Esempio n. 1
0
 def test_run(self):
     for sequence in self.sequences:
         gff_parser.run(sequence, self.config)
     len_cds_1 = len(utils.get_cds_features(self.sequences[0]))
     len_cds_2 = len(utils.get_cds_features(self.sequences[1]))
     detected_result = (len_cds_1, len_cds_2)
     expected_result = (1, 0)
     self.assertEqual(detected_result,
                      expected_result,
                      msg="\nResult : %s\nExpected : %s" %
                      (detected_result, expected_result))
Esempio n. 2
0
def fastaseqlengths(seq_record):
    seqlengths = {}
    cdsfeatures = utils.get_cds_features(seq_record)
    for cds in cdsfeatures:
        seqlength = len(str(utils.get_aa_sequence(cds)))
        seqlengths[utils.get_gene_acc(cds)] = seqlength
    return seqlengths
Esempio n. 3
0
def find_nr_cds(clusterpositions, seq_record):
    #Find the number of CDSs in candidate cluster and adjust the cluster starts and ends to match the CDS starts and ends
    cdsfeatures = utils.get_cds_features(seq_record)
    withinclustercdsfeatures = []
    for cds in cdsfeatures:
        if clusterpositions[0] <= int(cds.location.start) <= clusterpositions[1] or \
           clusterpositions[0] <= int(cds.location.end) <= clusterpositions[1] or \
           int(cds.location.start) <= clusterpositions[0] <= int(cds.location.end) or \
           int(cds.location.start) <= clusterpositions[1] <= int(cds.location.end):
            withinclustercdsfeatures.append(cds)
    if len(withinclustercdsfeatures) == 0:
        return clusterpositions, 0
    startlocations = [
        int(cds.location.start) for cds in withinclustercdsfeatures
    ]
    endlocations = [int(cds.location.end) for cds in withinclustercdsfeatures]
    #If statement to avoid getting the complete genome as cluster if one CDS starts at end and finishes at start of genome
    if seq_record is not None and not (0 in startlocations and len(
            seq_record.seq) in endlocations):
        newclusterstart = min(startlocations)
        newclusterend = max(endlocations)
        newclusterpositions = [newclusterstart, newclusterend]
    else:
        newclusterpositions = clusterpositions
    return newclusterpositions, len(withinclustercdsfeatures)
Esempio n. 4
0
def generate_searchgtr_htmls(seq_records, options):
    #Generate lists of COGs that are glycosyltransferases or transporters
    gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102']
    searchgtrformtemplateparts = load_searchgtr_search_form_template()
    options.searchgtr_links = {}
    for seq_record in seq_records:
        smcogdict, _ = utils.get_smcog_annotations(seq_record)
        for feature in utils.get_cds_features(seq_record):
            gene_id = utils.get_gene_id(feature)
            if smcogdict.has_key(gene_id):
                smcog = smcogdict[gene_id]
                if smcog in gtrcoglist:

                    if not os.path.exists(options.full_outputfolder_path +
                                          os.sep + "html"):
                        os.mkdir(options.full_outputfolder_path + os.sep +
                                 "html")
                    formfileloc = options.full_outputfolder_path + os.sep + "html" + os.sep + utils.get_gene_id(
                        feature) + "_searchgtr.html"
                    link_loc = "html" + os.sep + utils.get_gene_id(
                        feature) + "_searchgtr.html"
                    options.searchgtr_links[seq_record.id + "_" +
                                            gene_id] = link_loc
                    formfile = open(formfileloc, "w")
                    specificformtemplate = searchgtrformtemplateparts[
                        0].replace("GlycTr", gene_id)
                    formfile.write(specificformtemplate)
                    formfile.write("%s\n%s" %
                                   (gene_id, utils.get_aa_sequence(feature)))
                    formfile.write(searchgtrformtemplateparts[1])
                    formfile.close()
Esempio n. 5
0
def predict_class_from_gene_cluster(seq_record, cluster):
    '''
    Predict the lantipeptide class from the gene cluster
    '''
    found_domains = []
    for feature in utils.get_cds_features(seq_record):
        if feature.location.start < cluster.location.start or \
           feature.location.end > cluster.location.end:
            continue

        if not 'sec_met' in feature.qualifiers:
            continue

        for entry in feature.qualifiers['sec_met']:
            if entry.startswith('Domains detected:'):
                entry = entry[17:]
                domains = entry.split(';')
                for domain in domains:
                    found_domains.append(domain.split()[0])

    if 'Lant_dehyd_N' in found_domains or 'Lant_dehyd_C' in found_domains:
        return 'Class-I'
    if 'DUF4135' in found_domains:
        return 'Class-II'
    if 'Pkinase' in found_domains:
        # this could be class 3 or class 4, but as nobody has seen class 4
        # in vivo yet, we'll ignore that
        return 'Class-III'

    # Ok, no biosynthetic enzymes found, let's try the prepeptide
    if 'Gallidermin' in found_domains:
        return 'Class-I'

    return None
Esempio n. 6
0
def find_lan_a_features(seq_record, cluster):
    lan_a_features = []
    for feature in utils.get_cds_features(seq_record):
        if feature.location.start < cluster.location.start or \
           feature.location.end > cluster.location.end:
            continue

        aa_seq = utils.get_aa_sequence(feature)
        if len(aa_seq) < 80:
            lan_a_features.append(feature)
            continue

        if not 'sec_met' in feature.qualifiers:
            continue

        domain = None
        for entry in feature.qualifiers['sec_met']:
            if entry.startswith('Domains detected:'):
                domain = entry.split()[2]
                break

        if domain is None:
            continue

        if domain not in known_precursor_domains:
            continue

        lan_a_features.append(feature)

    return lan_a_features
Esempio n. 7
0
def load_clusterblast_database(seq_record, searchtype="general"):
    accessiondict = {}
    for cds in utils.get_cds_features(seq_record):
        accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds)
    clusters = load_geneclusters(searchtype)
    proteins = load_geneclusterproteins(accessiondict, searchtype)
    return clusters, proteins
Esempio n. 8
0
def load_clusterblast_database(seq_record, searchtype="general"):
    options = config.get_config()
    accessiondict = {}
    for cds in utils.get_cds_features(seq_record):
        accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds)
    clusters = load_geneclusters(searchtype)
    proteinlocations, proteinstrands, proteinannotations, proteintags = load_geneclusterproteins(accessiondict, searchtype)
    return clusters, proteinlocations, proteinstrands, proteinannotations, proteintags
Esempio n. 9
0
def seq_record_convert_nucl_to_prot(seq_records, options):
    seq_record = seq_records[0]
    cdsfeatures = utils.get_cds_features(seq_record)
    cdsmotifs = utils.get_all_features_of_type(seq_record, ["CDS_motif"])
    #Find corresponding cdsmotifs for each cdsfeature
    cdsmotifdict = {}
    for cdsfeature in cdsfeatures:
        for cdsmotif in cdsmotifs:
            if cdsfeature.location.start <= cdsmotif.location.start <= cdsfeature.location.end:
                if not cdsmotifdict.has_key(cdsfeature.qualifiers['product'][0]):
                    cdsmotifdict[cdsfeature.qualifiers['product'][0]] = [cdsmotif]
                else:
                    cdsmotifdict[cdsfeature.qualifiers['product'][0]].append(cdsmotif)
    #For each cdsfeature, write a protein SeqRecord with CDS_motif features (abMotifs AND sec_met)
    prot_seq_records = []
    for cdsfeature in cdsfeatures:
        cds_domains = []
        #Extract sec_met info from feature
        if 'sec_met' in cdsfeature.qualifiers:
            if len([qual for qual in cdsfeature.qualifiers['sec_met'] if "NRPS/PKS subtype: " in qual]) > 0:
                cds_description = [qual for qual in cdsfeature.qualifiers['sec_met'] if "NRPS/PKS subtype: " in qual][0].partition("NRPS/PKS subtype: ")[2]
            else:
                cds_description = "Unknown protein"
            cds_domains = [qual for qual in cdsfeature.qualifiers['sec_met'] if "NRPS/PKS Domain: " in qual]
        else:
            cds_description = "Unknown protein"
        #Create protein seq_record
        prot_seq_record = SeqRecord(Seq(cdsfeature.qualifiers['translation'][0], IUPAC.protein),
                                    id=cdsfeature.qualifiers['product'][0], name=cdsfeature.qualifiers['product'][0],
                                    description=cds_description)
        utils.fix_record_name_id(prot_seq_record, options)
        #Add CDS_motif features based on NRPS/PKS domains
        cdsmotif_features = []
        for cds_domain in cds_domains:
            domainstart, domainend = cds_domain.partition(" (")[2].partition("). ")[0].split("-")
            domainlocation = FeatureLocation(int(domainstart), int(domainend))
            domain_feature = SeqFeature(domainlocation, type="CDS_motif")
            domain_feature.qualifiers['note'] = [cds_domain]
            cdsmotif_features.append(domain_feature)
        #Add CDS_motif features based on NRPS/PKS abMotifs
        if cdsmotifdict.has_key(cdsfeature.qualifiers['product'][0]):
            for cdsmotif in cdsmotifdict[cdsfeature.qualifiers['product'][0]]:
                oldstart, oldend = cdsmotif.location.start, cdsmotif.location.end
                newstart = (oldstart - cdsfeature.location.start) / 3
                newend = (oldend - cdsfeature.location.start) / 3
                newlocation = FeatureLocation(newstart, newend)
                cdsmotif.location = newlocation
                cdsmotif_features.append(cdsmotif)
        prot_seq_record.features.extend(cdsmotif_features)
        prot_seq_records.append(prot_seq_record)
    return prot_seq_records
Esempio n. 10
0
def find_col_id(geo_dataset, seq_records):
    if geo_dataset["info"]["type"] == "CSV":
        geo_dataset["info"]["col_id"] = 0
        return geo_dataset
    for id_ref, data in geo_dataset["data"].items():
        for i in xrange(0, len(data[0])):
            for seq_record in seq_records:
                for feature in utils.get_cds_features(seq_record):
                    gene_id = utils.get_gene_id(feature)
                    if gene_id.upper() == data[0][i].upper():
                        geo_dataset["info"]["col_id"] = i
                        return geo_dataset
    geo_dataset["info"]["col_id"] = -1
    return geo_dataset
Esempio n. 11
0
def getECs(seq_record, options):
    logging.debug("Predicting EC numbers with EFICAz")
    if not name in options.ecpred:
        logging.debug("ECprediction %s not selected, returning..." % name)
        return

    if not 'cpus' in options:
        options.cpus = 1

    EFICAzECs = EFICAzECPrediction(seq_record, options)
    EFICAzECs.runECpred()
    logging.debug("Found %s predictions for EC4" %
                  len(EFICAzECs.getEC4Dict().keys()))

    for feature in utils.get_cds_features(seq_record):
        featureID = utils.get_gene_id(feature)

        notes = []

        if feature.qualifiers.has_key("note"):
            notes = feature.qualifiers['note']

        if EFICAzECs.getEC4(featureID):
            logging.debug("Annotating %s" % featureID)
            if feature.qualifiers.has_key('EC_number'):
                logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s  with %s' % \
                             (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC4(featureID))))
            feature.qualifiers['EC_number'] = EFICAzECs.getEC4(featureID)
            notes.append("EFICAz EC number prediction: EC4: {0}; {1}".format(", ".join(EFICAzECs.getEC4(featureID)), \
                                                                             "; ".join(EFICAzECs.getEC4Info(featureID)))    )
        # Only annotate 3 digit EC if no 4 digit EC is available
        if (EFICAzECs.getEC3(featureID) and not EFICAzECs.getEC4(featureID)):
            if feature.qualifiers.has_key('EC_number'):
                if not re.search("\d+\.\d+\.\d+\.\d+", " ".join(
                        feature.qualifiers['EC_number'])):
                    logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s  with %s' % \
                                 (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC3(featureID))))
                    feature.qualifiers['EC_number'] = EFICAzECs.getEC3(
                        featureID)

        if EFICAzECs.getEC3Info(featureID):
            notes.append("EFICAz EC number prediction: EC3: {0}; {1}".format(", ".join(EFICAzECs.getEC3(featureID)), \
                                                                             "; ".join(EFICAzECs.getEC3Info(featureID))))
            if not feature.qualifiers.has_key('EC_number'):
                feature.qualifiers['EC_number'] = EFICAzECs.getEC3(featureID)

        feature.qualifiers['note'] = notes
    logging.debug("Finished EC number prediction with EFICAz")
Esempio n. 12
0
def find_nr_cds(clusterpositions, seq_record):
    #Find the number of CDSs in candidate cluster and adjust the cluster starts and ends to match the CDS starts and ends
    cdsfeatures = utils.get_cds_features(seq_record)
    withinclustercdsfeatures = []
    for cds in cdsfeatures:
         if clusterpositions[0] <= int(cds.location.start) <= clusterpositions[1] or \
            clusterpositions[0] <= int(cds.location.end) <= clusterpositions[1] or \
            int(cds.location.start) <= clusterpositions[0] <= int(cds.location.end) or \
            int(cds.location.start) <= clusterpositions[1] <= int(cds.location.end):
            withinclustercdsfeatures.append(cds)
    if len(withinclustercdsfeatures) == 0:
        return clusterpositions, 0
    newclusterstart = min([int(cds.location.start) for cds in withinclustercdsfeatures])
    newclusterend = max([int(cds.location.end) for cds in withinclustercdsfeatures])
    newclusterpositions = [newclusterstart, newclusterend]
    return newclusterpositions, len(withinclustercdsfeatures)
Esempio n. 13
0
    def _getMultiFastaList(self):
        features = utils.get_cds_features(self.seq_record)
        allFastaList = []
        for feature in features:
            gene_id = utils.get_gene_id(feature)
            fasta_seq = feature.qualifiers['translation'][0]
            if "-" in str(fasta_seq):
                fasta_seq = Seq(
                    str(fasta_seq).replace("-", ""), generic_protein)

            # Never write empty fasta entries
            if len(fasta_seq) == 0:
                logging.debug("No translation for %s, skipping" % gene_id)
                continue

            allFastaList.append(">%s\n%s\n" % (gene_id, fasta_seq))

        return allFastaList
Esempio n. 14
0
def remove_irrelevant_allorfs(seq_record):
    #Get features
    allfeatures = utils.get_cds_features(seq_record)
    #Remove auto-orf features without unique sec_met qualifiers; remove glimmer ORFs overlapping with sec_met auto-orfs not catched by Glimmer
    auto_orf_features = [
        feature for feature in allfeatures
        if feature.qualifiers.has_key('note')
        and "auto-all-orf" in feature.qualifiers['note']
    ]
    other_features = [
        feature for feature in allfeatures
        if not feature.qualifiers.has_key('note')
        or "auto-all-orf" not in feature.qualifiers['note']
    ]
    to_delete = []
    for autofeature in auto_orf_features:
        if not autofeature.qualifiers.has_key("sec_met"):
            to_delete.append(autofeature)
        else:
            glimmer_has_sec_met = False
            for otherfeature in other_features:
                if overlaps(autofeature, otherfeature
                            ) and otherfeature.qualifiers.has_key('sec_met'):
                    to_delete.append(autofeature)
                    glimmer_has_sec_met = True
            if glimmer_has_sec_met == False:
                for otherfeature in other_features:
                    if overlaps(
                            autofeature, otherfeature
                    ) and not otherfeature.qualifiers.has_key('sec_met'):
                        to_delete.append(otherfeature)
    featurenrs = []
    idx = 0
    for feature in seq_record.features:
        if feature in to_delete:
            featurenrs.append(idx)
        idx += 1
    featurenrs.reverse()
    for featurenr in featurenrs:
        del seq_record.features[featurenr]
Esempio n. 15
0
def find_flavoprotein(seq_record, cluster):
    "Look for an epiD-like flavoprotein responsible for aminovinylcystein"
    for feature in utils.get_cds_features(seq_record):
        if feature.location.start < cluster.location.start or \
           feature.location.end > cluster.location.end:
            continue

        if not 'sec_met' in feature.qualifiers:
            continue

        domain = None
        for entry in feature.qualifiers['sec_met']:
            if entry.startswith('Domains detected:'):
                domain = entry.split()[2]
                break

        if domain is None:
            continue

        if domain in 'Flavoprotein':
            return True

    return False
Esempio n. 16
0
def find_short_chain_dehydrogenase(seq_record, cluster):
    "Look for an eciO-like short-chain dehydrogenase responsible for N-terminal lactone"
    for feature in utils.get_cds_features(seq_record):
        if feature.location.start < cluster.location.start or \
           feature.location.end > cluster.location.end:
            continue

        if not 'sec_met' in feature.qualifiers:
            continue

        domain = None
        for entry in feature.qualifiers['sec_met']:
            if entry.startswith('Domains detected:'):
                domain = entry.split()[2]
                break

        if domain is None:
            continue

        if domain in ('adh_short', 'adh_short_C2'):
            return True

    return False
Esempio n. 17
0
def find_p450_oxygenase(seq_record, cluster):
    "Look for a p450 oxygenase"
    #return False
    for feature in utils.get_cds_features(seq_record):
        if feature.location.start < cluster.location.start or \
           feature.location.end > cluster.location.end:
            continue

        if not 'sec_met' in feature.qualifiers:
            continue

        domain = None
        for entry in feature.qualifiers['sec_met']:
            if entry.startswith('Domains detected:'):
                domain = entry.split()[2]
                break

        if domain is None:
            continue

        if domain in 'p450':
            return True

    return False
Esempio n. 18
0
def find_clusters(seq_record, rulesdict, overlaps):
    #Functions that detects the gene clusters based on the identified core genes
    features = utils.get_cds_features(seq_record)
    clustertype = ""
    clusters = []
    cfg = config.get_config()
    clusternr = cfg.next_clusternr
    last_cutoff = 0
    cluster_cds = []

    for feature in features:
        within_cutoff = False
        if ('sec_met' in feature.qualifiers) and (len([
                feat
                for feat in feature.qualifiers['sec_met'] if "Type: " in feat
        ]) > 0):
            feature_start = min(feature.location.start, feature.location.end)
            feature_end = max(feature.location.start, feature.location.end)
            feature_type = [
                feat for feat in feature.qualifiers['sec_met']
                if "Type: " in feat
            ][0].partition("Type: ")[2]
            feature_cutoff = max(
                [rulesdict[value][1] for value in feature_type.split("-")])
            feature_extension = max(
                [rulesdict[value][2] for value in feature_type.split("-")])
            if (cfg.enable_dynamic_cutoff):
                multiply_cutoff = get_dynamic_cutoff_multiplier(
                    utils.get_gene_id(feature), overlaps)
                feature_cutoff = int(feature_cutoff * multiply_cutoff)
                feature_extension = int(feature_extension * multiply_cutoff)
            cluster = None

            if len(clusters) > 0:
                cluster = clusters[-1]
                cluster_start = cluster.location.start
                cluster_end = cluster.location.end
                # Check cutoff
                cutoff = max(last_cutoff, feature_cutoff)
                within_cutoff = feature_start <= cluster_end + cutoff
                within_gene_num_cutoff = (min([
                    abs(overlaps[1][utils.get_gene_id(feature)] -
                        overlaps[1][ncds]) for ncds in cluster_cds
                ]) - 1 <= cfg.gene_num_cutoff)
                if (cfg.gene_num_cutoff_only):
                    within_cutoff = within_gene_num_cutoff
                else:
                    within_cutoff = within_cutoff or within_gene_num_cutoff

            if not within_cutoff:
                if len(clusters) > 0:
                    # Finalize the last extended cluster
                    cluster = clusters[-1]
                    cluster.location = FeatureLocation(
                        max(
                            0, cluster.location.start -
                            cluster.qualifiers['extension'][0]),
                        min(
                            len(seq_record), cluster.location.end +
                            cluster.qualifiers['extension'][0]))
                # Create new cluster
                new_cluster = SeqFeature(FeatureLocation(
                    feature_start, feature_end),
                                         type="cluster")
                new_cluster.qualifiers['note'] = [
                    "Cluster number: " + str(clusternr)
                ]
                new_cluster.qualifiers['cutoff'] = [feature_cutoff]
                new_cluster.qualifiers['extension'] = [feature_extension]
                new_cluster.qualifiers['product'] = [feature_type]
                clusters.append(new_cluster)
                cluster = clusters[-1]
                cluster_cds = [utils.get_gene_id(feature)]
                clusternr += 1

            # Update cluster
            last_cutoff = feature_cutoff
            cluster.location = FeatureLocation(
                min(cluster.location.start, feature_start),
                max(cluster.location.end, feature_end))
            cluster.qualifiers['cutoff'] = [
                max(cluster.qualifiers['cutoff'][0], feature_cutoff)
            ]
            cluster.qualifiers['extension'] = [
                max(cluster.qualifiers['extension'][0], feature_extension)
            ]
            cluster.qualifiers['product'] = [
                "-".join(
                    list(
                        set(cluster.qualifiers['product'][0].split('-'))
                        | set(feature_type.split('-'))))
            ]
            if "-" in cluster.qualifiers['product'][0]:
                cluster.qualifiers['product'] = [
                    "-".join([
                        ct
                        for ct in cluster.qualifiers['product'][0].split('-')
                        if ct != "other"
                    ])
                ]
            if (utils.get_gene_id(feature) not in cluster_cds):
                cluster_cds.append(utils.get_gene_id(feature))

    if len(clusters) > 0:
        # Finalize the last extended cluster
        cluster = clusters[-1]
        cluster.location = FeatureLocation(
            max(0,
                cluster.location.start - cluster.qualifiers['extension'][0]),
            min(len(seq_record),
                cluster.location.end + cluster.qualifiers['extension'][0]))

    seq_record.features.extend(clusters)
    cfg.next_clusternr = clusternr
Esempio n. 19
0
def find_clusters(seq_record, rulesdict):
    #Functions that detects the gene clusters based on the identified core genes
    features = utils.get_cds_features(seq_record)
    clusters = []
    cfg = config.get_config()
    clusternr = cfg.next_clusternr

    for feature in features:
        within_cutoff = False
        if ('sec_met' not in feature.qualifiers) or (len([
                feat
                for feat in feature.qualifiers['sec_met'] if "Type: " in feat
        ]) <= 0):
            continue
        feature_start = min(feature.location.start, feature.location.end)
        feature_end = max(feature.location.start, feature.location.end)
        feature_type = [
            feat for feat in feature.qualifiers['sec_met'] if "Type: " in feat
        ][0].partition("Type: ")[2]
        if feature_type == "none":
            continue
        feature_cutoff = max(
            [rulesdict[value][1] for value in feature_type.split("-")])
        feature_extension = max(
            [rulesdict[value][2] for value in feature_type.split("-")])
        cluster = None

        if len(clusters) > 0:
            cluster = clusters[-1]
            cluster_end = cluster.location.end
            # Check cutoff
            cutoff = max(cluster.qualifiers['cutoff'][0], feature_cutoff)
            cutoff = max(
                cutoff, cluster.qualifiers['extension'][0] + feature_extension)
            within_cutoff = feature_start <= cluster_end + cutoff

        if not within_cutoff:
            if len(clusters) > 0:
                # Finalize the last extended cluster
                cluster = clusters[-1]
                cluster.location = FeatureLocation(
                    max(
                        0, cluster.location.start -
                        cluster.qualifiers['extension'][0]),
                    min(
                        len(seq_record), cluster.location.end +
                        cluster.qualifiers['extension'][0]))
            # Create new cluster
            new_cluster = SeqFeature(FeatureLocation(feature_start,
                                                     feature_end),
                                     type="cluster")
            new_cluster.qualifiers['note'] = [
                "Cluster number: " + str(clusternr)
            ]
            new_cluster.qualifiers['cutoff'] = [feature_cutoff]
            new_cluster.qualifiers['extension'] = [feature_extension]
            new_cluster.qualifiers['product'] = [feature_type]
            clusters.append(new_cluster)
            cluster = clusters[-1]
            clusternr += 1

        # Update cluster
        cluster.location = FeatureLocation(
            min(cluster.location.start, feature_start),
            max(cluster.location.end, feature_end))
        cluster.qualifiers['cutoff'] = [
            max(cluster.qualifiers['cutoff'][0], feature_cutoff)
        ]
        cluster.qualifiers['extension'] = [
            max(cluster.qualifiers['extension'][0], feature_extension)
        ]
        cluster.qualifiers['product'] = [
            "-".join(
                list(
                    set(cluster.qualifiers['product'][0].split('-'))
                    | set(feature_type.split('-'))))
        ]
        if "-" in cluster.qualifiers['product'][0]:
            cluster.qualifiers['product'] = [
                "-".join([
                    ct for ct in cluster.qualifiers['product'][0].split('-')
                    if ct != "other"
                ])
            ]

    if len(clusters) > 0:
        # Finalize the last extended cluster
        cluster = clusters[-1]
        cluster.location = FeatureLocation(
            max(0,
                cluster.location.start - cluster.qualifiers['extension'][0]),
            min(len(seq_record),
                cluster.location.end + cluster.qualifiers['extension'][0]))
    for cluster in clusters:
        #Add a note to specify whether a cluster lies on the contig/scaffold edge or not
        if cluster.location.start == 0 or cluster.location.end == len(
                seq_record):
            cluster.qualifiers['contig_edge'] = "True"
        else:
            cluster.qualifiers['contig_edge'] = "False"

    seq_record.features.extend(clusters)
    cfg.next_clusternr = clusternr
Esempio n. 20
0
def main():
    multiprocessing.freeze_support()
    res_object = {}

    # get genome files
    files = []
    for line in open(sys.argv[1], 'r'):
        files.append(path.expanduser(line.replace("\n", "")))

    # mockup antismash run per files
    i = 1
    for fpath in files:
        res_object[fpath] = {}
        print "Processing %s... (%d/%d)" % (fpath, i, len(files))
        i += 1
        options = get_mockup_config()
        options.sequences = [fpath]
        config.set_config(options)
        run_antismash.setup_logging(
            options)  #To-DO: get antismash logging to works!

        # load plugins
        plugins = run_antismash.load_detection_plugins()
        run_antismash.filter_plugins(plugins, options,
                                     options.enabled_cluster_types)

        # parse to seq_records
        seq_records = run_antismash.parse_input_sequences(options)
        options.next_clusternr = 1

        for seq_record in seq_records:
            if options.input_type == 'nucl':
                seq_records = [
                    record for record in seq_records if len(record.seq) > 1000
                ]
                if len(seq_records) < 1:
                    continue
            utils.sort_features(seq_record)
            run_antismash.strip_record(seq_record)
            utils.fix_record_name_id(seq_record, options)

            # fetch results_by_id
            feature_by_id = utils.get_feature_dict(seq_record)
            results = []
            results_by_id = {}
            for feature in utils.get_cds_features(seq_record):
                prefix = "%s:" % seq_record.id.replace(":", "_")
                gene_id = utils.get_gene_id(feature)
                if (prefix + gene_id) in options.hmm_results:
                    results_by_id[gene_id] = options.hmm_results[prefix +
                                                                 gene_id]
                    for res in results_by_id[gene_id]:
                        results.append(res)

            # ignore short aa's
            min_length_aa = 100
            short_cds_buffer = []
            for f in seq_record.features:  # temporarily remove short aa
                if f.type == "CDS" and len(
                        f.qualifiers['translation']
                    [0]) < min_length_aa and not results_by_id.has_key(
                        utils.get_gene_id(f)):
                    short_cds_buffer.append(f)
                    seq_record.features.remove(f)

            overlaps = utils.get_overlaps_table(seq_record)
            rulesdict = hmm_detection.create_rules_dict(
                options.enabled_cluster_types)
            # find total cdhit numbers in the chromosome
            total_cdhit = len(
                utils.get_cdhit_table(utils.get_cds_features(seq_record))[0])
            res_object[fpath][seq_record.id] = {
                "total_clusters": 0,
                "total_genes": len(overlaps[0]),
                "total_cdhit": total_cdhit,
                "genes_with_hits": 0,
                "largest_cdhit": 0,
                "largest_domain_variations": 0,
                "per_hits": {},
                "cluster_types": {}
            }

            # filter overlap hits
            results, results_by_id = hmm_detection.filter_results(
                results, results_by_id, overlaps, feature_by_id)

            # count hits
            for gene_id in results_by_id:
                res_gene = results_by_id[gene_id]
                if len(res_gene) > 0:
                    res_object[fpath][seq_record.id]["genes_with_hits"] += 1
                for hsp in res_gene:
                    domain_name = hsp.query_id.replace("plants/", "")
                    if domain_name not in res_object[fpath][
                            seq_record.id]["per_hits"]:
                        res_object[fpath][
                            seq_record.id]["per_hits"][domain_name] = 0
                    res_object[fpath][
                        seq_record.id]["per_hits"][domain_name] += 1

            # do cluster finding algorithm
            typedict = hmm_detection.apply_cluster_rules(
                results_by_id, feature_by_id, options.enabled_cluster_types,
                rulesdict, overlaps)
            hmm_detection.fix_hybrid_clusters_typedict(typedict)
            nseqdict = hmm_detection.get_nseq()
            for cds in results_by_id.keys():
                feature = feature_by_id[cds]
                if typedict[cds] != "none":
                    hmm_detection._update_sec_met_entry(
                        feature, results_by_id[cds], typedict[cds], nseqdict)
            hmm_detection.find_clusters(seq_record, rulesdict, overlaps)
            seq_record.features.extend(short_cds_buffer)
            res_object[fpath][seq_record.id]["total_clusters"] += len(
                utils.get_cluster_features(seq_record))

            # do cluster specific and unspecific analysis
            if len(utils.get_cluster_features(seq_record)) > 0:
                run_antismash.cluster_specific_analysis(
                    plugins, seq_record, options)
            run_antismash.unspecific_analysis(seq_record, options)

            #Rearrange hybrid clusters name alphabetically
            hmm_detection.fix_hybrid_clusters(seq_record)

            #before writing to output, remove all hmm_detection's subdir prefixes from clustertype
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = []
                    for name in prod.split('-'):
                        prod_name.append(name.split('/')[-1])
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = [
                                (ct.split('/')[-1])
                                for ct in row.split('Type: ')[-1].split('-')
                            ]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        elif row.startswith('Domains detected: '):
                            cluster_results = []
                            for cluster_result in row.split(
                                    'Domains detected: ')[-1].split(';'):
                                cluster_results.append(
                                    cluster_result.split(' (E-value')[0].split(
                                        '/')[-1] + ' (E-value' +
                                    cluster_result.split(' (E-value')[-1])
                            temp_qual.append('Domains detected: ' +
                                             ";".join(cluster_results))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            #on plants, remove plant clustertype from hybrid types, and replace single
            #plant clustertype with "putative"
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = list(set(prod.split('-')))
                    if (len(prod_name) > 1) and ("plant" in prod_name):
                        prod_name.remove("plant")
                    elif prod_name == ["plant"]:
                        prod_name = ["putative"]
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = list(
                                set(row.split('Type: ')[-1].split('-')))
                            if (len(clustertypes) > 1) and ("plant"
                                                            in clustertypes):
                                clustertypes.remove("plant")
                            elif clustertypes == ["plant"]:
                                clustertypes = ["putative"]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            # find largest cdhit number & largest domain diversity in a cluster
            res_object[fpath][seq_record.id]["average_cdhit"] = 0
            res_object[fpath][seq_record.id]["average_domain_variations"] = 0
            cdhit_numbers = []
            domain_numbers = []
            for cluster in utils.get_cluster_features(seq_record):
                cluster_type = utils.get_cluster_type(cluster)
                if cluster_type not in res_object[fpath][
                        seq_record.id]["cluster_types"]:
                    res_object[fpath][
                        seq_record.id]["cluster_types"][cluster_type] = 0
                res_object[fpath][
                    seq_record.id]["cluster_types"][cluster_type] += 1
                num_cdhit = len(
                    utils.get_cluster_cdhit_table(cluster, seq_record))
                num_domain = len(utils.get_cluster_domains(
                    cluster, seq_record))
                cdhit_numbers.append(num_cdhit)
                domain_numbers.append(num_domain)
                if num_cdhit > res_object[fpath][
                        seq_record.id]["largest_cdhit"]:
                    res_object[fpath][
                        seq_record.id]["largest_cdhit"] = num_cdhit
                if num_domain > res_object[fpath][
                        seq_record.id]["largest_domain_variations"]:
                    res_object[fpath][seq_record.id][
                        "largest_domain_variations"] = num_domain
            if len(cdhit_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_cdhit"] = numpy.median(cdhit_numbers)
            if len(domain_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_domain_variations"] = numpy.median(domain_numbers)

        with open('result.js', 'w') as h:
            h.write('var result = %s;' % json.dumps(res_object, indent=4))
Esempio n. 21
0
def get_targetGenomeInfo(seq_records, options):
    targetFastaFilename = options.metabolicmodeldir + os.sep + 'targetGenome_locusTag_aaSeq.fa'
    fp = open(targetFastaFilename, 'w')
    targetGenome_locusTag_aaSeq_dict = {}
    targetGenome_locusTag_ec_dict = {}
    targetGenome_locusTag_prod_dict = {}

    counter_for_temp_locusTags = 1
    # Reads GenBank file
    for seq_record in seq_records:
        logging.debug(
            '[MetabolicModeling] processing sequence id "%s" out of %s sequences',
            seq_record.id, len(seq_records))
        for feature in utils.get_cds_features(seq_record):

            # Retrieving "locus_tag (i.e., ORF name)" for each CDS
            locusTag = feature.qualifiers.get('locus_tag', ['-'])[0]
            logging.debug("Found locus_tag %s for feature %s", locusTag,
                          utils.get_gene_id(feature))
            # Assign own locus tag, if not set:
            if locusTag == "-":
                if not utils.get_gene_id(feature) == "no_tag_found":
                    locusTag = utils.get_gene_id(feature)
                else:
                    locusTag = "automodelorf{0:05d}".format(
                        counter_for_temp_locusTags)
                feature.qualifiers['locus_tag'] = [locusTag]
                logging.debug("replaced locus tag to %s for %s.", locusTag,
                              utils.get_gene_id(feature))
                counter_for_temp_locusTags += 1

            # Some locus_tag's have multiple same qualifiers (e.g., EC_number)
            for item in feature.qualifiers:

                # Note that the numbers of CDS and "translation" do not match.
                # There are occasions that CDS does not have "translation".
                if item == 'translation':

                    # Retrieving "translation (i.e., amino acid sequences)" for each CDS
                    translation = feature.qualifiers.get('translation')
                    targetGenome_locusTag_aaSeq_dict[locusTag] = translation[0]
                    print >> fp, '>%s\n%s' % (str(locusTag), str(
                        translation[0]))

                # Used to find "and" relationship in the GPR association
                if item == 'product':
                    product = feature.qualifiers.get('product')[0]
                    targetGenome_locusTag_prod_dict[locusTag] = product

                # Watch multiple EC_number's
                if item == 'EC_number':
                    ecnum = feature.qualifiers.get('EC_number')
                    targetGenome_locusTag_ec_dict[locusTag] = ecnum

    # Check if the gbk file has EC_number
    # Additional conditions should be given upon setup of in-house EC_number assigner
    logging.debug("len(targetGenome_locusTag_ec_dict.keys):")
    logging.debug(len(targetGenome_locusTag_ec_dict))
    logging.debug("len(targetGenome_locusTag_prod_dict.keys):")
    logging.debug(len(targetGenome_locusTag_prod_dict))

    fp.close()
    return targetGenome_locusTag_ec_dict, targetGenome_locusTag_prod_dict, targetFastaFilename
Esempio n. 22
0
def detect_signature_genes(seq_record, enabled_clustertypes, options):
    "Function to be executed by module"
    logging.info('Detecting gene clusters using HMM library')
    feature_by_id = utils.get_feature_dict(seq_record)
    rulesdict = create_rules_dict(enabled_clustertypes)
    results = []
    sig_by_name = {}
    results_by_id = {}
    for sig in get_sig_profiles():
        sig_by_name[sig.name] = sig

    for feature in utils.get_cds_features(seq_record):
        prefix = "%s:" % seq_record.id.replace(":", "_")
        gene_id = utils.get_gene_id(feature)
        if (prefix + gene_id) in options.hmm_results:
            results_by_id[gene_id] = options.hmm_results[prefix + gene_id]
            for res in results_by_id[gene_id]:
                results.append(res)

    short_cds_buffer = []
    if options.ignore_short_aa:
        # Temporarily filter out cds with < prot_min_length AA length
        min_length_aa = 50
        if options.eukaryotic:
            min_length_aa = 100
        for f in seq_record.features:
            if f.type == "CDS" and len(
                    f.qualifiers['translation']
                [0]) < min_length_aa and not results_by_id.has_key(
                    utils.get_gene_id(f)):
                short_cds_buffer.append(f)
                seq_record.features.remove(f)

    #Get overlap tables (for overlap filtering etc)
    overlaps = utils.get_overlaps_table(seq_record)

    #Filter results by comparing scores of different models (for PKS systems)
    results_to_delete = [gene_id for gene_id in results_by_id]
    results, results_by_id = filter_results(results, results_by_id, overlaps,
                                            feature_by_id)

    #Update filtered results back to the options.hmm_results
    for gene_id in results_by_id:
        results_to_delete.remove(gene_id)
        prefix = "%s:" % seq_record.id.replace(":", "_")
        if (prefix + gene_id) in options.hmm_results:
            options.hmm_results[(prefix + gene_id)] = results_by_id[gene_id]
    for gene_id in results_to_delete:
        prefix = "%s:" % seq_record.id.replace(":", "_")
        if (prefix + gene_id) in options.hmm_results:
            del options.hmm_results[(prefix + gene_id)]

    #Use rules to determine gene clusters
    typedict = apply_cluster_rules(results_by_id, feature_by_id,
                                   enabled_clustertypes, rulesdict, overlaps)

    #Rearrange hybrid clusters name in typedict alphabetically
    fix_hybrid_clusters_typedict(typedict)

    #Find number of sequences on which each pHMM is based
    nseqdict = get_nseq()

    #Save final results to seq_record
    for cds in results_by_id.keys():
        feature = feature_by_id[cds]
        if typedict[cds] != "none":
            _update_sec_met_entry(feature, results_by_id[cds], typedict[cds],
                                  nseqdict)

    find_clusters(seq_record, rulesdict, overlaps)

    #Find additional NRPS/PKS genes in gene clusters
    add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict)

    #Rearrange hybrid clusters name alphabetically
    fix_hybrid_clusters(seq_record)

    #Add details of gene cluster detection to cluster features
    store_detection_details(results_by_id, rulesdict, seq_record)

    # Re-add the short CDSs
    seq_record.features.extend(short_cds_buffer)
    utils.sort_features(seq_record)

    #If all-orfs option on, remove irrelevant short orfs
    if options.all_orfs:
        remove_irrelevant_allorfs(seq_record)

    #Display %identity
    if options.enable_cdhit:
        store_percentage_identities(seq_record)
Esempio n. 23
0
 def test_get_cds_features(self):
     "Test utils.get_all_cds_features()"
     cds = utils.get_cds_features(self.rec)
     features = utils.get_all_features_of_type(self.rec, "CDS")
     self.assertListEqual(cds, features)