Python get_cluster_cds_features 예제들, antismash.utils.get_cluster_cds_features Python 예제들

예제 #1

0

파일 보기

파일: test_utils.py 프로젝트: chevrm/transPACT

    def test_get_cluster_cds_features(self):
        "Test utils.get_cluster_cds_features()"
        cluster1, cluster2 = utils.get_cluster_features(self.record)
        self.assertEqual(self.features[0], cluster1)
        self.assertEqual(self.features[-1], cluster2)

        clusterfeatures = utils.get_cluster_cds_features(cluster1, self.record)
        self.assertEqual(self.features[3:6], clusterfeatures)

        clusterfeatures = utils.get_cluster_cds_features(cluster2, self.record)
        self.assertEqual(self.features[-3:-1], clusterfeatures)

예제 #2

0

파일 보기

def write(seq_records, options):
    logging.debug("Exporting antiSMASH information as txt tables")
    #Don't store TXT tables for protein input
    if options.input_type == 'prot':
        return
    #Localize output folder, create TXT subdirectory
    txt_outfolder = options.full_outputfolder_path + os.sep + "txt"
    if not os.path.exists(txt_outfolder):
        os.mkdir(txt_outfolder)
    #Define table names
    tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable"
    #For each gene cluster, write out info to TXT files
    for seq_record in seq_records:
        if len(utils.get_cluster_features(seq_record)) > 0:
            #Open up TXT files
            txt_files = {}
            for table in tables:
                txt_files[table] = open(
                    path.join(
                        txt_outfolder, "%s_%s.txt" %
                        (seq_record.id.partition(".")[0], table)), "w")
            #Gather all information
            info = utils.Storage()
            info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, []
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clusternr = utils.get_cluster_number(cluster)
                info.clusternrs.append(clusternr)
                info.clustertypes[clusternr] = utils.get_cluster_type(cluster)
                info.clustergenes[clusternr] = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.accessions[clusternr] = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.cdsmotifs[clusternr] = utils.get_all_features_of_type(
                    seq_record, ["CDS_motif"])
            info.seq_record = seq_record
            #Write information to tables
            for table in tables:
                getattr(write_tables, 'write_' + table)(txt_files[table], info,
                                                        options)
            for table in tables:
                txt_files[table].close()

예제 #3

0

파일 보기

파일: __init__.py 프로젝트: tina-r/galaxytools

def write(seq_records, options):
    """Write all cluster proteins to a file

    Args:
        seq_records (iterable): An iterable containing Bio.SeqRecords
        options (argparse.Namespace): The options passed to the program
    """
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername,
                            "%s_genecluster_proteins.fa" % basename)
    logging.debug("Writing seq_records to %r" % output_name)

    with open(output_name, 'w+') as handle:
        for seq_record in seq_records:
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clustertype = utils.get_cluster_type(cluster)
                clusternr = utils.get_cluster_number(cluster)
                for feature in utils.get_cluster_cds_features(
                        cluster, seq_record):
                    qual = feature.qualifiers
                    fasta_header = '>%s:%s %s #%s - %s\n' % (
                        qual['locus_tag'][0], qual['protein_id'][0],
                        clustertype, clusternr, qual['product'][0])
                    handle.write(fasta_header)
                    handle.write(
                        '%s\n' %
                        '\n'.join(textwrap.wrap(qual['translation'][0], 60)))

예제 #4

0

파일 보기

파일: __init__.py 프로젝트: abner24/plantismash

def store_percentage_identities(seq_record):
    clusters = utils.get_cluster_features(seq_record)
    cfg = config.get_config()
    for cluster in clusters:
        features = [
            feature
            for feature in utils.get_cluster_cds_features(cluster, seq_record)
            if 'sec_met' in feature.qualifiers
        ]
        cdhit_table, gene_to_cluster = utils.get_cdhit_table(
            features, float(cfg.cdh_display_cutoff))
        for cdhit_cluster in cdhit_table:
            if len(cdhit_cluster["genes"]) > 1:
                cl_features = [
                    feature for feature in features if utils.get_gene_id(
                        feature) in cdhit_cluster["genes"].keys()
                ]
                pct_table = utils.get_pct_identity_table(cl_features)
                for cds in cl_features:
                    result = ",".join([
                        "%s=%s" %
                        (othercds, pct_table[utils.get_gene_id(cds)][othercds])
                        for othercds in pct_table[utils.get_gene_id(
                            cds)].keys()
                    ])
                    for ann in cds.qualifiers['sec_met']:
                        if ann.startswith("Percentage identity"):
                            del ann
                    cds.qualifiers['sec_met'].append(
                        "Percentage identity: %s" % (result))

예제 #5

0

파일 보기

def annotate_geneclusters(seq_record, options):
    """Re-annotate gene clusters in the seq_record"""
    pfam_features = utils.get_pfam_features(seq_record)
    cf_clusters = find_cf_clusters(pfam_features, seq_record, options)
    #Integrate ClusterFinder clusters with existing cluster features
    newclusters = []
    cluster_features = utils.get_cluster_features(seq_record)
    for cf_cluster in cf_clusters:
        overlaps = False
        cf_type = "cf_putative"
        for cluster in cluster_features:
            cluster_sig_genes = [gene for gene in utils.get_secmet_cds_features(seq_record) if gene in utils.get_cluster_cds_features(cluster, seq_record)]
            if utils.features_overlap(cf_cluster, cluster):
                overlaps = True
                if options.borderpredict: #Predict gene cluster borders using ClusterFinder
                    if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location:
                        cluster.location = cf_cluster.location
                        for sig_gene in cluster_sig_genes:
                            startpoint = min([sig_gene.location.start, sig_gene.location.end])
                            endpoint = max([sig_gene.location.start, sig_gene.location.end])
                            if cluster.location.start > startpoint:
                                cluster.location = FeatureLocation(startpoint, cluster.location.end)
                            if cluster.location.end < endpoint:
                                cluster.location = FeatureLocation(cluster.location.start, endpoint)
                elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end:
                    cluster.location = cf_cluster.location
                elif cf_cluster.location.start < cluster.location.start:
                    cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end)
                elif cf_cluster.location.end > cluster.location.end:
                    cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end)
                cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ]
        if not overlaps:
            cf_cluster_CDSs = utils.get_cluster_cds_features(cf_cluster, seq_record)
            for CDS in cf_cluster_CDSs:
                if 'sec_met' in CDS.qualifiers:
                    type_sec_met_qualifiers = [feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat]
                    for qualifier in type_sec_met_qualifiers:
                        if "cf_fatty_acid" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_fatty_acid"
                            elif cf_type == "cf_saccharide":
                                cf_type = "cf_fatty_acid-saccharide"
                        if "cf_saccharide" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_saccharide"
                            elif cf_type == "cf_fatty_acid":
                                cf_type = "cf_fatty_acid-saccharide"
            new_cluster = SeqFeature(cf_cluster.location, type="cluster")
            new_cluster.qualifiers['product'] = [cf_type]
            new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ]
            newclusters.append(new_cluster)
    seq_record.features.extend(newclusters)
    #Re-number clusters
    clusters = utils.get_cluster_features(seq_record)
    clusters.sort(compare_feature_locations)
    clusternr = options.clusternr_offset
    for cluster in clusters:
        cluster.qualifiers['note'] = ["Cluster number: %s" % clusternr]
        clusternr += 1
    options.next_clusternr = clusternr

예제 #6

0

파일 보기

def write_gene(txt, info, options):
    "Write gene table to TXT"
    #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation
    txt.write("\t".join([
        "gene ID", "gene start", "gene end", "gene strand", "smCOG",
        "locus_tag", "annotation"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        for cds in cluster_gene_features:
            gene_id = utils.get_gene_acc(cds).partition(".")[0]
            cds_start = str(cds.location.start)
            cds_end = str(cds.location.end)
            if cds.strand == 1:
                cds_strand = "+"
            else:
                cds_strand = "-"
            smCOG = ""  ##Not used for now
            locus_tag = utils.get_gene_id(cds).partition(".")[0]
            annotation = utils.get_gene_annotation(cds)
            txt.write("\t".join([
                gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag,
                annotation
            ]) + "\n")

예제 #7

0

파일 보기

파일: html_output.py 프로젝트: abner24/plantismash

def generate_details_div(cluster,
                         seq_record,
                         options,
                         js_domains,
                         details=None):
    """Generate details div"""

    cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx'])
    if cluster_rec is None:
        return details

    if details is None:
        details = pq('<div>')
        details.addClass('details')

        header = pq('<h3>')
        header.text('Detailed annotation')
        details.append(header)

    js_cluster_domains = {
        'id': "cluster-%s-details" % cluster['idx'],
        'orfs': []
    }
    features = utils.get_cluster_cds_features(cluster_rec, seq_record)
    for feature in features:
        if not 'sec_met' in feature.qualifiers:
            continue

        if 'translation' in feature.qualifiers:
            sequence = feature.qualifiers['translation'][0]
        else:
            sequence = str(utils.get_aa_sequence(feature))

        js_orf = {
            'id': utils.get_gene_id(feature),
            'sequence': sequence,
            'domains': [],
        }

        for qual in feature.qualifiers['sec_met']:
            if not qual.startswith('NRPS/PKS Domain:'):
                continue

            js_domain = _parse_domain(qual, feature, seq_record)
            if len(js_domain) > 0:
                js_orf['domains'].append(js_domain)

        if len(js_orf['domains']) > 0:
            js_cluster_domains['orfs'].append(js_orf)

    if len(js_cluster_domains['orfs']) > 0:
        details_svg = pq('<div>')
        details_svg.addClass('details-svg')
        details_svg.attr('id', '%s-svg' % js_cluster_domains['id'])
        details.append(details_svg)

        js_domains.append(js_cluster_domains)

    return details

예제 #8

0

파일 보기

def write_RiPP(txt, info, options):
    "Write RiPP table to TXT"
    #TXT columns: RiPP ID, annotation, core peptide, mol weight, monoisotopic_mass, alt mol weights, nr bridges
    txt.write("\t".join([
        "RiPP ID", "annotation", "core peptide", "molecular weight",
        "monoisotopic_mass", "alternative molecular weights",
        "number of bridges"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        RiPP_features = _find_core_peptides(cluster_feature, info.seq_record)
        RiPPs = []
        for peptide in RiPP_features:
            for cds in cluster_gene_features:
                if utils.features_overlap(cds, peptide):
                    RiPPs.append(utils.get_gene_acc(cds).partition(".")[0])
                    break
        idx = 0
        for RiPP in RiPP_features:
            RiPP_ID = RiPPs[idx]
            note_quals = RiPP.qualifiers['note']
            annotation = [
                qual.partition("predicted class: ")[2] for qual in note_quals
                if "predicted class:" in qual
            ][0]
            core_peptide = [
                qual.partition("predicted core seq: ")[2]
                for qual in note_quals if "predicted core seq:" in qual
            ][0]
            mol_weight = [
                qual.partition("molecular weight: ")[2] for qual in note_quals
                if "molecular weight: " in qual
            ][0]
            monoiso_mass = [
                qual.partition("monoisotopic mass: ")[2] for qual in note_quals
                if "monoisotopic mass: " in qual
            ][0]
            if "alternative weights" in note_quals:
                alt_mol_weights = [
                    qual.partition("alternative weights: ")[2].replace(
                        " ", "") for qual in note_quals
                    if "alternative weights:" in qual
                ][0]
            else:
                alt_mol_weights = ""
            nr_bridges = [
                qual.partition("number of bridges: ")[2] for qual in note_quals
                if "number of bridges: " in qual
            ][0]
            txt.write("\t".join([
                RiPP_ID, annotation, core_peptide, mol_weight, monoiso_mass,
                alt_mol_weights, nr_bridges
            ]) + "\n")
            idx += 1

예제 #9

0

파일 보기

파일: js.py 프로젝트: chevrm/transPACT

def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)
        borders = utils.get_cluster_cluster_border_features(cluster, record)

        tta_codons = []
        all_misc_features = utils.get_all_features_of_type(
            record, 'misc_feature')
        for feature in all_misc_features:
            if not utils.features_overlap(cluster, feature):
                continue
            if 'note' not in feature.qualifiers:
                continue

            for note in feature.qualifiers['note']:
                if note.startswith('tta leucine codon'):
                    tta_codons.append(feature)
                    break

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['borders'] = convert_cluster_border_features(borders)
        js_cluster['tta_codons'] = convert_tta_codons(tta_codons)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters

예제 #10

0

파일 보기

def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist,
                                      transportercoglist, geneclusternr):
    allcoregenes = [
        utils.get_gene_id(cds)
        for cds in utils.get_secmet_cds_features(seq_record)
    ]
    pksnrpscoregenes = [
        utils.get_gene_id(cds)
        for cds in utils.get_pksnrps_cds_features(seq_record)
    ]
    feature_by_id = utils.get_feature_dict(seq_record)
    clustergenes = [
        utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(
            utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record)
    ]
    clustertype = utils.get_cluster_type(
        utils.get_cluster_by_nr(seq_record, geneclusternr))
    annotations = {}
    colors = []
    starts = []
    ends = []
    strands = []
    pksnrpsprots = []
    gtrs = []
    transporters = []
    for j in clustergenes:
        cdsfeature = feature_by_id[j]
        if cdsfeature.qualifiers.has_key('product'):
            annotations[j] = cdsfeature.qualifiers['product'][0]
        else:
            annotations[j] = 'Unannotated gene'
        starts.append(cdsfeature.location.start)
        ends.append(cdsfeature.location.end)
        if cdsfeature.strand == -1:
            strands.append("-")
        else:
            strands.append("+")
        if j in allcoregenes:
            colors.append("#810E15")
        else:
            colors.append("grey")
        if j in pksnrpscoregenes:
            pksnrpsprots.append(j)
        if smcogdict.has_key(j):
            if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist:
                gtrs.append(j)
            if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist:
                transporters.append(j)
    clustersize = max(ends) - min(starts)
    return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize

예제 #11

0

파일 보기

파일: clusterblast.py 프로젝트: abner24/plantismash

def create_blast_inputs(genecluster, seq_record):
    options = config.get_config()
    #Create input fasta files for BLAST search
    if options.taxon == "plants":
        queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record))
    else:
        queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.start).replace(">","").replace("<","") + "-" + \
                             str(cds.location.end).replace(">","").replace("<",""), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames

예제 #12

0

파일 보기

def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if options.coexpress:
            js_cluster["geo"] = utils.get_geotable_json(features)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"
        js_cluster['domains'] = utils.get_cluster_domains(cluster, record)

        if options.enable_cdhit:
            js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table(
                cluster, record)

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters

예제 #13

0

파일 보기

파일: clusterblast.py 프로젝트: chevrm/transPACT

def create_blast_inputs(genecluster, seq_record):
    #Create input fasta files for BLAST search
    queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.nofuzzy_start) + "-" + \
                             str(cds.location.nofuzzy_end), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames

예제 #14

0

파일 보기

파일: __init__.py 프로젝트: Ahsanzia/galaxytools

def write(seq_records, options):
    """Write all cluster proteins to a file

    Args:
        seq_records (iterable): An iterable containing Bio.SeqRecords
        options (argparse.Namespace): The options passed to the program
    """
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename)
    logging.debug("Writing seq_records to %r" % output_name)

    with open(output_name, 'w+') as handle:
        for seq_record in seq_records:
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clustertype = utils.get_cluster_type(cluster)
                clusternr = utils.get_cluster_number(cluster)
                for feature in utils.get_cluster_cds_features(cluster, seq_record):
                    qual = feature.qualifiers
                    fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0])
                    handle.write( fasta_header )
                    handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )

예제 #15

0

파일 보기

def write_signature_gene_info(txt, info, options):
    "Write signature gene table to TXT"
    #TXT columns: signature_gene, pHMM_hit, e-value, bit score, nr of seeds
    txt.write("\t".join([
        "signature gene", "pHMM hits", "e-value", "bit score",
        "number of seeds"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        signature_genes = [
            cds for cds in cluster_gene_features if 'sec_met' in cds.qualifiers
        ]
        for cds in signature_genes:
            if len([
                    qual for qual in cds.qualifiers['sec_met']
                    if qual.startswith('Domains detected: ')
            ]) == 0:
                continue
            gene_ID = utils.get_gene_acc(cds).partition(".")[0]
            domdetect_qual = [
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('Domains detected: ')
            ][0]
            if ";" in domdetect_qual:
                domains = domdetect_qual.partition(
                    "Domains detected: ")[2].split(";")
            else:
                domains = [domdetect_qual.partition("Domains detected: ")[2]]
            for domain in domains:
                domain_name = domain.partition(" (")[0].replace(" ", "")
                evalue = domain.partition("E-value: ")[2].partition(",")[0]
                bitscore = domain.partition("bitscore: ")[2].partition(",")[0]
                nr_seeds = domain.partition("seeds: ")[2].partition(")")[0]
                txt.write("\t".join(
                    [gene_ID, domain_name, evalue, bitscore, nr_seeds]) + "\n")

예제 #16

0

파일 보기

파일: test_hmm_detection.py 프로젝트: chevrm/transPACT

 def test_find_clusters(self):
     i = 0
     nseqdict = {"Metabolite0": "?", "Metabolite1": "?"}
     self.config.next_clusternr = 1
     for gene_id in self.feature_by_id:
         if gene_id != "GENE_X":
             clustertype = "Metabolite%d" % (i % 2)
             hmm_detection._update_sec_met_entry(
                 self.feature_by_id[gene_id], self.results_by_id[gene_id],
                 clustertype, nseqdict)
             i += 1
     hmm_detection.find_clusters(self.record, self.rulesdict)
     result_clusters = [
         sorted([
             utils.get_gene_id(f)
             for f in utils.get_cluster_cds_features(feature, self.record)
         ]) for feature in utils.get_cluster_features(self.record)
     ]
     expected_clusters = [["GENE_1", "GENE_2"], ["GENE_3"],
                          ["GENE_4", "GENE_5"]]
     self.assertEqual(result_clusters,
                      expected_clusters,
                      msg="\nResult : %s\nExpected : %s" %
                      (result_clusters, expected_clusters))

예제 #17

0

파일 보기

def run_coexpress(seq_record, all_gene_expressions, geo):
    options = get_config()
    cl_count = 1
    cl_list = utils.get_cluster_features(seq_record)

    gene_expressions = all_gene_expressions[seq_record.id]

    logging.info('Running CoExpress analysis on the clusters..')
    for cluster in cl_list:
        logging.debug(
            'Running CoExpress analysis on record "%s".. (Cluster %s of %s)' %
            (geo["info"]["id"], cl_count, len(cl_list)))
        features = utils.get_cluster_cds_features(cluster, seq_record)
        cl_count += 1
        cluster_genes = {}

        for feature in features:
            gene_id = utils.get_gene_id(feature)
            if gene_id in gene_expressions:
                cluster_genes[gene_id] = gene_expressions[gene_id]

        #calculate correlation value between genes
        for gene_1 in cluster_genes:
            if "cor" not in cluster_genes[gene_1]:
                cluster_genes[gene_1]["cor"] = {}
            if "exp" not in cluster_genes[gene_1]:
                continue
            for gene_2 in cluster_genes:
                if "cor" not in cluster_genes[gene_2]:
                    cluster_genes[gene_2]["cor"] = {}
                if gene_2 == gene_1:
                    continue
                if "exp" not in cluster_genes[gene_2]:
                    continue
                if gene_1 in cluster_genes[gene_2]["cor"]:
                    continue
                cor_val = calc_correlation_value(cluster_genes[gene_1],
                                                 cluster_genes[gene_2])
                cluster_genes[gene_1]["cor"][gene_2] = cor_val
                cluster_genes[gene_2]["cor"][gene_1] = cor_val

        #calculate distance value for building dendogram
        for gene_1 in cluster_genes:
            if "dist" not in cluster_genes[gene_1]:
                cluster_genes[gene_1]["dist"] = {}
            for gene_2 in cluster_genes:
                if "dist" not in cluster_genes[gene_2]:
                    cluster_genes[gene_2]["dist"] = {}
                dist = 100.0
                if "cor" in cluster_genes[gene_1] and gene_2 in cluster_genes[
                        gene_1]["cor"]:
                    cor_val = min(1.00, cluster_genes[gene_1]["cor"][gene_2])
                    dist = 100.0 * (1.0 - cor_val)
                cluster_genes[gene_1]["dist"][gene_2] = dist
                cluster_genes[gene_2]["dist"][gene_1] = dist

        # check for remote genes, add if correlation value >= 0.9
        for gene_1 in cluster_genes:
            for seqid in all_gene_expressions:
                prefix = "%s:" % seqid.replace(":", "_")
                for gene_2 in all_gene_expressions[seqid]:
                    if (
                            prefix + gene_2
                    ) not in options.hmm_results:  # only add biosynthetic remote genes
                        continue
                    if gene_2 == gene_1:
                        continue
                    if gene_2 in cluster_genes:
                        continue
                    cor_val = min(
                        1.00,
                        calc_correlation_value(
                            cluster_genes[gene_1],
                            all_gene_expressions[seqid][gene_2]))
                    if 1.00 > cor_val >= 0.9:
                        cluster_genes[gene_1]["dist"][gene_2] = 100.0 * (
                            1.0 - cor_val)

        # review the remote genes, discard genes with less than 2 edges
        if True:
            edges_count = {}
            for gene_1 in cluster_genes:
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if gene_2 not in cluster_genes:
                        if gene_2 not in edges_count:
                            edges_count[gene_2] = 0
                        edges_count[gene_2] += 1
            for gene_1 in cluster_genes:
                new_dists = {}
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if (gene_2 in cluster_genes) or (edges_count[gene_2] >= 2):
                        new_dists[gene_2] = cluster_genes[gene_1]["dist"][
                            gene_2]
                cluster_genes[gene_1]["dist"] = new_dists

        # review the remote genes, discard genes without any connection to cluster's biosynthetic genes
        if True:
            have_connections = []
            prefix = "%s:" % seq_record.id.replace(":", "_")
            for gene_1 in cluster_genes:
                if (prefix + gene_1) in options.hmm_results:
                    for gene_2 in cluster_genes[gene_1]["dist"]:
                        if (gene_2 not in cluster_genes) and (
                                gene_2 not in have_connections):
                            have_connections.append(gene_2)
            for gene_1 in cluster_genes:
                new_dists = {}
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if (gene_2 in cluster_genes) or (gene_2
                                                     in have_connections):
                        new_dists[gene_2] = cluster_genes[gene_1]["dist"][
                            gene_2]
                cluster_genes[gene_1]["dist"] = new_dists

        #update seq_record
        update_features(features, cluster_genes, geo)

    if False:  #This feature is temporarily disabled, saved for next version #options.coexpress_signal_cluster_size < len(overlaps):
        logging.info('Running expression signal analysis on seq_record..')
        signals = []
        n = options.coexpress_signal_cluster_size - 1
        #build list of cluster locations (for annotating signal regions)
        clrefs = []
        for cluster in cl_list:
            clrefs.append(((cluster.location.start, cluster.location.end),
                           utils.get_cluster_number(cluster)))
        clrefs = sorted(clrefs, key=lambda cl: cl[0][0])
        #build signals
        for i in xrange(0, len(overlaps) - n):
            genes = []
            for overlap in overlaps[i:i + n]:
                gene = overlap[0]
                for feature in overlap:
                    if utils.get_gene_id(feature) in gene_expressions:
                        gene = feature
                        break
                genes.append(gene)
            cors = []
            checked = []
            hits = []
            for x in xrange(0, len(genes)):
                gene_x = utils.get_gene_id(genes[x])
                if prefix + gene_x in options.hmm_results:
                    hits.append(options.hmm_results[prefix +
                                                    gene_x][0].query_id)
                for y in xrange(0, len(genes)):
                    if ((x, y) in checked) or ((y, x) in checked):
                        continue
                    cor_val = 0
                    gene_y = utils.get_gene_id(genes[y])
                    if (gene_x in gene_expressions) and (gene_y
                                                         in gene_expressions):
                        cor_val = calc_correlation_value(
                            gene_expressions[gene_x], gene_expressions[gene_y])
                    cors.append(cor_val)
                    checked.append((x, y))
            sloc = (genes[0].location.start + genes[-1].location.end) / 2
            cor_val = 0
            if len(cors) > 0 and len(list(set(hits))) > 1:
                cor_val = np.median(cors)
            cl_idx = -1
            for clref in clrefs:
                if sloc < clref[0][0]:
                    continue
                if sloc <= clref[0][1]:
                    cl_idx = clref[1]
                    break
            signals.append((sloc, cor_val, cl_idx))
        if "coexpress_signal" not in options:
            options.coexpress_signal = {}
        if geo["info"]["id"] not in options.coexpress_signal:
            options.coexpress_signal[geo["info"]["id"]] = {}
        options.coexpress_signal[geo["info"]["id"]][seq_record.id] = signals

예제 #18

0

파일 보기

def write(seq_records, options):
    if options.input_type == 'prot':
        return
    #Open up TXT file and XLS record
    outfolder = options.full_outputfolder_path
    txtfile = open(path.join(outfolder, "geneclusters.txt"), "w")
    wb = Workbook()
    font1 = Font()
    style1 = XFStyle()
    style1.font = font1
    font1.bold = True
    ws0 = wb.add_sheet('0')
    ws0.write(0, 0, "Input accession number", style1)
    ws0.write(0, 1, "Input name", style1)
    ws0.write(0, 2, "Gene cluster type", style1)
    ws0.write(0, 3, "Gene cluster genes", style1)
    ws0.write(0, 4, "Gene cluster gene accessions", style1)
    if options.knownclusterblast:
        ws0.write(0, 5, "Compound with gene cluster of highest homology",
                  style1)
    #For each gene cluster, write out info
    column = 1
    for seq_record in seq_records:
        clusters = utils.get_cluster_features(seq_record)
        for cluster in clusters:
            clustertype = utils.get_cluster_type(cluster)
            clusternr = utils.get_cluster_number(cluster)
            clustergenes = [
                utils.get_gene_id(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            accessions = [
                utils.get_gene_acc(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            ws0.write(column, 0, seq_record.id)
            try:
                ws0.write(column, 1, seq_record.description)
            except:
                ws0.write(
                    column, 1,
                    "Name to long to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            ws0.write(column, 2, clustertype)
            try:
                ws0.write(column, 3, ";".join(clustergenes))
            except:
                ws0.write(
                    column, 3,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            try:
                ws0.write(column, 4, ";".join(accessions))
            except:
                ws0.write(
                    column, 4,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            if hasattr(seq_record, 'closestcompounddict') and \
               seq_record.closestcompounddict.has_key(clusternr):
                ws0.write(column, 5, seq_record.closestcompounddict[clusternr])
            column += 1
            txtfile.write("\t".join([
                seq_record.id, seq_record.description, clustertype, ";".join(
                    clustergenes), ";".join(accessions)
            ]) + "\n")
    wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))

예제 #19

0

파일 보기

def get_inter_cluster_relation(seq_records, geo_id):
    logging.debug('Calculating inter cluster relations on geo_record "%s"..' %
                  (geo_id))
    data = []
    full_g = nx.Graph()
    cluster_genes = {}
    bio_genes = set()
    cur_cluster1 = 0
    # First, inspect all cluster to get cluster_genes
    for record in seq_records:
        for cluster in utils.get_cluster_features(record):
            cur_cluster1 += 1
            cluster_genes[cur_cluster1] = set()

            for cluster_gene in utils.get_cluster_cds_features(
                    cluster, record):
                # We only care about cluster_genes that have a geo match
                for cluster_gene_geo in utils.parse_geo_feature(cluster_gene):
                    # We only care about data from the current geo_id
                    if cluster_gene_geo['rec_id'] == geo_id:
                        cur_gene1 = utils.get_gene_id(cluster_gene)
                        cur_gene1_distances = cluster_gene_geo['dist']
                        cur_gene1_neighbors = set(cur_gene1_distances)

                        # Add each gene to cluster_genes, and to the full_g(raph) and to bio_genes
                        cluster_genes[cur_cluster1].add(cur_gene1)
                        full_g.add_node(cur_gene1)
                        if 'sec_met' in cluster_gene.qualifiers:
                            bio_genes.add(cur_gene1)

                        # Get intra-cluster edges
                        interactions = cur_gene1_neighbors.intersection(
                            cluster_genes[cur_cluster1])
                        update_g(cur_gene1, interactions, cur_gene1_distances,
                                 full_g)

                        # From the second cluster onwards, we'll add inter-cluster edges backwards, i.e.: 2-1, 3-1, 3-2, 4-1, 4-2, etc...
                        if cur_cluster1 is not 1:
                            for cur_cluster2 in cluster_genes:
                                if cur_cluster1 is not cur_cluster2:
                                    interactions = cur_gene1_neighbors.intersection(
                                        cluster_genes[cur_cluster2])
                                    update_g(cur_gene1, interactions,
                                             cur_gene1_distances, full_g)

    # Remove single nodes
    for node in full_g.nodes():
        if full_g.degree(node) == 0:
            full_g.remove_node(node)

    # Get communities
    community_dict = community.best_partition(full_g)

    number_of_clusters = len(cluster_genes)

    # Now check inter-cluster interactions
    for i in range(1, number_of_clusters + 1):
        cluster1 = cluster_genes[i]

        for j in range(i + 1, number_of_clusters + 1):
            cluster2 = cluster_genes[j]
            cluster3 = cluster1.union(cluster2)

            cluster_pair_g = full_g.subgraph(cluster3)

            communities_present = np.unique(
                [community_dict[n] for n in cluster3 if n in community_dict])

            # CRITERIA 1 = only intra-community edges
            for cur_community in communities_present:
                cur_community_nodes = [
                    n for n in cluster3 if n in community_dict
                    and community_dict[n] == cur_community
                ]
                cur_community_g = cluster_pair_g.subgraph(cur_community_nodes)

                decomposed_g = list(
                    nx.connected_component_subgraphs(cur_community_g))
                for cur_g in decomposed_g:
                    # CRITERIA 2 = no isolates. anything with a clustering_coefficient=0 will be pruned out.
                    clustering_coefficient = nx.clustering(cur_g)

                    pred_nodes = [
                        n for n in clustering_coefficient
                        if clustering_coefficient[n] > 0
                    ]
                    pred_g = cur_g.subgraph(pred_nodes)
                    pred_edges = pred_g.edges()

                    prediction = set(pred_g.nodes())
                    prediction_cluster1 = prediction.intersection(cluster1)
                    prediction_cluster2 = prediction.intersection(cluster2)

                    bio_prediction = prediction.intersection(bio_genes)
                    bio_prediction_cluster1 = prediction_cluster1.intersection(
                        bio_genes)
                    bio_prediction_cluster2 = prediction_cluster2.intersection(
                        bio_genes)

                    #CRITERIA 3 = at least 2 genes per cluster
                    #CRITERIA 5 = at least 1 bio per cluster
                    #CRITERIA 4 = at least 3 bio
                    if (len(prediction_cluster1) >= 2
                            and len(prediction_cluster2) >= 2
                            and len(bio_prediction_cluster1) >= 1
                            and len(bio_prediction_cluster2) >= 1
                            and len(bio_prediction) >= 3):

                        pred_edges1 = [
                            n for n in pred_edges
                            if n[0] in cluster1 and n[1] in cluster1
                        ]
                        pred_edges2 = [
                            n for n in pred_edges
                            if n[0] in cluster2 and n[1] in cluster2
                        ]

                        pred_edges12 = [
                            n for n in pred_edges
                            if n[0] in cluster1 and n[1] in cluster2
                        ]
                        pred_edges21 = [
                            n for n in pred_edges
                            if n[0] in cluster2 and n[1] in cluster1
                        ]
                        inter_cluster_edges = pred_edges12 + pred_edges21

                        data.append({})
                        data[-1]['source'] = {}
                        data[-1]['source']['id'] = i
                        data[-1]['source']['links'] = pred_edges1

                        data[-1]['target'] = {}
                        data[-1]['target']['id'] = j
                        data[-1]['target']['links'] = pred_edges2

                        data[-1]['links'] = inter_cluster_edges
    return data

예제 #20

0

파일 보기

def write_BGC(txt, info, options):
    "Write BGC table to TXT"
    #TXT columns: BGC ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters,
    # NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers
    txt.write("\t".join([
        "BGC ID", "BGC type", "detection rules used", "BGC_range", "genes",
        "subclusters", "NRPSs/PKSs", "signature_genes", "RiPPs",
        "predicted structure", "monomers"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        BGC_ID = "%s_c%s" % (info.seq_record.id.partition(".")[0], BGCnr)
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        BGC_type = info.clustertypes[BGCnr].replace("-", ";")
        detection_rules_used = '"' + ";".join(
            get_detection_rules(cluster_feature)) + '"'
        BGC_range = ";".join([
            str(cluster_feature.location.start),
            str(cluster_feature.location.end)
        ])
        genes = ";".join(info.accessions[BGCnr])
        if 'subclusterblast' in cluster_feature.qualifiers:
            subclusters = ";".join([
                qual.partition("\t")[2]
                for qual in cluster_feature.qualifiers['subclusterblast']
            ])
        else:
            subclusters = ""
        #TODO The subclusterblast module should probably be changed for the precalcs to provide a list here of the 100% hits instead of all hits
        NRPSs_PKSs = ";".join([
            utils.get_gene_acc(cds).partition(".")[0]
            for cds in cluster_gene_features
            if 'sec_met' in cds.qualifiers and len([
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('NRPS/PKS Domain:')
            ]) > 0
        ])
        signature_genes = ";".join([
            utils.get_gene_acc(cds).partition(".")[0]
            for cds in cluster_gene_features if 'sec_met' in cds.qualifiers
        ])
        if len(_find_core_peptides(cluster_feature, info.seq_record)) != 0:
            ripp_list = []
            for peptide in _find_core_peptides(cluster_feature,
                                               info.seq_record):
                for cds in cluster_gene_features:
                    if utils.features_overlap(cds, peptide):
                        ripp_list.append(
                            utils.get_gene_acc(cds).partition(".")[0])
                        break
#            RiPPs = ";".join([[utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features
#                if utils.features_overlap(cds, peptide)][0] for peptide in
#               _find_core_peptides(cluster_feature, info.seq_record)])
            RiPPs = ";".join(ripp_list)
        else:
            RiPPs = "-"
        if 'structure' in cluster_feature.qualifiers:
            pred_structure = ";".join(cluster_feature.qualifiers['structure'])
        else:
            pred_structure = "N/A"
        monomers = utils.get_structure_pred(cluster_feature)
        #Write data to TXT
        txt.write("\t".join([
            BGC_ID, BGC_type, detection_rules_used, BGC_range, genes,
            subclusters, NRPSs_PKSs, signature_genes, RiPPs, pred_structure,
            monomers
        ]) + "\n")

예제 #21

0

파일 보기

def write_NRPS_PKS(txt, info, options):
    "Write NRPS/PKS table to TXT"
    #TXT columns: NRPS/PKS ID, annotation, aSDomain, score, evalue, domain type, subtype, range, activity, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus
    txt.write("\t".join([
        "Cluster_ID", "NRPSPKS_ID", "annotation", "aSDomain", "score",
        "evalue", "domain_type", "subtype", "domain_start", "domain_end",
        "KR activity", "KR stereochemistry", "NRPSPredictor2", "Stachelhaus",
        "Minowa", "pkssignature", "consensus"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        cluster_id = "{seq_id}_c{cluster_nr}".format(seq_id=info.seq_record.id,
                                                     cluster_nr=BGCnr)
        NRPSs_PKSs = [
            cds for cds in cluster_gene_features
            if 'sec_met' in cds.qualifiers and len([
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('NRPS/PKS Domain:')
            ]) > 0
        ]
        for cds in NRPSs_PKSs:
            enzyme_ID = utils.get_gene_acc(cds).partition(".")[0]
            if len([
                    qual for qual in cds.qualifiers['sec_met']
                    if "NRPS/PKS subtype: " in qual
            ]) > 0:
                enzyme_annotation = [
                    qual for qual in cds.qualifiers['sec_met']
                    if qual.startswith("NRPS/PKS subtype")
                ][0].partition("NRPS/PKS subtype: ")[2]
            else:
                logging.warn("No enzyme annotation for %s" % enzyme_ID)
                enzyme_annotation = ""
            aSDomains = [
                dom for dom in utils.get_cluster_aSDomain_features(
                    cluster_feature, info.seq_record) if
                utils.features_overlap(cds, dom) and utils.get_gene_id(cds) in
                [dom.qualifiers['locus_tag'], dom.qualifiers['locus_tag'][0]]
            ]
            for aSDomain in aSDomains:
                domtype = aSDomain.qualifiers['domain'][0]
                if "domain_subtype" in aSDomain.qualifiers:
                    subtype = aSDomain.qualifiers['domain_subtype'][0]
                else:
                    subtype = ""
                aSDomain_ID = aSDomain.qualifiers['asDomain_id'][0]
                score = str(aSDomain.qualifiers['score'][0])
                evalue = str(aSDomain.qualifiers['evalue'][0])
                dom_start = str(aSDomain.location.start)
                dom_end = str(aSDomain.location.end)
                kr_activity = ""
                kr_stereochemistry = ""
                NRPSPredictor2 = ""
                Stachelhaus = ""
                Minowa = ""
                pkssignature = ""
                consensus = ""
                if aSDomain.qualifiers.has_key('specificity'):
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR activity")
                    ]) > 0:
                        kr_activity = [
                            qual.partition("KR activity: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR activity")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR stereochemistry")
                    ]) > 0:
                        kr_stereochemistry = [
                            qual.partition("KR stereochemistry: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR stereochemistry")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("NRPSpredictor2")
                    ]) > 0:
                        NRPSPredictor2 = [
                            qual.partition("NRPSpredictor2 SVM: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("NRPSpredictor2")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Stachelhaus")
                    ]) > 0:
                        Stachelhaus = [
                            qual.partition("Stachelhaus code: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Stachelhaus")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Minowa")
                    ]) > 0:
                        Minowa = [
                            qual.partition("Minowa: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Minowa")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("PKS signature")
                    ]) > 0:
                        pkssignature = [
                            qual.partition("PKS signature: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("PKS signature")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("consensus")
                    ]) > 0:
                        consensus = [
                            qual.partition("consensus: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("consensus")
                        ][0]

                txt.write("\t".join([
                    cluster_id, enzyme_ID, enzyme_annotation, aSDomain_ID,
                    score, evalue, domtype, subtype, dom_start, dom_end,
                    kr_activity, kr_stereochemistry, NRPSPredictor2,
                    Stachelhaus, Minowa, pkssignature, consensus
                ]) + "\n")

예제 #22

0

파일 보기

파일: html_output.py 프로젝트: abner24/plantismash

def generate_sidepanel(cluster, seq_record, options, sidepanel=None):
    """Generate sidepanel div"""
    cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx'])
    if cluster_rec is None:
        return sidepanel

    if sidepanel is None:
        sidepanel = pq('<div>')
        sidepanel.addClass('sidepanel')

    structure = pq('<div>')
    structure.addClass('structure')
    structure_header = pq('<h3>')
    structure_header.text('Predicted core structure')
    structure.append(structure_header)
    a = pq('<a>')
    a.attr('href',
           _get_structure_image_url(cluster_rec, options.outputfoldername))
    a.attr('target', '_new')
    structure.append(a)
    structure_img = pq('<img>')
    structure_img.attr(
        'src', _get_structure_image_url(cluster_rec, options.outputfoldername))
    a.append(structure_img)
    warning = pq('<div>')
    warning.addClass('as-structure-warning')
    if not 'docking' in options:
        options.docking = {}
    if cluster['idx'] in options.docking and options.docking[cluster['idx']]:
        warning.text('Rough prediction of core scaffold based on assumed '
                     'PKS linker matching; tailoring reactions not taken '
                     'into account')
    else:
        warning.text('Rough prediction of core scaffold based on assumed '
                     'PKS/NRPS colinearity; tailoring reactions not taken '
                     'into account')
    structure.append(warning)
    sidepanel.append(structure)

    details = pq('<div>')
    details.addClass('more-details')
    details_header = pq('<h3>')
    details_header.text('Prediction details')
    details.append(details_header)
    details_list = pq('<dl>')
    details_list.addClass('prediction-text')

    details.append(details_list)
    sidepanel.append(details)
    dt = pq('<dt>')
    dt.text('Monomers prediction:')
    details_list.append(dt)
    dd = pq('<dd>')
    dd.text(_get_monomer_prediction(cluster_rec))
    details_list.append(dd)

    features = utils.get_cluster_cds_features(cluster_rec, seq_record)
    for feature in features:
        if not 'sec_met' in feature.qualifiers:
            continue

        header_printed = False
        per_CDS_predictions = []
        for qual in feature.qualifiers['sec_met']:
            if not qual.startswith('NRPS/PKS Domain:'):
                continue
            # logging.debug("qual: %s" % qual)
            preds = _parse_substrate_predictions(qual)

            per_Adomain_predictions = []
            for key, val in preds:

                if not header_printed:
                    dt = pq('<dt>')
                    dt.text(utils.get_gene_id(feature))
                    details_list.append(dt)
                    header_printed = True
                dd = pq('<dd>')
                dd.html('%s: %s<br>' % (key, val))
                details_list.append(dd)
                if qual.startswith("NRPS/PKS Domain: AMP-binding"):
                    values = _filter_norine_as(val.split(","))
                    if len(values) > 0:
                        per_Adomain_predictions.extend(val.split(","))

            if len(preds) > 0:
                if qual.startswith("NRPS/PKS Domain: AMP-binding"):
                    per_Adomains_predictions_unique = list(
                        set(per_Adomain_predictions))
                    per_CDS_predictions.append(per_Adomains_predictions_unique)
                # logging.debug("substrate prediction list: %s" % ",".join(per_Adomains_predictions_unique) )
                dd = pq('<dd>')
                dd.append(pq('<br>'))
                details_list.append(dd)

        if len(per_CDS_predictions) > 0:
            url = _get_norine_url_for_specArray(per_CDS_predictions)
            if url:
                dd = pq('<dd>')
                dd.append("Search NORINE for peptide in ")
                a = pq('<a>')
                a.attr('href', url)
                a.attr('target', '_new')
                a.text("strict mode")
                dd.append(a)
                dd.append(" // ")
                url = _get_norine_url_for_specArray(per_CDS_predictions,
                                                    be_strict=False)
                a = pq('<a>')
                a.attr('href', url)
                a.attr('target', '_new')
                a.text("relaxed mode")
                dd.append(a)
                dd.append(pq('<br>'))
                dd.append(pq('<br>'))
                details_list.append(dd)

    if cluster['type'].find('nrps') > -1:
        cross_refs = pq("<div>")
        refs_header = pq('<h3>')
        refs_header.text('Database cross-links')
        cross_refs.append(refs_header)
        links = pq("<div>")
        links.addClass('prediction-text')

        a = pq("<a>")
        a.attr('href', 'http://bioinfo.lifl.fr/norine/form2.jsp')
        a.attr('target', '_new')
        a.text("Link to NORINE database query form")
        links.append(a)
        links.append("<br>")

        a = pq("<a>")
        url = _get_norine_url_for_cluster(cluster_rec)
        logging.debug("NORINE URL string: %s" % url)
        a.attr('href', url)
        a.attr('target', '_new')
        a.text("strict mode")
        links.append("Direct lookup in NORINE database in ")
        links.append(a)
        links.append(" // ")
        url = _get_norine_url_for_cluster(cluster_rec, be_strict=False)
        a = pq("<a>")
        a.attr('href', url)
        a.attr('target', '_new')
        a.text("relaxed mode")
        links.append(a)
        cross_refs.append(links)
        sidepanel.append(cross_refs)

    return sidepanel

예제 #23

0

파일 보기

def annotate_geneclusters(seq_record, options):
    """Re-annotate gene clusters in the seq_record"""
    pfam_features = utils.get_pfam_features(seq_record)
    cf_clusters = find_cf_clusters(pfam_features, seq_record, options)
    # Integrate ClusterFinder clusters with existing cluster features
    newclusters = []
    cluster_features = utils.get_cluster_features(seq_record)
    secmet_cds_features = utils.get_secmet_cds_features(seq_record)

    for cf_cluster in cf_clusters:
        overlaps = False
        cf_type = "cf_putative"
        for cluster in cluster_features:
            if not utils.features_overlap(cf_cluster, cluster):
                continue

            overlaps = True

            # Get signature genes from antiSMASH-predicted cluster
            features_in_cluster = utils.get_cluster_cds_features(
                cluster, seq_record)
            cluster_sig_genes = [
                gene for gene in secmet_cds_features
                if gene in features_in_cluster
            ]

            # Predict gene cluster borders using ClusterFinder
            if options.borderpredict:
                if ((cluster.location.end + cluster.location.start) /
                        2) in cf_cluster.location:
                    # Make sure that antiSMASH signature genes are still included in the cluster
                    for sig_gene in cluster_sig_genes:
                        startpoint = min(
                            [sig_gene.location.start, sig_gene.location.end])
                        endpoint = max(
                            [sig_gene.location.start, sig_gene.location.end])
                        if cf_cluster.location.start > startpoint:
                            cf_cluster.location = FeatureLocation(
                                startpoint, cf_cluster.location.end)
                        if cf_cluster.location.end < endpoint:
                            cf_cluster.location = FeatureLocation(
                                cf_cluster.location.start, endpoint)
                    cluster_border = SeqFeature(cf_cluster.location,
                                                type="cluster_border")
                    cluster_border.qualifiers = {
                        "tool": ["clusterfinder"],
                        "probability": [cf_cluster.probability],
                        "note": ["best prediction"],
                    }
                    seq_record.features.append(cluster_border)
            elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end:
                cluster.location = cf_cluster.location
            elif cf_cluster.location.start < cluster.location.start:
                cluster.location = FeatureLocation(cf_cluster.location.start,
                                                   cluster.location.end)
            elif cf_cluster.location.end > cluster.location.end:
                cluster.location = FeatureLocation(cluster.location.start,
                                                   cf_cluster.location.end)
            cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
        if not overlaps and not ('borderpredict_only' in options
                                 and options.borderpredict_only):
            cf_cluster_CDSs = utils.get_cluster_cds_features(
                cf_cluster, seq_record)
            for CDS in cf_cluster_CDSs:
                if 'sec_met' in CDS.qualifiers:
                    type_sec_met_qualifiers = [
                        feat for feat in CDS.qualifiers['sec_met']
                        if "Type: " in feat
                    ]
                    for qualifier in type_sec_met_qualifiers:
                        if "cf_fatty_acid" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_fatty_acid"
                            elif cf_type == "cf_saccharide":
                                cf_type = "cf_fatty_acid-saccharide"
                        if "cf_saccharide" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_saccharide"
                            elif cf_type == "cf_fatty_acid":
                                cf_type = "cf_fatty_acid-saccharide"
            new_cluster = SeqFeature(cf_cluster.location, type="cluster")
            new_cluster.qualifiers['product'] = [cf_type]
            new_cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
            newclusters.append(new_cluster)

    if len(newclusters):
        seq_record.features.extend(newclusters)
        renumber_clusters(seq_record, options)