예제 #1
0
def store_percentage_identities(seq_record):
    clusters = utils.get_cluster_features(seq_record)
    cfg = config.get_config()
    for cluster in clusters:
        features = [
            feature
            for feature in utils.get_cluster_cds_features(cluster, seq_record)
            if 'sec_met' in feature.qualifiers
        ]
        cdhit_table, gene_to_cluster = utils.get_cdhit_table(
            features, float(cfg.cdh_display_cutoff))
        for cdhit_cluster in cdhit_table:
            if len(cdhit_cluster["genes"]) > 1:
                cl_features = [
                    feature for feature in features if utils.get_gene_id(
                        feature) in cdhit_cluster["genes"].keys()
                ]
                pct_table = utils.get_pct_identity_table(cl_features)
                for cds in cl_features:
                    result = ",".join([
                        "%s=%s" %
                        (othercds, pct_table[utils.get_gene_id(cds)][othercds])
                        for othercds in pct_table[utils.get_gene_id(
                            cds)].keys()
                    ])
                    for ann in cds.qualifiers['sec_met']:
                        if ann.startswith("Percentage identity"):
                            del ann
                    cds.qualifiers['sec_met'].append(
                        "Percentage identity: %s" % (result))
예제 #2
0
def generate_searchgtr_htmls(seq_records, options):
    #Generate lists of COGs that are glycosyltransferases or transporters
    gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102']
    searchgtrformtemplateparts = load_searchgtr_search_form_template()
    options.searchgtr_links = {}
    for seq_record in seq_records:
        smcogdict, _ = utils.get_smcog_annotations(seq_record)
        for feature in utils.get_cds_features(seq_record):
            gene_id = utils.get_gene_id(feature)
            if smcogdict.has_key(gene_id):
                smcog = smcogdict[gene_id]
                if smcog in gtrcoglist:

                    if not os.path.exists(options.full_outputfolder_path +
                                          os.sep + "html"):
                        os.mkdir(options.full_outputfolder_path + os.sep +
                                 "html")
                    formfileloc = options.full_outputfolder_path + os.sep + "html" + os.sep + utils.get_gene_id(
                        feature) + "_searchgtr.html"
                    link_loc = "html" + os.sep + utils.get_gene_id(
                        feature) + "_searchgtr.html"
                    options.searchgtr_links[seq_record.id + "_" +
                                            gene_id] = link_loc
                    formfile = open(formfileloc, "w")
                    specificformtemplate = searchgtrformtemplateparts[
                        0].replace("GlycTr", gene_id)
                    formfile.write(specificformtemplate)
                    formfile.write("%s\n%s" %
                                   (gene_id, utils.get_aa_sequence(feature)))
                    formfile.write(searchgtrformtemplateparts[1])
                    formfile.close()
예제 #3
0
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist,
                                      transportercoglist, geneclusternr):
    allcoregenes = [
        utils.get_gene_id(cds)
        for cds in utils.get_secmet_cds_features(seq_record)
    ]
    pksnrpscoregenes = [
        utils.get_gene_id(cds)
        for cds in utils.get_pksnrps_cds_features(seq_record)
    ]
    feature_by_id = utils.get_feature_dict(seq_record)
    clustergenes = [
        utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(
            utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record)
    ]
    clustertype = utils.get_cluster_type(
        utils.get_cluster_by_nr(seq_record, geneclusternr))
    annotations = {}
    colors = []
    starts = []
    ends = []
    strands = []
    pksnrpsprots = []
    gtrs = []
    transporters = []
    for j in clustergenes:
        cdsfeature = feature_by_id[j]
        if cdsfeature.qualifiers.has_key('product'):
            annotations[j] = cdsfeature.qualifiers['product'][0]
        else:
            annotations[j] = 'Unannotated gene'
        starts.append(cdsfeature.location.start)
        ends.append(cdsfeature.location.end)
        if cdsfeature.strand == -1:
            strands.append("-")
        else:
            strands.append("+")
        if j in allcoregenes:
            colors.append("#810E15")
        else:
            colors.append("grey")
        if j in pksnrpscoregenes:
            pksnrpsprots.append(j)
        if smcogdict.has_key(j):
            if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist:
                gtrs.append(j)
            if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist:
                transporters.append(j)
    clustersize = max(ends) - min(starts)
    return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
예제 #4
0
def match_exp_to_genes(features, geo_dataset):
    cluster_genes = {}
    geo_info = geo_dataset["info"]
    geo_data = geo_dataset["data"]

    # get gene_id to ref_id table
    gene_to_ref = {}
    col_gene_id = geo_info["col_id"]
    if col_gene_id < 0:
        return {}  # gene_id columns not found
    for id_ref, data in geo_data.items():
        gene_to_ref[data[0][col_gene_id].upper()] = id_ref

    # fill cluster_genes
    for feature in features:
        gene_id = utils.get_gene_id(feature)
        if gene_id.upper() in gene_to_ref:
            cluster_genes[gene_id] = {}
            cluster_genes[gene_id]["ref"] = gene_to_ref[gene_id.upper()]
            cluster_genes[gene_id]["evalue"] = float(-1)

    #calculate scaled value for each hits
    for gene_id in cluster_genes:
        cg = cluster_genes[gene_id]
        if "ref" in cg:
            cg["exp"] = {}
            for sample, value in geo_data[cg["ref"]][1].items():
                cg["exp"][sample] = value

    return cluster_genes
예제 #5
0
def getECs(seq_record, options):

    if not name in options.ecpred:
        logging.debug("ECprediction %s not selected, returning..." % name)
        return

    CDSFeatureDict = utils.get_feature_dict(seq_record)
    logging.debug("Predicting EC numbers using KEGG online queries")
    KEGGspeciesLocusTagDict = _getKEGG_speciesLocusTag(CDSFeatureDict)
    ECDict = _get_ECNumberDict(KEGGspeciesLocusTagDict)

    notes = []
    # logging.debug("Found %s EC predictions" % len(ECDict.keys()))
    for key in ECDict.keys():
        Feature = CDSFeatureDict[key]
        if Feature.qualifiers.has_key('note'):
            notes = Feature.qualifiers['note']

        if len(ECDict[key]) > 0:
            logging.debug("Found EC numbers: %s" % ", ".join(ECDict[key]))
            notes.append('EC number prediction based on KEGG query: %s' %
                         ECDict[key])
            Feature.qualifiers['note'] = notes
            if Feature.qualifiers.has_key('EC_number'):
                logging.warn('ECpredictor[kegg]: Overwriting existing EC annotation: %s  with %s' % \
                             (", ".join(Feature.qualifiers['EC_number']), ", ".join(ECDict[key])))

            Feature.qualifiers['EC_number'] = ECDict[key]
        else:
            logging.warn('ECpredictor[KEGG]: Could not find EC number for %s' %
                         utils.get_gene_id(Feature))
예제 #6
0
def convert_cds_features(record, features, annotations, options):
    """Convert CDS SeqFeatures to JSON"""
    js_orfs = []
    for feature in features:
        js_orf = {}
        js_orf['start'] = int(feature.location.start) + 1
        js_orf['end'] = int(feature.location.end)
        # Fix for files that have their coordinates the wrong way around
        if js_orf['start'] > js_orf['end']:
            js_orf['end'], js_orf['start'] = js_orf['start'], js_orf['end']
        js_orf['strand'] = feature.strand if feature.strand is not None else 1
        js_orf['locus_tag'] = utils.get_gene_id(feature)
        js_orf['type'] = get_biosynthetic_type(feature, annotations)
        js_orf['description'] = utils.ascii_string(
            get_description(record, feature, js_orf['type'], options))
        domains = []
        prefix = "%s:" % record.id.replace(":", "_")
        if (prefix + js_orf['locus_tag']) in options.hmm_results:
            prefix = "%s:" % record.id.replace(":", "_")
            for hsp in sorted(options.hmm_results[prefix +
                                                  js_orf['locus_tag']],
                              key=lambda x: x.bitscore,
                              reverse=True):
                domains.append(hsp.query_id)
        js_orf['domains'] = domains
        if options.coexpress:
            js_orf['geo'] = utils.parse_geo_feature(feature)
        js_orfs.append(js_orf)
    return js_orfs
예제 #7
0
    def test_get_gene_id_no_id(self):
        "Test utils.get_gene_id() without any useable id"
        expected = 'no_tag_found'
        f = FakeFeature("CDS")

        ret = utils.get_gene_id(f)
        self.assertEqual(ret, expected)
예제 #8
0
def write_gene(txt, info, options):
    "Write gene table to TXT"
    #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation
    txt.write("\t".join([
        "gene ID", "gene start", "gene end", "gene strand", "smCOG",
        "locus_tag", "annotation"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        for cds in cluster_gene_features:
            gene_id = utils.get_gene_acc(cds).partition(".")[0]
            cds_start = str(cds.location.start)
            cds_end = str(cds.location.end)
            if cds.strand == 1:
                cds_strand = "+"
            else:
                cds_strand = "-"
            smCOG = ""  ##Not used for now
            locus_tag = utils.get_gene_id(cds).partition(".")[0]
            annotation = utils.get_gene_annotation(cds)
            txt.write("\t".join([
                gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag,
                annotation
            ]) + "\n")
예제 #9
0
def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict,
                                       seq_record, options):
    #Predict PKS KR domain stereochemistry using pattern as published in ClustScan
    krnames = []
    krseqs = []
    logging.info("Predicting PKS KR activity and stereochemistry using KR " \
        "fingerprints from Starcevic et al.")
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "PKS_KR":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_KR" + str(nr)
                krnames.append(name)
                krseqs.append(seq)
    if len(krnames) > 0:
        utils.writefasta(
            krnames, krseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_krseqs.fasta")
        with TemporaryDirectory(change=True):
            kr_analysis.run_kr_analysis(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krpredoutput.txt")
    return krnames, krseqs
예제 #10
0
def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record,
                                 options):
    calnames = []
    calseqs = []
    #Predict PKS CAL domain specificities with Minowa et al. method
    logging.info(
        "Predicting CAL domain substrate specificities by Minowa et al. method"
    )
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "CAL_domain":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_CAL" + str(nr)
                calnames.append(name)
                calseqs.append(seq)
    if len(calnames) > 0:
        utils.writefasta(
            calnames, calseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_calseqs.fasta")
        with TemporaryDirectory(change=True):
            minowa_CAL.run_minowa_cal(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_calseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_minowa_calpredoutput.txt")
    return calnames, calseqs
예제 #11
0
def find_cluster_modular_enzymes(clusterpksnrpsgenes, pksnrpsvars):
    clusterpksnrpsgenenames = [
        utils.get_gene_id(feature) for feature in clusterpksnrpsgenes
    ]
    pksgenes = 0
    clusterpksgenes = []
    nrpsgenes = 0
    hybridgenes = 0
    for j in clusterpksnrpsgenenames:
        k = pksnrpsvars.nrpspkstypedict[j]
        if "PKS" in k and "NRPS" not in k:
            pksgenes += 1
            clusterpksgenes.append(j)
        elif "PKS" not in k and "NRPS" in k:
            nrpsgenes += 1
        elif "PKS/NRPS" in k:
            if ("PKS_KS" in pksnrpsvars.domainnamesdict[j]
                    or "PKS_AT" in pksnrpsvars.domainnamesdict[j]) and (
                        "AMP-binding" not in pksnrpsvars.domainnamesdict[j]
                        and "A-OX" not in pksnrpsvars.domainnamesdict[j] and
                        "Condensation" not in pksnrpsvars.domainnamesdict[j]):
                pksgenes += 1
                clusterpksgenes.append(j)
            elif ("PKS_KS" not in pksnrpsvars.domainnamesdict[j]
                  and "PKS_AT" not in pksnrpsvars.domainnamesdict[j]) and (
                      "AMP-binding" in pksnrpsvars.domainnamesdict[j]
                      or "A-OX" in pksnrpsvars.domainnamesdict[j]
                      or "Condensation" in pksnrpsvars.domainnamesdict[j]):
                nrpsgenes += 1
        elif "PKS" in k and "NRPS" in k:
            hybridgenes += 1
    return pksgenes, clusterpksgenes, nrpsgenes, hybridgenes
예제 #12
0
def update_dist_between_clusters(seq_records, all_gene_expressions, geo):
    """Check and add remote genes that have > 0.9 PCC and in a cluster"""
    cluster_genes = {}
    for seq_record in seq_records:
        gene_expressions = all_gene_expressions[seq_record.id]
        for feature in utils.get_withincluster_cds_features(seq_record):
            gene_id = utils.get_gene_id(feature)
            if gene_id in gene_expressions:
                cluster_genes[gene_id] = gene_expressions[gene_id]

    for gene_1 in cluster_genes:
        for gene_2 in cluster_genes:
            if gene_2 == gene_1:
                continue
            if (gene_1 in cluster_genes[gene_2]["dist"]) or (
                    gene_2 in cluster_genes[gene_1]["dist"]):
                if (gene_1 not in cluster_genes[gene_2]["dist"]):
                    cluster_genes[gene_2]["dist"][gene_1] = cluster_genes[
                        gene_1]["dist"][gene_2]
                if (gene_2 not in cluster_genes[gene_1]["dist"]):
                    cluster_genes[gene_1]["dist"][gene_2] = cluster_genes[
                        gene_2]["dist"][gene_1]
                continue
            cor_val = min(
                1.00,
                calc_correlation_value(cluster_genes[gene_1],
                                       cluster_genes[gene_2]))
            if 1.00 > cor_val >= 0.9:
                cluster_genes[gene_1]["dist"][gene_2] = 100.0 * (1.0 - cor_val)
                cluster_genes[gene_2]["dist"][gene_1] = 100.0 * (1.0 - cor_val)

    for seq_record in seq_records:
        update_features(utils.get_withincluster_cds_features(seq_record),
                        cluster_genes, geo)
예제 #13
0
def generate_details_div(cluster,
                         seq_record,
                         options,
                         js_domains,
                         details=None):
    """Generate details div"""

    cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx'])
    if cluster_rec is None:
        return details

    if details is None:
        details = pq('<div>')
        details.addClass('details')

        header = pq('<h3>')
        header.text('Detailed annotation')
        details.append(header)

    js_cluster_domains = {
        'id': "cluster-%s-details" % cluster['idx'],
        'orfs': []
    }
    features = utils.get_cluster_cds_features(cluster_rec, seq_record)
    for feature in features:
        if not 'sec_met' in feature.qualifiers:
            continue

        if 'translation' in feature.qualifiers:
            sequence = feature.qualifiers['translation'][0]
        else:
            sequence = str(utils.get_aa_sequence(feature))

        js_orf = {
            'id': utils.get_gene_id(feature),
            'sequence': sequence,
            'domains': [],
        }

        for qual in feature.qualifiers['sec_met']:
            if not qual.startswith('NRPS/PKS Domain:'):
                continue

            js_domain = _parse_domain(qual, feature, seq_record)
            if len(js_domain) > 0:
                js_orf['domains'].append(js_domain)

        if len(js_orf['domains']) > 0:
            js_cluster_domains['orfs'].append(js_orf)

    if len(js_cluster_domains['orfs']) > 0:
        details_svg = pq('<div>')
        details_svg.addClass('details-svg')
        details_svg.attr('id', '%s-svg' % js_cluster_domains['id'])
        details.append(details_svg)

        js_domains.append(js_cluster_domains)

    return details
예제 #14
0
def result_vec_to_features(orig_feature, res_vec):
    start = orig_feature.location.start
    end = orig_feature.location.start + (res_vec.end * 3)
    strand = orig_feature.location.strand
    loc = FeatureLocation(start, end, strand=strand)
    leader_feature = SeqFeature(loc, type='CDS_motif')
    leader_feature.qualifiers['note'] = ['leader peptide']
    leader_feature.qualifiers['note'].append('predicted leader seq: %s' %
                                             res_vec.leader)
    leader_feature.qualifiers['locus_tag'] = [utils.get_gene_id(orig_feature)]

    start = end
    end = orig_feature.location.end
    loc = FeatureLocation(start, end, strand=strand)
    core_feature = SeqFeature(loc, type='CDS_motif')
    core_feature.qualifiers['note'] = ['core peptide']
    core_feature.qualifiers['note'].append('monoisotopic mass: %0.1f' %
                                           res_vec.monoisotopic_mass)
    core_feature.qualifiers['note'].append('molecular weight: %0.1f' %
                                           res_vec.molecular_weight)
    if res_vec.alternative_weights:
        weights = map(lambda x: "%0.1f" % x, res_vec.alternative_weights)
        core_feature.qualifiers['note'].append('alternative weights: %s' %
                                               "; ".join(weights))
    core_feature.qualifiers['note'].append('number of bridges: %s' %
                                           res_vec.number_of_lan_bridges)
    core_feature.qualifiers['note'].append('predicted core seq: %s' %
                                           res_vec.core)
    core_feature.qualifiers['note'].append('predicted class: %s' %
                                           res_vec.lantype)
    core_feature.qualifiers['note'].append('score: %0.2f' % res_vec.score)
    if res_vec.aminovinyl_group:
        core_feature.qualifiers['note'].append(
            'predicted additional modification: AviCys')
    if res_vec.chlorinated:
        core_feature.qualifiers['note'].append(
            'predicted additional modification: Cl')
    if res_vec.oxygenated:
        core_feature.qualifiers['note'].append(
            'predicted additional modification: OH')
    if res_vec.lactonated:
        core_feature.qualifiers['note'].append(
            'predicted additional modification: Lac')
    core_feature.qualifiers['locus_tag'] = [utils.get_gene_id(orig_feature)]

    return [leader_feature, core_feature]
예제 #15
0
    def test_get_gene_id_locus_tag(self):
        "Test utils.get_gene_id() with locus tag"
        expected = 'test_tag'
        f = FakeFeature("CDS")
        f.qualifiers['locus_tag'] = [expected]

        ret = utils.get_gene_id(f)
        self.assertEqual(ret, expected)
예제 #16
0
def _get_transatpks_geneclusters(pksnrpsvars, seq_record):
    nrpspksclusters = list(set(utils.get_cluster_features_of_type(seq_record, "transatpks")))
    genes_in_cluster = {}
    for cluster in nrpspksclusters:
        cluster_id = utils.get_cluster_number(cluster)
        cluster_genes = [utils.get_gene_id(feature) for feature in find_clusterpksnrpsgenes(cluster, pksnrpsvars.pksnrpscoregenes)]
        genes_in_cluster[cluster_id] = cluster_genes
    return genes_in_cluster
예제 #17
0
    def test_get_gene_id_protein_id(self):
        "Test utils.get_gene_id() with protein_id tag"
        expected = 'test_id'
        f = FakeFeature("CDS")
        f.qualifiers['protein_id'] = [expected]

        ret = utils.get_gene_id(f)
        self.assertEqual(ret, expected)
예제 #18
0
    def test_get_gene_id_gene(self):
        "Test utils.get_gene_id() with gene tag"
        expected = 'test_gene'
        f = FakeFeature("CDS")
        f.qualifiers['gene'] = [expected]

        ret = utils.get_gene_id(f)
        self.assertEqual(ret, expected)
예제 #19
0
def generate_domainnamesdict(pksnrpsvars):
    pksnrpsvars.domainnamesdict = {}
    for feature in pksnrpsvars.pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        j = pksnrpsvars.domaindict[locus]
        domainnames = []
        for k in j:
            domainnames.append(k[0])
        pksnrpsvars.domainnamesdict[locus] = domainnames
예제 #20
0
def add_additional_nrpspks_genes(typedict, results_by_id, seq_record,
                                 nseqdict):
    nrpspksdomains = [
        "PKS_KS", "PKS_AT", "ATd", "ene_KS", "mod_KS", "hyb_KS", "itr_KS",
        "tra_KS", "Condensation", "AMP-binding", "A-OX"
    ]
    clustercdsfeatures = utils.get_withincluster_cds_features(seq_record)
    othercds_with_results = [
        cds for cds in clustercdsfeatures
        if results_by_id.has_key(utils.get_gene_id(cds))
        and typedict[utils.get_gene_id(cds)] == "none"
    ]
    for cds in othercds_with_results:
        cdsresults = [
            res.query_id for res in results_by_id[utils.get_gene_id(cds)]
        ]
        if len(set(nrpspksdomains) & set(cdsresults)) >= 1:
            _update_sec_met_entry(cds, results_by_id[utils.get_gene_id(cds)],
                                  "other", nseqdict)
예제 #21
0
def count_pks_genes(pksnrpscoregenes, domaindict, seq_record):
    pkscount = 0
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        for tab in domaindetails:
            if tab[0] == "PKS_AT" or tab[0] == "CAL_domain" or tab[
                    0] == "PKS_KR":
                pkscount += 1
    return pkscount
예제 #22
0
def find_colinear_order(clusterpksnrpsgenes, seq_record, domainnamesdict):
    feature_by_id = utils.get_feature_dict(seq_record)
    #If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity
    direction = 0
    for feature in clusterpksnrpsgenes:
        k = utils.get_gene_id(feature)
        if feature_by_id[k].strand == 1:
            direction += 1
        elif feature_by_id[k].strand == -1:
            direction = direction - 1
    if direction < 0:
        clusterpksnrpsgenes.reverse()
    #Reverse if first gene encodes a multidomain protein with a TE/TD domain
    if "Thioesterase" in domainnamesdict[utils.get_gene_id(
            clusterpksnrpsgenes[0])] or "TD" in domainnamesdict[
                utils.get_gene_id(clusterpksnrpsgenes[0])]:
        if len(domainnamesdict[utils.get_gene_id(clusterpksnrpsgenes[0])]) > 1:
            clusterpksnrpsgenes.reverse()
    geneorder = [utils.get_gene_id(feature) for feature in clusterpksnrpsgenes]
    return geneorder
예제 #23
0
def run_lantipred(seq_record, query, lant_class):
    hmmer_profiles = {
        'Class-I': 'class1.hmm',
        'Class-II': 'class2.hmm',
        'Class-III': 'class3.hmm',
    }

    query_sequence = utils.get_aa_sequence(query, to_stop=True)
    lan_a_fasta = ">%s\n%s" % (utils.get_gene_id(query), query_sequence)

    #run sequence against profiles and parse them in a vector containing START, END, SCORE and LANTYPE
    profile = utils.get_full_path(__file__, hmmer_profiles[lant_class])
    result = predict_cleavage_site(profile, lan_a_fasta)

    if result is None:
        logging.debug('%r: No cleavage site predicted' %
                      utils.get_gene_id(query))
        return

    if thresh_dict[lant_class] > result.score:
        logging.debug('%r: Score %0.2f below threshold %0.2f for class %r' %
                      (utils.get_gene_id(query), result.score,
                       thresh_dict[lant_class], lant_class))
        return

    #extract now (that class is known and thus the END component) the core peptide
    result.leader = query_sequence[:result.end]
    result.core = query_sequence[result.end:]
    if result.core.find('C') < 0:
        logging.debug(
            '%r: No Cysteine residues found in core, false positive' %
            utils.get_gene_id(query))
        return
    if not 'sec_met' in query.qualifiers:
        query.qualifiers['sec_met'] = []

    if ";".join(query.qualifiers['sec_met']).find(';Kind: biosynthetic') < 0:
        query.qualifiers['sec_met'].append('Kind: biosynthetic')

    return result
예제 #24
0
def _annotate(geneclustergenes, smcogvars, options):
    #Annotate smCOGS in CDS features
    for feature in geneclustergenes:
        gene_id = utils.get_gene_id(feature)
        if smcogvars.smcogdict.has_key(gene_id):
            detailslist = smcogvars.smcogdict[gene_id]
            if not feature.qualifiers.has_key('note'):
                feature.qualifiers['note'] = []
            if len(detailslist) > 0:
                feature.qualifiers['note'].append("smCOG: " + detailslist[0][0] + " (Score: " + str(detailslist[0][4]) + "; E-value: " + str(detailslist[0][3]) + ");")
        if smcogvars.smcogtreedict.has_key(gene_id):
            if not feature.qualifiers.has_key('note'):
                feature.qualifiers['note'] = []
            feature.qualifiers['note'].append("smCOG tree PNG image: smcogs/%s"  % smcogvars.smcogtreedict[gene_id])
예제 #25
0
def find_col_id(geo_dataset, seq_records):
    if geo_dataset["info"]["type"] == "CSV":
        geo_dataset["info"]["col_id"] = 0
        return geo_dataset
    for id_ref, data in geo_dataset["data"].items():
        for i in xrange(0, len(data[0])):
            for seq_record in seq_records:
                for feature in utils.get_cds_features(seq_record):
                    gene_id = utils.get_gene_id(feature)
                    if gene_id.upper() == data[0][i].upper():
                        geo_dataset["info"]["col_id"] = i
                        return geo_dataset
    geo_dataset["info"]["col_id"] = -1
    return geo_dataset
예제 #26
0
def getECs(seq_record, options):
    logging.debug("Predicting EC numbers with EFICAz")
    if not name in options.ecpred:
        logging.debug("ECprediction %s not selected, returning..." % name)
        return

    if not 'cpus' in options:
        options.cpus = 1

    EFICAzECs = EFICAzECPrediction(seq_record, options)
    EFICAzECs.runECpred()
    logging.debug("Found %s predictions for EC4" %
                  len(EFICAzECs.getEC4Dict().keys()))

    for feature in utils.get_cds_features(seq_record):
        featureID = utils.get_gene_id(feature)

        notes = []

        if feature.qualifiers.has_key("note"):
            notes = feature.qualifiers['note']

        if EFICAzECs.getEC4(featureID):
            logging.debug("Annotating %s" % featureID)
            if feature.qualifiers.has_key('EC_number'):
                logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s  with %s' % \
                             (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC4(featureID))))
            feature.qualifiers['EC_number'] = EFICAzECs.getEC4(featureID)
            notes.append("EFICAz EC number prediction: EC4: {0}; {1}".format(", ".join(EFICAzECs.getEC4(featureID)), \
                                                                             "; ".join(EFICAzECs.getEC4Info(featureID)))    )
        # Only annotate 3 digit EC if no 4 digit EC is available
        if (EFICAzECs.getEC3(featureID) and not EFICAzECs.getEC4(featureID)):
            if feature.qualifiers.has_key('EC_number'):
                if not re.search("\d+\.\d+\.\d+\.\d+", " ".join(
                        feature.qualifiers['EC_number'])):
                    logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s  with %s' % \
                                 (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC3(featureID))))
                    feature.qualifiers['EC_number'] = EFICAzECs.getEC3(
                        featureID)

        if EFICAzECs.getEC3Info(featureID):
            notes.append("EFICAz EC number prediction: EC3: {0}; {1}".format(", ".join(EFICAzECs.getEC3(featureID)), \
                                                                             "; ".join(EFICAzECs.getEC3Info(featureID))))
            if not feature.qualifiers.has_key('EC_number'):
                feature.qualifiers['EC_number'] = EFICAzECs.getEC3(featureID)

        feature.qualifiers['note'] = notes
    logging.debug("Finished EC number prediction with EFICAz")
예제 #27
0
def write(seq_records, options):
    logging.debug("Exporting antiSMASH information as txt tables")
    #Don't store TXT tables for protein input
    if options.input_type == 'prot':
        return
    #Localize output folder, create TXT subdirectory
    txt_outfolder = options.full_outputfolder_path + os.sep + "txt"
    if not os.path.exists(txt_outfolder):
        os.mkdir(txt_outfolder)
    #Define table names
    tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable"
    #For each gene cluster, write out info to TXT files
    for seq_record in seq_records:
        if len(utils.get_cluster_features(seq_record)) > 0:
            #Open up TXT files
            txt_files = {}
            for table in tables:
                txt_files[table] = open(
                    path.join(
                        txt_outfolder, "%s_%s.txt" %
                        (seq_record.id.partition(".")[0], table)), "w")
            #Gather all information
            info = utils.Storage()
            info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, []
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clusternr = utils.get_cluster_number(cluster)
                info.clusternrs.append(clusternr)
                info.clustertypes[clusternr] = utils.get_cluster_type(cluster)
                info.clustergenes[clusternr] = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.accessions[clusternr] = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.cdsmotifs[clusternr] = utils.get_all_features_of_type(
                    seq_record, ["CDS_motif"])
            info.seq_record = seq_record
            #Write information to tables
            for table in tables:
                getattr(write_tables, 'write_' + table)(txt_files[table], info,
                                                        options)
            for table in tables:
                txt_files[table].close()
예제 #28
0
파일: js.py 프로젝트: chevrm/transPACT
def convert_cds_features(record, features, annotations, options):
    """Convert CDS SeqFeatures to JSON"""
    js_orfs = []
    for feature in features:
        js_orf = {}
        js_orf['start'] = int(feature.location.start) + 1
        js_orf['end'] = int(feature.location.end)
        # Fix for files that have their coordinates the wrong way around
        if js_orf['start'] > js_orf['end']:
            js_orf['end'], js_orf['start'] = js_orf['start'], js_orf['end']
        js_orf['strand'] = feature.strand if feature.strand is not None else 1
        js_orf['locus_tag'] = utils.get_gene_id(feature)
        js_orf['type'] = get_biosynthetic_type(feature, annotations)
        js_orf['description'] = utils.ascii_string(
            get_description(record, feature, js_orf['type'], options))
        js_orfs.append(js_orf)
    return js_orfs
예제 #29
0
def retrieve_pksnrps_info(seq_record, geneclusternr, pksnrpsprots):
    pksnrpsprotsnames = [utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record)]
    domaindict = utils.get_nrpspks_domain_dict(seq_record)
    substr_spec_preds = utils.get_nrpspks_substr_spec_preds(seq_record)
    pksnrpsdomains = {}
    domsdetails = {}
    substrspecnrpspredictordict = {}
    substrspecminowadict = {}
    substrspecpkssigdict = {}
    substrspecconsensusdict = {}
    krpredictionsdict = {}
    for i in pksnrpsprots:
        domlist = []
        domsdetails = {}
        doms = domaindict[i]
        for j in doms:
            nr = 1
            while j[0] + str(nr) in domlist:
                nr += 1
            domname = j[0] + str(nr)
            domlist.append(domname)
            domsdetails[domname] = [j[1],j[2]]
            if "AMP-binding" in domname or "A-OX" in domname:
                domname2 = i + "_" + "A" + str(nr)
                substrspecminowadict[domname2] = substr_spec_preds.minowa_nrps_preds[i + "_A" + str(nr)]
                substrspecnrpspredictordict[domname2] = [substr_spec_preds.nrps_code_preds[i + "_A" + str(nr)], substr_spec_preds.nrps_svm_preds[i + "_A" + str(nr)]]
                substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_A" + str(nr)]
            if "PKS_AT" in domname:
                domname2 = i + "_" + "AT" + str(nr)
                substrspecminowadict[domname2] = substr_spec_preds.minowa_pks_preds[i + "_AT" + str(nr)]
                substrspecpkssigdict[domname2] = substr_spec_preds.pks_code_preds[i + "_AT" + str(nr)]
                substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_AT" + str(nr)]
            if "CAL_domain" in domname:
                domname2 = i + "_" + "CAL" + str(nr)
                substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)]
                substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)]
            if "CAL_domain" in domname:
                domname2 = i + "_" + "CAL" + str(nr)
                substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)]
                substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)]
            if "PKS_KR" in domname:
                domname2 = i + "_" + "KR" + str(nr)
                krpredictionsdict[domname2] = [substr_spec_preds.kr_activity_preds[i + "_KR" + str(nr)], substr_spec_preds.kr_stereo_preds[i + "_KR" + str(nr)]]
        pksnrpsdomains[i] = [domlist,domsdetails]
    structpred = utils.get_structure_pred(utils.get_cluster_by_nr(seq_record, geneclusternr))
    return pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred
예제 #30
0
def extract_pks_genes(pksnrpscoregenes, domaindict, seq_record):
    pksnames = []
    pksseqs = []
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "PKS_AT":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_AT" + str(nr)
                pksnames.append(name)
                pksseqs.append(seq)
    return pksnames, pksseqs