def store_percentage_identities(seq_record): clusters = utils.get_cluster_features(seq_record) cfg = config.get_config() for cluster in clusters: features = [ feature for feature in utils.get_cluster_cds_features(cluster, seq_record) if 'sec_met' in feature.qualifiers ] cdhit_table, gene_to_cluster = utils.get_cdhit_table( features, float(cfg.cdh_display_cutoff)) for cdhit_cluster in cdhit_table: if len(cdhit_cluster["genes"]) > 1: cl_features = [ feature for feature in features if utils.get_gene_id( feature) in cdhit_cluster["genes"].keys() ] pct_table = utils.get_pct_identity_table(cl_features) for cds in cl_features: result = ",".join([ "%s=%s" % (othercds, pct_table[utils.get_gene_id(cds)][othercds]) for othercds in pct_table[utils.get_gene_id( cds)].keys() ]) for ann in cds.qualifiers['sec_met']: if ann.startswith("Percentage identity"): del ann cds.qualifiers['sec_met'].append( "Percentage identity: %s" % (result))
def generate_searchgtr_htmls(seq_records, options): #Generate lists of COGs that are glycosyltransferases or transporters gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102'] searchgtrformtemplateparts = load_searchgtr_search_form_template() options.searchgtr_links = {} for seq_record in seq_records: smcogdict, _ = utils.get_smcog_annotations(seq_record) for feature in utils.get_cds_features(seq_record): gene_id = utils.get_gene_id(feature) if smcogdict.has_key(gene_id): smcog = smcogdict[gene_id] if smcog in gtrcoglist: if not os.path.exists(options.full_outputfolder_path + os.sep + "html"): os.mkdir(options.full_outputfolder_path + os.sep + "html") formfileloc = options.full_outputfolder_path + os.sep + "html" + os.sep + utils.get_gene_id( feature) + "_searchgtr.html" link_loc = "html" + os.sep + utils.get_gene_id( feature) + "_searchgtr.html" options.searchgtr_links[seq_record.id + "_" + gene_id] = link_loc formfile = open(formfileloc, "w") specificformtemplate = searchgtrformtemplateparts[ 0].replace("GlycTr", gene_id) formfile.write(specificformtemplate) formfile.write("%s\n%s" % (gene_id, utils.get_aa_sequence(feature))) formfile.write(searchgtrformtemplateparts[1]) formfile.close()
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr): allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] pksnrpscoregenes = [ utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record) ] feature_by_id = utils.get_feature_dict(seq_record) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record) ] clustertype = utils.get_cluster_type( utils.get_cluster_by_nr(seq_record, geneclusternr)) annotations = {} colors = [] starts = [] ends = [] strands = [] pksnrpsprots = [] gtrs = [] transporters = [] for j in clustergenes: cdsfeature = feature_by_id[j] if cdsfeature.qualifiers.has_key('product'): annotations[j] = cdsfeature.qualifiers['product'][0] else: annotations[j] = 'Unannotated gene' starts.append(cdsfeature.location.start) ends.append(cdsfeature.location.end) if cdsfeature.strand == -1: strands.append("-") else: strands.append("+") if j in allcoregenes: colors.append("#810E15") else: colors.append("grey") if j in pksnrpscoregenes: pksnrpsprots.append(j) if smcogdict.has_key(j): if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist: gtrs.append(j) if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist: transporters.append(j) clustersize = max(ends) - min(starts) return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
def match_exp_to_genes(features, geo_dataset): cluster_genes = {} geo_info = geo_dataset["info"] geo_data = geo_dataset["data"] # get gene_id to ref_id table gene_to_ref = {} col_gene_id = geo_info["col_id"] if col_gene_id < 0: return {} # gene_id columns not found for id_ref, data in geo_data.items(): gene_to_ref[data[0][col_gene_id].upper()] = id_ref # fill cluster_genes for feature in features: gene_id = utils.get_gene_id(feature) if gene_id.upper() in gene_to_ref: cluster_genes[gene_id] = {} cluster_genes[gene_id]["ref"] = gene_to_ref[gene_id.upper()] cluster_genes[gene_id]["evalue"] = float(-1) #calculate scaled value for each hits for gene_id in cluster_genes: cg = cluster_genes[gene_id] if "ref" in cg: cg["exp"] = {} for sample, value in geo_data[cg["ref"]][1].items(): cg["exp"][sample] = value return cluster_genes
def getECs(seq_record, options): if not name in options.ecpred: logging.debug("ECprediction %s not selected, returning..." % name) return CDSFeatureDict = utils.get_feature_dict(seq_record) logging.debug("Predicting EC numbers using KEGG online queries") KEGGspeciesLocusTagDict = _getKEGG_speciesLocusTag(CDSFeatureDict) ECDict = _get_ECNumberDict(KEGGspeciesLocusTagDict) notes = [] # logging.debug("Found %s EC predictions" % len(ECDict.keys())) for key in ECDict.keys(): Feature = CDSFeatureDict[key] if Feature.qualifiers.has_key('note'): notes = Feature.qualifiers['note'] if len(ECDict[key]) > 0: logging.debug("Found EC numbers: %s" % ", ".join(ECDict[key])) notes.append('EC number prediction based on KEGG query: %s' % ECDict[key]) Feature.qualifiers['note'] = notes if Feature.qualifiers.has_key('EC_number'): logging.warn('ECpredictor[kegg]: Overwriting existing EC annotation: %s with %s' % \ (", ".join(Feature.qualifiers['EC_number']), ", ".join(ECDict[key]))) Feature.qualifiers['EC_number'] = ECDict[key] else: logging.warn('ECpredictor[KEGG]: Could not find EC number for %s' % utils.get_gene_id(Feature))
def convert_cds_features(record, features, annotations, options): """Convert CDS SeqFeatures to JSON""" js_orfs = [] for feature in features: js_orf = {} js_orf['start'] = int(feature.location.start) + 1 js_orf['end'] = int(feature.location.end) # Fix for files that have their coordinates the wrong way around if js_orf['start'] > js_orf['end']: js_orf['end'], js_orf['start'] = js_orf['start'], js_orf['end'] js_orf['strand'] = feature.strand if feature.strand is not None else 1 js_orf['locus_tag'] = utils.get_gene_id(feature) js_orf['type'] = get_biosynthetic_type(feature, annotations) js_orf['description'] = utils.ascii_string( get_description(record, feature, js_orf['type'], options)) domains = [] prefix = "%s:" % record.id.replace(":", "_") if (prefix + js_orf['locus_tag']) in options.hmm_results: prefix = "%s:" % record.id.replace(":", "_") for hsp in sorted(options.hmm_results[prefix + js_orf['locus_tag']], key=lambda x: x.bitscore, reverse=True): domains.append(hsp.query_id) js_orf['domains'] = domains if options.coexpress: js_orf['geo'] = utils.parse_geo_feature(feature) js_orfs.append(js_orf) return js_orfs
def test_get_gene_id_no_id(self): "Test utils.get_gene_id() without any useable id" expected = 'no_tag_found' f = FakeFeature("CDS") ret = utils.get_gene_id(f) self.assertEqual(ret, expected)
def write_gene(txt, info, options): "Write gene table to TXT" #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation txt.write("\t".join([ "gene ID", "gene start", "gene end", "gene strand", "smCOG", "locus_tag", "annotation" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) for cds in cluster_gene_features: gene_id = utils.get_gene_acc(cds).partition(".")[0] cds_start = str(cds.location.start) cds_end = str(cds.location.end) if cds.strand == 1: cds_strand = "+" else: cds_strand = "-" smCOG = "" ##Not used for now locus_tag = utils.get_gene_id(cds).partition(".")[0] annotation = utils.get_gene_annotation(cds) txt.write("\t".join([ gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag, annotation ]) + "\n")
def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict, seq_record, options): #Predict PKS KR domain stereochemistry using pattern as published in ClustScan krnames = [] krseqs = [] logging.info("Predicting PKS KR activity and stereochemistry using KR " \ "fingerprints from Starcevic et al.") for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "PKS_KR": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_KR" + str(nr) krnames.append(name) krseqs.append(seq) if len(krnames) > 0: utils.writefasta( krnames, krseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta") with TemporaryDirectory(change=True): kr_analysis.run_kr_analysis( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krpredoutput.txt") return krnames, krseqs
def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record, options): calnames = [] calseqs = [] #Predict PKS CAL domain specificities with Minowa et al. method logging.info( "Predicting CAL domain substrate specificities by Minowa et al. method" ) for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "CAL_domain": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_CAL" + str(nr) calnames.append(name) calseqs.append(seq) if len(calnames) > 0: utils.writefasta( calnames, calseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta") with TemporaryDirectory(change=True): minowa_CAL.run_minowa_cal( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_minowa_calpredoutput.txt") return calnames, calseqs
def find_cluster_modular_enzymes(clusterpksnrpsgenes, pksnrpsvars): clusterpksnrpsgenenames = [ utils.get_gene_id(feature) for feature in clusterpksnrpsgenes ] pksgenes = 0 clusterpksgenes = [] nrpsgenes = 0 hybridgenes = 0 for j in clusterpksnrpsgenenames: k = pksnrpsvars.nrpspkstypedict[j] if "PKS" in k and "NRPS" not in k: pksgenes += 1 clusterpksgenes.append(j) elif "PKS" not in k and "NRPS" in k: nrpsgenes += 1 elif "PKS/NRPS" in k: if ("PKS_KS" in pksnrpsvars.domainnamesdict[j] or "PKS_AT" in pksnrpsvars.domainnamesdict[j]) and ( "AMP-binding" not in pksnrpsvars.domainnamesdict[j] and "A-OX" not in pksnrpsvars.domainnamesdict[j] and "Condensation" not in pksnrpsvars.domainnamesdict[j]): pksgenes += 1 clusterpksgenes.append(j) elif ("PKS_KS" not in pksnrpsvars.domainnamesdict[j] and "PKS_AT" not in pksnrpsvars.domainnamesdict[j]) and ( "AMP-binding" in pksnrpsvars.domainnamesdict[j] or "A-OX" in pksnrpsvars.domainnamesdict[j] or "Condensation" in pksnrpsvars.domainnamesdict[j]): nrpsgenes += 1 elif "PKS" in k and "NRPS" in k: hybridgenes += 1 return pksgenes, clusterpksgenes, nrpsgenes, hybridgenes
def update_dist_between_clusters(seq_records, all_gene_expressions, geo): """Check and add remote genes that have > 0.9 PCC and in a cluster""" cluster_genes = {} for seq_record in seq_records: gene_expressions = all_gene_expressions[seq_record.id] for feature in utils.get_withincluster_cds_features(seq_record): gene_id = utils.get_gene_id(feature) if gene_id in gene_expressions: cluster_genes[gene_id] = gene_expressions[gene_id] for gene_1 in cluster_genes: for gene_2 in cluster_genes: if gene_2 == gene_1: continue if (gene_1 in cluster_genes[gene_2]["dist"]) or ( gene_2 in cluster_genes[gene_1]["dist"]): if (gene_1 not in cluster_genes[gene_2]["dist"]): cluster_genes[gene_2]["dist"][gene_1] = cluster_genes[ gene_1]["dist"][gene_2] if (gene_2 not in cluster_genes[gene_1]["dist"]): cluster_genes[gene_1]["dist"][gene_2] = cluster_genes[ gene_2]["dist"][gene_1] continue cor_val = min( 1.00, calc_correlation_value(cluster_genes[gene_1], cluster_genes[gene_2])) if 1.00 > cor_val >= 0.9: cluster_genes[gene_1]["dist"][gene_2] = 100.0 * (1.0 - cor_val) cluster_genes[gene_2]["dist"][gene_1] = 100.0 * (1.0 - cor_val) for seq_record in seq_records: update_features(utils.get_withincluster_cds_features(seq_record), cluster_genes, geo)
def generate_details_div(cluster, seq_record, options, js_domains, details=None): """Generate details div""" cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx']) if cluster_rec is None: return details if details is None: details = pq('<div>') details.addClass('details') header = pq('<h3>') header.text('Detailed annotation') details.append(header) js_cluster_domains = { 'id': "cluster-%s-details" % cluster['idx'], 'orfs': [] } features = utils.get_cluster_cds_features(cluster_rec, seq_record) for feature in features: if not 'sec_met' in feature.qualifiers: continue if 'translation' in feature.qualifiers: sequence = feature.qualifiers['translation'][0] else: sequence = str(utils.get_aa_sequence(feature)) js_orf = { 'id': utils.get_gene_id(feature), 'sequence': sequence, 'domains': [], } for qual in feature.qualifiers['sec_met']: if not qual.startswith('NRPS/PKS Domain:'): continue js_domain = _parse_domain(qual, feature, seq_record) if len(js_domain) > 0: js_orf['domains'].append(js_domain) if len(js_orf['domains']) > 0: js_cluster_domains['orfs'].append(js_orf) if len(js_cluster_domains['orfs']) > 0: details_svg = pq('<div>') details_svg.addClass('details-svg') details_svg.attr('id', '%s-svg' % js_cluster_domains['id']) details.append(details_svg) js_domains.append(js_cluster_domains) return details
def result_vec_to_features(orig_feature, res_vec): start = orig_feature.location.start end = orig_feature.location.start + (res_vec.end * 3) strand = orig_feature.location.strand loc = FeatureLocation(start, end, strand=strand) leader_feature = SeqFeature(loc, type='CDS_motif') leader_feature.qualifiers['note'] = ['leader peptide'] leader_feature.qualifiers['note'].append('predicted leader seq: %s' % res_vec.leader) leader_feature.qualifiers['locus_tag'] = [utils.get_gene_id(orig_feature)] start = end end = orig_feature.location.end loc = FeatureLocation(start, end, strand=strand) core_feature = SeqFeature(loc, type='CDS_motif') core_feature.qualifiers['note'] = ['core peptide'] core_feature.qualifiers['note'].append('monoisotopic mass: %0.1f' % res_vec.monoisotopic_mass) core_feature.qualifiers['note'].append('molecular weight: %0.1f' % res_vec.molecular_weight) if res_vec.alternative_weights: weights = map(lambda x: "%0.1f" % x, res_vec.alternative_weights) core_feature.qualifiers['note'].append('alternative weights: %s' % "; ".join(weights)) core_feature.qualifiers['note'].append('number of bridges: %s' % res_vec.number_of_lan_bridges) core_feature.qualifiers['note'].append('predicted core seq: %s' % res_vec.core) core_feature.qualifiers['note'].append('predicted class: %s' % res_vec.lantype) core_feature.qualifiers['note'].append('score: %0.2f' % res_vec.score) if res_vec.aminovinyl_group: core_feature.qualifiers['note'].append( 'predicted additional modification: AviCys') if res_vec.chlorinated: core_feature.qualifiers['note'].append( 'predicted additional modification: Cl') if res_vec.oxygenated: core_feature.qualifiers['note'].append( 'predicted additional modification: OH') if res_vec.lactonated: core_feature.qualifiers['note'].append( 'predicted additional modification: Lac') core_feature.qualifiers['locus_tag'] = [utils.get_gene_id(orig_feature)] return [leader_feature, core_feature]
def test_get_gene_id_locus_tag(self): "Test utils.get_gene_id() with locus tag" expected = 'test_tag' f = FakeFeature("CDS") f.qualifiers['locus_tag'] = [expected] ret = utils.get_gene_id(f) self.assertEqual(ret, expected)
def _get_transatpks_geneclusters(pksnrpsvars, seq_record): nrpspksclusters = list(set(utils.get_cluster_features_of_type(seq_record, "transatpks"))) genes_in_cluster = {} for cluster in nrpspksclusters: cluster_id = utils.get_cluster_number(cluster) cluster_genes = [utils.get_gene_id(feature) for feature in find_clusterpksnrpsgenes(cluster, pksnrpsvars.pksnrpscoregenes)] genes_in_cluster[cluster_id] = cluster_genes return genes_in_cluster
def test_get_gene_id_protein_id(self): "Test utils.get_gene_id() with protein_id tag" expected = 'test_id' f = FakeFeature("CDS") f.qualifiers['protein_id'] = [expected] ret = utils.get_gene_id(f) self.assertEqual(ret, expected)
def test_get_gene_id_gene(self): "Test utils.get_gene_id() with gene tag" expected = 'test_gene' f = FakeFeature("CDS") f.qualifiers['gene'] = [expected] ret = utils.get_gene_id(f) self.assertEqual(ret, expected)
def generate_domainnamesdict(pksnrpsvars): pksnrpsvars.domainnamesdict = {} for feature in pksnrpsvars.pksnrpscoregenes: locus = utils.get_gene_id(feature) j = pksnrpsvars.domaindict[locus] domainnames = [] for k in j: domainnames.append(k[0]) pksnrpsvars.domainnamesdict[locus] = domainnames
def add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict): nrpspksdomains = [ "PKS_KS", "PKS_AT", "ATd", "ene_KS", "mod_KS", "hyb_KS", "itr_KS", "tra_KS", "Condensation", "AMP-binding", "A-OX" ] clustercdsfeatures = utils.get_withincluster_cds_features(seq_record) othercds_with_results = [ cds for cds in clustercdsfeatures if results_by_id.has_key(utils.get_gene_id(cds)) and typedict[utils.get_gene_id(cds)] == "none" ] for cds in othercds_with_results: cdsresults = [ res.query_id for res in results_by_id[utils.get_gene_id(cds)] ] if len(set(nrpspksdomains) & set(cdsresults)) >= 1: _update_sec_met_entry(cds, results_by_id[utils.get_gene_id(cds)], "other", nseqdict)
def count_pks_genes(pksnrpscoregenes, domaindict, seq_record): pkscount = 0 for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] for tab in domaindetails: if tab[0] == "PKS_AT" or tab[0] == "CAL_domain" or tab[ 0] == "PKS_KR": pkscount += 1 return pkscount
def find_colinear_order(clusterpksnrpsgenes, seq_record, domainnamesdict): feature_by_id = utils.get_feature_dict(seq_record) #If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity direction = 0 for feature in clusterpksnrpsgenes: k = utils.get_gene_id(feature) if feature_by_id[k].strand == 1: direction += 1 elif feature_by_id[k].strand == -1: direction = direction - 1 if direction < 0: clusterpksnrpsgenes.reverse() #Reverse if first gene encodes a multidomain protein with a TE/TD domain if "Thioesterase" in domainnamesdict[utils.get_gene_id( clusterpksnrpsgenes[0])] or "TD" in domainnamesdict[ utils.get_gene_id(clusterpksnrpsgenes[0])]: if len(domainnamesdict[utils.get_gene_id(clusterpksnrpsgenes[0])]) > 1: clusterpksnrpsgenes.reverse() geneorder = [utils.get_gene_id(feature) for feature in clusterpksnrpsgenes] return geneorder
def run_lantipred(seq_record, query, lant_class): hmmer_profiles = { 'Class-I': 'class1.hmm', 'Class-II': 'class2.hmm', 'Class-III': 'class3.hmm', } query_sequence = utils.get_aa_sequence(query, to_stop=True) lan_a_fasta = ">%s\n%s" % (utils.get_gene_id(query), query_sequence) #run sequence against profiles and parse them in a vector containing START, END, SCORE and LANTYPE profile = utils.get_full_path(__file__, hmmer_profiles[lant_class]) result = predict_cleavage_site(profile, lan_a_fasta) if result is None: logging.debug('%r: No cleavage site predicted' % utils.get_gene_id(query)) return if thresh_dict[lant_class] > result.score: logging.debug('%r: Score %0.2f below threshold %0.2f for class %r' % (utils.get_gene_id(query), result.score, thresh_dict[lant_class], lant_class)) return #extract now (that class is known and thus the END component) the core peptide result.leader = query_sequence[:result.end] result.core = query_sequence[result.end:] if result.core.find('C') < 0: logging.debug( '%r: No Cysteine residues found in core, false positive' % utils.get_gene_id(query)) return if not 'sec_met' in query.qualifiers: query.qualifiers['sec_met'] = [] if ";".join(query.qualifiers['sec_met']).find(';Kind: biosynthetic') < 0: query.qualifiers['sec_met'].append('Kind: biosynthetic') return result
def _annotate(geneclustergenes, smcogvars, options): #Annotate smCOGS in CDS features for feature in geneclustergenes: gene_id = utils.get_gene_id(feature) if smcogvars.smcogdict.has_key(gene_id): detailslist = smcogvars.smcogdict[gene_id] if not feature.qualifiers.has_key('note'): feature.qualifiers['note'] = [] if len(detailslist) > 0: feature.qualifiers['note'].append("smCOG: " + detailslist[0][0] + " (Score: " + str(detailslist[0][4]) + "; E-value: " + str(detailslist[0][3]) + ");") if smcogvars.smcogtreedict.has_key(gene_id): if not feature.qualifiers.has_key('note'): feature.qualifiers['note'] = [] feature.qualifiers['note'].append("smCOG tree PNG image: smcogs/%s" % smcogvars.smcogtreedict[gene_id])
def find_col_id(geo_dataset, seq_records): if geo_dataset["info"]["type"] == "CSV": geo_dataset["info"]["col_id"] = 0 return geo_dataset for id_ref, data in geo_dataset["data"].items(): for i in xrange(0, len(data[0])): for seq_record in seq_records: for feature in utils.get_cds_features(seq_record): gene_id = utils.get_gene_id(feature) if gene_id.upper() == data[0][i].upper(): geo_dataset["info"]["col_id"] = i return geo_dataset geo_dataset["info"]["col_id"] = -1 return geo_dataset
def getECs(seq_record, options): logging.debug("Predicting EC numbers with EFICAz") if not name in options.ecpred: logging.debug("ECprediction %s not selected, returning..." % name) return if not 'cpus' in options: options.cpus = 1 EFICAzECs = EFICAzECPrediction(seq_record, options) EFICAzECs.runECpred() logging.debug("Found %s predictions for EC4" % len(EFICAzECs.getEC4Dict().keys())) for feature in utils.get_cds_features(seq_record): featureID = utils.get_gene_id(feature) notes = [] if feature.qualifiers.has_key("note"): notes = feature.qualifiers['note'] if EFICAzECs.getEC4(featureID): logging.debug("Annotating %s" % featureID) if feature.qualifiers.has_key('EC_number'): logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s with %s' % \ (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC4(featureID)))) feature.qualifiers['EC_number'] = EFICAzECs.getEC4(featureID) notes.append("EFICAz EC number prediction: EC4: {0}; {1}".format(", ".join(EFICAzECs.getEC4(featureID)), \ "; ".join(EFICAzECs.getEC4Info(featureID))) ) # Only annotate 3 digit EC if no 4 digit EC is available if (EFICAzECs.getEC3(featureID) and not EFICAzECs.getEC4(featureID)): if feature.qualifiers.has_key('EC_number'): if not re.search("\d+\.\d+\.\d+\.\d+", " ".join( feature.qualifiers['EC_number'])): logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s with %s' % \ (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC3(featureID)))) feature.qualifiers['EC_number'] = EFICAzECs.getEC3( featureID) if EFICAzECs.getEC3Info(featureID): notes.append("EFICAz EC number prediction: EC3: {0}; {1}".format(", ".join(EFICAzECs.getEC3(featureID)), \ "; ".join(EFICAzECs.getEC3Info(featureID)))) if not feature.qualifiers.has_key('EC_number'): feature.qualifiers['EC_number'] = EFICAzECs.getEC3(featureID) feature.qualifiers['note'] = notes logging.debug("Finished EC number prediction with EFICAz")
def write(seq_records, options): logging.debug("Exporting antiSMASH information as txt tables") #Don't store TXT tables for protein input if options.input_type == 'prot': return #Localize output folder, create TXT subdirectory txt_outfolder = options.full_outputfolder_path + os.sep + "txt" if not os.path.exists(txt_outfolder): os.mkdir(txt_outfolder) #Define table names tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable" #For each gene cluster, write out info to TXT files for seq_record in seq_records: if len(utils.get_cluster_features(seq_record)) > 0: #Open up TXT files txt_files = {} for table in tables: txt_files[table] = open( path.join( txt_outfolder, "%s_%s.txt" % (seq_record.id.partition(".")[0], table)), "w") #Gather all information info = utils.Storage() info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, [] clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) info.clusternrs.append(clusternr) info.clustertypes[clusternr] = utils.get_cluster_type(cluster) info.clustergenes[clusternr] = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.accessions[clusternr] = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.cdsmotifs[clusternr] = utils.get_all_features_of_type( seq_record, ["CDS_motif"]) info.seq_record = seq_record #Write information to tables for table in tables: getattr(write_tables, 'write_' + table)(txt_files[table], info, options) for table in tables: txt_files[table].close()
def convert_cds_features(record, features, annotations, options): """Convert CDS SeqFeatures to JSON""" js_orfs = [] for feature in features: js_orf = {} js_orf['start'] = int(feature.location.start) + 1 js_orf['end'] = int(feature.location.end) # Fix for files that have their coordinates the wrong way around if js_orf['start'] > js_orf['end']: js_orf['end'], js_orf['start'] = js_orf['start'], js_orf['end'] js_orf['strand'] = feature.strand if feature.strand is not None else 1 js_orf['locus_tag'] = utils.get_gene_id(feature) js_orf['type'] = get_biosynthetic_type(feature, annotations) js_orf['description'] = utils.ascii_string( get_description(record, feature, js_orf['type'], options)) js_orfs.append(js_orf) return js_orfs
def retrieve_pksnrps_info(seq_record, geneclusternr, pksnrpsprots): pksnrpsprotsnames = [utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record)] domaindict = utils.get_nrpspks_domain_dict(seq_record) substr_spec_preds = utils.get_nrpspks_substr_spec_preds(seq_record) pksnrpsdomains = {} domsdetails = {} substrspecnrpspredictordict = {} substrspecminowadict = {} substrspecpkssigdict = {} substrspecconsensusdict = {} krpredictionsdict = {} for i in pksnrpsprots: domlist = [] domsdetails = {} doms = domaindict[i] for j in doms: nr = 1 while j[0] + str(nr) in domlist: nr += 1 domname = j[0] + str(nr) domlist.append(domname) domsdetails[domname] = [j[1],j[2]] if "AMP-binding" in domname or "A-OX" in domname: domname2 = i + "_" + "A" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_nrps_preds[i + "_A" + str(nr)] substrspecnrpspredictordict[domname2] = [substr_spec_preds.nrps_code_preds[i + "_A" + str(nr)], substr_spec_preds.nrps_svm_preds[i + "_A" + str(nr)]] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_A" + str(nr)] if "PKS_AT" in domname: domname2 = i + "_" + "AT" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_pks_preds[i + "_AT" + str(nr)] substrspecpkssigdict[domname2] = substr_spec_preds.pks_code_preds[i + "_AT" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_AT" + str(nr)] if "CAL_domain" in domname: domname2 = i + "_" + "CAL" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)] if "CAL_domain" in domname: domname2 = i + "_" + "CAL" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)] if "PKS_KR" in domname: domname2 = i + "_" + "KR" + str(nr) krpredictionsdict[domname2] = [substr_spec_preds.kr_activity_preds[i + "_KR" + str(nr)], substr_spec_preds.kr_stereo_preds[i + "_KR" + str(nr)]] pksnrpsdomains[i] = [domlist,domsdetails] structpred = utils.get_structure_pred(utils.get_cluster_by_nr(seq_record, geneclusternr)) return pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred
def extract_pks_genes(pksnrpscoregenes, domaindict, seq_record): pksnames = [] pksseqs = [] for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "PKS_AT": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_AT" + str(nr) pksnames.append(name) pksseqs.append(seq) return pksnames, pksseqs