def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict, seq_record, options): #Predict PKS KR domain stereochemistry using pattern as published in ClustScan krnames = [] krseqs = [] logging.info("Predicting PKS KR activity and stereochemistry using KR " \ "fingerprints from Starcevic et al.") for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "PKS_KR": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_KR" + str(nr) krnames.append(name) krseqs.append(seq) if len(krnames) > 0: utils.writefasta( krnames, krseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta") with TemporaryDirectory(change=True): kr_analysis.run_kr_analysis( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krpredoutput.txt") return krnames, krseqs
def extract_nterminus(da_dir, clusterpksgenes, seq_record, startergene, feature_by_id): #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues ntermintresdict = {} ntermnames = [] ntermseqs = [] nterm_file = os.path.join(da_dir, 'nterm.fasta') for k in clusterpksgenes: if k != startergene: ntermnames.append(k) seq = str(utils.get_aa_sequence(feature_by_id[k])) ntermseqs.append(seq[:50]) ntermfasta = "input.fasta" z = 0 for k in ntermnames: utils.writefasta([ntermnames[z]], [ntermseqs[z]], ntermfasta) utils.execute([ "muscle", "-profile", "-quiet", "-in1", nterm_file, "-in2", "input.fasta", "-out", "muscle.fasta" ]) intresidues = extractpositions("muscle.fasta", [2, 15], "EryAIII_5_6_ref", ntermnames[z]) ntermintresdict[ntermnames[z]] = intresidues z += 1 return ntermintresdict
def extract_cterminus(da_dir, clusterpksgenes, seq_record, endinggene, feature_by_id): #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues ctermintresdict = {} ctermnames = [] ctermseqs = [] cterm_file = os.path.join(da_dir, 'cterm.fasta') for k in clusterpksgenes: if k != endinggene: ctermnames.append(k) seq = str(utils.get_aa_sequence(feature_by_id[k])) ctermseqs.append(seq[-100:]) ctermfasta = "input.fasta" z = 0 for k in ctermnames: utils.writefasta([ctermnames[z]], [ctermseqs[z]], ctermfasta) utils.execute([ "muscle", "-profile", "-quiet", "-in1", cterm_file, "-in2", "input.fasta", "-out", "muscle.fasta" ]) intresidues = extractpositions("muscle.fasta", [55, 64], "EryAII_ref", ctermnames[z]) ctermintresdict[ctermnames[z]] = intresidues z += 1 return ctermintresdict
def parse_subject(tabs, seqlengths, geneclustergenes, seq_record): if len(tabs) < 12: logging.error("Malformed blast pairing: %s", "\t".join(tabs)) query = tabs[0] subject_parts = tabs[1].split("|") subject = subject_parts[4] if subject == "no_locus_tag": subject = subject_parts[6] if subject in geneclustergenes: subject = "h_" + subject if len(subject_parts) > 6: locustag = subject_parts[6] else: locustag = "" genecluster = "{}_{}".format(subject_parts[0], subject_parts[1]) start, end = subject_parts[2].split("-")[:2] strand = subject_parts[3] annotation = subject_parts[5] perc_ident = int(float(tabs[2]) + 0.5) evalue = str(tabs[10]) blastscore = int(float(tabs[11]) + 0.5) query_key = query.split("|")[4] if seqlengths.has_key(query_key): perc_coverage = (float(tabs[3]) / seqlengths[query_key]) * 100 else: feature_by_id = utils.get_feature_dict_protein_id(seq_record) seqlength = len(utils.get_aa_sequence(feature_by_id[query_key])) perc_coverage = (float(tabs[3]) / seqlength) * 100 return Subject(subject, genecluster, start, end, strand, annotation, perc_ident, blastscore, perc_coverage, evalue, locustag)
def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record, options): calnames = [] calseqs = [] #Predict PKS CAL domain specificities with Minowa et al. method logging.info( "Predicting CAL domain substrate specificities by Minowa et al. method" ) for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "CAL_domain": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_CAL" + str(nr) calnames.append(name) calseqs.append(seq) if len(calnames) > 0: utils.writefasta( calnames, calseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta") with TemporaryDirectory(change=True): minowa_CAL.run_minowa_cal( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_minowa_calpredoutput.txt") return calnames, calseqs
def find_lan_a_features(seq_record, cluster): lan_a_features = [] for feature in utils.get_cds_features(seq_record): if feature.location.start < cluster.location.start or \ feature.location.end > cluster.location.end: continue aa_seq = utils.get_aa_sequence(feature) if len(aa_seq) < 80: lan_a_features.append(feature) continue if not 'sec_met' in feature.qualifiers: continue domain = None for entry in feature.qualifiers['sec_met']: if entry.startswith('Domains detected:'): domain = entry.split()[2] break if domain is None: continue if domain not in known_precursor_domains: continue lan_a_features.append(feature) return lan_a_features
def fastaseqlengths(seq_record): seqlengths = {} cdsfeatures = utils.get_cds_features(seq_record) for cds in cdsfeatures: seqlength = len(str(utils.get_aa_sequence(cds))) seqlengths[utils.get_gene_acc(cds)] = seqlength return seqlengths
def generate_searchgtr_htmls(seq_records, options): #Generate lists of COGs that are glycosyltransferases or transporters gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102'] searchgtrformtemplateparts = load_searchgtr_search_form_template() options.searchgtr_links = {} for seq_record in seq_records: smcogdict, _ = utils.get_smcog_annotations(seq_record) for feature in utils.get_cds_features(seq_record): gene_id = utils.get_gene_id(feature) if smcogdict.has_key(gene_id): smcog = smcogdict[gene_id] if smcog in gtrcoglist: if not os.path.exists(options.full_outputfolder_path + os.sep + "html"): os.mkdir(options.full_outputfolder_path + os.sep + "html") formfileloc = options.full_outputfolder_path + os.sep + "html" + os.sep + utils.get_gene_id( feature) + "_searchgtr.html" link_loc = "html" + os.sep + utils.get_gene_id( feature) + "_searchgtr.html" options.searchgtr_links[seq_record.id + "_" + gene_id] = link_loc formfile = open(formfileloc, "w") specificformtemplate = searchgtrformtemplateparts[ 0].replace("GlycTr", gene_id) formfile.write(specificformtemplate) formfile.write("%s\n%s" % (gene_id, utils.get_aa_sequence(feature))) formfile.write(searchgtrformtemplateparts[1]) formfile.close()
def _parse_domain(domain, feature, seq_record): "Convert a NRPS/PKS domain string to a dict useable by json.dumps" text = domain[17:] type_, location, prediction_string = text.split(' ', 2) predictions = _parse_substrate_predictions(prediction_string) location = location.strip('().') coordinates = location.split('-') #Create url_link to NaPDoS for C and KS domains napdoslink = "" domainseq = str(utils.get_aa_sequence( feature))[int(coordinates[0]):int(coordinates[-1])] if "PKS_KS" in text: napdoslink = "http://napdos.ucsd.edu/cgi-bin/process_request.cgi?query_type=aa&ref_seq_file=all_KS_public_12062011.faa&Sequence=%3EKS_domain_from_antiSMASH%0D" + domainseq elif "Condensation" in text: napdoslink = "http://napdos.ucsd.edu/cgi-bin/process_request.cgi?query_type=aa&ref_seq_file=all_C_public_12062011.faa&Sequence=%3EC_domain_from_antiSMASH%0D" + domainseq blastlink = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + domainseq + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch" try: js_domain = { 'type': type_, 'start': int(coordinates[0]), 'end': int(coordinates[1]), 'predictions': predictions, 'napdoslink': napdoslink, 'blastlink': blastlink, 'sequence': domainseq } return js_domain except ValueError: logging.debug('%r' % text) logging.debug('%r %r' % (type_, location)) logging.debug(coordinates) raise
def generate_details_div(cluster, seq_record, options, js_domains, details=None): """Generate details div""" cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx']) if cluster_rec is None: return details if details is None: details = pq('<div>') details.addClass('details') header = pq('<h3>') header.text('Detailed annotation') details.append(header) js_cluster_domains = { 'id': "cluster-%s-details" % cluster['idx'], 'orfs': [] } features = utils.get_cluster_cds_features(cluster_rec, seq_record) for feature in features: if not 'sec_met' in feature.qualifiers: continue if 'translation' in feature.qualifiers: sequence = feature.qualifiers['translation'][0] else: sequence = str(utils.get_aa_sequence(feature)) js_orf = { 'id': utils.get_gene_id(feature), 'sequence': sequence, 'domains': [], } for qual in feature.qualifiers['sec_met']: if not qual.startswith('NRPS/PKS Domain:'): continue js_domain = _parse_domain(qual, feature, seq_record) if len(js_domain) > 0: js_orf['domains'].append(js_domain) if len(js_orf['domains']) > 0: js_cluster_domains['orfs'].append(js_orf) if len(js_cluster_domains['orfs']) > 0: details_svg = pq('<div>') details_svg.addClass('details-svg') details_svg.attr('id', '%s-svg' % js_cluster_domains['id']) details.append(details_svg) js_domains.append(js_cluster_domains) return details
def test_get_aa_sequence(self): "Test utils.get_aa_sequence() for straightforward translation" expected = 'MAGIC' f = FakeFeature("CDS") f.qualifiers['translation'] = [expected] ret = utils.get_aa_sequence(f) self.assertEqual(expected, ret)
def filter_overlap(cdsfeatures): #For groups of overlapping CDSs (e.g., alternative transcripts?), only use the longest one uniquecdsfeatures = [] overlapping_groups = find_overlapping_groups(cdsfeatures) for group in overlapping_groups: lengths = [len(utils.get_aa_sequence(feature)) for feature in group] longest_idx = lengths.index(max(lengths)) uniquecdsfeatures.append(group[longest_idx]) return uniquecdsfeatures
def test_get_aa_sequence_to_stop(self): "Test utils.get_aa_sequence() for translation up to a stop codon" inseq = 'MAGIC*SEQ' expected = 'MAGIC' f = FakeFeature("CDS") f.qualifiers['translation'] = [inseq] ret = utils.get_aa_sequence(f, to_stop=True) self.assertEqual(expected, ret)
def test_get_aa_sequence_gap(self): "Test utils.get_aa_sequence() for translation including a gap" inseq = 'MA-GIC' expected = 'MAGIC' f = FakeFeature("CDS") f.qualifiers['translation'] = [inseq] ret = utils.get_aa_sequence(f) self.assertEqual(expected, ret)
def smcog_analysis(inputgenes, inputnr, seq_record, smcogdict, smcogsoutputfolder): "run smCOG search on all gene cluster CDS features" for feature in inputgenes: k = utils.get_gene_id(feature) tag = k seq = str(utils.get_aa_sequence(feature)) #create input.fasta file with single query sequence to be used as input for MSA utils.writefasta([tag], [seq], "input" + str(inputnr) + ".fasta") if smcogdict.has_key(k) and len(smcogdict[k]) > 0: smcog = (smcogdict[k][0][0]).split(":")[0] alignsmcogs(smcog, inputnr) #Generate trimmed alignment trimalignment(inputnr) #Draw phylogenetic tree drawtree(inputnr) #Convert tree to draw PNG image converttree(inputnr, smcogsoutputfolder, tag)
def extract_nrps_genes(pksnrpscoregenes, domaindict, seq_record, extra_aa=0): nrpsnames = [] nrpsseqs = [] for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "AMP-binding" or tab[0] == "A-OX": nr += 1 start = int(tab[1]) end = int(tab[2]) + extra_aa seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_A" + str(nr) nrpsnames.append(name) nrpsseqs.append(seq) return nrpsnames, nrpsseqs
def extract_pks_genes(pksnrpscoregenes, domaindict, seq_record): pksnames = [] pksseqs = [] for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "PKS_AT": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_AT" + str(nr) pksnames.append(name) pksseqs.append(seq) return pksnames, pksseqs
def tresholdblasthitfilter(blastlines, minseqcoverage, minpercidentity, seqlengths, seq_record): #Filters blastlines to get rid of hits that do not meet criteria blastlines2 = [] for i in blastlines: tabs = i.split("\t") query = tabs[0] perc_ident = int(float(tabs[2]) + 0.5) alignmentlength = float(tabs[3]) if seqlengths.has_key(query.split("|")[4]): perc_coverage = (float(tabs[3]) / seqlengths[query.split("|")[4]]) * 100 else: feature_by_id = utils.get_feature_dict_protein_id(seq_record) seqlength = len(utils.get_aa_sequence(feature_by_id[query.split("|")[4]])) perc_coverage = (float(tabs[3]) / seqlength) * 100 if perc_ident > minpercidentity and (perc_coverage > minseqcoverage): blastlines2.append(i) return blastlines2
def create_blast_inputs(genecluster, seq_record): #Create input fasta files for BLAST search queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.nofuzzy_start) + "-" + \ str(cds.location.nofuzzy_end), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def _get_nrpspks_domains_ks(pksnrpsvars, seq_record, domain): transatpks_geneclusters = _get_transatpks_geneclusters(pksnrpsvars, seq_record) transatpks_genes = list(set([g for g_list in transatpks_geneclusters.values() for g in g_list])) ksnames = [] ksseqs = [] if len(transatpks_geneclusters) >= 1: job_id = seq_record.id for feature in pksnrpsvars.pksnrpscoregenes: start_cds = str(feature.location.nofuzzy_start) end_cds = str(feature.location.nofuzzy_end) strand = feature.location.strand if strand == 1: strand_char = '+' else: strand_char = '-' loc = '-'.join((start_cds, end_cds)) prot_id = product = '' if 'protein_id' in feature.qualifiers: prot_id = feature.qualifiers["protein_id"][0] if 'product' in feature.qualifiers: product = feature.qualifiers["product"][0].replace(' ', '_').replace('|', '') # We use | as a separator later assert '|' not in product, product gene_id = utils.get_gene_id(feature) if gene_id in transatpks_genes: domaindetails = pksnrpsvars.domaindict[gene_id] nr = 0 for tab in domaindetails: if tab[0] == domain: nr += 1 start = int(tab[1]) end = int(tab[2]) loc_domain = '-'.join((str(start), str(end))) ks_index = ''.join(('KS', str(nr))) name1 = '|'.join( [''.join(['>', job_id]), 'c', loc, strand_char, gene_id, product, prot_id, loc_domain, ks_index]) name = re.sub(r'(\:|\'|\(|\)|\,|\?|\;)', '', name1) seq = str(utils.get_aa_sequence(feature))[start:end] ksnames.append(name) ksseqs.append(seq) return ksnames, ksseqs
def run_lantipred(seq_record, query, lant_class): hmmer_profiles = { 'Class-I': 'class1.hmm', 'Class-II': 'class2.hmm', 'Class-III': 'class3.hmm', } query_sequence = utils.get_aa_sequence(query, to_stop=True) lan_a_fasta = ">%s\n%s" % (utils.get_gene_id(query), query_sequence) #run sequence against profiles and parse them in a vector containing START, END, SCORE and LANTYPE profile = utils.get_full_path(__file__, hmmer_profiles[lant_class]) result = predict_cleavage_site(profile, lan_a_fasta) if result is None: logging.debug('%r: No cleavage site predicted' % utils.get_gene_id(query)) return if thresh_dict[lant_class] > result.score: logging.debug('%r: Score %0.2f below threshold %0.2f for class %r' % (utils.get_gene_id(query), result.score, thresh_dict[lant_class], lant_class)) return #extract now (that class is known and thus the END component) the core peptide result.leader = query_sequence[:result.end] result.core = query_sequence[result.end:] if result.core.find('C') < 0: logging.debug( '%r: No Cysteine residues found in core, false positive' % utils.get_gene_id(query)) return if not 'sec_met' in query.qualifiers: query.qualifiers['sec_met'] = [] if ";".join(query.qualifiers['sec_met']).find(';Kind: biosynthetic') < 0: query.qualifiers['sec_met'].append('Kind: biosynthetic') return result
def create_blast_inputs(genecluster, seq_record): options = config.get_config() #Create input fasta files for BLAST search if options.taxon == "plants": queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record)) else: queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.start).replace(">","").replace("<","") + "-" + \ str(cds.location.end).replace(">","").replace("<",""), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def filter_nonterminal_docking_domains(seq_record, pksnrpsvars): dockingdomains = [ 'NRPS-COM_Nterm', 'NRPS-COM_Cterm', 'PKS_Docking_Cterm', 'PKS_Docking_Nterm' ] hitgenes = pksnrpsvars.domaindict.keys() feature_by_id = utils.get_feature_dict(seq_record) for hitgene in hitgenes: to_remove = [] cdsfeature = feature_by_id[hitgene] cds_seq = utils.get_aa_sequence(cdsfeature) hitgenelength = len(cds_seq) x = 0 for hit in pksnrpsvars.domaindict[hitgene]: if hit[0] in dockingdomains: if not (hitgenelength - max(hit[1], hit[2]) < 50 or min(hit[1], hit[2]) < 50): to_remove.append(x) x += 1 to_remove.reverse() for idx in to_remove: del pksnrpsvars.domaindict[hitgene][idx] if pksnrpsvars.domaindict[hitgene] == []: del pksnrpsvars.domaindict[hitgene]
def blastparse(blasttext, minseqcoverage, minpercidentity, seqlengths, seq_record): options = config.get_config() geneclustergenes = [utils.get_gene_acc(cds) for cds in utils.get_withincluster_cds_features(seq_record)] blastdict = {} querylist = [] hitclusters = [] blastlines = blasttext.split("\n")[:-1] blastlines = uniqueblasthitfilter(blastlines) blastlines = tresholdblasthitfilter(blastlines, minseqcoverage, minpercidentity, seqlengths, seq_record) #Goes through the blastlines. For each query, creates a querydict and hitlist, and adds these to the blastdict when finding the next query firstquery = "y" percid_per_cluster = {} for i in blastlines: tabs = i.split("\t") query = tabs[0] subject = tabs[1].split("|")[4] if subject == "no_locus_tag": subject = tabs[1].split("|")[6] if subject in geneclustergenes: subject = "h_" + subject if len(tabs[1].split("|")) > 6: locustag = tabs[1].split("|")[6] else: locustag = "" subject_genecluster = tabs[1].split("|")[0] + "_" + tabs[1].split("|")[1] subject_start = (tabs[1].split("|")[2]).split("-")[0] subject_end = (tabs[1].split("|")[2]).split("-")[1] subject_strand = tabs[1].split("|")[3] subject_annotation = tabs[1].split("|")[5] perc_ident = int(float(tabs[2]) + 0.5) evalue = str(tabs[10]) blastscore = int(float(tabs[11])+0.5) if seqlengths.has_key(query.split("|")[4]): perc_coverage = (float(tabs[3]) / seqlengths[query.split("|")[4]]) * 100 else: feature_by_id = utils.get_feature_dict_protein_id(seq_record) seqlength = len(utils.get_aa_sequence(feature_by_id[query.split("|")[4]])) perc_coverage = (float(tabs[3]) / seqlength) * 100 if firstquery == "y": #Only until the first blastline with good hit firstquery = "n" querylist.append(query) subjectlist = [] querydict = {} subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] if subject_genecluster not in hitclusters: percid_per_cluster[subject_genecluster] = [perc_ident] hitclusters.append(subject_genecluster) last_query = query elif i == blastlines[-1]: #Only for the last blastline if query not in querylist: subjectlist = [] querydict = {} subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] blastdict[query] = [subjectlist,querydict] querylist.append(query) if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) else: subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] blastdict[query] = [subjectlist,querydict] if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) else: #For all but the first and last blastlines if query not in querylist: blastdict[last_query] = [subjectlist,querydict] querylist.append(query) subjectlist = [] querydict = {} subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) last_query = query else: subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) #For plants, filter hitclusters to only keep those hits with at least one hit > 60% ID if options.taxon == "plants": hitclusters = [cluster for cluster in hitclusters if len([int(pid) for pid in percid_per_cluster[cluster] if int(pid) > 60]) > 0] return [blastdict,querylist,hitclusters]
def get_description(record, feature, type_, options): "Get the description text of a feature" replacements = { 'locus_tag': ", ".join(feature.qualifiers.get('locus_tag', ['-'])), 'protein_id': ", ".join(feature.qualifiers.get('protein_id', ['-'])), 'smcog': '-', 'ecnumber': '-', 'transport_blast_line': '', 'smcog_tree_line': '', 'searchgtr_line': '', 'start': int(feature.location.start) + 1, 'end': int(feature.location.end), 'model_details': get_model_details(feature), 'asf': '' } blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \ "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \ "LINK_LOC=protein&PAGE_TYPE=BlastSearch" genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \ "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\ "id=%s&from=%s&to=%s" template = '<span class="svgene-tooltip-bold">%(product)s</span><br>\n' template += 'Locus-tag: %(locus_tag)s; Protein-ID: %(protein_id)s<br>\n' if 'EC_number' in feature.qualifiers: template += "EC-number(s): %(ecnumber)s<br>\n" if options.smcogs: template += "smCOG: %(smcog)s<br>\n" if options.input_type == 'nucl': template += "Location: %(start)s - %(end)s<br><br>\n" if 'sec_met' in feature.qualifiers: template += '<span class="bold">Signature pHMM hits:</span><br>\n%(model_details)s<br>\n' if options.knownclusterblast: mibig_homology_path = glob( os.path.join(options.full_outputfolder_path, "knownclusterblast", "cluster*", utils.get_gene_acc(feature) + '_mibig_hits.txt')) if mibig_homology_path: mibig_homology_file = mibig_homology_path[0] generate_html_table(mibig_homology_file) html_file = mibig_homology_file.split('.txt')[0] + '.html' replacements['mibig_homology_path'] = html_file[ len(options.full_outputfolder_path) + 1:] template += '<a href="%(mibig_homology_path)s" target="_new">MiBIG Hits</a><br><br>\n' template += """ %(transport_blast_line)s %(searchgtr_line)s <a href="%(blastp_url)s" target="_new">NCBI BlastP on this gene</a><br> <a href="%(genomic_context_url)s" target="_new">View genomic context</a><br> %(smcog_tree_line)s<br>""" if not get_ASF_predictions(feature) == "": template += '<span class="bold">Active Site Finder results:</span><br>\n%(asf)s<br><br>\n' template += """AA sequence: <a href="javascript:copyToClipboard('%(sequence)s')">Copy to clipboard</a><br>""" if not options.smcogs: del replacements['smcog'] if options.input_type == 'prot': del replacements['start'] del replacements['end'] replacements['product'] = feature.qualifiers.get('product', ['-'])[0] if 'translation' in feature.qualifiers: sequence = feature.qualifiers['translation'][0] else: sequence = str(utils.get_aa_sequence(feature)) replacements['blastp_url'] = blastp_url % sequence replacements['sequence'] = sequence if len(sequence) > 2000: len_seq = 30 else: len_seq = (len(sequence) / 80) + 1 replacements['len_seq'] = len_seq replacements['genomic_context_url'] = genomic_context_url % \ ( record.id, max(feature.location.start - 9999, 0), min(feature.location.end + 10000, len(record)) ) if 'EC_number' in feature.qualifiers: replacements['ecnumber'] = ", ".join( feature.qualifiers.get('EC_number', ['-'])) else: del replacements['ecnumber'] if options.smcogs: for note in feature.qualifiers.get('note', []): if note.startswith('smCOG:') and '(' in note: text = note[6:].split('(', 1)[0] smcog, desc = text.split(':', 1) desc = desc.replace('_', ' ') replacements['smcog'] = '%s (%s)' % (smcog, desc) elif note.startswith('smCOG tree PNG image:'): entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>' url = note.split(':')[-1] replacements['smcog_tree_line'] = entry % url if type_ == 'transport': url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \ "program=blastp;database=pub/transporter.pep;" \ "sequence=sequence%%0A%s" % sequence transport_blast_line = '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url replacements['transport_blast_line'] = transport_blast_line if options.searchgtr_links.has_key(record.id + "_" + utils.get_gene_id(feature)): url = options.searchgtr_links[record.id + "_" + utils.get_gene_id(feature)] searchgtr_line = '<a href="%s" target="_new">SEARCHGTr on this gene<br>' % url replacements['searchgtr_line'] = searchgtr_line replacements['asf'] = get_ASF_predictions(feature) if replacements['asf'] == "": del replacements['asf'] return template % replacements