def _parse_domain(record: Record, domain: NRPSPKSQualifier.Domain, feature: CDSFeature) -> JSONDomain: """ Convert a NRPS/PKS domain string to a dict useable by json.dumps Arguments: record: the Record containing the domain domain: the NRPSPKSQualifier.Domain in question feature: the CDSFeature that the domain belongs to Returns: a populated JSONDomain instance """ predictions = list(domain.predictions.items()) # Create url_link to NaPDoS for C and KS domains napdoslink = "" domainseq = str(feature.translation)[domain.start:domain.end] base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?" "query_type=aa&ref_seq_file=all_{0}_public_12062011.faa" "&Sequence=%3E{0}_domain_from_antiSMASH%0D{1}") if domain.name == "PKS_KS": napdoslink = base.format("KS", domainseq) elif "Condensation" in domain.name: napdoslink = base.format("C", domainseq) blastlink = ( "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins" "&PROGRAM=blastp&BLAST_PROGRAMS=blastp" "&QUERY={}" "&LINK_LOC=protein&PAGE_TYPE=BlastSearch").format(domainseq) dna_sequence = feature.extract(record.seq) abbreviation = _get_domain_abbreviation(domain.name) return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq, dna_sequence, abbreviation, _get_domain_class(abbreviation, domain.name))
def parse_domain(self, domain: NRPSPKSQualifier.Domain, feature: CDSFeature ) -> JSONDomain: "Convert a NRPS/PKS domain string to a dict useable by json.dumps" predictions = parse_substrate_predictions(domain.predictions) # Create url_link to NaPDoS for C and KS domains napdoslink = "" domainseq = str(feature.translation)[domain.start:domain.end] base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?" "query_type=aa&ref_seq_file=all_{0}_public_12062011.faa" "&Sequence=%3E{0}_domain_from_antiSMASH%0D{1}") if domain.name == "PKS_KS": napdoslink = base.format("KS", domainseq) elif "Condensation" in domain.name: napdoslink = base.format("C", domainseq) blastlink = ("http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins" "&PROGRAM=blastp&BLAST_PROGRAMS=blastp" "&QUERY={}" "&LINK_LOC=protein&PAGE_TYPE=BlastSearch").format(domainseq) dna_sequence = feature.extract(self.record.seq_record.seq) return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq, dna_sequence)
def generate_rodeo_svm_csv( record: Record, query: CDSFeature, leader: str, core: str, previously_gathered_tabs: List[Union[float, int]], fimo_motifs: List[int], fimo_scores: Dict[int, float]) -> List[Union[float, int]]: """Generates all the items for a single precursor peptide candidate""" columns = [] # type: List[Union[float, int]] # Precursor Index columns.append(1) # classification columns.append(0) columns += previously_gathered_tabs # Cluster has PF00733? if utils.distance_to_pfam(record, query, ['PF00733']) == -1 or \ utils.distance_to_pfam(record, query, ['PF00733']) > 10000: columns.append(0) else: columns.append(1) # Cluster has PF05402? if utils.distance_to_pfam(record, query, ['PF05402']) == -1 or \ utils.distance_to_pfam(record, query, ['PF05402']) > 10000: columns.append(0) else: columns.append(1) # Cluster has PF13471? if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \ utils.distance_to_pfam(record, query, ['PF13471']) > 10000: columns.append(0) else: columns.append(1) # Leader has LxxxxxT motif? if re.search('(L[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader): columns.append(1) else: columns.append(0) # Core has adjacent identical aas (doubles)? if any(core[i] == core[i + 1] for i in range(len(core) - 1)): columns.append(1) else: columns.append(0) # Core length (aa) columns.append(len(core)) # Leader length (aa) columns.append(len(leader)) # Precursor length (aa) columns.append(len(leader) + len(core)) # Leader/core ratio columns.append(len(core) / len(leader)) # Number of Pro in first 9 aa of core? columns.append(core[:9].count("P")) # Estimated core charge charge_dict = {"E": -1, "D": -1, "K": 1, "H": 1, "R": 1} columns.append(sum([charge_dict[aa] for aa in core if aa in charge_dict])) # Estimated leader charge columns.append(sum([charge_dict[aa] for aa in leader if aa in charge_dict])) # Estimated precursor charge columns.append( sum([charge_dict[aa] for aa in leader + core if aa in charge_dict])) # Absolute value of core charge columns.append( abs(sum([charge_dict[aa] for aa in core if aa in charge_dict]))) # Absolute value of leader charge columns.append( abs(sum([charge_dict[aa] for aa in leader if aa in charge_dict]))) # Absolute value of precursor charge columns.append( abs(sum([charge_dict[aa] for aa in leader + core if aa in charge_dict]))) # Counts of AAs in leader columns += [leader.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Aromatics in leader columns.append(sum([leader.count(aa) for aa in "FWY"])) # Neg charged in leader columns.append(sum([leader.count(aa) for aa in "DE"])) # Pos charged in leader columns.append(sum([leader.count(aa) for aa in "RK"])) # Charged in leader columns.append(sum([leader.count(aa) for aa in "RKDE"])) # Aliphatic in leader columns.append(sum([leader.count(aa) for aa in "GAVLMI"])) # Hydroxyl in leader columns.append(sum([leader.count(aa) for aa in "ST"])) # Counts of AAs in core columns += [core.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Aromatics in core columns.append(sum([core.count(aa) for aa in "FWY"])) # Neg charged in core columns.append(sum([core.count(aa) for aa in "DE"])) # Pos charged in core columns.append(sum([core.count(aa) for aa in "RK"])) # Charged in core columns.append(sum([core.count(aa) for aa in "RKDE"])) # Aliphatic in core columns.append(sum([core.count(aa) for aa in "GAVLMI"])) # Hydroxyl in core columns.append(sum([core.count(aa) for aa in "ST"])) # Counts (0 or 1) of amino acids within first AA position of core sequence columns += [core[0].count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Counts of AAs in leader+core precursor = leader + core columns += [precursor.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV" ] # Temp to work with current training CSV # Aromatics in precursor columns.append(sum([precursor.count(aa) for aa in "FWY"])) # Neg charged in precursor columns.append(sum([precursor.count(aa) for aa in "DE"])) # Pos charged in precursor columns.append(sum([precursor.count(aa) for aa in "RK"])) # Charged in precursor columns.append(sum([precursor.count(aa) for aa in "RKDE"])) # Aliphatic in precursor columns.append(sum([precursor.count(aa) for aa in "GAVLMI"])) # Hydroxyl in precursor columns.append(sum([precursor.count(aa) for aa in "ST"])) # Motifs columns += [1 if motif in fimo_motifs else 0 for motif in range(1, 17)] # Total motifs hit columns.append(len(fimo_motifs)) # Motif scores columns += [ fimo_scores[motif] if motif in fimo_motifs else 0 for motif in range(1, 17) ] # Sum of MEME scores columns.append( sum([ fimo_scores[motif] if motif in fimo_motifs else 0 for motif in range(1, 17) ])) # No Motifs? if not fimo_motifs: columns.append(1) else: columns.append(0) # Alternate Start Codon? if not str(query.extract(record.seq)).startswith("ATG"): columns.append(1) else: columns.append(0) return columns
def acquire_rodeo_heuristics(record: Record, cluster: Cluster, query: CDSFeature, leader: str, core: str) -> Tuple[int, List[Union[float, int]]]: """Calculate heuristic scores for RODEO""" tabs = [] # type: List[Union[float, int]] score = 0 # Calcd. lasso peptide mass (Da) (with Xs average out) core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=False) tabs.append(float(core_analysis.molecular_weight())) # Distance to any biosynthetic protein (E, B, C) hmmer_profiles = ['PF13471', 'PF00733', 'PF05402'] distance = utils.distance_to_pfam(record, query, hmmer_profiles) tabs.append(distance) # Within 500 nucleotides of any biosynthetic protein (E, B, C) +1 if distance < 500: score += 1 tabs.append(1) else: tabs.append(0) # Within 150 nucleotides of any biosynthetic protein (E, B, C) +1 if distance < 150: score += 1 tabs.append(1) else: tabs.append(0) # Greater than 1000 nucleotides from every biosynthetic protein (E, B, C) -2 if distance > 1000: score -= 2 tabs.append(1) else: tabs.append(0) # Core region has 2 or 4 Cys residues +1 if core.count("C") in [2, 4]: score += 1 tabs.append(1) else: tabs.append(0) # Leader region is longer than core region +2 if len(leader) > len(core): score += 2 tabs.append(1) else: tabs.append(0) # Core has 7 (Glu) or 8(Glu/Asp) or 9 (Asp) membered ring possible +1 if 'E' in core[6:8] or 'D' in core[7:9]: score += 1 tabs.append(1) else: tabs.append(0) # Leader region contains GxxxxxT +3 if re.search('(G[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader): score += 3 tabs.append(1) else: tabs.append(0) # Core starts with G +2 if core.startswith("G"): score += 2 tabs.append(1) else: tabs.append(0) # Peptide and lasso cyclase are on same strand +1 if is_on_same_strand_as(cluster, query, 'PF00733'): score += 1 tabs.append(1) else: tabs.append(0) # Leader/core region length ratio < 2 and > 0.5 +1 if 0.5 <= len(leader) / len(core) <= 2: score += 1 tabs.append(1) else: tabs.append(0) # Core starts with Cys and has an even number of Cys 0 if core.startswith("C") and core.count("C") % 2 == 0: score += 0 tabs.append(1) else: tabs.append(0) # Core contains no Gly -4 if "G" not in core: score -= 4 tabs.append(1) else: tabs.append(0) # Core has at least one aromatic residue +1 if set("FWY") & set(core): score += 1 tabs.append(1) else: tabs.append(0) # Core has at least 2 aromatic residues +2 if sum([core.count(aa) for aa in list("FWY")]) >= 2: score += 2 tabs.append(1) else: tabs.append(0) # Core has odd number of Cys -2 if core.count("C") % 2 != 0: score -= 2 tabs.append(1) else: tabs.append(0) # Leader region contains Trp -1 if "W" in leader: score -= 1 tabs.append(1) else: tabs.append(0) # Leader region contains Lys +1 if "K" in leader: score += 1 tabs.append(1) else: tabs.append(0) # Leader region has Cys -2 if "C" in leader: score -= 2 tabs.append(1) else: tabs.append(0) # Gene cluster does not contain PF13471 -2 if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \ utils.distance_to_pfam(record, query, ['PF13471']) > 10000: score -= 2 # Peptide utilizes alternate start codon -1 if not str(query.extract(record.seq)).startswith("ATG"): score -= 1 return score, tabs
def get_description( record: Record, feature: CDSFeature, type_: str, options: ConfigType, mibig_result: List[clusterblast.results.MibigEntry]) -> str: "Get the description text of a CDS feature" blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \ "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \ "LINK_LOC=protein&PAGE_TYPE=BlastSearch" % feature.translation genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \ "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\ "id=%s&from=%s&to=%s" template = '<span class="svgene-tooltip-bold">%s</span><br>\n' % feature.product or feature.get_name( ) template += 'Locus-tag: %s; Protein-ID: %s<br>\n' % (feature.locus_tag, feature.protein_id) if feature.get_qualifier('EC_number'): template += "EC-number(s): %s<br>\n" % ",".join( feature.get_qualifier('EC_number')) for gene_function in feature.gene_functions: template += "%s<br>\n" % str(gene_function) template += "Location: %d - %d<br><br>\n" % ( feature.location.start + 1, # 1-indexed feature.location.end) if mibig_result: cluster_number = feature.cluster.get_cluster_number() mibig_homology_file = os.path.join( options.output_dir, "knownclusterblast", "cluster%d" % cluster_number, feature.get_accession() + '_mibig_hits.html') generate_html_table(mibig_homology_file, mibig_result) mibig_path = mibig_homology_file[len(options.output_dir) + 1:] template += '<br><a href="%s" target="_new">MiBIG Hits</a><br>\n' % mibig_path if type_ == 'transport': url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \ "program=blastp;database=pub/transporter.pep;" \ "sequence=sequence%%0A%s" % feature.translation template += '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url key = record.id + "_" + feature.get_name() if key in searchgtr_links: url = searchgtr_links[key] template += '<a href="%s" target="_new">SEARCHGTr on this gene<br>\n' % url template += '<a href="%s" target="_new">NCBI BlastP on this gene</a><br>\n' % blastp_url context = genomic_context_url % ( record.id, max(feature.location.start - 9999, 0), min(feature.location.end + 10000, len(record))) template += """<a href="%s" target="_new">View genomic context</a><br>\n""" % context if options.smcogs_trees: for note in feature.notes: # TODO find a better way to store image urls if note.startswith('smCOG tree PNG image:'): url = note.split(':')[-1] entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>\n' template += entry % url break template += generate_asf_tooltip_section(record, feature) go_notes = generate_pfam2go_tooltip(record, feature) if go_notes: template += '<br><span class="bold">Gene Ontology terms for PFAM domains:</span><br>\n' \ '%s<br><br>\n' % "<br>".join(go_notes) clipboard_fragment = """<a href="javascript:copyToClipboard('%s')">Copy to clipboard</a>""" template += "AA sequence: %s<br>\n" % (clipboard_fragment % feature.translation) template += "Nucleotide sequence: %s<br>\n" % (clipboard_fragment % feature.extract(record.seq)) return "".join(char for char in template if char in string.printable)