def create_feature_from_location(record: Record, location: FeatureLocation, counter: int = 1, label: Optional[str] = None) -> CDSFeature: """ Creates a CDS feature covering the provided location. Arguments: record: The Record the CDSFeature will belong to, used to generate the feature translation location: The FeatureLocation specifying the location of the CDSFeature counter: An integer to use to format a default label 'allorf' with, used only if label not provided label: The locus tag, protein id, and gene name to use for the new CDSFeature Returns: The CDSFeature created. """ if label is None: label = 'allorf%03d' % counter feature = CDSFeature( location, str(record.get_aa_translation_from_location(location)), locus_tag=label, protein_id=label, gene=label) feature.created_by_antismash = True return feature
def run_lassopred(record: Record, cluster: Cluster, query: CDSFeature) -> Optional[LassopeptideMotif]: """General function to predict and analyse lasso peptides""" # Run checks to determine whether an ORF encodes a precursor peptide result = determine_precursor_peptide_candidate(record, cluster, query, query.translation) if result is None: return None # prediction of cleavage in C-terminal based on lasso's core sequence c_term_hmmer_profile = 'tail_cut.hmm' thresh_c_hit = -7.5 aux = result.core[(len(result.core) // 2):] core_a_fasta = ">%s\n%s" % (query.get_name(), aux) profile = path.get_full_path(__file__, 'data', c_term_hmmer_profile) hmmer_res = subprocessing.run_hmmpfam2(profile, core_a_fasta) for res in hmmer_res: for hits in res: for seq in hits: if seq.bitscore > thresh_c_hit: result.c_cut = aux[seq.query_start + 1:] if result is None: logging.debug('%r: No C-terminal cleavage site predicted', query.get_name()) return None query.gene_functions.add(GeneFunction.ADDITIONAL, "lassopeptides", "predicted lassopeptide") return result_vec_to_motif(query, result)
def get_description(record: Record, feature: CDSFeature, type_: str, options: ConfigType, mibig_result: List[clusterblast.results.MibigEntry]) -> str: "Get the description text of a CDS feature" urls = { "blastp": ("http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" "LINK_LOC=protein&PAGE_TYPE=BlastSearch") % feature.translation, "mibig": "", "transport": "", "smcog_tree": "" } genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \ "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\ "id=%s&from=%s&to=%s" if mibig_result: assert feature.region region_number = feature.region.get_region_number() mibig_homology_file = os.path.join(options.output_dir, "knownclusterblast", "region%d" % region_number, feature.get_accession() + '_mibig_hits.html') generate_html_table(mibig_homology_file, mibig_result) urls["mibig"] = mibig_homology_file[len(options.output_dir) + 1:] if type_ == 'transport': urls["transport"] = ("http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" "program=blastp;database=pub/transporter.pep;" "sequence=sequence%%0A%s") % feature.translation urls["context"] = genomic_context_url % (record.id, max(feature.location.start - 9999, 0), min(feature.location.end + 10000, len(record))) if options.smcog_trees: for note in feature.notes: # TODO find a better way to store image urls if note.startswith('smCOG tree PNG image:'): urls["smcog_tree"] = note.split(':')[-1] break asf_notes = generate_asf_tooltip_section(record, feature) go_notes = generate_pfam2go_tooltip(record, feature) pfam_notes = generate_pfam_tooltip(record, feature) tigr_notes = generate_tigr_tooltip(record, feature) urls["searchgtr"] = searchgtr_links.get("{}_{}".format(record.id, feature.get_name()), "") template = html_renderer.FileTemplate(path.get_full_path(__file__, "templates", "cds_detail.html")) ec_numbers = "" ec_number_qual = feature.get_qualifier("EC_number") if isinstance(ec_number_qual, list): ec_numbers = ",".join(ec_number_qual) return template.render(feature=feature, ec_numbers=ec_numbers, go_notes=go_notes, asf_notes=asf_notes, pfam_notes=pfam_notes, tigr_notes=tigr_notes, record=record, urls=urls)
def run_lanthi_on_genes(record: Record, focus: CDSFeature, cluster: Protocluster, genes: List[CDSFeature], results: LanthiResults) -> None: """ Runs lanthipeptide around a single focus gene which is a core biosynthetic enzyme for lanthipeptides. Updates the results object with any precursors found. Arguments: record: the Record instance containing the genes focus: a core lanthipeptide gene cluster: the Protocluster being analysed genes: a list of candidate precursor genes results: a LanthiResults object to update Returns: None """ if not genes: return domains = get_detected_domains(cluster.cds_children) non_candidate_neighbours = find_neighbours_in_range( focus, cluster.cds_children) flavoprotein_found = contains_feature_with_single_domain( non_candidate_neighbours, {"Flavoprotein"}) halogenase_found = contains_feature_with_single_domain( non_candidate_neighbours, {"Trp_halogenase"}) oxygenase_found = contains_feature_with_single_domain( non_candidate_neighbours, {"p450"}) dehydrogenase_found = contains_feature_with_single_domain( non_candidate_neighbours, {"adh_short", "adh_short_C2"}) lant_class = predict_class_from_genes(focus, cluster.cds_children) if not lant_class: return for candidate in genes: result_vec = run_lanthipred(record, candidate, lant_class, domains) if result_vec is None: continue result_vec.aminovinyl_group = flavoprotein_found result_vec.chlorinated = halogenase_found result_vec.oxygenated = oxygenase_found result_vec.lactonated = dehydrogenase_found and result_vec.core.startswith( 'S') motif = result_vec_to_feature(candidate, result_vec) results.motifs_by_locus[focus.get_name()].append(motif) results.clusters[cluster.get_protocluster_number()].add( focus.get_name()) # track new CDSFeatures if found with all_orfs if candidate.region is None: results.new_cds_features.add(candidate)
def from_feature(feature: secmet.CDSFeature) -> 'Gene': # string because forward reference """ Constructs a Gene instance from a CDS feature """ start = int(feature.location.start) end = int(feature.location.end) strand = feature.location.strand name = feature.get_accession() return Gene(start, end, strand, name, product=feature.product)
def find_tail(query: secmet.CDSFeature, core: str) -> str: """ Finds the tail of a prepeptide, if it exists Arguments: query: the CDS feature being checked core: the core of the prepeptide as a string Returns: the translation of the tail, or an empty string if it wasn't found """ # prediction of cleavage in C-terminal based on thiopeptide's core sequence # if last core residue != S or T or C > great chance of a tail cut tail = '' if core[-1] in "SCT": return tail thresh_c_hit = -9 temp = core[-10:] core_a_fasta = ">%s\n%s" % (query.get_name(), temp) c_term_profile = path.get_full_path(__file__, "data", 'thio_tail.hmm') c_hmmer_res = subprocessing.run_hmmpfam2(c_term_profile, core_a_fasta) for res in c_hmmer_res: for hits in res: for seq in hits: if seq.bitscore > thresh_c_hit: tail = temp[seq.query_end-1:] return tail
def run_prodigal(record: Record, options: ConfigType) -> None: """ Run progidal to annotate prokaryotic sequences """ if "basedir" in options.get('prodigal', ''): basedir = options.prodigal.basedir else: basedir = "" with TemporaryDirectory(change=True): name = record.id.lstrip('-') if not name: name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([record.to_biopython()], handle, 'fasta') # run prodigal prodigal = [path.join(basedir, 'prodigal')] prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file]) if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000: prodigal.extend(['-p', 'meta']) err = execute(prodigal).stderr if err.find('Error') > -1: logging.error("Failed to run prodigal: %r", err) raise RuntimeError("prodigal error: %s" % err) found = 0 for line in open(result_file, 'r'): # skip first line if not line.startswith('>'): continue name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip( ).split("_") try: start = int(start_chunk) end = int(end_chunk) if prodigal_strand == "+": strand = 1 else: strand = -1 except ValueError: logging.error('Malformatted prodigal output line %r', line.rstrip()) continue if start > end: strand = -1 start, end = end, start loc = FeatureLocation(start - 1, end, strand=strand) translation = record.get_aa_translation_from_location(loc) feature = CDSFeature(loc, locus_tag='ctg%s_%s' % (record.record_index, name), translation=translation, translation_table=record.transl_table) record.add_cds_feature(feature) found += 1 logging.debug("prodigal found %d CDS features", found)
def determine_precursor_peptide_candidate( cluster: secmet.Protocluster, query: secmet.CDSFeature, query_sequence: str, domains: Dict[str, int]) -> Optional[secmet.Prepeptide]: """Identify precursor peptide candidates and split into two""" # Skip sequences with >100 AA if not 20 <= len(query_sequence) <= 100: return None end = len(query_sequence) // 4 # TODO: this seems very arbitrary # Determine the leader and core peptide leader = query_sequence[:end] core = query_sequence[end:] # Run RODEO to assess whether candidate precursor peptide is judged real valid, score = run_rodeo(cluster, query, leader, core, domains) if not valid: return None return secmet.Prepeptide(query.location, "sactipeptide", core, query.get_name(), tool="sactipeptides", leader=leader, score=score)
def result_vec_to_motif(query: CDSFeature, result: Lassopeptide) -> Prepeptide: """ Converts a Lassopeptide to a Prepeptide """ core = result.core tail = result.c_cut if tail: core = result.core[:-len(tail)] weight = result.molecular_weight cut_mass = result.cut_mass cut_weight = result.cut_weight feature = Prepeptide(query.location, "lassopeptide", core, query.get_name(), "lassopeptides", peptide_subclass=result.lasso_class, score=result.score, monoisotopic_mass=result.monoisotopic_mass, molecular_weight=weight, leader=result.leader, tail=tail) feature.detailed_information = LassoQualifier(result.rodeo_score, result.number_bridges, result.macrolactam, cut_mass, cut_weight) return feature
def _parse_domain(record: Record, domain: NRPSPKSQualifier.Domain, feature: CDSFeature) -> JSONDomain: """ Convert a NRPS/PKS domain string to a dict useable by json.dumps Arguments: record: the Record containing the domain domain: the NRPSPKSQualifier.Domain in question feature: the CDSFeature that the domain belongs to Returns: a populated JSONDomain instance """ predictions = list(domain.predictions.items()) # Create url_link to NaPDoS for C and KS domains napdoslink = "" domainseq = str(feature.translation)[domain.start:domain.end] base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?" "query_type=aa&ref_seq_file=all_{0}_public_12062011.faa" "&Sequence=%3E{0}_domain_from_antiSMASH%0D{1}") if domain.name == "PKS_KS": napdoslink = base.format("KS", domainseq) elif "Condensation" in domain.name: napdoslink = base.format("C", domainseq) blastlink = ( "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins" "&PROGRAM=blastp&BLAST_PROGRAMS=blastp" "&QUERY={}" "&LINK_LOC=protein&PAGE_TYPE=BlastSearch").format(domainseq) dna_sequence = feature.extract(record.seq) abbreviation = _get_domain_abbreviation(domain.name) return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq, dna_sequence, abbreviation, _get_domain_class(abbreviation, domain.name))
def determine_precursor_peptide_candidate( record: Record, cluster: Cluster, query: CDSFeature, query_sequence: str) -> Optional[Lassopeptide]: """Identify precursor peptide candidates and split into two""" # Skip sequences with >100 AA if len(query_sequence) > 100 or len(query_sequence) < 20: return None # Create FASTA sequence for feature under study lasso_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence) # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE start, end, score = run_cleavage_site_phmm(lasso_a_fasta, 'precursor_2637.hmm', -20.00) # If no pHMM hit, try regular expression if score is None: start, end, score = run_cleavage_site_regex(lasso_a_fasta) if score is None or end > len(query_sequence) - 3: start, end, score = 0, len(query_sequence) // 2 - 5, 0. # Run RODEO to assess whether candidate precursor peptide is judged real valid, rodeo_score = run_rodeo(record, cluster, query, query_sequence[:end], query_sequence[end:]) if not valid: return None # Determine the leader and core peptide leader = query_sequence[:end] core = query_sequence[end:] return Lassopeptide(start, end + 1, score, rodeo_score, leader, core)
def result_vec_to_feature(orig_feature: secmet.CDSFeature, res_vec: Thiopeptide) -> ThiopeptideMotif: """ Converts a Thiopeptide object to a ThiopeptideMotif, based on an original CDSFeature. Arguments: orig_feature: the original CDS feature that the Motif will attach to res_vec: a Thiopeptide object containing results Returns: a ThiopeptideMotif """ if res_vec.c_cut: res_vec.core = res_vec.core[:-len(res_vec.c_cut)] mature_weights = [] # type: List[float] if res_vec.thio_type != "Type III": mature_weights = res_vec.mature_alt_weights feature = ThiopeptideMotif( orig_feature.location, res_vec.core, res_vec.leader, orig_feature.get_name(), res_vec.monoisotopic_mass, res_vec.molecular_weight, res_vec.alternative_weights, res_vec.thio_type, res_vec.score, res_vec.rodeo_score, res_vec.macrocycle, res_vec.c_cut, res_vec.mature_features, mature_weights, res_vec.amidation) return feature
def smcog_tree_analysis(cds: CDSFeature, input_number: int, smcog: str, output_dir: str) -> None: "run smCOG search on all gene cluster CDS features" gene_id = cds.get_name() seq = cds.translation # create input.fasta file with single query sequence to be used as input for MSA fasta.write_fasta([gene_id], [seq], "input" + str(input_number) + ".fasta") alignment_file = alignsmcogs(smcog, input_number) # Generate trimmed alignment trim_alignment(input_number, alignment_file) # Draw phylogenetic tree draw_tree(input_number, output_dir, gene_id)
def run_thiopred(query: secmet.CDSFeature, thio_type: str, domains: Set[str]) -> Optional[Thiopeptide]: """ Analyses a CDS feature to determine if it contains a thiopeptide precursor Arguments: query: the CDS feature to analyse thio_type: the suspected type of the thiopeptide domains: the set of domains found within the cluster containing the query Returns: A Thiopeptide instance if a precursor is found, otherwise None """ # Run checks to determine whether an ORF encodes a precursor peptide result = determine_precursor_peptide_candidate(query, domains) if result is None: return None # Determine thiopeptide type result.thio_type = thio_type # leader cleavage "validation" profile_pep = path.get_full_path(__file__, "data", 'thiopep2.hmm') core_a_fasta = ">%s\n%s" % (query.get_name(), result.core) hmmer_res_pep = subprocessing.run_hmmpfam2(profile_pep, core_a_fasta) thresh_pep_hit = -2 filter_out = True for res in hmmer_res_pep: for hits in res: for seq in hits: if seq.bitscore > thresh_pep_hit: filter_out = False if filter_out: return None # additional filter(s) for peptide prediction search = re.search( "[ISTV][SACNTW][STNCVG][ATCSGM][SVTFC][CGSTEAV][TCGVY].*", result.core) if not search: return None aux = search.group() if 10 < len(aux) < 20: diff = len(result.core) - len(aux) result.leader = result.leader + result.core[:diff] result.core = aux result.c_cut = find_tail(query, result.core) query.gene_functions.add(secmet.GeneFunction.ADDITIONAL, "thiopeptides", "predicted thiopeptide") return result
def run_lanthipred(record: Record, query: CDSFeature, lant_class: str, domains: List[str]) -> Optional[Lanthipeptide]: """ Determines if a CDS is a predicted lanthipeptide based on the class and any contained domains. Arguments: record: the parent Record of the feature query: the CDSFeature to analyse lant_class: a string representing the class domains: a list of domain names in the current cluster """ hmmer_profiles = { 'Class-I': 'data/class1.hmm', 'Class-II': 'data/class2.hmm', 'Class-III': 'data/class3.hmm', } query_sequence = query.translation if lant_class in ("Class-II", "Class-III"): profile = path.get_full_path(__file__, hmmer_profiles[lant_class]) lan_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence) cleavage_result = predict_cleavage_site(profile, lan_a_fasta) if cleavage_result is None: return None if THRESH_DICT[lant_class] > cleavage_result.score: return None # if the cleavage results in no core, that's not valid if cleavage_result.end == len(query_sequence): return None cleavage_result.lantype = lant_class leader = query_sequence[:cleavage_result.end] core = query_sequence[cleavage_result.end:] result = Lanthipeptide(cleavage_result, 0, leader, core) else: candidate = determine_precursor_peptide_candidate( record, query, domains, hmmer_profiles[lant_class], lant_class) if candidate is None: return None result = candidate # extract now (that class is known and thus the END component) the core peptide if result.number_of_lan_bridges == 0: return None query.gene_functions.add(GeneFunction.ADDITIONAL, "lanthipeptides", "predicted lanthipeptide") return result
def create_feature_from_location(record: Record, location: FeatureLocation, label: Optional[str] = None) -> CDSFeature: """ Creates a CDS feature covering the provided location. Arguments: record: The Record the CDSFeature will belong to, used to generate the feature translation location: The FeatureLocation specifying the location of the CDSFeature label: The locus tag, protein id, and gene name to use for the new CDSFeature Returns: The CDSFeature created. """ if label is None: digits = len(str(len(record))) label = 'allorf_{start:0{digits}}_{end:0{digits}}'.format( digits=digits, start=(location.start + 1), end=location.end ) feature = CDSFeature(location, str(record.get_aa_translation_from_location(location)), locus_tag=label, protein_id=label, gene=label) feature.created_by_antismash = True return feature
def determine_precursor_peptide_candidate( record: secmet.Record, query: secmet.CDSFeature, query_sequence: str, domains: List[str], hmmer_profile: str) -> Optional[Lanthipeptide]: """ Identify precursor peptide candidates and split into two, only valid for Class-I lanthipeptides """ # Skip sequences with >200 AA if len(query_sequence) > 200 or len(query_sequence) < 20: return None # Create FASTA sequence for feature under study lan_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence) # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE cleavage_result = run_cleavage_site_phmm(lan_a_fasta, hmmer_profile, THRESH_DICT["Class-I"]) if cleavage_result is not None and cleavage_result.end <= len( query_sequence) - 8: start = cleavage_result.start end = cleavage_result.end score = cleavage_result.score lanthi_type = cleavage_result.lantype else: # If no pHMM hit, try regular expression start, end, score = run_cleavage_site_regex(lan_a_fasta) if score is None or end > len(query_sequence) - 8: # abort, since RODEO will predict duplicates based only on cluster # attributes return None lanthi_type = "lanthipeptide" # if the cleavage results in no core, that's not valid if end == len(query_sequence): return None # Run RODEO to assess whether candidate precursor peptide is judged real rodeo_result = run_rodeo(record, query, query_sequence[:end], query_sequence[end:], domains) if rodeo_result < 14: return None lanthipeptide = Lanthipeptide(start, end, score, rodeo_result, lanthi_type) # Determine the leader and core peptide lanthipeptide.leader = query_sequence[:end] lanthipeptide.core = query_sequence[end:] return lanthipeptide
def determine_precursor_peptide_candidate( query: secmet.CDSFeature, domains: Set[str]) -> Optional[Thiopeptide]: """ Identify precursor peptide candidates and split into two Arguments: query: the CDS feature to check for motifs domains: the set of domain ids found in the cluster Returns: a Thiopeptide instance if a valid precursor found, otherwise None """ query_sequence = query.translation # Skip sequences not in the size range desired if not 40 < len(query_sequence) < 200: return None # Create FASTA sequence for feature under study thio_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence) # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE end, score = run_cleavage_site_phmm(thio_a_fasta, 'thio_cleave.hmm', -3.00) # If no pHMM hit, try regular expression if end is None: score = 0. end = run_cleavage_site_regex(query_sequence) if end is None or end > len(query_sequence) - 5: end = int(len(query_sequence) * 0.60) - 14 # ensure there's a valid value for end before trying to use it assert isinstance(end, int) and end > 0 # Run RODEO to assess whether candidate precursor peptide is judged real rodeo_result = run_rodeo(query_sequence[:end], query_sequence[end:], domains) if not rodeo_result[0]: return Thiopeptide(end + 1, score, 0) thiopeptide = Thiopeptide(end + 1, score, rodeo_result[1]) # Determine the leader and core peptide thiopeptide.leader = query_sequence[:end] thiopeptide.core = query_sequence[end:] return thiopeptide
def determine_precursor_peptide_candidate( record: Record, query: CDSFeature, domains: List[str], hmmer_profile: str, lant_class: str) -> Optional[Lanthipeptide]: """ Identify precursor peptide candidates and split into two, only valid for Class-I lanthipeptides """ # Skip sequences with >200 AA if len(query.translation) > 200 or len(query.translation) < 20: return None # Create FASTA sequence for feature under study lan_a_fasta = ">%s\n%s" % (query.get_name(), query.translation) # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE cleavage_result = run_cleavage_site_phmm(lan_a_fasta, hmmer_profile, THRESH_DICT[lant_class]) if cleavage_result is None or cleavage_result.end > len( query.translation) - 8: # If no pHMM hit, try regular expression cleavage_result = run_cleavage_site_regex(lan_a_fasta, lant_class) if cleavage_result is None or cleavage_result.end > len( query.translation) - 8: # still no good, so abort, since RODEO will predict duplicates based # only on cluster attributes return None # if the cleavage results in no core, that's not valid if cleavage_result.end == len(query.translation): return None # Run RODEO to assess whether candidate precursor peptide is judged real rodeo_result = run_rodeo(record, query, query.translation[:cleavage_result.end], query.translation[cleavage_result.end:], domains) if rodeo_result < 14: return None # Determine the leader and core peptide leader = query.translation[:cleavage_result.end] core = query.translation[cleavage_result.end:] return Lanthipeptide(cleavage_result, rodeo_result, leader, core)
def generate_motif_features(record: Record, feature: CDSFeature, motifs: List[HMMResult]) -> List[CDSMotif]: """ Convert a list of HMMResult to a list of CDSMotif features """ # use a locus tag if one exists locus_tag = feature.get_name() if feature.locus_tag: locus_tag = feature.locus_tag # grab the translation table if it's there if feature.transl_table: transl_table = feature.transl_table else: transl_table = 1 motif_features = [] for i, motif in enumerate(motifs): i += 1 # user facing, so 1-indexed if feature.location.strand == 1: start = feature.location.start + 3 * motif.query_start end = feature.location.start + 3 * motif.query_end else: end = feature.location.end - 3 * motif.query_start start = feature.location.end - 3 * motif.query_end loc = FeatureLocation(start, end, strand=feature.strand) new_motif = CDSMotif(loc) new_motif.label = motif.hit_id new_motif.motif = motif.hit_id # TODO: why both label AND motif? new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i) new_motif.evalue = motif.evalue new_motif.score = motif.bitscore new_motif.tool = "pksnrpsmotif" new_motif.detection = "hmmscan" new_motif.database = "abmotifs" new_motif.locus_tag = locus_tag new_motif.translation = str( new_motif.extract(record.seq).translate(table=transl_table)) new_motif.notes.append( "NRPS/PKS Motif: %s (e-value: %s, bit-score: %s)" % (motif.hit_id, motif.evalue, motif.bitscore)) # TODO move to CDSMotif motif_features.append(new_motif) return motif_features
def result_vec_to_feature(orig_feature: CDSFeature, res_vec: Lanthipeptide) -> LanthipeptideMotif: """ Generates a LanthipeptideMotif feature from a CDSFeature and a Lanthipeptide Arguments: orig_feature: the CDSFeature the lanthipeptide was found in res_vec: the Lanthipeptide instance that was calculated Returns: a LanthipeptideMotif instance """ feature = LanthipeptideMotif( orig_feature.location, res_vec.core, res_vec.leader, orig_feature.get_name(), res_vec.monoisotopic_mass, res_vec.molecular_weight, res_vec.alternative_weights, res_vec.number_of_lan_bridges, res_vec.lantype, res_vec.score, res_vec.rodeo_score, res_vec.aminovinyl_group, res_vec.chlorinated, res_vec.oxygenated, res_vec.lactonated) return feature
def result_vec_to_feature(orig_feature: CDSFeature, res_vec: Lanthipeptide) -> Prepeptide: """ Generates a Prepeptide feature from a CDSFeature and a Lanthipeptide Arguments: orig_feature: the CDSFeature the lanthipeptide was found in res_vec: the Lanthipeptide instance that was calculated Returns: a Prepeptide instance """ assert res_vec.leader is not None feature = Prepeptide(orig_feature.location, "lanthipeptide", res_vec.core, orig_feature.get_name(), "lanthipeptides", res_vec.lantype, res_vec.score, res_vec.monoisotopic_mass, res_vec.molecular_weight, res_vec.alternative_weights, res_vec.leader) qual = LanthiQualifier(res_vec.number_of_lan_bridges, res_vec.rodeo_score, res_vec.aminovinyl_group, res_vec.chlorinated, res_vec.oxygenated, res_vec.lactonated) feature.detailed_information = qual return feature
def result_vec_to_motif(query: CDSFeature, result: Lassopeptide) -> LassopeptideMotif: leader = result.leader core = result.core tail = result.c_cut if tail: core = result.core[:-len(tail)] mass = result.monoisotopic_mass weight = result.molecular_weight cut_mass = result.cut_mass cut_weight = result.cut_weight bridges = result.number_bridges lasso_class = result.lasso_class score = result.score rodeo_score = result.rodeo_score macrolactam = result.macrolactam locus_tag = query.get_name() location = query.location return LassopeptideMotif(location, leader, core, tail, locus_tag, mass, weight, cut_mass, cut_weight, bridges, lasso_class, score, rodeo_score, macrolactam)
def parse_domain(self, domain: NRPSPKSQualifier.Domain, feature: CDSFeature ) -> JSONDomain: "Convert a NRPS/PKS domain string to a dict useable by json.dumps" predictions = parse_substrate_predictions(domain.predictions) # Create url_link to NaPDoS for C and KS domains napdoslink = "" domainseq = str(feature.translation)[domain.start:domain.end] base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?" "query_type=aa&ref_seq_file=all_{0}_public_12062011.faa" "&Sequence=%3E{0}_domain_from_antiSMASH%0D{1}") if domain.name == "PKS_KS": napdoslink = base.format("KS", domainseq) elif "Condensation" in domain.name: napdoslink = base.format("C", domainseq) blastlink = ("http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins" "&PROGRAM=blastp&BLAST_PROGRAMS=blastp" "&QUERY={}" "&LINK_LOC=protein&PAGE_TYPE=BlastSearch").format(domainseq) dna_sequence = feature.extract(self.record.seq_record.seq) return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq, dna_sequence)
def result_vec_to_feature(orig_feature: secmet.CDSFeature, res_vec: Thiopeptide) -> secmet.Prepeptide: """ Converts a Thiopeptide object to a Prepeptide, based on an original CDSFeature. Arguments: orig_feature: the original CDS feature that the Motif will attach to res_vec: a Thiopeptide object containing results Returns: a Prepeptide """ if res_vec.c_cut: res_vec.core = res_vec.core[:-len(res_vec.c_cut)] mature_weights: List[float] = [] if res_vec.thio_type != "Type III": mature_weights = res_vec.mature_alt_weights feature = secmet.Prepeptide(orig_feature.location, "thiopeptide", res_vec.core, orig_feature.get_name(), "thiopeptides", res_vec.thio_type, res_vec.score, res_vec.monoisotopic_mass, res_vec.molecular_weight, res_vec.alternative_weights, leader=res_vec.leader, tail=res_vec.c_cut) feature.detailed_information = ThioQualifier(res_vec.rodeo_score, res_vec.amidation, res_vec.macrocycle, res_vec.mature_features, mature_weights) return feature
def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]: domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) return domains
def get_description( record: Record, feature: CDSFeature, type_: str, options: ConfigType, mibig_result: List[clusterblast.results.MibigEntry]) -> str: "Get the description text of a CDS feature" blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \ "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \ "LINK_LOC=protein&PAGE_TYPE=BlastSearch" % feature.translation genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \ "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\ "id=%s&from=%s&to=%s" template = '<span class="svgene-tooltip-bold">%s</span><br>\n' % feature.product or feature.get_name( ) template += 'Locus-tag: %s; Protein-ID: %s<br>\n' % (feature.locus_tag, feature.protein_id) if feature.get_qualifier('EC_number'): template += "EC-number(s): %s<br>\n" % ",".join( feature.get_qualifier('EC_number')) for gene_function in feature.gene_functions: template += "%s<br>\n" % str(gene_function) template += "Location: %d - %d<br><br>\n" % ( feature.location.start + 1, # 1-indexed feature.location.end) if mibig_result: cluster_number = feature.cluster.get_cluster_number() mibig_homology_file = os.path.join( options.output_dir, "knownclusterblast", "cluster%d" % cluster_number, feature.get_accession() + '_mibig_hits.html') generate_html_table(mibig_homology_file, mibig_result) mibig_path = mibig_homology_file[len(options.output_dir) + 1:] template += '<br><a href="%s" target="_new">MiBIG Hits</a><br>\n' % mibig_path if type_ == 'transport': url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \ "program=blastp;database=pub/transporter.pep;" \ "sequence=sequence%%0A%s" % feature.translation template += '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url key = record.id + "_" + feature.get_name() if key in searchgtr_links: url = searchgtr_links[key] template += '<a href="%s" target="_new">SEARCHGTr on this gene<br>\n' % url template += '<a href="%s" target="_new">NCBI BlastP on this gene</a><br>\n' % blastp_url context = genomic_context_url % ( record.id, max(feature.location.start - 9999, 0), min(feature.location.end + 10000, len(record))) template += """<a href="%s" target="_new">View genomic context</a><br>\n""" % context if options.smcogs_trees: for note in feature.notes: # TODO find a better way to store image urls if note.startswith('smCOG tree PNG image:'): url = note.split(':')[-1] entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>\n' template += entry % url break template += generate_asf_tooltip_section(record, feature) go_notes = generate_pfam2go_tooltip(record, feature) if go_notes: template += '<br><span class="bold">Gene Ontology terms for PFAM domains:</span><br>\n' \ '%s<br><br>\n' % "<br>".join(go_notes) clipboard_fragment = """<a href="javascript:copyToClipboard('%s')">Copy to clipboard</a>""" template += "AA sequence: %s<br>\n" % (clipboard_fragment % feature.translation) template += "Nucleotide sequence: %s<br>\n" % (clipboard_fragment % feature.extract(record.seq)) return "".join(char for char in template if char in string.printable)
def acquire_rodeo_heuristics(record: Record, cluster: Cluster, query: CDSFeature, leader: str, core: str) -> Tuple[int, List[Union[float, int]]]: """Calculate heuristic scores for RODEO""" tabs = [] # type: List[Union[float, int]] score = 0 # Calcd. lasso peptide mass (Da) (with Xs average out) core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=False) tabs.append(float(core_analysis.molecular_weight())) # Distance to any biosynthetic protein (E, B, C) hmmer_profiles = ['PF13471', 'PF00733', 'PF05402'] distance = utils.distance_to_pfam(record, query, hmmer_profiles) tabs.append(distance) # Within 500 nucleotides of any biosynthetic protein (E, B, C) +1 if distance < 500: score += 1 tabs.append(1) else: tabs.append(0) # Within 150 nucleotides of any biosynthetic protein (E, B, C) +1 if distance < 150: score += 1 tabs.append(1) else: tabs.append(0) # Greater than 1000 nucleotides from every biosynthetic protein (E, B, C) -2 if distance > 1000: score -= 2 tabs.append(1) else: tabs.append(0) # Core region has 2 or 4 Cys residues +1 if core.count("C") in [2, 4]: score += 1 tabs.append(1) else: tabs.append(0) # Leader region is longer than core region +2 if len(leader) > len(core): score += 2 tabs.append(1) else: tabs.append(0) # Core has 7 (Glu) or 8(Glu/Asp) or 9 (Asp) membered ring possible +1 if 'E' in core[6:8] or 'D' in core[7:9]: score += 1 tabs.append(1) else: tabs.append(0) # Leader region contains GxxxxxT +3 if re.search('(G[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader): score += 3 tabs.append(1) else: tabs.append(0) # Core starts with G +2 if core.startswith("G"): score += 2 tabs.append(1) else: tabs.append(0) # Peptide and lasso cyclase are on same strand +1 if is_on_same_strand_as(cluster, query, 'PF00733'): score += 1 tabs.append(1) else: tabs.append(0) # Leader/core region length ratio < 2 and > 0.5 +1 if 0.5 <= len(leader) / len(core) <= 2: score += 1 tabs.append(1) else: tabs.append(0) # Core starts with Cys and has an even number of Cys 0 if core.startswith("C") and core.count("C") % 2 == 0: score += 0 tabs.append(1) else: tabs.append(0) # Core contains no Gly -4 if "G" not in core: score -= 4 tabs.append(1) else: tabs.append(0) # Core has at least one aromatic residue +1 if set("FWY") & set(core): score += 1 tabs.append(1) else: tabs.append(0) # Core has at least 2 aromatic residues +2 if sum([core.count(aa) for aa in list("FWY")]) >= 2: score += 2 tabs.append(1) else: tabs.append(0) # Core has odd number of Cys -2 if core.count("C") % 2 != 0: score -= 2 tabs.append(1) else: tabs.append(0) # Leader region contains Trp -1 if "W" in leader: score -= 1 tabs.append(1) else: tabs.append(0) # Leader region contains Lys +1 if "K" in leader: score += 1 tabs.append(1) else: tabs.append(0) # Leader region has Cys -2 if "C" in leader: score -= 2 tabs.append(1) else: tabs.append(0) # Gene cluster does not contain PF13471 -2 if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \ utils.distance_to_pfam(record, query, ['PF13471']) > 10000: score -= 2 # Peptide utilizes alternate start codon -1 if not str(query.extract(record.seq)).startswith("ATG"): score -= 1 return score, tabs
def __init__(self, feature: CDSFeature) -> None: super().__init__(['id', 'sequence', 'domains']) self.sequence = feature.translation self.id = feature.get_name() self.domains = [] # type: List[JSONDomain]
def generate_rodeo_svm_csv( record: Record, query: CDSFeature, leader: str, core: str, previously_gathered_tabs: List[Union[float, int]], fimo_motifs: List[int], fimo_scores: Dict[int, float]) -> List[Union[float, int]]: """Generates all the items for a single precursor peptide candidate""" columns = [] # type: List[Union[float, int]] # Precursor Index columns.append(1) # classification columns.append(0) columns += previously_gathered_tabs # Cluster has PF00733? if utils.distance_to_pfam(record, query, ['PF00733']) == -1 or \ utils.distance_to_pfam(record, query, ['PF00733']) > 10000: columns.append(0) else: columns.append(1) # Cluster has PF05402? if utils.distance_to_pfam(record, query, ['PF05402']) == -1 or \ utils.distance_to_pfam(record, query, ['PF05402']) > 10000: columns.append(0) else: columns.append(1) # Cluster has PF13471? if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \ utils.distance_to_pfam(record, query, ['PF13471']) > 10000: columns.append(0) else: columns.append(1) # Leader has LxxxxxT motif? if re.search('(L[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader): columns.append(1) else: columns.append(0) # Core has adjacent identical aas (doubles)? if any(core[i] == core[i + 1] for i in range(len(core) - 1)): columns.append(1) else: columns.append(0) # Core length (aa) columns.append(len(core)) # Leader length (aa) columns.append(len(leader)) # Precursor length (aa) columns.append(len(leader) + len(core)) # Leader/core ratio columns.append(len(core) / len(leader)) # Number of Pro in first 9 aa of core? columns.append(core[:9].count("P")) # Estimated core charge charge_dict = {"E": -1, "D": -1, "K": 1, "H": 1, "R": 1} columns.append(sum([charge_dict[aa] for aa in core if aa in charge_dict])) # Estimated leader charge columns.append(sum([charge_dict[aa] for aa in leader if aa in charge_dict])) # Estimated precursor charge columns.append( sum([charge_dict[aa] for aa in leader + core if aa in charge_dict])) # Absolute value of core charge columns.append( abs(sum([charge_dict[aa] for aa in core if aa in charge_dict]))) # Absolute value of leader charge columns.append( abs(sum([charge_dict[aa] for aa in leader if aa in charge_dict]))) # Absolute value of precursor charge columns.append( abs(sum([charge_dict[aa] for aa in leader + core if aa in charge_dict]))) # Counts of AAs in leader columns += [leader.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Aromatics in leader columns.append(sum([leader.count(aa) for aa in "FWY"])) # Neg charged in leader columns.append(sum([leader.count(aa) for aa in "DE"])) # Pos charged in leader columns.append(sum([leader.count(aa) for aa in "RK"])) # Charged in leader columns.append(sum([leader.count(aa) for aa in "RKDE"])) # Aliphatic in leader columns.append(sum([leader.count(aa) for aa in "GAVLMI"])) # Hydroxyl in leader columns.append(sum([leader.count(aa) for aa in "ST"])) # Counts of AAs in core columns += [core.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Aromatics in core columns.append(sum([core.count(aa) for aa in "FWY"])) # Neg charged in core columns.append(sum([core.count(aa) for aa in "DE"])) # Pos charged in core columns.append(sum([core.count(aa) for aa in "RK"])) # Charged in core columns.append(sum([core.count(aa) for aa in "RKDE"])) # Aliphatic in core columns.append(sum([core.count(aa) for aa in "GAVLMI"])) # Hydroxyl in core columns.append(sum([core.count(aa) for aa in "ST"])) # Counts (0 or 1) of amino acids within first AA position of core sequence columns += [core[0].count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Counts of AAs in leader+core precursor = leader + core columns += [precursor.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV" ] # Temp to work with current training CSV # Aromatics in precursor columns.append(sum([precursor.count(aa) for aa in "FWY"])) # Neg charged in precursor columns.append(sum([precursor.count(aa) for aa in "DE"])) # Pos charged in precursor columns.append(sum([precursor.count(aa) for aa in "RK"])) # Charged in precursor columns.append(sum([precursor.count(aa) for aa in "RKDE"])) # Aliphatic in precursor columns.append(sum([precursor.count(aa) for aa in "GAVLMI"])) # Hydroxyl in precursor columns.append(sum([precursor.count(aa) for aa in "ST"])) # Motifs columns += [1 if motif in fimo_motifs else 0 for motif in range(1, 17)] # Total motifs hit columns.append(len(fimo_motifs)) # Motif scores columns += [ fimo_scores[motif] if motif in fimo_motifs else 0 for motif in range(1, 17) ] # Sum of MEME scores columns.append( sum([ fimo_scores[motif] if motif in fimo_motifs else 0 for motif in range(1, 17) ])) # No Motifs? if not fimo_motifs: columns.append(1) else: columns.append(0) # Alternate Start Codon? if not str(query.extract(record.seq)).startswith("ATG"): columns.append(1) else: columns.append(0) return columns