Example #1
0
def _parse_domain(record: Record, domain: NRPSPKSQualifier.Domain,
                  feature: CDSFeature) -> JSONDomain:
    """ Convert a NRPS/PKS domain string to a dict useable by json.dumps

        Arguments:
            record: the Record containing the domain
            domain: the NRPSPKSQualifier.Domain in question
            feature: the CDSFeature that the domain belongs to

        Returns:
            a populated JSONDomain instance
    """
    predictions = list(domain.predictions.items())

    # Create url_link to NaPDoS for C and KS domains
    napdoslink = ""
    domainseq = str(feature.translation)[domain.start:domain.end]
    base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?"
            "query_type=aa&ref_seq_file=all_{0}_public_12062011.faa"
            "&Sequence=%3E{0}_domain_from_antiSMASH%0D{1}")
    if domain.name == "PKS_KS":
        napdoslink = base.format("KS", domainseq)
    elif "Condensation" in domain.name:
        napdoslink = base.format("C", domainseq)
    blastlink = (
        "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins"
        "&PROGRAM=blastp&BLAST_PROGRAMS=blastp"
        "&QUERY={}"
        "&LINK_LOC=protein&PAGE_TYPE=BlastSearch").format(domainseq)

    dna_sequence = feature.extract(record.seq)
    abbreviation = _get_domain_abbreviation(domain.name)
    return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq,
                      dna_sequence, abbreviation,
                      _get_domain_class(abbreviation, domain.name))
Example #2
0
    def parse_domain(self, domain: NRPSPKSQualifier.Domain, feature: CDSFeature
                     ) -> JSONDomain:
        "Convert a NRPS/PKS domain string to a dict useable by json.dumps"
        predictions = parse_substrate_predictions(domain.predictions)

        # Create url_link to NaPDoS for C and KS domains
        napdoslink = ""
        domainseq = str(feature.translation)[domain.start:domain.end]
        base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?"
                "query_type=aa&ref_seq_file=all_{0}_public_12062011.faa"
                "&Sequence=%3E{0}_domain_from_antiSMASH%0D{1}")
        if domain.name == "PKS_KS":
            napdoslink = base.format("KS", domainseq)
        elif "Condensation" in domain.name:
            napdoslink = base.format("C", domainseq)
        blastlink = ("http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins"
                     "&PROGRAM=blastp&BLAST_PROGRAMS=blastp"
                     "&QUERY={}"
                     "&LINK_LOC=protein&PAGE_TYPE=BlastSearch").format(domainseq)

        dna_sequence = feature.extract(self.record.seq_record.seq)
        return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq, dna_sequence)
Example #3
0
def generate_rodeo_svm_csv(
        record: Record, query: CDSFeature, leader: str, core: str,
        previously_gathered_tabs: List[Union[float,
                                             int]], fimo_motifs: List[int],
        fimo_scores: Dict[int, float]) -> List[Union[float, int]]:
    """Generates all the items for a single precursor peptide candidate"""
    columns = []  # type: List[Union[float, int]]
    # Precursor Index
    columns.append(1)
    # classification
    columns.append(0)
    columns += previously_gathered_tabs
    # Cluster has PF00733?
    if utils.distance_to_pfam(record, query, ['PF00733']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF00733']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Cluster has PF05402?
    if utils.distance_to_pfam(record, query, ['PF05402']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF05402']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Cluster has PF13471?
    if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF13471']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Leader has LxxxxxT motif?
    if re.search('(L[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader):
        columns.append(1)
    else:
        columns.append(0)
    # Core has adjacent identical aas (doubles)?
    if any(core[i] == core[i + 1] for i in range(len(core) - 1)):
        columns.append(1)
    else:
        columns.append(0)
    # Core length (aa)
    columns.append(len(core))
    # Leader length (aa)
    columns.append(len(leader))
    # Precursor length (aa)
    columns.append(len(leader) + len(core))
    # Leader/core ratio
    columns.append(len(core) / len(leader))
    # Number of Pro in first 9 aa of core?
    columns.append(core[:9].count("P"))
    # Estimated core charge
    charge_dict = {"E": -1, "D": -1, "K": 1, "H": 1, "R": 1}
    columns.append(sum([charge_dict[aa] for aa in core if aa in charge_dict]))
    # Estimated leader charge
    columns.append(sum([charge_dict[aa] for aa in leader
                        if aa in charge_dict]))
    # Estimated precursor charge
    columns.append(
        sum([charge_dict[aa] for aa in leader + core if aa in charge_dict]))
    # Absolute value of core charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in core if aa in charge_dict])))
    # Absolute value of leader charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in leader if aa in charge_dict])))
    # Absolute value of precursor charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in leader + core
                 if aa in charge_dict])))
    # Counts of AAs in leader
    columns += [leader.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in leader
    columns.append(sum([leader.count(aa) for aa in "FWY"]))
    # Neg charged in leader
    columns.append(sum([leader.count(aa) for aa in "DE"]))
    # Pos charged in leader
    columns.append(sum([leader.count(aa) for aa in "RK"]))
    # Charged in leader
    columns.append(sum([leader.count(aa) for aa in "RKDE"]))
    # Aliphatic in leader
    columns.append(sum([leader.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in leader
    columns.append(sum([leader.count(aa) for aa in "ST"]))
    # Counts of AAs in core
    columns += [core.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in core
    columns.append(sum([core.count(aa) for aa in "FWY"]))
    # Neg charged in core
    columns.append(sum([core.count(aa) for aa in "DE"]))
    # Pos charged in core
    columns.append(sum([core.count(aa) for aa in "RK"]))
    # Charged in core
    columns.append(sum([core.count(aa) for aa in "RKDE"]))
    # Aliphatic in core
    columns.append(sum([core.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in core
    columns.append(sum([core.count(aa) for aa in "ST"]))
    # Counts (0 or 1) of amino acids within first AA position of core sequence
    columns += [core[0].count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Counts of AAs in leader+core
    precursor = leader + core
    columns += [precursor.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"
                ]  # Temp to work with current training CSV
    # Aromatics in precursor
    columns.append(sum([precursor.count(aa) for aa in "FWY"]))
    # Neg charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "DE"]))
    # Pos charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RK"]))
    # Charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RKDE"]))
    # Aliphatic in precursor
    columns.append(sum([precursor.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in precursor
    columns.append(sum([precursor.count(aa) for aa in "ST"]))
    # Motifs
    columns += [1 if motif in fimo_motifs else 0 for motif in range(1, 17)]
    # Total motifs hit
    columns.append(len(fimo_motifs))
    # Motif scores
    columns += [
        fimo_scores[motif] if motif in fimo_motifs else 0
        for motif in range(1, 17)
    ]
    # Sum of MEME scores
    columns.append(
        sum([
            fimo_scores[motif] if motif in fimo_motifs else 0
            for motif in range(1, 17)
        ]))
    # No Motifs?
    if not fimo_motifs:
        columns.append(1)
    else:
        columns.append(0)
    # Alternate Start Codon?
    if not str(query.extract(record.seq)).startswith("ATG"):
        columns.append(1)
    else:
        columns.append(0)
    return columns
Example #4
0
def acquire_rodeo_heuristics(record: Record, cluster: Cluster,
                             query: CDSFeature, leader: str,
                             core: str) -> Tuple[int, List[Union[float, int]]]:
    """Calculate heuristic scores for RODEO"""
    tabs = []  # type: List[Union[float, int]]
    score = 0
    # Calcd. lasso peptide mass (Da) (with Xs average out)
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))

    # Distance to any biosynthetic protein (E, B, C)
    hmmer_profiles = ['PF13471', 'PF00733', 'PF05402']
    distance = utils.distance_to_pfam(record, query, hmmer_profiles)
    tabs.append(distance)
    # Within 500 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Within 150 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 150:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Greater than 1000 nucleotides from every biosynthetic protein (E, B, C)	-2
    if distance > 1000:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core region has 2 or 4 Cys residues	+1
    if core.count("C") in [2, 4]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region is longer than core region	+2
    if len(leader) > len(core):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has 7 (Glu) or 8(Glu/Asp) or 9 (Asp) membered ring possible	+1
    if 'E' in core[6:8] or 'D' in core[7:9]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains GxxxxxT	+3
    if re.search('(G[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with G	+2
    if core.startswith("G"):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide and lasso cyclase are on same strand	+1
    if is_on_same_strand_as(cluster, query, 'PF00733'):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader/core region length ratio < 2 and > 0.5	+1
    if 0.5 <= len(leader) / len(core) <= 2:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with Cys and has an even number of Cys	0
    if core.startswith("C") and core.count("C") % 2 == 0:
        score += 0
        tabs.append(1)
    else:
        tabs.append(0)
    # Core contains no Gly	-4
    if "G" not in core:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least one aromatic residue	+1
    if set("FWY") & set(core):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least 2 aromatic residues	+2
    if sum([core.count(aa) for aa in list("FWY")]) >= 2:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has odd number of Cys	-2
    if core.count("C") % 2 != 0:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Trp	-1
    if "W" in leader:
        score -= 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Lys	+1
    if "K" in leader:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region has Cys	-2
    if "C" in leader:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Gene cluster does not contain PF13471	-2
    if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF13471']) > 10000:
        score -= 2
    # Peptide utilizes alternate start codon	-1
    if not str(query.extract(record.seq)).startswith("ATG"):
        score -= 1
    return score, tabs
Example #5
0
def get_description(
        record: Record, feature: CDSFeature, type_: str, options: ConfigType,
        mibig_result: List[clusterblast.results.MibigEntry]) -> str:
    "Get the description text of a CDS feature"

    blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \
                 "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \
                 "LINK_LOC=protein&PAGE_TYPE=BlastSearch" % feature.translation
    genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \
                          "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\
                          "id=%s&from=%s&to=%s"
    template = '<span class="svgene-tooltip-bold">%s</span><br>\n' % feature.product or feature.get_name(
    )
    template += 'Locus-tag: %s; Protein-ID: %s<br>\n' % (feature.locus_tag,
                                                         feature.protein_id)

    if feature.get_qualifier('EC_number'):
        template += "EC-number(s): %s<br>\n" % ",".join(
            feature.get_qualifier('EC_number'))

    for gene_function in feature.gene_functions:
        template += "%s<br>\n" % str(gene_function)

    template += "Location: %d - %d<br><br>\n" % (
        feature.location.start + 1,  # 1-indexed
        feature.location.end)

    if mibig_result:
        cluster_number = feature.cluster.get_cluster_number()
        mibig_homology_file = os.path.join(
            options.output_dir, "knownclusterblast",
            "cluster%d" % cluster_number,
            feature.get_accession() + '_mibig_hits.html')
        generate_html_table(mibig_homology_file, mibig_result)
        mibig_path = mibig_homology_file[len(options.output_dir) + 1:]
        template += '<br><a href="%s" target="_new">MiBIG Hits</a><br>\n' % mibig_path

    if type_ == 'transport':
        url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \
              "program=blastp;database=pub/transporter.pep;" \
              "sequence=sequence%%0A%s" % feature.translation
        template += '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url

    key = record.id + "_" + feature.get_name()
    if key in searchgtr_links:
        url = searchgtr_links[key]
        template += '<a href="%s" target="_new">SEARCHGTr on this gene<br>\n' % url

    template += '<a href="%s" target="_new">NCBI BlastP on this gene</a><br>\n' % blastp_url

    context = genomic_context_url % (
        record.id, max(feature.location.start - 9999,
                       0), min(feature.location.end + 10000, len(record)))
    template += """<a href="%s" target="_new">View genomic context</a><br>\n""" % context

    if options.smcogs_trees:
        for note in feature.notes:  # TODO find a better way to store image urls
            if note.startswith('smCOG tree PNG image:'):
                url = note.split(':')[-1]
                entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>\n'
                template += entry % url
                break

    template += generate_asf_tooltip_section(record, feature)

    go_notes = generate_pfam2go_tooltip(record, feature)
    if go_notes:
        template += '<br><span class="bold">Gene Ontology terms for PFAM domains:</span><br>\n' \
                    '%s<br><br>\n' % "<br>".join(go_notes)

    clipboard_fragment = """<a href="javascript:copyToClipboard('%s')">Copy to clipboard</a>"""
    template += "AA sequence: %s<br>\n" % (clipboard_fragment %
                                           feature.translation)
    template += "Nucleotide sequence: %s<br>\n" % (clipboard_fragment %
                                                   feature.extract(record.seq))

    return "".join(char for char in template if char in string.printable)