Example #1
0
def create_feature_from_location(record: Record,
                                 location: FeatureLocation,
                                 counter: int = 1,
                                 label: Optional[str] = None) -> CDSFeature:
    """ Creates a CDS feature covering the provided location.

        Arguments:
            record: The Record the CDSFeature will belong to, used to generate
                    the feature translation
            location: The FeatureLocation specifying the location of the CDSFeature
            counter: An integer to use to format a default label 'allorf' with,
                     used only if label not provided
            label: The locus tag, protein id, and gene name to use for the new
                   CDSFeature

        Returns:
            The CDSFeature created.
    """
    if label is None:
        label = 'allorf%03d' % counter
    feature = CDSFeature(
        location,
        str(record.get_aa_translation_from_location(location)),
        locus_tag=label,
        protein_id=label,
        gene=label)
    feature.created_by_antismash = True
    return feature
Example #2
0
def run_lassopred(record: Record, cluster: Cluster,
                  query: CDSFeature) -> Optional[LassopeptideMotif]:
    """General function to predict and analyse lasso peptides"""

    # Run checks to determine whether an ORF encodes a precursor peptide
    result = determine_precursor_peptide_candidate(record, cluster, query,
                                                   query.translation)
    if result is None:
        return None

    # prediction of cleavage in C-terminal based on lasso's core sequence
    c_term_hmmer_profile = 'tail_cut.hmm'
    thresh_c_hit = -7.5

    aux = result.core[(len(result.core) // 2):]
    core_a_fasta = ">%s\n%s" % (query.get_name(), aux)

    profile = path.get_full_path(__file__, 'data', c_term_hmmer_profile)
    hmmer_res = subprocessing.run_hmmpfam2(profile, core_a_fasta)

    for res in hmmer_res:
        for hits in res:
            for seq in hits:
                if seq.bitscore > thresh_c_hit:
                    result.c_cut = aux[seq.query_start + 1:]

    if result is None:
        logging.debug('%r: No C-terminal cleavage site predicted',
                      query.get_name())
        return None

    query.gene_functions.add(GeneFunction.ADDITIONAL, "lassopeptides",
                             "predicted lassopeptide")

    return result_vec_to_motif(query, result)
Example #3
0
def get_description(record: Record, feature: CDSFeature, type_: str,
                    options: ConfigType, mibig_result: List[clusterblast.results.MibigEntry]) -> str:
    "Get the description text of a CDS feature"

    urls = {
        "blastp": ("http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&"
                   "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&"
                   "LINK_LOC=protein&PAGE_TYPE=BlastSearch") % feature.translation,
        "mibig": "",
        "transport": "",
        "smcog_tree": ""
    }

    genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \
                          "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\
                          "id=%s&from=%s&to=%s"

    if mibig_result:
        assert feature.region
        region_number = feature.region.get_region_number()
        mibig_homology_file = os.path.join(options.output_dir, "knownclusterblast",
                                           "region%d" % region_number,
                                           feature.get_accession() + '_mibig_hits.html')
        generate_html_table(mibig_homology_file, mibig_result)
        urls["mibig"] = mibig_homology_file[len(options.output_dir) + 1:]

    if type_ == 'transport':
        urls["transport"] = ("http://blast.jcvi.org/er-blast/index.cgi?project=transporter;"
                             "program=blastp;database=pub/transporter.pep;"
                             "sequence=sequence%%0A%s") % feature.translation

    urls["context"] = genomic_context_url % (record.id,
                                             max(feature.location.start - 9999, 0),
                                             min(feature.location.end + 10000, len(record)))

    if options.smcog_trees:
        for note in feature.notes:  # TODO find a better way to store image urls
            if note.startswith('smCOG tree PNG image:'):
                urls["smcog_tree"] = note.split(':')[-1]
                break

    asf_notes = generate_asf_tooltip_section(record, feature)
    go_notes = generate_pfam2go_tooltip(record, feature)
    pfam_notes = generate_pfam_tooltip(record, feature)
    tigr_notes = generate_tigr_tooltip(record, feature)

    urls["searchgtr"] = searchgtr_links.get("{}_{}".format(record.id, feature.get_name()), "")
    template = html_renderer.FileTemplate(path.get_full_path(__file__, "templates", "cds_detail.html"))
    ec_numbers = ""
    ec_number_qual = feature.get_qualifier("EC_number")
    if isinstance(ec_number_qual, list):
        ec_numbers = ",".join(ec_number_qual)
    return template.render(feature=feature, ec_numbers=ec_numbers, go_notes=go_notes,
                           asf_notes=asf_notes, pfam_notes=pfam_notes, tigr_notes=tigr_notes,
                           record=record, urls=urls)
Example #4
0
def run_lanthi_on_genes(record: Record, focus: CDSFeature,
                        cluster: Protocluster, genes: List[CDSFeature],
                        results: LanthiResults) -> None:
    """ Runs lanthipeptide around a single focus gene which is a core biosynthetic
        enzyme for lanthipeptides.
        Updates the results object with any precursors found.

        Arguments:
            record: the Record instance containing the genes
            focus: a core lanthipeptide gene
            cluster: the Protocluster being analysed
            genes: a list of candidate precursor genes
            results: a LanthiResults object to update

        Returns:
            None
    """
    if not genes:
        return
    domains = get_detected_domains(cluster.cds_children)
    non_candidate_neighbours = find_neighbours_in_range(
        focus, cluster.cds_children)
    flavoprotein_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"Flavoprotein"})
    halogenase_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"Trp_halogenase"})
    oxygenase_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"p450"})
    dehydrogenase_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"adh_short", "adh_short_C2"})

    lant_class = predict_class_from_genes(focus, cluster.cds_children)
    if not lant_class:
        return

    for candidate in genes:
        result_vec = run_lanthipred(record, candidate, lant_class, domains)
        if result_vec is None:
            continue
        result_vec.aminovinyl_group = flavoprotein_found
        result_vec.chlorinated = halogenase_found
        result_vec.oxygenated = oxygenase_found
        result_vec.lactonated = dehydrogenase_found and result_vec.core.startswith(
            'S')
        motif = result_vec_to_feature(candidate, result_vec)
        results.motifs_by_locus[focus.get_name()].append(motif)
        results.clusters[cluster.get_protocluster_number()].add(
            focus.get_name())
        # track new CDSFeatures if found with all_orfs
        if candidate.region is None:
            results.new_cds_features.add(candidate)
Example #5
0
 def from_feature(feature: secmet.CDSFeature) -> 'Gene':  # string because forward reference
     """ Constructs a Gene instance from a CDS feature """
     start = int(feature.location.start)
     end = int(feature.location.end)
     strand = feature.location.strand
     name = feature.get_accession()
     return Gene(start, end, strand, name, product=feature.product)
Example #6
0
def find_tail(query: secmet.CDSFeature, core: str) -> str:
    """ Finds the tail of a prepeptide, if it exists

        Arguments:
            query: the CDS feature being checked
            core: the core of the prepeptide as a string

        Returns:
            the translation of the tail, or an empty string if it wasn't found
    """
    # prediction of cleavage in C-terminal based on thiopeptide's core sequence
    # if last core residue != S or T or C > great chance of a tail cut
    tail = ''
    if core[-1] in "SCT":
        return tail
    thresh_c_hit = -9

    temp = core[-10:]
    core_a_fasta = ">%s\n%s" % (query.get_name(), temp)

    c_term_profile = path.get_full_path(__file__, "data", 'thio_tail.hmm')
    c_hmmer_res = subprocessing.run_hmmpfam2(c_term_profile, core_a_fasta)

    for res in c_hmmer_res:
        for hits in res:
            for seq in hits:
                if seq.bitscore > thresh_c_hit:
                    tail = temp[seq.query_end-1:]
    return tail
Example #7
0
def run_prodigal(record: Record, options: ConfigType) -> None:
    """ Run progidal to annotate prokaryotic sequences
    """
    if "basedir" in options.get('prodigal', ''):
        basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        name = record.id.lstrip('-')
        if not name:
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([record.to_biopython()], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal).stderr
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r", err)
            raise RuntimeError("prodigal error: %s" % err)
        found = 0
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip(
            ).split("_")

            try:
                start = int(start_chunk)
                end = int(end_chunk)
                if prodigal_strand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r',
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                start, end = end, start

            loc = FeatureLocation(start - 1, end, strand=strand)
            translation = record.get_aa_translation_from_location(loc)
            feature = CDSFeature(loc,
                                 locus_tag='ctg%s_%s' %
                                 (record.record_index, name),
                                 translation=translation,
                                 translation_table=record.transl_table)
            record.add_cds_feature(feature)
            found += 1
    logging.debug("prodigal found %d CDS features", found)
Example #8
0
def determine_precursor_peptide_candidate(
        cluster: secmet.Protocluster, query: secmet.CDSFeature,
        query_sequence: str,
        domains: Dict[str, int]) -> Optional[secmet.Prepeptide]:
    """Identify precursor peptide candidates and split into two"""

    # Skip sequences with >100 AA
    if not 20 <= len(query_sequence) <= 100:
        return None

    end = len(query_sequence) // 4  # TODO: this seems very arbitrary

    # Determine the leader and core peptide
    leader = query_sequence[:end]
    core = query_sequence[end:]

    # Run RODEO to assess whether candidate precursor peptide is judged real
    valid, score = run_rodeo(cluster, query, leader, core, domains)
    if not valid:
        return None
    return secmet.Prepeptide(query.location,
                             "sactipeptide",
                             core,
                             query.get_name(),
                             tool="sactipeptides",
                             leader=leader,
                             score=score)
Example #9
0
def result_vec_to_motif(query: CDSFeature, result: Lassopeptide) -> Prepeptide:
    """ Converts a Lassopeptide to a Prepeptide """
    core = result.core
    tail = result.c_cut
    if tail:
        core = result.core[:-len(tail)]
    weight = result.molecular_weight
    cut_mass = result.cut_mass
    cut_weight = result.cut_weight

    feature = Prepeptide(query.location,
                         "lassopeptide",
                         core,
                         query.get_name(),
                         "lassopeptides",
                         peptide_subclass=result.lasso_class,
                         score=result.score,
                         monoisotopic_mass=result.monoisotopic_mass,
                         molecular_weight=weight,
                         leader=result.leader,
                         tail=tail)
    feature.detailed_information = LassoQualifier(result.rodeo_score,
                                                  result.number_bridges,
                                                  result.macrolactam, cut_mass,
                                                  cut_weight)
    return feature
Example #10
0
def _parse_domain(record: Record, domain: NRPSPKSQualifier.Domain,
                  feature: CDSFeature) -> JSONDomain:
    """ Convert a NRPS/PKS domain string to a dict useable by json.dumps

        Arguments:
            record: the Record containing the domain
            domain: the NRPSPKSQualifier.Domain in question
            feature: the CDSFeature that the domain belongs to

        Returns:
            a populated JSONDomain instance
    """
    predictions = list(domain.predictions.items())

    # Create url_link to NaPDoS for C and KS domains
    napdoslink = ""
    domainseq = str(feature.translation)[domain.start:domain.end]
    base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?"
            "query_type=aa&amp;ref_seq_file=all_{0}_public_12062011.faa"
            "&amp;Sequence=%3E{0}_domain_from_antiSMASH%0D{1}")
    if domain.name == "PKS_KS":
        napdoslink = base.format("KS", domainseq)
    elif "Condensation" in domain.name:
        napdoslink = base.format("C", domainseq)
    blastlink = (
        "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins"
        "&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp"
        "&amp;QUERY={}"
        "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch").format(domainseq)

    dna_sequence = feature.extract(record.seq)
    abbreviation = _get_domain_abbreviation(domain.name)
    return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq,
                      dna_sequence, abbreviation,
                      _get_domain_class(abbreviation, domain.name))
Example #11
0
def determine_precursor_peptide_candidate(
        record: Record, cluster: Cluster, query: CDSFeature,
        query_sequence: str) -> Optional[Lassopeptide]:
    """Identify precursor peptide candidates and split into two"""

    # Skip sequences with >100 AA
    if len(query_sequence) > 100 or len(query_sequence) < 20:
        return None

    # Create FASTA sequence for feature under study
    lasso_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence)

    # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE
    start, end, score = run_cleavage_site_phmm(lasso_a_fasta,
                                               'precursor_2637.hmm', -20.00)

    # If no pHMM hit, try regular expression
    if score is None:
        start, end, score = run_cleavage_site_regex(lasso_a_fasta)
        if score is None or end > len(query_sequence) - 3:
            start, end, score = 0, len(query_sequence) // 2 - 5, 0.

    # Run RODEO to assess whether candidate precursor peptide is judged real
    valid, rodeo_score = run_rodeo(record, cluster, query,
                                   query_sequence[:end], query_sequence[end:])
    if not valid:
        return None

    # Determine the leader and core peptide
    leader = query_sequence[:end]
    core = query_sequence[end:]
    return Lassopeptide(start, end + 1, score, rodeo_score, leader, core)
Example #12
0
def result_vec_to_feature(orig_feature: secmet.CDSFeature,
                          res_vec: Thiopeptide) -> ThiopeptideMotif:
    """ Converts a Thiopeptide object to a ThiopeptideMotif, based on an original
        CDSFeature.

        Arguments:
            orig_feature: the original CDS feature that the Motif will attach to
            res_vec: a Thiopeptide object containing results

        Returns:
            a ThiopeptideMotif
    """
    if res_vec.c_cut:
        res_vec.core = res_vec.core[:-len(res_vec.c_cut)]

    mature_weights = []  # type: List[float]
    if res_vec.thio_type != "Type III":
        mature_weights = res_vec.mature_alt_weights
    feature = ThiopeptideMotif(
        orig_feature.location, res_vec.core, res_vec.leader,
        orig_feature.get_name(), res_vec.monoisotopic_mass,
        res_vec.molecular_weight, res_vec.alternative_weights,
        res_vec.thio_type, res_vec.score, res_vec.rodeo_score,
        res_vec.macrocycle, res_vec.c_cut, res_vec.mature_features,
        mature_weights, res_vec.amidation)
    return feature
Example #13
0
def smcog_tree_analysis(cds: CDSFeature, input_number: int, smcog: str, output_dir: str) -> None:
    "run smCOG search on all gene cluster CDS features"
    gene_id = cds.get_name()
    seq = cds.translation
    # create input.fasta file with single query sequence to be used as input for MSA
    fasta.write_fasta([gene_id], [seq], "input" + str(input_number) + ".fasta")
    alignment_file = alignsmcogs(smcog, input_number)
    # Generate trimmed alignment
    trim_alignment(input_number, alignment_file)
    # Draw phylogenetic tree
    draw_tree(input_number, output_dir, gene_id)
Example #14
0
def run_thiopred(query: secmet.CDSFeature, thio_type: str,
                 domains: Set[str]) -> Optional[Thiopeptide]:
    """ Analyses a CDS feature to determine if it contains a thiopeptide precursor

        Arguments:
            query: the CDS feature to analyse
            thio_type: the suspected type of the thiopeptide
            domains: the set of domains found within the cluster containing the query

        Returns:
            A Thiopeptide instance if a precursor is found, otherwise None
    """
    # Run checks to determine whether an ORF encodes a precursor peptide
    result = determine_precursor_peptide_candidate(query, domains)
    if result is None:
        return None

    # Determine thiopeptide type
    result.thio_type = thio_type

    # leader cleavage "validation"
    profile_pep = path.get_full_path(__file__, "data", 'thiopep2.hmm')
    core_a_fasta = ">%s\n%s" % (query.get_name(), result.core)
    hmmer_res_pep = subprocessing.run_hmmpfam2(profile_pep, core_a_fasta)

    thresh_pep_hit = -2
    filter_out = True
    for res in hmmer_res_pep:
        for hits in res:
            for seq in hits:
                if seq.bitscore > thresh_pep_hit:
                    filter_out = False

    if filter_out:
        return None

    # additional filter(s) for peptide prediction
    search = re.search(
        "[ISTV][SACNTW][STNCVG][ATCSGM][SVTFC][CGSTEAV][TCGVY].*", result.core)
    if not search:
        return None
    aux = search.group()

    if 10 < len(aux) < 20:
        diff = len(result.core) - len(aux)
        result.leader = result.leader + result.core[:diff]
        result.core = aux

    result.c_cut = find_tail(query, result.core)

    query.gene_functions.add(secmet.GeneFunction.ADDITIONAL, "thiopeptides",
                             "predicted thiopeptide")
    return result
Example #15
0
def run_lanthipred(record: Record, query: CDSFeature, lant_class: str,
                   domains: List[str]) -> Optional[Lanthipeptide]:
    """ Determines if a CDS is a predicted lanthipeptide based on the class
        and any contained domains.

        Arguments:
            record: the parent Record of the feature
            query: the CDSFeature to analyse
            lant_class: a string representing the class
            domains: a list of domain names in the current cluster
    """
    hmmer_profiles = {
        'Class-I': 'data/class1.hmm',
        'Class-II': 'data/class2.hmm',
        'Class-III': 'data/class3.hmm',
    }
    query_sequence = query.translation

    if lant_class in ("Class-II", "Class-III"):
        profile = path.get_full_path(__file__, hmmer_profiles[lant_class])
        lan_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence)
        cleavage_result = predict_cleavage_site(profile, lan_a_fasta)

        if cleavage_result is None:
            return None

        if THRESH_DICT[lant_class] > cleavage_result.score:
            return None

        # if the cleavage results in no core, that's not valid
        if cleavage_result.end == len(query_sequence):
            return None
        cleavage_result.lantype = lant_class
        leader = query_sequence[:cleavage_result.end]
        core = query_sequence[cleavage_result.end:]
        result = Lanthipeptide(cleavage_result, 0, leader, core)

    else:
        candidate = determine_precursor_peptide_candidate(
            record, query, domains, hmmer_profiles[lant_class], lant_class)
        if candidate is None:
            return None
        result = candidate

    # extract now (that class is known and thus the END component) the core peptide
    if result.number_of_lan_bridges == 0:
        return None

    query.gene_functions.add(GeneFunction.ADDITIONAL, "lanthipeptides",
                             "predicted lanthipeptide")
    return result
Example #16
0
def create_feature_from_location(record: Record, location: FeatureLocation,
                                 label: Optional[str] = None) -> CDSFeature:
    """ Creates a CDS feature covering the provided location.

        Arguments:
            record: The Record the CDSFeature will belong to, used to generate
                    the feature translation
            location: The FeatureLocation specifying the location of the CDSFeature
            label: The locus tag, protein id, and gene name to use for the new
                   CDSFeature

        Returns:
            The CDSFeature created.
    """
    if label is None:
        digits = len(str(len(record)))
        label = 'allorf_{start:0{digits}}_{end:0{digits}}'.format(
            digits=digits, start=(location.start + 1), end=location.end
        )
    feature = CDSFeature(location, str(record.get_aa_translation_from_location(location)),
                         locus_tag=label, protein_id=label, gene=label)
    feature.created_by_antismash = True
    return feature
Example #17
0
def determine_precursor_peptide_candidate(
        record: secmet.Record, query: secmet.CDSFeature, query_sequence: str,
        domains: List[str], hmmer_profile: str) -> Optional[Lanthipeptide]:
    """ Identify precursor peptide candidates and split into two,
        only valid for Class-I lanthipeptides
    """

    # Skip sequences with >200 AA
    if len(query_sequence) > 200 or len(query_sequence) < 20:
        return None

    # Create FASTA sequence for feature under study
    lan_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence)

    # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE
    cleavage_result = run_cleavage_site_phmm(lan_a_fasta, hmmer_profile,
                                             THRESH_DICT["Class-I"])

    if cleavage_result is not None and cleavage_result.end <= len(
            query_sequence) - 8:
        start = cleavage_result.start
        end = cleavage_result.end
        score = cleavage_result.score
        lanthi_type = cleavage_result.lantype
    else:
        # If no pHMM hit, try regular expression
        start, end, score = run_cleavage_site_regex(lan_a_fasta)
        if score is None or end > len(query_sequence) - 8:
            # abort, since RODEO will predict duplicates based only on cluster
            # attributes
            return None
        lanthi_type = "lanthipeptide"

    # if the cleavage results in no core, that's not valid
    if end == len(query_sequence):
        return None

    # Run RODEO to assess whether candidate precursor peptide is judged real
    rodeo_result = run_rodeo(record, query, query_sequence[:end],
                             query_sequence[end:], domains)
    if rodeo_result < 14:
        return None
    lanthipeptide = Lanthipeptide(start, end, score, rodeo_result, lanthi_type)

    # Determine the leader and core peptide
    lanthipeptide.leader = query_sequence[:end]
    lanthipeptide.core = query_sequence[end:]

    return lanthipeptide
Example #18
0
def determine_precursor_peptide_candidate(
        query: secmet.CDSFeature, domains: Set[str]) -> Optional[Thiopeptide]:
    """ Identify precursor peptide candidates and split into two

        Arguments:
            query: the CDS feature to check for motifs
            domains: the set of domain ids found in the cluster

        Returns:
            a Thiopeptide instance if a valid precursor found, otherwise None
    """

    query_sequence = query.translation
    # Skip sequences not in the size range desired
    if not 40 < len(query_sequence) < 200:
        return None

    # Create FASTA sequence for feature under study
    thio_a_fasta = ">%s\n%s" % (query.get_name(), query_sequence)

    # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE
    end, score = run_cleavage_site_phmm(thio_a_fasta, 'thio_cleave.hmm', -3.00)

    # If no pHMM hit, try regular expression
    if end is None:
        score = 0.
        end = run_cleavage_site_regex(query_sequence)
        if end is None or end > len(query_sequence) - 5:
            end = int(len(query_sequence) * 0.60) - 14

    # ensure there's a valid value for end before trying to use it
    assert isinstance(end, int) and end > 0

    # Run RODEO to assess whether candidate precursor peptide is judged real
    rodeo_result = run_rodeo(query_sequence[:end], query_sequence[end:],
                             domains)
    if not rodeo_result[0]:
        return Thiopeptide(end + 1, score, 0)

    thiopeptide = Thiopeptide(end + 1, score, rodeo_result[1])

    # Determine the leader and core peptide
    thiopeptide.leader = query_sequence[:end]
    thiopeptide.core = query_sequence[end:]

    return thiopeptide
Example #19
0
def determine_precursor_peptide_candidate(
        record: Record, query: CDSFeature, domains: List[str],
        hmmer_profile: str, lant_class: str) -> Optional[Lanthipeptide]:
    """ Identify precursor peptide candidates and split into two,
        only valid for Class-I lanthipeptides
    """

    # Skip sequences with >200 AA
    if len(query.translation) > 200 or len(query.translation) < 20:
        return None

    # Create FASTA sequence for feature under study
    lan_a_fasta = ">%s\n%s" % (query.get_name(), query.translation)

    # Run sequence against pHMM; if positive, parse into a vector containing START, END and SCORE
    cleavage_result = run_cleavage_site_phmm(lan_a_fasta, hmmer_profile,
                                             THRESH_DICT[lant_class])

    if cleavage_result is None or cleavage_result.end > len(
            query.translation) - 8:
        # If no pHMM hit, try regular expression
        cleavage_result = run_cleavage_site_regex(lan_a_fasta, lant_class)
        if cleavage_result is None or cleavage_result.end > len(
                query.translation) - 8:
            # still no good, so abort, since RODEO will predict duplicates based
            # only on cluster attributes
            return None

    # if the cleavage results in no core, that's not valid
    if cleavage_result.end == len(query.translation):
        return None

    # Run RODEO to assess whether candidate precursor peptide is judged real
    rodeo_result = run_rodeo(record, query,
                             query.translation[:cleavage_result.end],
                             query.translation[cleavage_result.end:], domains)
    if rodeo_result < 14:
        return None

    # Determine the leader and core peptide
    leader = query.translation[:cleavage_result.end]
    core = query.translation[cleavage_result.end:]

    return Lanthipeptide(cleavage_result, rodeo_result, leader, core)
Example #20
0
def generate_motif_features(record: Record, feature: CDSFeature,
                            motifs: List[HMMResult]) -> List[CDSMotif]:
    """ Convert a list of HMMResult to a list of CDSMotif features """
    # use a locus tag if one exists
    locus_tag = feature.get_name()
    if feature.locus_tag:
        locus_tag = feature.locus_tag
    # grab the translation table if it's there
    if feature.transl_table:
        transl_table = feature.transl_table
    else:
        transl_table = 1

    motif_features = []
    for i, motif in enumerate(motifs):
        i += 1  # user facing, so 1-indexed
        if feature.location.strand == 1:
            start = feature.location.start + 3 * motif.query_start
            end = feature.location.start + 3 * motif.query_end
        else:
            end = feature.location.end - 3 * motif.query_start
            start = feature.location.end - 3 * motif.query_end
        loc = FeatureLocation(start, end, strand=feature.strand)
        new_motif = CDSMotif(loc)
        new_motif.label = motif.hit_id
        new_motif.motif = motif.hit_id  # TODO: why both label AND motif?
        new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i)
        new_motif.evalue = motif.evalue
        new_motif.score = motif.bitscore
        new_motif.tool = "pksnrpsmotif"
        new_motif.detection = "hmmscan"
        new_motif.database = "abmotifs"
        new_motif.locus_tag = locus_tag

        new_motif.translation = str(
            new_motif.extract(record.seq).translate(table=transl_table))
        new_motif.notes.append(
            "NRPS/PKS Motif: %s (e-value: %s, bit-score: %s)" %
            (motif.hit_id, motif.evalue,
             motif.bitscore))  # TODO move to CDSMotif

        motif_features.append(new_motif)
    return motif_features
Example #21
0
def result_vec_to_feature(orig_feature: CDSFeature,
                          res_vec: Lanthipeptide) -> LanthipeptideMotif:
    """ Generates a LanthipeptideMotif feature from a CDSFeature and a Lanthipeptide

        Arguments:
            orig_feature: the CDSFeature the lanthipeptide was found in
            res_vec: the Lanthipeptide instance that was calculated

        Returns:
            a LanthipeptideMotif instance
    """
    feature = LanthipeptideMotif(
        orig_feature.location, res_vec.core, res_vec.leader,
        orig_feature.get_name(), res_vec.monoisotopic_mass,
        res_vec.molecular_weight, res_vec.alternative_weights,
        res_vec.number_of_lan_bridges, res_vec.lantype, res_vec.score,
        res_vec.rodeo_score, res_vec.aminovinyl_group, res_vec.chlorinated,
        res_vec.oxygenated, res_vec.lactonated)
    return feature
def result_vec_to_feature(orig_feature: CDSFeature, res_vec: Lanthipeptide) -> Prepeptide:
    """ Generates a Prepeptide feature from a CDSFeature and a Lanthipeptide

        Arguments:
            orig_feature: the CDSFeature the lanthipeptide was found in
            res_vec: the Lanthipeptide instance that was calculated

        Returns:
            a Prepeptide instance
    """
    assert res_vec.leader is not None
    feature = Prepeptide(orig_feature.location, "lanthipeptide", res_vec.core,
                         orig_feature.get_name(), "lanthipeptides", res_vec.lantype, res_vec.score,
                         res_vec.monoisotopic_mass, res_vec.molecular_weight,
                         res_vec.alternative_weights, res_vec.leader)
    qual = LanthiQualifier(res_vec.number_of_lan_bridges,
                           res_vec.rodeo_score, res_vec.aminovinyl_group,
                           res_vec.chlorinated, res_vec.oxygenated, res_vec.lactonated)
    feature.detailed_information = qual
    return feature
Example #23
0
def result_vec_to_motif(query: CDSFeature,
                        result: Lassopeptide) -> LassopeptideMotif:
    leader = result.leader
    core = result.core
    tail = result.c_cut
    if tail:
        core = result.core[:-len(tail)]
    mass = result.monoisotopic_mass
    weight = result.molecular_weight
    cut_mass = result.cut_mass
    cut_weight = result.cut_weight
    bridges = result.number_bridges
    lasso_class = result.lasso_class
    score = result.score
    rodeo_score = result.rodeo_score
    macrolactam = result.macrolactam
    locus_tag = query.get_name()
    location = query.location

    return LassopeptideMotif(location, leader, core, tail, locus_tag, mass,
                             weight, cut_mass, cut_weight, bridges,
                             lasso_class, score, rodeo_score, macrolactam)
Example #24
0
    def parse_domain(self, domain: NRPSPKSQualifier.Domain, feature: CDSFeature
                     ) -> JSONDomain:
        "Convert a NRPS/PKS domain string to a dict useable by json.dumps"
        predictions = parse_substrate_predictions(domain.predictions)

        # Create url_link to NaPDoS for C and KS domains
        napdoslink = ""
        domainseq = str(feature.translation)[domain.start:domain.end]
        base = ("http://napdos.ucsd.edu/cgi-bin/process_request.cgi?"
                "query_type=aa&amp;ref_seq_file=all_{0}_public_12062011.faa"
                "&amp;Sequence=%3E{0}_domain_from_antiSMASH%0D{1}")
        if domain.name == "PKS_KS":
            napdoslink = base.format("KS", domainseq)
        elif "Condensation" in domain.name:
            napdoslink = base.format("C", domainseq)
        blastlink = ("http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins"
                     "&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp"
                     "&amp;QUERY={}"
                     "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch").format(domainseq)

        dna_sequence = feature.extract(self.record.seq_record.seq)
        return JSONDomain(domain, predictions, napdoslink, blastlink, domainseq, dna_sequence)
Example #25
0
def result_vec_to_feature(orig_feature: secmet.CDSFeature,
                          res_vec: Thiopeptide) -> secmet.Prepeptide:
    """ Converts a Thiopeptide object to a Prepeptide, based on an original
        CDSFeature.

        Arguments:
            orig_feature: the original CDS feature that the Motif will attach to
            res_vec: a Thiopeptide object containing results

        Returns:
            a Prepeptide
    """
    if res_vec.c_cut:
        res_vec.core = res_vec.core[:-len(res_vec.c_cut)]

    mature_weights: List[float] = []
    if res_vec.thio_type != "Type III":
        mature_weights = res_vec.mature_alt_weights
    feature = secmet.Prepeptide(orig_feature.location,
                                "thiopeptide",
                                res_vec.core,
                                orig_feature.get_name(),
                                "thiopeptides",
                                res_vec.thio_type,
                                res_vec.score,
                                res_vec.monoisotopic_mass,
                                res_vec.molecular_weight,
                                res_vec.alternative_weights,
                                leader=res_vec.leader,
                                tail=res_vec.c_cut)
    feature.detailed_information = ThioQualifier(res_vec.rodeo_score,
                                                 res_vec.amidation,
                                                 res_vec.macrocycle,
                                                 res_vec.mature_features,
                                                 mature_weights)
    return feature
 def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]:
     domains = []
     for hsp in results_by_id.get(cds.get_name(), []):
         domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore,
                                               num_seeds_per_hmm[hsp.query_id], tool))
     return domains
Example #27
0
def get_description(
        record: Record, feature: CDSFeature, type_: str, options: ConfigType,
        mibig_result: List[clusterblast.results.MibigEntry]) -> str:
    "Get the description text of a CDS feature"

    blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \
                 "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \
                 "LINK_LOC=protein&PAGE_TYPE=BlastSearch" % feature.translation
    genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \
                          "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\
                          "id=%s&from=%s&to=%s"
    template = '<span class="svgene-tooltip-bold">%s</span><br>\n' % feature.product or feature.get_name(
    )
    template += 'Locus-tag: %s; Protein-ID: %s<br>\n' % (feature.locus_tag,
                                                         feature.protein_id)

    if feature.get_qualifier('EC_number'):
        template += "EC-number(s): %s<br>\n" % ",".join(
            feature.get_qualifier('EC_number'))

    for gene_function in feature.gene_functions:
        template += "%s<br>\n" % str(gene_function)

    template += "Location: %d - %d<br><br>\n" % (
        feature.location.start + 1,  # 1-indexed
        feature.location.end)

    if mibig_result:
        cluster_number = feature.cluster.get_cluster_number()
        mibig_homology_file = os.path.join(
            options.output_dir, "knownclusterblast",
            "cluster%d" % cluster_number,
            feature.get_accession() + '_mibig_hits.html')
        generate_html_table(mibig_homology_file, mibig_result)
        mibig_path = mibig_homology_file[len(options.output_dir) + 1:]
        template += '<br><a href="%s" target="_new">MiBIG Hits</a><br>\n' % mibig_path

    if type_ == 'transport':
        url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \
              "program=blastp;database=pub/transporter.pep;" \
              "sequence=sequence%%0A%s" % feature.translation
        template += '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url

    key = record.id + "_" + feature.get_name()
    if key in searchgtr_links:
        url = searchgtr_links[key]
        template += '<a href="%s" target="_new">SEARCHGTr on this gene<br>\n' % url

    template += '<a href="%s" target="_new">NCBI BlastP on this gene</a><br>\n' % blastp_url

    context = genomic_context_url % (
        record.id, max(feature.location.start - 9999,
                       0), min(feature.location.end + 10000, len(record)))
    template += """<a href="%s" target="_new">View genomic context</a><br>\n""" % context

    if options.smcogs_trees:
        for note in feature.notes:  # TODO find a better way to store image urls
            if note.startswith('smCOG tree PNG image:'):
                url = note.split(':')[-1]
                entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>\n'
                template += entry % url
                break

    template += generate_asf_tooltip_section(record, feature)

    go_notes = generate_pfam2go_tooltip(record, feature)
    if go_notes:
        template += '<br><span class="bold">Gene Ontology terms for PFAM domains:</span><br>\n' \
                    '%s<br><br>\n' % "<br>".join(go_notes)

    clipboard_fragment = """<a href="javascript:copyToClipboard('%s')">Copy to clipboard</a>"""
    template += "AA sequence: %s<br>\n" % (clipboard_fragment %
                                           feature.translation)
    template += "Nucleotide sequence: %s<br>\n" % (clipboard_fragment %
                                                   feature.extract(record.seq))

    return "".join(char for char in template if char in string.printable)
Example #28
0
def acquire_rodeo_heuristics(record: Record, cluster: Cluster,
                             query: CDSFeature, leader: str,
                             core: str) -> Tuple[int, List[Union[float, int]]]:
    """Calculate heuristic scores for RODEO"""
    tabs = []  # type: List[Union[float, int]]
    score = 0
    # Calcd. lasso peptide mass (Da) (with Xs average out)
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))

    # Distance to any biosynthetic protein (E, B, C)
    hmmer_profiles = ['PF13471', 'PF00733', 'PF05402']
    distance = utils.distance_to_pfam(record, query, hmmer_profiles)
    tabs.append(distance)
    # Within 500 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Within 150 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 150:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Greater than 1000 nucleotides from every biosynthetic protein (E, B, C)	-2
    if distance > 1000:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core region has 2 or 4 Cys residues	+1
    if core.count("C") in [2, 4]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region is longer than core region	+2
    if len(leader) > len(core):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has 7 (Glu) or 8(Glu/Asp) or 9 (Asp) membered ring possible	+1
    if 'E' in core[6:8] or 'D' in core[7:9]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains GxxxxxT	+3
    if re.search('(G[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with G	+2
    if core.startswith("G"):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide and lasso cyclase are on same strand	+1
    if is_on_same_strand_as(cluster, query, 'PF00733'):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader/core region length ratio < 2 and > 0.5	+1
    if 0.5 <= len(leader) / len(core) <= 2:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with Cys and has an even number of Cys	0
    if core.startswith("C") and core.count("C") % 2 == 0:
        score += 0
        tabs.append(1)
    else:
        tabs.append(0)
    # Core contains no Gly	-4
    if "G" not in core:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least one aromatic residue	+1
    if set("FWY") & set(core):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least 2 aromatic residues	+2
    if sum([core.count(aa) for aa in list("FWY")]) >= 2:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has odd number of Cys	-2
    if core.count("C") % 2 != 0:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Trp	-1
    if "W" in leader:
        score -= 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Lys	+1
    if "K" in leader:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region has Cys	-2
    if "C" in leader:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Gene cluster does not contain PF13471	-2
    if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF13471']) > 10000:
        score -= 2
    # Peptide utilizes alternate start codon	-1
    if not str(query.extract(record.seq)).startswith("ATG"):
        score -= 1
    return score, tabs
Example #29
0
 def __init__(self, feature: CDSFeature) -> None:
     super().__init__(['id', 'sequence', 'domains'])
     self.sequence = feature.translation
     self.id = feature.get_name()
     self.domains = []  # type: List[JSONDomain]
Example #30
0
def generate_rodeo_svm_csv(
        record: Record, query: CDSFeature, leader: str, core: str,
        previously_gathered_tabs: List[Union[float,
                                             int]], fimo_motifs: List[int],
        fimo_scores: Dict[int, float]) -> List[Union[float, int]]:
    """Generates all the items for a single precursor peptide candidate"""
    columns = []  # type: List[Union[float, int]]
    # Precursor Index
    columns.append(1)
    # classification
    columns.append(0)
    columns += previously_gathered_tabs
    # Cluster has PF00733?
    if utils.distance_to_pfam(record, query, ['PF00733']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF00733']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Cluster has PF05402?
    if utils.distance_to_pfam(record, query, ['PF05402']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF05402']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Cluster has PF13471?
    if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF13471']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Leader has LxxxxxT motif?
    if re.search('(L[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader):
        columns.append(1)
    else:
        columns.append(0)
    # Core has adjacent identical aas (doubles)?
    if any(core[i] == core[i + 1] for i in range(len(core) - 1)):
        columns.append(1)
    else:
        columns.append(0)
    # Core length (aa)
    columns.append(len(core))
    # Leader length (aa)
    columns.append(len(leader))
    # Precursor length (aa)
    columns.append(len(leader) + len(core))
    # Leader/core ratio
    columns.append(len(core) / len(leader))
    # Number of Pro in first 9 aa of core?
    columns.append(core[:9].count("P"))
    # Estimated core charge
    charge_dict = {"E": -1, "D": -1, "K": 1, "H": 1, "R": 1}
    columns.append(sum([charge_dict[aa] for aa in core if aa in charge_dict]))
    # Estimated leader charge
    columns.append(sum([charge_dict[aa] for aa in leader
                        if aa in charge_dict]))
    # Estimated precursor charge
    columns.append(
        sum([charge_dict[aa] for aa in leader + core if aa in charge_dict]))
    # Absolute value of core charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in core if aa in charge_dict])))
    # Absolute value of leader charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in leader if aa in charge_dict])))
    # Absolute value of precursor charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in leader + core
                 if aa in charge_dict])))
    # Counts of AAs in leader
    columns += [leader.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in leader
    columns.append(sum([leader.count(aa) for aa in "FWY"]))
    # Neg charged in leader
    columns.append(sum([leader.count(aa) for aa in "DE"]))
    # Pos charged in leader
    columns.append(sum([leader.count(aa) for aa in "RK"]))
    # Charged in leader
    columns.append(sum([leader.count(aa) for aa in "RKDE"]))
    # Aliphatic in leader
    columns.append(sum([leader.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in leader
    columns.append(sum([leader.count(aa) for aa in "ST"]))
    # Counts of AAs in core
    columns += [core.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in core
    columns.append(sum([core.count(aa) for aa in "FWY"]))
    # Neg charged in core
    columns.append(sum([core.count(aa) for aa in "DE"]))
    # Pos charged in core
    columns.append(sum([core.count(aa) for aa in "RK"]))
    # Charged in core
    columns.append(sum([core.count(aa) for aa in "RKDE"]))
    # Aliphatic in core
    columns.append(sum([core.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in core
    columns.append(sum([core.count(aa) for aa in "ST"]))
    # Counts (0 or 1) of amino acids within first AA position of core sequence
    columns += [core[0].count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Counts of AAs in leader+core
    precursor = leader + core
    columns += [precursor.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"
                ]  # Temp to work with current training CSV
    # Aromatics in precursor
    columns.append(sum([precursor.count(aa) for aa in "FWY"]))
    # Neg charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "DE"]))
    # Pos charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RK"]))
    # Charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RKDE"]))
    # Aliphatic in precursor
    columns.append(sum([precursor.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in precursor
    columns.append(sum([precursor.count(aa) for aa in "ST"]))
    # Motifs
    columns += [1 if motif in fimo_motifs else 0 for motif in range(1, 17)]
    # Total motifs hit
    columns.append(len(fimo_motifs))
    # Motif scores
    columns += [
        fimo_scores[motif] if motif in fimo_motifs else 0
        for motif in range(1, 17)
    ]
    # Sum of MEME scores
    columns.append(
        sum([
            fimo_scores[motif] if motif in fimo_motifs else 0
            for motif in range(1, 17)
        ]))
    # No Motifs?
    if not fimo_motifs:
        columns.append(1)
    else:
        columns.append(0)
    # Alternate Start Codon?
    if not str(query.extract(record.seq)).startswith("ATG"):
        columns.append(1)
    else:
        columns.append(0)
    return columns