예제 #1
0
def run_non_biosynthetic_phmms(fasta: str) -> Dict[str, List[str]]:
    """ Finds lanthipeptide-specific domains in the input fasta

        Arguments:
            fasta: a string containing gene sequences in fasta format

        Returns:
            a dictionary mapping the hit id to a list of matching query ids
    """
    with open(
            path.get_full_path(__file__, "data", "non_biosyn_hmms",
                               "hmmdetails.txt"), "r") as handle:
        hmmdetails = [
            line.strip().split("\t") for line in handle
            if line.count("\t") == 3
        ]
    signature_profiles = [
        HmmSignature(details[0], details[1], int(details[2]), details[3])
        for details in hmmdetails
    ]
    non_biosynthetic_hmms_by_id = defaultdict(
        list)  # type: Dict[str, List[str]]
    for sig in signature_profiles:
        sig.path = path.get_full_path(__file__, "data", "non_biosyn_hmms",
                                      sig.path.rpartition(os.sep)[2])
        runresults = subprocessing.run_hmmsearch(sig.path, fasta)
        for runresult in runresults:
            # Store result if it is above cut-off
            for hsp in runresult.hsps:
                if hsp.bitscore > sig.cutoff:
                    non_biosynthetic_hmms_by_id[hsp.hit_id].append(
                        hsp.query_id)
    return non_biosynthetic_hmms_by_id
예제 #2
0
def run_non_biosynthetic_phmms(cluster_fasta: str) -> Dict[str, List[HSP]]:
    """ Try to identify cleavage site using pHMM """
    if not cluster_fasta:
        return {}
    with open(
            path.get_full_path(__file__, "data", "non_biosyn_hmms",
                               "hmmdetails.txt"), "r") as handle:
        hmmdetails = [
            line.split("\t") for line in handle.read().splitlines()
            if line.count("\t") == 3
        ]
    signature_profiles = [
        HmmSignature(details[0], details[1], int(details[2]), details[3])
        for details in hmmdetails
    ]

    non_biosynthetic_hmms_by_id = defaultdict(list)  # type: Dict[str, Any]
    for sig in signature_profiles:
        sig.path = path.get_full_path(__file__, "data", "non_biosyn_hmms",
                                      sig.path)
        runresults = subprocessing.run_hmmsearch(sig.path, cluster_fasta)
        for runresult in runresults:
            # Store result if it is above cut-off
            for hsp in runresult.hsps:
                if hsp.bitscore > sig.cutoff:
                    non_biosynthetic_hmms_by_id[hsp.hit_id].append(hsp)
    return non_biosynthetic_hmms_by_id
예제 #3
0
def acquire_rodeo_heuristics(
        cluster: secmet.Protocluster, query: secmet.CDSFeature, leader: str,
        core: str, domains: Dict[str,
                                 int]) -> Tuple[int, List[float], List[int]]:
    """Calculate heuristic scores for RODEO"""
    tabs = []
    score = 0
    precursor = leader + core
    # Calcd. precursor peptide mass (Da)
    precursor_analysis = utils.RobustProteinAnalysis(precursor,
                                                     monoisotopic=True,
                                                     ignore_invalid=False)
    tabs.append(float(precursor_analysis.molecular_weight()))
    # Calcd. leader peptide mass (Da)
    leader_analysis = utils.RobustProteinAnalysis(leader,
                                                  monoisotopic=True,
                                                  ignore_invalid=False)
    tabs.append(float(leader_analysis.molecular_weight()))
    # Calcd. core peptide mass (Da)
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))
    # Distance to any biosynthetic protein (E, B, C)
    hmmer_profiles = ['PF04055']
    distance = utils.distance_to_pfam(cluster.parent_record, query,
                                      hmmer_profiles)
    tabs.append(distance)
    # rSAM within 500 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # rSAM within 150 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 150:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # rSAM further than 1000 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) == -1 or \
       utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) > 10000:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Ratio of N-term to 1st Cys 0.25<x<0.60; Ratio of N-term to 1st Cys <0.25 or >0.60
    if "C" not in precursor:
        score -= 2
        tabs += [0, 1]
    elif 0.25 <= precursor.find("C") / len(precursor) <= 0.60:
        score += 2
        tabs += [1, 0]
    else:
        score -= 2
        tabs += [0, 1]
    # Three or more Cys; Less than 3 Cys
    if precursor.count("C") >= 3:
        score += 4
        tabs += [1, 0]
    else:
        score -= 4
        tabs += [0, 1]
    # CxC/CxxC/CxxxC/CxxxxxC; # CC/CCC
    motifs = (('C.{5}C', 2), ('C.{3}C', 1), ('C.{2}C', 1), ('C.{1}C', 1),
              ('CC', -2), ('CCC', -2))
    for motif in motifs:
        if re.search(motif[0], core):
            score += motif[1]
            tabs.append(1)
        else:
            tabs.append(0)
    # No Cys in last 1/4th?
    quarter_length = -len(precursor) // 4
    if "C" not in precursor[quarter_length:]:
        score += 1
        tabs.append(1)
    else:
        score -= 1
        tabs.append(0)
    # 2 Cys in first 2/3rds of precursor, 1 Cys in last 1/3rd of precursor
    two_thirds = 2 * len(precursor) // 3
    if precursor[:two_thirds].count("C") == 2 and precursor[two_thirds:].count(
            "C") == 1:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SboA hmm
    if cds_has_domains(query, {"Subtilosin_A"}):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SkfA hmm
    if cds_has_domains(query, {"TIGR04404"}):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SCIFF hmm
    if cds_has_domains(query, {"TIGR03973"}):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has PqqD/RRE (PF05402)
    if "PF05402" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has SPASM domain (PF13186)
    if "PF13186" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # PF04055 (rSAM) domain start > 80
    runresults = subprocessing.run_hmmsearch(
        path.get_full_path(__file__, "data", "PF04055.hmm"),
        fasta.get_fasta_from_features(cluster.cds_children))
    max_start = 0
    hitstarts = []
    hitends = []
    for runresult in runresults:
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.bitscore > 40:
                hitstarts.append(hsp.hit_start)
                max_start = max(hsp.hit_start, max_start)
                hitends.append(hsp.hit_end)
    if hitstarts and max_start > 80:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has peptidase
    peptidase_domains = [
        "Peptidase_M16_C", "Peptidase_S8", "Peptidase_M16", "Peptidase_S41"
    ]
    no_peptidase = True
    for pepdom in peptidase_domains:
        if pepdom in domains:
            score += 1
            tabs.append(1)
            no_peptidase = False
        else:
            tabs.append(0)
    # cluster has transporter
    transport_domains = ["PF00005", "PF00664"]
    for transpdom in transport_domains:
        if transpdom in domains:
            score += 1
            tabs.append(1)
        else:
            tabs.append(0)
    # cluster has response regulator (PF00072)
    if "PF00072" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has major facilitator (PF07690)
    if "PF07690" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has ATPase (PF13304)
    if "PF13304" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has Fer4_12 (PF13353)
    if "PF13353" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has rSAM (PF04055)
    if "PF04055" in domains or "TIGR03975" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has no recognized peptidase
    if no_peptidase:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # C-terminal portion is < 0.35 or > 0.65; C-terminal portion is defined as
    # the part from the last cysteine in the last identified Cx(n)C motif to the C-terminus
    # the binary opposite is also included as the next field
    last_motif_c = 0
    index = -1
    for aa in reversed(precursor):
        if aa == "C" and "C" in precursor[index - 6:index]:
            last_motif_c = index + 1
        index -= 1
    if 0.35 <= last_motif_c / len(precursor) <= 0.65:
        score += 3
        tabs += [0, 1]
    else:
        score -= 2
        tabs += [1, 0]
    # SS profile count > 1
    # is there more than one Cx..C structure in the sequence
    cysrex = '(?=(C.{%d,%d}C))' % (CHAIN_LOWER, CHAIN_UPPER)
    rex4 = re.compile(cysrex)
    if len(rex4.findall(core)) > 1:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    return score, tabs, hitends
def detect_protoclusters_and_signatures(record: Record, signature_file: str, seeds_file: str,
                                        rule_files: List[str], valid_categories: Set[str],
                                        filter_file: str, tool: str,
                                        annotate_existing_subregions: bool = True) -> RuleDetectionResults:
    """ Compares all CDS features in a record with HMM signatures and generates
        Protocluster features based on those hits and the current protocluster detection
        rules.

        Arguments:
            record: the record to analyse
            signature_file: a tab separated file; each row being a single HMM reference
                        with columns: label, description, minimum score cutoff, hmm path
            seeds_file: the file containing all HMM profiles
            rule_files: the files containing the rules to use for cluster definition
            valid_categories: a set containing valid rule category strings
            filter_file: a file containing equivalence sets of HMMs
            tool: the name of the tool providing the HMMs (e.g. rule_based_clusters)
            annotate_existing_subregions: if True, subregions already present in the record
                    will have domains annotated even if no protocluster is found
    """
    if not rule_files:
        raise ValueError("rules must be provided")
    full_fasta = fasta.get_fasta_from_record(record)
    # if there's no CDS features, don't try to do anything
    if not full_fasta:
        return RuleDetectionResults({}, tool, [])
    sig_by_name = {sig.name: sig for sig in get_signature_profiles(signature_file)}
    rules: List[rule_parser.DetectionRule] = []
    aliases: Dict[str, List[rule_parser.Token]] = {}
    for rule_file in rule_files:
        rules = create_rules(rule_file, set(sig_by_name), valid_categories, aliases, rules)
    results = []
    results_by_id: Dict[str, HSP] = {}

    runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                raise ValueError('Failed to find signature for ID %s / ACC %s' % (
                                                    hsp.query_id, acc))
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    # Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id, filter_file, set(sig_by_name))

    # Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    # Use rules to determine gene clusters
    cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(record, results_by_id, rules)

    # Find number of sequences on which each pHMM is based
    num_seeds_per_hmm = get_sequence_counts(signature_file)

    # annotate everything in detected protoclusters
    rules_by_name = {rule.name: rule for rule in rules}
    clusters = find_protoclusters(record, cluster_type_hits, rules_by_name)
    strip_inferior_domains(cds_domains_by_cluster, rules_by_name)

    def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]:
        domains = []
        for hsp in results_by_id.get(cds.get_name(), []):
            domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore,
                                                  num_seeds_per_hmm[hsp.query_id], tool))
        return domains

    cds_results_by_cluster = {}
    cdses_with_annotations = set()
    for cluster in clusters:
        cds_results = []
        for cds in record.get_cds_features_within_location(cluster.location):
            domains = get_domains_for_cds(cds)
            if domains:
                cds_results.append(CDSResults(cds, domains, cds_domains_by_cluster.get(cds.get_name(), {})))
                cdses_with_annotations.add(cds)
        cds_results_by_cluster[cluster] = cds_results

    # add detected profile annotations for any existing subregions, if enabled
    cds_results_outside_clusters = []
    if annotate_existing_subregions:
        for subregion in record.get_subregions():
            for cds in subregion.cds_children:
                if cds in cdses_with_annotations:
                    continue
                domains = get_domains_for_cds(cds)
                if domains:
                    cds_results_outside_clusters.append(CDSResults(cds, domains, {}))
                    cdses_with_annotations.add(cds)

    return RuleDetectionResults(cds_results_by_cluster, tool, cds_results_outside_clusters)
예제 #5
0
def detect_borders_and_signatures(record: Record, signature_file: str,
                                  seeds_file: str, rules_file: str,
                                  filter_file: str,
                                  tool: str) -> RuleDetectionResults:
    """ Compares all CDS features in a record with HMM signatures and generates
        Cluster features based on those hits and the current cluster detection
        rules.

        Arguments:
            record: the record to analyse
            signature_file: a tab separated file; each row being a single HMM reference
                        with columns: label, description, minimum score cutoff, hmm path
            seeds_file: the file containing all HMM profiles
            rules_file: the file containing all the rules to use for cluster definition
            filter_file: a file containing equivalence sets of HMMs
            tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters)
    """
    full_fasta = fasta.get_fasta_from_record(record)
    # if there's no CDS features, don't try to do anything
    if not full_fasta:
        return None
    sig_by_name = {
        sig.name: sig
        for sig in get_signature_profiles(signature_file)
    }
    rules = create_rules(rules_file, set(sig_by_name))
    results = []
    results_by_id = {}  # type: Dict[str, HSP]

    runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                raise ValueError(
                    'Failed to find signature for ID %s / ACC %s' %
                    (hsp.query_id, acc))
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    # Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id,
                                            filter_file, set(sig_by_name))

    # Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    # Use rules to determine gene clusters
    cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(
        record, results_by_id, rules)

    # Find number of sequences on which each pHMM is based
    num_seeds_per_hmm = get_sequence_counts(signature_file)

    # Save final results to record
    rules_by_name = {rule.name: rule for rule in rules}
    clusters = find_clusters(record, cluster_type_hits, rules_by_name)
    strip_inferior_domains(cds_domains_by_cluster, rules_by_name)

    cds_results_by_cluster = {}
    for cluster in clusters:
        record.add_cluster_border(cluster)
        cds_results = []
        cluster_extent = FeatureLocation(
            cluster.location.start - cluster.extent,
            cluster.location.end + cluster.extent)
        for cds in record.get_cds_features_within_location(cluster_extent):
            domains = []
            for hsp in results_by_id.get(cds.get_name(), []):
                domains.append(
                    SecMetQualifier.Domain(hsp.query_id, hsp.evalue,
                                           hsp.bitscore,
                                           num_seeds_per_hmm[hsp.query_id],
                                           tool))
            if domains:
                cds_results.append(
                    CDSResults(cds, domains,
                               cds_domains_by_cluster.get(cds.get_name(), {})))
        cds_results_by_cluster[cluster] = cds_results

    return RuleDetectionResults(cds_results_by_cluster, tool)