def find_ks_domains(fasta: str) -> Dict[str, List[HMMResult]]:
    """ Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types

        Arguments:
            fasta: a group of features in fasta format

        Returns:
            a dictionary mapping feature name to a list of KS domain results for that feature
    """
    opts = ["--cut_tc"]
    ks_file = path.get_full_path(__file__, "data", "ksdomains.hmm")
    lengths = utils.get_hmm_lengths(ks_file)
    domains = subprocessing.run_hmmscan(ks_file, fasta, opts)
    refine_hmmscan_results(domains, lengths, neighbour_mode=True)
    raise NotImplementedError("no return value used from refine_hmmscan_results")
Beispiel #2
0
def scan_for_functions(cds_features: List[CDSFeature], database: str,
                       hmmscan_opts: Optional[List[str]] = None) -> Dict[str, HMMResult]:
    """ Finds possible classifications for the provided genes.

        Arguments:
            cds_features: a list of CDSFeatures to classify
            database: the path to the database to check
            hmmscan_opts: a list of extra options to provide to hmmscan

        Returns:
            a dictionary mapping CDS name to a list of HMMResult instances of
                classifications
    """
    search_fasta = fasta.get_fasta_from_features(cds_features)
    results = subprocessing.run_hmmscan(database, search_fasta, hmmscan_opts)
    hmm_lengths = utils.get_hmm_lengths(database)
    hmm_results = refine_hmmscan_results(results, hmm_lengths)

    best_hits = {}  # type: Dict[str, HMMResult]

    for cds in cds_features:
        cds_name = cds.get_name()
        hits = hmm_results.get(cds_name)
        if not hits:
            continue
        best_hits[cds_name] = hits[0]

    return best_hits
 def test_combined(self):
     results = refinement.refine_hmmscan_results(self.results,
                                                 self.hmm_lengths)
     assert len(results) == 1
     assert len(results[self.gene_id]) == 1
     best = results[self.gene_id][0]
     assert best.hit_id == "SMCOG1048:sensor_histidine_kinase"
     assert best.evalue == 3.6e-13
     assert best.bitscore == 43.5
     assert best.query_start == 91
     assert best.query_end == 390
Beispiel #4
0
def find_ab_motifs(fasta: str) -> Dict[str, List[HMMResult]]:
    """ Analyse for abMotifs

        Arguments:
            fasta: a group of features in fasta format

        Returns:
            a dictionary mapping feature name to a list of motif results for that feature
    """
    opts = ["-E", "0.25"]
    motif_file = path.get_full_path(__file__, "data", "abmotifs.hmm")
    abmotif_results = subprocessing.run_hmmscan(motif_file, fasta, opts)
    lengths = utils.get_hmm_lengths(motif_file)
    return refine_hmmscan_results(abmotif_results, lengths, neighbour_mode=True)
Beispiel #5
0
def run_t2pks_hmmscan(cluster: Cluster) -> Dict[str, List[HMMResult]]:
    """ Runs hmmscan for type II PKS proteins on coding sequences in cluster

        Arguments:
            cluster: Cluster on which the type II PKS hmmscan shall be run

        Returns:
            a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster
    """
    cluster_fasta = fasta.get_fasta_from_features(cluster.cds_children)
    hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm")
    hmm_results = subprocessing.run_hmmscan(hmm_file,
                                            cluster_fasta,
                                            opts=['--cut_tc'])
    hmm_lengths = get_hmm_lengths(hmm_file)
    return refine_hmmscan_results(hmm_results, hmm_lengths)
Beispiel #6
0
def find_domains(fasta: str, record: Record) -> Dict[str, List[HMMResult]]:
    """ Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains

        Arguments:
            fasta: a group of features in fasta format
            record: the Record that contains all the features

        Returns:
            a dictionary mapping feature name to a list of domain results for that feature
    """
    opts = ["--cut_tc"]
    nrpspks_file = path.get_full_path(__file__, "data", "nrpspksdomains.hmm")
    nrpspksdomain_results = subprocessing.run_hmmscan(nrpspks_file, fasta, opts)
    lengths = utils.get_hmm_lengths(nrpspks_file)
    domains = refine_hmmscan_results(nrpspksdomain_results, lengths, neighbour_mode=True)
    return filter_nonterminal_docking_domains(record, domains)
Beispiel #7
0
def run_t2pks_hmmscan(
        cds_features: Iterable[CDSFeature]) -> Dict[str, List[HMMResult]]:
    """ Runs hmmscan for type II PKS proteins on the given CDSFeatures

        Arguments:
            cluster: Protocluster on which to run the type II PKS hmmscan

        Returns:
            a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster
    """
    cluster_fasta = fasta.get_fasta_from_features(cds_features)
    hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm")
    hmm_results = subprocessing.run_hmmscan(hmm_file,
                                            cluster_fasta,
                                            opts=['--cut_tc'])
    hmm_lengths = get_hmm_lengths(hmm_file)
    return refine_hmmscan_results(hmm_results, hmm_lengths)
Beispiel #8
0
def classify_genes(
        cds_features: List[CDSFeature]) -> Dict[str, List[HMMResult]]:
    """ Finds possible classifications for the provided genes.

        Arguments:
            cds_features: a list of CDSFeatures to classify

        Returns:
            a dictionary mapping CDS name to a list of HMMResult instances of
                classifications
    """
    smcogs_fasta = fasta.get_fasta_from_features(cds_features)
    smcogs_opts = ["-E", "1E-6"]
    hmm_file = path.get_full_path(__file__, "data", "smcogs.hmm")
    smcogs_results = subprocessing.run_hmmscan(hmm_file, smcogs_fasta,
                                               smcogs_opts)
    hmm_lengths = utils.get_hmm_lengths(hmm_file)
    return refine_hmmscan_results(smcogs_results, hmm_lengths)
Beispiel #9
0
def run_starter_unit_blastp(
        cluster: Cluster,
        cds_hmm_hits: Dict[str,
                           List[HMMResult]]) -> Dict[str, List[HMMResult]]:
    """ Runs blastp on starter unit coding sequences in given cluster

        Arguments:
            cluster: Cluster on which the blastp shall be run
            cds_hmm_hits: HMMResults by cds from type II PKS hmmscan

        Returns:
            None if no starter unit cds are present otherwise a dictionary of key: cds and value: list of HMMresults, for blastp results of the cluster
    """
    starter_unit_cds = {}
    for cds, hmm_hits in cds_hmm_hits.items():
        starter_unit_hit_ids = [
            hit.hit_id for hit in hmm_hits
            if hit.hit_id in ['KSIII', 'AT', 'AMID', 'LIG']
        ]
        if starter_unit_hit_ids:
            starter_unit_cds[cluster.parent_record.get_cds_by_name(
                cds)] = starter_unit_hit_ids

    if starter_unit_cds:
        blastp_results = []
        blastp_fasta_files = set()
        for cds, starter_unit_hit_ids in starter_unit_cds.items():
            query_sequence = fasta.get_fasta_from_features([cds])
            for hit_id in starter_unit_hit_ids:
                blast_database = path.get_full_path(__file__, 'data', hit_id)
                blastp_results.extend(
                    subprocessing.run_blastp(blast_database, query_sequence))
                blastp_fasta_files.add(
                    path.get_full_path(__file__, 'data', hit_id + '.fasta'))

        fasta_lengths = {}
        for fasta_file in blastp_fasta_files:
            fasta_lengths.update(get_fasta_lengths(fasta_file))

        return refine_hmmscan_results(blastp_results, fasta_lengths)

    return {}
Beispiel #10
0
def run_starter_unit_blastp(
    cds_hmm_hits: Dict[CDSFeature,
                       List[HMMResult]]) -> Dict[str, List[HMMResult]]:
    """ Runs blastp on starter unit coding sequences in given cluster

        Arguments:
            cds_hmm_hits: HMMResults by cds from type II PKS hmmscan

        Returns:
            a dictionary mapping CDS name to a list of HMMresults
    """
    blastp_results = []
    blastp_fasta_files = set()
    for cds, hmm_hits in cds_hmm_hits.items():
        query_sequence = fasta.get_fasta_from_features([cds])
        for hit in hmm_hits:
            if hit.hit_id not in ['KSIII', 'AT', 'AMID', 'LIG']:
                continue
            blast_database = path.get_full_path(__file__, 'data', hit.hit_id)
            blastp_results.extend(
                subprocessing.run_blastp(blast_database, query_sequence))
            blastp_fasta_files.add(
                path.get_full_path(__file__, 'data', hit.hit_id + '.fasta'))

    if not blastp_results:
        return {}

    fasta_lengths = {}
    for fasta_file in blastp_fasta_files:
        fasta_lengths.update(get_fasta_lengths(fasta_file))

    results = refine_hmmscan_results(blastp_results, fasta_lengths)
    for hits in results.values():
        for i, hit in enumerate(hits):
            if not hit.hit_id.endswith("-CoA"):
                hits[i] = HMMResult(hit.hit_id + "-CoA", hit.query_start,
                                    hit.query_end, hit.evalue, hit.bitscore)
    return results