def scan_for_functions(cds_features: List[CDSFeature], database: str, hmmscan_opts: Optional[List[str]] = None) -> Dict[str, HMMResult]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify database: the path to the database to check hmmscan_opts: a list of extra options to provide to hmmscan Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ search_fasta = fasta.get_fasta_from_features(cds_features) results = subprocessing.run_hmmscan(database, search_fasta, hmmscan_opts) hmm_lengths = utils.get_hmm_lengths(database) hmm_results = refine_hmmscan_results(results, hmm_lengths) best_hits = {} # type: Dict[str, HMMResult] for cds in cds_features: cds_name = cds.get_name() hits = hmm_results.get(cds_name) if not hits: continue best_hits[cds_name] = hits[0] return best_hits
def run_hmmer(record: Record, features: Iterable[CDSFeature], max_evalue: float, min_score: float, database: str, tool: str, filter_overlapping: bool = True ) -> HmmerResults: """ Build hmmer results for the given features Arguments: record: the Record instance to run hmmer over features: the list of CDSFeatures to run over specifically max_evalue: a maximum evalue allowed for hits (exclusive) min_evalue: a minimum evalue allowed for hits (exclusive) database: the database to search for hits within tool: the name of the specific tool calling into this module """ if not os.path.exists(database): raise ValueError("Given database does not exist: %s" % database) query_sequence = fasta.get_fasta_from_features(features) hmmscan_results = subprocessing.run_hmmscan(database, query_sequence, opts=["--cut_tc"]) hits = build_hits(record, hmmscan_results, min_score, max_evalue, database) if filter_overlapping: results_by_cds = defaultdict(list) for hit in hits: results_by_cds[hit.locus_tag].append(hit) cutoffs = pfamdb.get_pfam_cutoffs(database) hits = [] for locus_hits in results_by_cds.values(): hits.extend(remove_overlapping(locus_hits, cutoffs)) return HmmerResults(record.id, max_evalue, min_score, database, tool, hits)
def run_hmmer(record: Record, features: List[CDSFeature], max_evalue: float, min_score: float, database: str, tool: str) -> HmmerResults: """ Build hmmer results for the given features""" if not os.path.exists(database): raise ValueError("Given database does not exist: %s" % database) query_sequence = fasta.get_fasta_from_features(features) hmmscan_results = subprocessing.run_hmmscan(database, query_sequence) hits = build_hits(record, hmmscan_results, min_score, max_evalue, database) return HmmerResults(record.id, max_evalue, min_score, database, tool, hits)
def find_ks_domains(fasta: str) -> Dict[str, List[HMMResult]]: """ Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types Arguments: fasta: a group of features in fasta format Returns: a dictionary mapping feature name to a list of KS domain results for that feature """ opts = ["--cut_tc"] ks_file = path.get_full_path(__file__, "data", "ksdomains.hmm") lengths = utils.get_hmm_lengths(ks_file) domains = subprocessing.run_hmmscan(ks_file, fasta, opts) return refine_hmmscan_results(domains, lengths, neighbour_mode=True)
def find_ab_motifs(fasta: str) -> Dict[str, List[HMMResult]]: """ Analyse for abMotifs Arguments: fasta: a group of features in fasta format Returns: a dictionary mapping feature name to a list of motif results for that feature """ opts = ["-E", "0.25"] motif_file = path.get_full_path(__file__, "data", "abmotifs.hmm") abmotif_results = subprocessing.run_hmmscan(motif_file, fasta, opts) lengths = utils.get_hmm_lengths(motif_file) return refine_hmmscan_results(abmotif_results, lengths, neighbour_mode=True)
def run_t2pks_hmmscan(cluster: Cluster) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on coding sequences in cluster Arguments: cluster: Cluster on which the type II PKS hmmscan shall be run Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cluster.cds_children) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def find_domains(fasta: str, record: Record) -> Dict[str, List[HMMResult]]: """ Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains Arguments: fasta: a group of features in fasta format record: the Record that contains all the features Returns: a dictionary mapping feature name to a list of domain results for that feature """ opts = ["--cut_tc"] nrpspks_file = path.get_full_path(__file__, "data", "nrpspksdomains.hmm") nrpspksdomain_results = subprocessing.run_hmmscan(nrpspks_file, fasta, opts) lengths = utils.get_hmm_lengths(nrpspks_file) domains = refine_hmmscan_results(nrpspksdomain_results, lengths, neighbour_mode=True) return filter_nonterminal_docking_domains(record, domains)
def run_t2pks_hmmscan( cds_features: Iterable[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on the given CDSFeatures Arguments: cluster: Protocluster on which to run the type II PKS hmmscan Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cds_features) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def run_hmmer(record: Record, features: Iterable[CDSFeature], max_evalue: float, min_score: float, database: str, tool: str) -> HmmerResults: """ Build hmmer results for the given features Arguments: record: the Record instance to run hmmer over features: the list of CDSFeatures to run over specifically max_evalue: a maximum evalue allowed for hits (exclusive) min_evalue: a minimum evalue allowed for hits (exclusive) database: the database to search for hits within tool: the name of the specific tool calling into this module """ if not os.path.exists(database): raise ValueError("Given database does not exist: %s" % database) query_sequence = fasta.get_fasta_from_features(features) hmmscan_results = subprocessing.run_hmmscan(database, query_sequence, opts=["--cut_tc"]) hits = build_hits(record, hmmscan_results, min_score, max_evalue, database) return HmmerResults(record.id, max_evalue, min_score, database, tool, hits)
def classify_genes( cds_features: List[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ smcogs_fasta = fasta.get_fasta_from_features(cds_features) smcogs_opts = ["-E", "1E-6"] hmm_file = path.get_full_path(__file__, "data", "smcogs.hmm") smcogs_results = subprocessing.run_hmmscan(hmm_file, smcogs_fasta, smcogs_opts) hmm_lengths = utils.get_hmm_lengths(hmm_file) return refine_hmmscan_results(smcogs_results, hmm_lengths)