def scan_for_functions(cds_features: List[CDSFeature], database: str, hmmscan_opts: Optional[List[str]] = None) -> Dict[str, HMMResult]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify database: the path to the database to check hmmscan_opts: a list of extra options to provide to hmmscan Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ search_fasta = fasta.get_fasta_from_features(cds_features) results = subprocessing.run_hmmscan(database, search_fasta, hmmscan_opts) hmm_lengths = utils.get_hmm_lengths(database) hmm_results = refine_hmmscan_results(results, hmm_lengths) best_hits = {} # type: Dict[str, HMMResult] for cds in cds_features: cds_name = cds.get_name() hits = hmm_results.get(cds_name) if not hits: continue best_hits[cds_name] = hits[0] return best_hits
def find_ks_domains(fasta: str) -> Dict[str, List[HMMResult]]: """ Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types Arguments: fasta: a group of features in fasta format Returns: a dictionary mapping feature name to a list of KS domain results for that feature """ opts = ["--cut_tc"] ks_file = path.get_full_path(__file__, "data", "ksdomains.hmm") lengths = utils.get_hmm_lengths(ks_file) domains = subprocessing.run_hmmscan(ks_file, fasta, opts) return refine_hmmscan_results(domains, lengths, neighbour_mode=True)
def find_ab_motifs(fasta: str) -> Dict[str, List[HMMResult]]: """ Analyse for abMotifs Arguments: fasta: a group of features in fasta format Returns: a dictionary mapping feature name to a list of motif results for that feature """ opts = ["-E", "0.25"] motif_file = path.get_full_path(__file__, "data", "abmotifs.hmm") abmotif_results = subprocessing.run_hmmscan(motif_file, fasta, opts) lengths = utils.get_hmm_lengths(motif_file) return refine_hmmscan_results(abmotif_results, lengths, neighbour_mode=True)
def run_t2pks_hmmscan(cluster: Cluster) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on coding sequences in cluster Arguments: cluster: Cluster on which the type II PKS hmmscan shall be run Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cluster.cds_children) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def find_domains(fasta: str, record: Record) -> Dict[str, List[HMMResult]]: """ Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains Arguments: fasta: a group of features in fasta format record: the Record that contains all the features Returns: a dictionary mapping feature name to a list of domain results for that feature """ opts = ["--cut_tc"] nrpspks_file = path.get_full_path(__file__, "data", "nrpspksdomains.hmm") nrpspksdomain_results = subprocessing.run_hmmscan(nrpspks_file, fasta, opts) lengths = utils.get_hmm_lengths(nrpspks_file) domains = refine_hmmscan_results(nrpspksdomain_results, lengths, neighbour_mode=True) return filter_nonterminal_docking_domains(record, domains)
def run_t2pks_hmmscan( cds_features: Iterable[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on the given CDSFeatures Arguments: cluster: Protocluster on which to run the type II PKS hmmscan Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cds_features) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def classify_genes( cds_features: List[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ smcogs_fasta = fasta.get_fasta_from_features(cds_features) smcogs_opts = ["-E", "1E-6"] hmm_file = path.get_full_path(__file__, "data", "smcogs.hmm") smcogs_results = subprocessing.run_hmmscan(hmm_file, smcogs_fasta, smcogs_opts) hmm_lengths = utils.get_hmm_lengths(hmm_file) return refine_hmmscan_results(smcogs_results, hmm_lengths)