def from_json(data: Dict[str, Any]) -> "CDSResult": """ Reconstruct from a JSON representation """ domain_hmms = [HMMResult.from_json(hmm) for hmm in data["domain_hmms"]] motif_hmms = [HMMResult.from_json(hmm) for hmm in data["motif_hmms"]] modules = [Module.from_json(module) for module in data["modules"]] return CDSResult(domain_hmms, motif_hmms, data["type"], modules, data["ks_subtypes"])
def test_results_reconstruction(self): def check_results(results): assert results.record_id == "rec_id" assert results.tool == "toolname" assert isinstance(results.best_hits["cds1"], HMMResult) assert results.best_hits["cds1"].hit_id == 'desc1' assert results.best_hits["cds2"].bitscore == 20 assert results.function_mapping["cds2"] == GeneFunction.REGULATORY hits = { "cds1": HMMResult("desc1", 0, 100, 2.3e-126, 416), "cds2": HMMResult("desc2", 5, 60, 3e-16, 20), } mapping = { "cds1": GeneFunction.TRANSPORT, "cds2": GeneFunction.REGULATORY, } results = self.res_class("rec_id", "toolname", best_hits=hits, function_mapping=mapping) check_results(results) json = results.to_json() assert json["best_hits"]["cds1"][0] == hits["cds1"].hit_id record = DummyRecord() record.id = "rec_id" reconstructed = self.res_class.from_json(json, record) check_results(reconstructed)
def setUp(self): self.res_class = genefunctions.core.FunctionResults hits = {"cds1": HMMResult("desc1", 0, 100, 2.3e-126, 416), "cds2": HMMResult("desc2", 5, 60, 3e-16, 20), } mapping = {"cds1": GeneFunction.TRANSPORT, "cds2": GeneFunction.REGULATORY, } self.record = DummyRecord() self.record.id = "rec_id" self.results = self.res_class(self.record.id, "toolname", best_hits=hits, function_mapping=mapping)
def classify( record_id: str, cds_features: List[CDSFeature], # an API, so hide unused warning options: ConfigType ) -> FunctionResults: # pylint: disable=unused-argument """ Finds possible classifications for the provided CDS features. Arguments: cds_features: a list of CDSFeatures to classify Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ hmm_file = path.get_full_path(__file__, "data", "smcogs.hmm") hits = scan_for_functions(cds_features, hmm_file, hmmscan_opts=["-E", "1E-16"]) ids_to_function = build_function_mapping() cds_name_to_function = {} for cds_name, result in hits.items(): smcog_id = result.hit_id.split(":", 1)[0] cds_name_to_function[cds_name] = ids_to_function[smcog_id] hits[cds_name] = HMMResult(result.hit_id.replace("_", " "), result.query_start, result.query_end, result.evalue, result.bitscore) return FunctionResults(record_id, "smcogs", hits, cds_name_to_function)
def test_biopython_compatibility(self): qualifier = NRPSPKSQualifier(strand=1) for pks in ["PKS_AT", "AMP-binding"]: qualifier.add_domain(HMMResult(pks, 1, 1, 1, 1), "missing") qualifier.add_subtype(pks + "dummy") assert len(qualifier) == 4 for i in qualifier: assert isinstance(i, str)
def test_counter(self): qualifier = NRPSPKSQualifier(strand=1) types = [("PKS_AT", "_AT"), ("PKS_KR", "_KR"), ("CAL_domain", "_CAL"), ("AMP-binding", "_A"), ("PKS_KS", "_KS"), ("ACP", "_OTHER")] expected = set() for pks_type, suffix in types: domain = HMMResult(pks_type, 1, 1, 1, 1) suffix = suffix + "%d" for i in range(3): qualifier.add_domain(domain, "missing") expected.add(suffix % (i + 1)) assert len(qualifier.domains) == 3 * len(types) assert {domain.label for domain in qualifier.domains} == expected
def test_classification_with_colon(self): # since SMCOG id and description are stored in a string separated by :, # ensure that descriptions containing : are properly handled cds = helpers.DummyCDS(locus_tag="test") record = helpers.DummyRecord(features=[cds], seq="A" * 100) record.add_cluster(helpers.DummyCluster(0, 100)) results = SMCOGResults(record.id) results.best_hits[cds.get_name()] = HMMResult( "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416) results.add_to_record(record) gene_functions = cds.gene_functions.get_by_tool("smcogs") assert len(gene_functions) == 1 assert str(gene_functions[0]).startswith( "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter" " (Score: 416; E-value: 2.3e-126)")
def from_json(json: Dict[str, Any], record: Record) -> Optional["FunctionResults"]: if json.get("schema_version") != FunctionResults.schema_version: logging.debug("Schema version mismatch, discarding FunctionResults for tool: %s", json.get("tool", "unknown")) return None if record.id != json.get("record_id"): logging.debug("Record ID mismatch, discarding FunctionResults for tool: %s", json.get("tool", "unknown")) return None hits = {} for hit, parts in json["best_hits"].items(): hits[hit] = HMMResult(*parts) mapping = {} for cds_name, simple_function in json["mapping"].items(): mapping[cds_name] = GeneFunction.from_string(simple_function) results = FunctionResults(json["record_id"], json["tool"], hits, mapping) return results
def test_biopython_conversion(self): qualifier = NRPSPKSQualifier(strand=1) for pks in ["PKS_AT", "AMP-binding"]: qualifier.add_domain(HMMResult(pks, 1, 1, 1, 1), "missing") qualifier.add_subtype(pks + "dummy") qualifier.type = "some type" bio = list(qualifier) for val in bio: assert isinstance(val, str) new = NRPSPKSQualifier(strand=1) new.add_from_qualifier(bio) assert list(qualifier) == list(new) for bad in [["mismatching info"], ["Domain: missing info"]]: with self.assertRaisesRegex( ValueError, "unknown NRPS/PKS qualifier|could not match"): new.add_from_qualifier(bad)
def run_starter_unit_blastp( cds_hmm_hits: Dict[CDSFeature, List[HMMResult]]) -> Dict[str, List[HMMResult]]: """ Runs blastp on starter unit coding sequences in given cluster Arguments: cds_hmm_hits: HMMResults by cds from type II PKS hmmscan Returns: a dictionary mapping CDS name to a list of HMMresults """ blastp_results = [] blastp_fasta_files = set() for cds, hmm_hits in cds_hmm_hits.items(): query_sequence = fasta.get_fasta_from_features([cds]) for hit in hmm_hits: if hit.hit_id not in ['KSIII', 'AT', 'AMID', 'LIG']: continue blast_database = path.get_full_path(__file__, 'data', hit.hit_id) blastp_results.extend( subprocessing.run_blastp(blast_database, query_sequence)) blastp_fasta_files.add( path.get_full_path(__file__, 'data', hit.hit_id + '.fasta')) if not blastp_results: return {} fasta_lengths = {} for fasta_file in blastp_fasta_files: fasta_lengths.update(get_fasta_lengths(fasta_file)) results = refine_hmmscan_results(blastp_results, fasta_lengths) for hits in results.values(): for i, hit in enumerate(hits): if not hit.hit_id.endswith("-CoA"): hits[i] = HMMResult(hit.hit_id + "-CoA", hit.query_start, hit.query_end, hit.evalue, hit.bitscore) return results
def from_json(cls, data: Dict[str, Any]) -> "Component": """ Construct a component from a JSON representation """ subtype = data.get("subtype", "") assert isinstance(subtype, str), subtype return cls(HMMResult.from_json(data["domain"]), data["locus"], subtype)
def dummy_hmm(hit_id="dummy", start=1): return HMMResult(hit_id, start, start + 40, 1e-5, 10)
def from_json(data) -> "CDSResult": """ Reconstruct from a JSON representation """ domain_hmms = [HMMResult.from_json(hmm) for hmm in data["domain_hmms"]] motif_hmms = [HMMResult.from_json(hmm) for hmm in data["motif_hmms"]] return CDSResult(domain_hmms, motif_hmms, data["type"])
def setUp(self): test_file = path.get_full_path(__file__, 'data', 'NC_003888.3.cluster011.gbk') self.record = record_processing.parse_input_sequence(test_file)[0] self.cluster = Cluster(FeatureLocation(0, len(self.record.seq)), surrounding_location=FeatureLocation( 0, len(self.record.seq)), cutoff=20, neighbourhood_range=0, tool="test", product="T2PKS", detection_rule="dummy rule") self.record.add_cluster(self.cluster) self.record.create_superclusters() self.record.create_regions() hmm_results = { 'SCO5072': [HMMResult("KR", 1, 265, evalue=3.1e-49, bitscore=159.4)], 'SCO5079': [HMMResult("DIMER", 4, 293, evalue=8.7e-131, bitscore=426.8)], 'SCO5080': [HMMResult("OXY", 8, 377, evalue=2.1e-14, bitscore=44.7)], 'SCO5086': [HMMResult("KR_C9", 0, 261, evalue=1.9e-134, bitscore=438.4)], 'SCO5087': [HMMResult("KS", 44, 463, evalue=3.5e-234, bitscore=768.6)], 'SCO5088': [HMMResult("CLF_7", 1, 401, evalue=1.2e-226, bitscore=743.5)], 'SCO5089': [HMMResult("ACP", 4, 86, evalue=5e-36, bitscore=114.2)], 'SCO5090': [HMMResult("CYC_C7-C12", 1, 312, evalue=7.8e-124, bitscore=404)], 'SCO5091': [HMMResult("CYC_C5-C14", 3, 297, evalue=4.4e-143, bitscore=467.3)], 'SCO5094': [HMMResult("MET", 40, 155, evalue=9.8e-11, bitscore=32.7)], 'SCO5097': [HMMResult("KR", 3, 247, evalue=3.3e-40, bitscore=129.8)], } mock("t2pks_analysis.run_t2pks_hmmscan", returns=hmm_results) mock("t2pks_analysis.run_starter_unit_blastp", returns={})
def hmm_results(self): hmm_results = {'SCO5072': [HMMResult("KR", 1, 265, evalue=3.1e-49, bitscore=159.4)], 'SCO5079': [HMMResult("DIMER", 4, 293, evalue=8.7e-131, bitscore=426.8)], 'SCO5080': [HMMResult("OXY", 8, 377, evalue=2.1e-14, bitscore=44.7)], 'SCO5086': [HMMResult("KR_C9", 0, 261, evalue=1.9e-134, bitscore=438.4)], 'SCO5087': [HMMResult("KS", 44, 463, evalue=3.5e-234, bitscore=768.6)], 'SCO5088': [HMMResult("CLF_7", 1, 401, evalue=1.2e-226, bitscore=743.5)], 'SCO5089': [HMMResult("ACP", 4, 86, evalue=5e-36, bitscore=114.2)], 'SCO5090': [HMMResult("CYC_C7-C12", 1, 312, evalue=7.8e-124, bitscore=404)], 'SCO5091': [HMMResult("CYC_C5-C14", 3, 297, evalue=4.4e-143, bitscore=467.3)], 'SCO5094': [HMMResult("MET", 40, 155, evalue=9.8e-11, bitscore=32.7)], 'SCO5097': [HMMResult("KR", 3, 247, evalue=3.3e-40, bitscore=129.8)], } return hmm_results