def from_json(data: Dict[str, Any]) -> "CDSResult":
     """ Reconstruct from a JSON representation """
     domain_hmms = [HMMResult.from_json(hmm) for hmm in data["domain_hmms"]]
     motif_hmms = [HMMResult.from_json(hmm) for hmm in data["motif_hmms"]]
     modules = [Module.from_json(module) for module in data["modules"]]
     return CDSResult(domain_hmms, motif_hmms, data["type"], modules,
                      data["ks_subtypes"])
Example #2
0
    def test_results_reconstruction(self):
        def check_results(results):
            assert results.record_id == "rec_id"
            assert results.tool == "toolname"
            assert isinstance(results.best_hits["cds1"], HMMResult)
            assert results.best_hits["cds1"].hit_id == 'desc1'
            assert results.best_hits["cds2"].bitscore == 20
            assert results.function_mapping["cds2"] == GeneFunction.REGULATORY

        hits = {
            "cds1": HMMResult("desc1", 0, 100, 2.3e-126, 416),
            "cds2": HMMResult("desc2", 5, 60, 3e-16, 20),
        }
        mapping = {
            "cds1": GeneFunction.TRANSPORT,
            "cds2": GeneFunction.REGULATORY,
        }
        results = self.res_class("rec_id",
                                 "toolname",
                                 best_hits=hits,
                                 function_mapping=mapping)
        check_results(results)

        json = results.to_json()
        assert json["best_hits"]["cds1"][0] == hits["cds1"].hit_id

        record = DummyRecord()
        record.id = "rec_id"
        reconstructed = self.res_class.from_json(json, record)
        check_results(reconstructed)
Example #3
0
    def setUp(self):
        self.res_class = genefunctions.core.FunctionResults
        hits = {"cds1": HMMResult("desc1", 0, 100, 2.3e-126, 416),
                "cds2": HMMResult("desc2", 5, 60, 3e-16, 20),
                }
        mapping = {"cds1": GeneFunction.TRANSPORT,
                   "cds2": GeneFunction.REGULATORY,
                   }
        self.record = DummyRecord()
        self.record.id = "rec_id"

        self.results = self.res_class(self.record.id, "toolname", best_hits=hits,
                                      function_mapping=mapping)
Example #4
0
def classify(
    record_id: str,
    cds_features: List[CDSFeature],  # an API, so hide unused warning
    options: ConfigType
) -> FunctionResults:  # pylint: disable=unused-argument
    """ Finds possible classifications for the provided CDS features.

        Arguments:
            cds_features: a list of CDSFeatures to classify

        Returns:
            a dictionary mapping CDS name to a list of HMMResult instances of
                classifications
    """

    hmm_file = path.get_full_path(__file__, "data", "smcogs.hmm")
    hits = scan_for_functions(cds_features,
                              hmm_file,
                              hmmscan_opts=["-E", "1E-16"])
    ids_to_function = build_function_mapping()
    cds_name_to_function = {}
    for cds_name, result in hits.items():
        smcog_id = result.hit_id.split(":", 1)[0]
        cds_name_to_function[cds_name] = ids_to_function[smcog_id]
        hits[cds_name] = HMMResult(result.hit_id.replace("_", " "),
                                   result.query_start, result.query_end,
                                   result.evalue, result.bitscore)
    return FunctionResults(record_id, "smcogs", hits, cds_name_to_function)
Example #5
0
 def test_biopython_compatibility(self):
     qualifier = NRPSPKSQualifier(strand=1)
     for pks in ["PKS_AT", "AMP-binding"]:
         qualifier.add_domain(HMMResult(pks, 1, 1, 1, 1), "missing")
         qualifier.add_subtype(pks + "dummy")
     assert len(qualifier) == 4
     for i in qualifier:
         assert isinstance(i, str)
Example #6
0
 def test_counter(self):
     qualifier = NRPSPKSQualifier(strand=1)
     types = [("PKS_AT", "_AT"), ("PKS_KR", "_KR"), ("CAL_domain", "_CAL"),
              ("AMP-binding", "_A"), ("PKS_KS", "_KS"), ("ACP", "_OTHER")]
     expected = set()
     for pks_type, suffix in types:
         domain = HMMResult(pks_type, 1, 1, 1, 1)
         suffix = suffix + "%d"
         for i in range(3):
             qualifier.add_domain(domain, "missing")
             expected.add(suffix % (i + 1))
     assert len(qualifier.domains) == 3 * len(types)
     assert {domain.label for domain in qualifier.domains} == expected
Example #7
0
 def test_classification_with_colon(self):
     # since SMCOG id and description are stored in a string separated by :,
     # ensure that descriptions containing : are properly handled
     cds = helpers.DummyCDS(locus_tag="test")
     record = helpers.DummyRecord(features=[cds], seq="A" * 100)
     record.add_cluster(helpers.DummyCluster(0, 100))
     results = SMCOGResults(record.id)
     results.best_hits[cds.get_name()] = HMMResult(
         "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416)
     results.add_to_record(record)
     gene_functions = cds.gene_functions.get_by_tool("smcogs")
     assert len(gene_functions) == 1
     assert str(gene_functions[0]).startswith(
         "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter"
         " (Score: 416; E-value: 2.3e-126)")
Example #8
0
    def from_json(json: Dict[str, Any], record: Record) -> Optional["FunctionResults"]:
        if json.get("schema_version") != FunctionResults.schema_version:
            logging.debug("Schema version mismatch, discarding FunctionResults for tool: %s",
                          json.get("tool", "unknown"))
            return None
        if record.id != json.get("record_id"):
            logging.debug("Record ID mismatch, discarding FunctionResults for tool: %s",
                          json.get("tool", "unknown"))
            return None
        hits = {}
        for hit, parts in json["best_hits"].items():
            hits[hit] = HMMResult(*parts)

        mapping = {}
        for cds_name, simple_function in json["mapping"].items():
            mapping[cds_name] = GeneFunction.from_string(simple_function)

        results = FunctionResults(json["record_id"], json["tool"], hits, mapping)
        return results
Example #9
0
    def test_biopython_conversion(self):
        qualifier = NRPSPKSQualifier(strand=1)
        for pks in ["PKS_AT", "AMP-binding"]:
            qualifier.add_domain(HMMResult(pks, 1, 1, 1, 1), "missing")
            qualifier.add_subtype(pks + "dummy")
        qualifier.type = "some type"

        bio = list(qualifier)
        for val in bio:
            assert isinstance(val, str)

        new = NRPSPKSQualifier(strand=1)
        new.add_from_qualifier(bio)
        assert list(qualifier) == list(new)

        for bad in [["mismatching info"], ["Domain: missing info"]]:
            with self.assertRaisesRegex(
                    ValueError, "unknown NRPS/PKS qualifier|could not match"):
                new.add_from_qualifier(bad)
Example #10
0
def run_starter_unit_blastp(
    cds_hmm_hits: Dict[CDSFeature,
                       List[HMMResult]]) -> Dict[str, List[HMMResult]]:
    """ Runs blastp on starter unit coding sequences in given cluster

        Arguments:
            cds_hmm_hits: HMMResults by cds from type II PKS hmmscan

        Returns:
            a dictionary mapping CDS name to a list of HMMresults
    """
    blastp_results = []
    blastp_fasta_files = set()
    for cds, hmm_hits in cds_hmm_hits.items():
        query_sequence = fasta.get_fasta_from_features([cds])
        for hit in hmm_hits:
            if hit.hit_id not in ['KSIII', 'AT', 'AMID', 'LIG']:
                continue
            blast_database = path.get_full_path(__file__, 'data', hit.hit_id)
            blastp_results.extend(
                subprocessing.run_blastp(blast_database, query_sequence))
            blastp_fasta_files.add(
                path.get_full_path(__file__, 'data', hit.hit_id + '.fasta'))

    if not blastp_results:
        return {}

    fasta_lengths = {}
    for fasta_file in blastp_fasta_files:
        fasta_lengths.update(get_fasta_lengths(fasta_file))

    results = refine_hmmscan_results(blastp_results, fasta_lengths)
    for hits in results.values():
        for i, hit in enumerate(hits):
            if not hit.hit_id.endswith("-CoA"):
                hits[i] = HMMResult(hit.hit_id + "-CoA", hit.query_start,
                                    hit.query_end, hit.evalue, hit.bitscore)
    return results
Example #11
0
 def from_json(cls, data: Dict[str, Any]) -> "Component":
     """ Construct a component from a JSON representation """
     subtype = data.get("subtype", "")
     assert isinstance(subtype, str), subtype
     return cls(HMMResult.from_json(data["domain"]), data["locus"], subtype)
def dummy_hmm(hit_id="dummy", start=1):
    return HMMResult(hit_id, start, start + 40, 1e-5, 10)
Example #13
0
 def from_json(data) -> "CDSResult":
     """ Reconstruct from a JSON representation """
     domain_hmms = [HMMResult.from_json(hmm) for hmm in data["domain_hmms"]]
     motif_hmms = [HMMResult.from_json(hmm) for hmm in data["motif_hmms"]]
     return CDSResult(domain_hmms, motif_hmms, data["type"])
Example #14
0
 def setUp(self):
     test_file = path.get_full_path(__file__, 'data',
                                    'NC_003888.3.cluster011.gbk')
     self.record = record_processing.parse_input_sequence(test_file)[0]
     self.cluster = Cluster(FeatureLocation(0, len(self.record.seq)),
                            surrounding_location=FeatureLocation(
                                0, len(self.record.seq)),
                            cutoff=20,
                            neighbourhood_range=0,
                            tool="test",
                            product="T2PKS",
                            detection_rule="dummy rule")
     self.record.add_cluster(self.cluster)
     self.record.create_superclusters()
     self.record.create_regions()
     hmm_results = {
         'SCO5072':
         [HMMResult("KR", 1, 265, evalue=3.1e-49, bitscore=159.4)],
         'SCO5079':
         [HMMResult("DIMER", 4, 293, evalue=8.7e-131, bitscore=426.8)],
         'SCO5080':
         [HMMResult("OXY", 8, 377, evalue=2.1e-14, bitscore=44.7)],
         'SCO5086':
         [HMMResult("KR_C9", 0, 261, evalue=1.9e-134, bitscore=438.4)],
         'SCO5087':
         [HMMResult("KS", 44, 463, evalue=3.5e-234, bitscore=768.6)],
         'SCO5088':
         [HMMResult("CLF_7", 1, 401, evalue=1.2e-226, bitscore=743.5)],
         'SCO5089': [HMMResult("ACP", 4, 86, evalue=5e-36, bitscore=114.2)],
         'SCO5090':
         [HMMResult("CYC_C7-C12", 1, 312, evalue=7.8e-124, bitscore=404)],
         'SCO5091':
         [HMMResult("CYC_C5-C14", 3, 297, evalue=4.4e-143, bitscore=467.3)],
         'SCO5094':
         [HMMResult("MET", 40, 155, evalue=9.8e-11, bitscore=32.7)],
         'SCO5097':
         [HMMResult("KR", 3, 247, evalue=3.3e-40, bitscore=129.8)],
     }
     mock("t2pks_analysis.run_t2pks_hmmscan", returns=hmm_results)
     mock("t2pks_analysis.run_starter_unit_blastp", returns={})
Example #15
0
 def hmm_results(self):
     hmm_results = {'SCO5072': [HMMResult("KR", 1, 265, evalue=3.1e-49, bitscore=159.4)],
                    'SCO5079': [HMMResult("DIMER", 4, 293, evalue=8.7e-131, bitscore=426.8)],
                    'SCO5080': [HMMResult("OXY", 8, 377, evalue=2.1e-14, bitscore=44.7)],
                    'SCO5086': [HMMResult("KR_C9", 0, 261, evalue=1.9e-134, bitscore=438.4)],
                    'SCO5087': [HMMResult("KS", 44, 463, evalue=3.5e-234, bitscore=768.6)],
                    'SCO5088': [HMMResult("CLF_7", 1, 401, evalue=1.2e-226, bitscore=743.5)],
                    'SCO5089': [HMMResult("ACP", 4, 86, evalue=5e-36, bitscore=114.2)],
                    'SCO5090': [HMMResult("CYC_C7-C12", 1, 312, evalue=7.8e-124, bitscore=404)],
                    'SCO5091': [HMMResult("CYC_C5-C14", 3, 297, evalue=4.4e-143, bitscore=467.3)],
                    'SCO5094': [HMMResult("MET", 40, 155, evalue=9.8e-11, bitscore=32.7)],
                    'SCO5097': [HMMResult("KR", 3, 247, evalue=3.3e-40, bitscore=129.8)],
                    }
     return hmm_results