Exemple #1
0
 def setUp(self):
     self.cluster = create_cluster()
     self.cluster.core_location = FeatureLocation(30, 50)
     self.inside_cds = DummyCDS(40, 45)
     self.neighbour_cds = DummyCDS(20, 25)
     self.outside_cds = DummyCDS(120, 125)
     assert not self.cluster.cds_children
     assert not self.cluster.definition_cdses
    def test_add_cds(self):
        collection = CDSCollection(FeatureLocation(20, 40),
                                   feature_type="test",
                                   child_collections=[])
        cds = DummyCDS(20, 40)
        collection.add_cds(cds)
        assert cds in collection.cds_children

        cds = DummyCDS(120, 140)
        with self.assertRaisesRegex(ValueError, "not contained by"):
            collection.add_cds(cds)
 def setUp(self):
     self.geneclustergenes = {"CAG25752": ""}
     self.seq_record = Record("dummy")
     self.seqlengths = {"CAG25751.1": 253}
     # used by parse_subject, but only if locus tag not in seqlengths
     mock('core.get_cds_lengths', returns=self.seqlengths)
     mock('Record.get_cds_by_name', returns=DummyCDS(1, 301))
Exemple #4
0
    def test_add_cds_propagation(self):
        cds = DummyCDS(0, 10)
        assert cds.is_contained_by(self.region)
        # ensure all empty to start with
        assert not self.cluster.cds_children
        assert not self.super.cds_children
        assert not self.sub.cds_children
        assert not self.region.cds_children
        assert not cds.region

        self.region.add_cds(cds)
        assert self.cluster.cds_children == (cds, )
        assert self.super.cds_children == (cds, )
        assert self.sub.cds_children == (cds, )
        assert self.region.cds_children == (cds, )
        assert cds.region is self.region
Exemple #5
0
def create_cds(start, end, products):
    cds = DummyCDS(start,
                   end,
                   locus_tag="%s-%s-%s" % (start, end, "-".join(products)))
    for product in products:
        cds.gene_functions.add(GeneFunction.CORE, "test", "dummy", product)
    return cds
 def parse_subject_wrapper(self, subject_line):
     # used by core.parse_subject, but only if locus tag not in self.seqlengths
     with patch.object(core, 'get_cds_lengths', returns=self.seqlengths):
         with patch.object(Record,
                           'get_cds_by_name',
                           returns=DummyCDS(1, 301)):
             return core.parse_subject(subject_line, self.seqlengths,
                                       self.seq_record)
 def parse_subject_wrapper(self, subject_line):
     seq_record = Record("dummy")
     seqlengths = {}
     # used by core.parse_subject, but only if locus tag not in self.seqlengths
     with patch.object(core, 'get_cds_lengths', return_value={}):
         with patch.object(Record,
                           'get_cds_by_name',
                           return_value=DummyCDS(1, 101)):
             return core.parse_subject(subject_line, seqlengths, seq_record)
    def build(self, early, late, strand=1, tail_strand_multiplier=1):
        if strand == -1:
            head = late
            tail = early
            second = CDSModuleInfo(DummyCDS(start=50, end=110, strand=strand),
                                   [tail])
            first = CDSModuleInfo(
                DummyCDS(start=500,
                         end=560,
                         strand=strand * tail_strand_multiplier), [head])
        else:
            head = early
            tail = late
            first = CDSModuleInfo(DummyCDS(start=50, end=110, strand=strand),
                                  [head])
            second = CDSModuleInfo(
                DummyCDS(start=500,
                         end=560,
                         strand=strand * tail_strand_multiplier), [tail])

        first_modules = list(first.modules)
        second_modules = list(second.modules)

        if strand == -1:
            module = combine_modules(second, first)
        else:
            module = combine_modules(first, second)

        if not module:
            # nothing should be changed
            assert first_modules == first.modules
            assert second_modules == second.modules
        else:
            # head is replaced
            assert head not in first.modules
            assert len(first_modules) == len(first.modules), (first, second)
            assert module in first.modules
            # tail removed
            assert tail not in second.modules
            # and not replaced
            assert len(second_modules) - 1 == len(second.modules)

        return module
 def test_single_file(self):
     self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)])
     with TemporaryDirectory(change=True):
         files = core.write_fastas_with_all_genes(self.regions,
                                                  "test.fasta")
         assert files == ["test.fasta"]
         assert os.path.exists("test.fasta")
         expected = "".join(">L{0}\nS{0}\n".format(i)
                            for i in range(len(self.regions) * 3))
         assert open("test.fasta").read() == expected
Exemple #10
0
def add_module_references_to_record(module, record):
    for domain in module.domains:
        record.add_antismash_domain(domain)
        try:
            record.get_cds_by_name(domain.locus_tag)
        except KeyError:
            record.add_cds_feature(
                DummyCDS(start=module.location.start - 10,
                         end=module.location.end + 10,
                         locus_tag=domain.locus_tag))
    def test_bad_child(self):
        with self.assertRaises(AssertionError):
            child = CDSCollection(FeatureLocation(10, 50),
                                  feature_type="test",
                                  child_collections=[])
            CDSCollection(FeatureLocation(20, 40),
                          feature_type="test",
                          child_collections=[child])

        with self.assertRaises(AssertionError):
            cds = DummyCDS(25, 35)
            CDSCollection(FeatureLocation(20, 40),
                          feature_type="test",
                          child_collections=[cds])
Exemple #12
0
 def test_multi_cds_tracking(self):
     domains = [DummyAntismashDomain(locus_tag=i) for i in "AB"]
     module = create_module(domains=domains)
     assert module.is_multigene_module()
     record = DummyRecord()
     add_module_references_to_record(module, record)
     record.add_cds_feature(DummyCDS(locus_tag="C"))
     for cds in record.get_cds_features():
         assert not cds.modules
     assert not record.get_modules()
     record.add_module(module)
     # make sure it's not added to every CDS
     assert not record.get_cds_by_name("C").modules
     # but that it is added to all CDSes with a domain included
     for i in "AB":
         assert record.get_cds_by_name(i).modules == (module, )
Exemple #13
0
    def test_limited_add_cds_propagation(self):
        cds = DummyCDS(0, 10)
        self.sub = SubRegion(FeatureLocation(20, 30), "testtool")
        self.region = Region(superclusters=[self.super], subregions=[self.sub])

        # ensure all empty to start with
        assert not self.cluster.cds_children
        assert not self.super.cds_children
        assert not self.sub.cds_children
        assert not self.region.cds_children
        assert not cds.region

        self.region.add_cds(cds)
        assert self.cluster.cds_children == (cds, )
        assert self.super.cds_children == (cds, )
        assert not self.sub.cds_children
        assert self.region.cds_children == (cds, )
        assert cds.region is self.region
 def test_multiple_files(self):
     self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)])
     for partitions in [2, 3]:
         with TemporaryDirectory(change=True):
             self.index = 0
             chunk_size = (len(self.regions) * 3) // partitions
             files = core.write_fastas_with_all_genes(self.regions,
                                                      "test.fasta",
                                                      partitions=partitions)
             assert files == ["test%d.fasta" % i for i in range(partitions)]
             for index in range(partitions):
                 assert os.path.exists("test%d.fasta" % index)
                 print(index, chunk_size)
                 contents = open("test%d.fasta" % index).read()
                 assert contents.count(">") == chunk_size
                 expected = "".join(
                     ">L{0}\nS{0}\n".format(i + index * chunk_size)
                     for i in range(chunk_size))
                 assert contents == expected
Exemple #15
0
 def test_adding_invalid_cds(self):
     cds = DummyCDS(50, 60)
     assert not cds.is_contained_by(self.region)
     with self.assertRaisesRegex(ValueError, "not contained by"):
         self.region.add_cds(cds)
Exemple #16
0
 def test_parents(self):
     cds = DummyCDS(0, 6, locus_tag="testCDS")
     domain = DummyAntismashDomain(2, 5)
     domain.locus_tag = "testCDS"
     module = create_module([domain])
     assert module.parent_cds_names[0] == cds.get_name()
Exemple #17
0
class TestBlastParsing(unittest.TestCase):
    def setUp(self):
        self.sample_data = self.read_sample_data()
        self.sample_data_as_lists = self.file_data_to_lists(self.sample_data)

    def parse_subject_wrapper(self, subject_line):
        seq_record = Record("dummy")
        seqlengths = {}
        # used by core.parse_subject, but only if locus tag not in self.seqlengths
        with patch.object(core, 'get_cds_lengths', return_value={}):
            with patch.object(Record,
                              'get_cds_by_name',
                              return_value=DummyCDS(1, 101)):
                return core.parse_subject(subject_line, seqlengths, seq_record)

    def read_sample_data(self, filename="data/diamond_output_sample.txt"):
        data_path = os.path.join(__file__.rsplit(os.sep, 1)[0], filename)
        return open(data_path, "r").read()

    def file_data_to_lists(self, data):
        return [line.split("\t") for line in data.rstrip().split("\n")]

    def test_unique_pairings_filter(self):
        data = self.file_data_to_lists(self.sample_data)
        sample = core.remove_duplicate_hits(data)
        self.assertEqual(len(sample), len(data))
        self.assertEqual(sample, core.remove_duplicate_hits(data * 2))

        # test empty
        data = [[], ["a"], ["abc"]]
        results = core.remove_duplicate_hits(data)
        self.assertEqual(results, [])

    def verify_subjects_and_clusters_represented(self, subjects,
                                                 cluster_name_to_queries):
        subject_clusters = set()
        for subject in subjects:
            self.assertTrue(subject.genecluster in cluster_name_to_queries)
            subject_clusters.add(subject.genecluster)
        self.assertEqual(sorted(subject_clusters),
                         sorted(cluster_name_to_queries))

    @patch.object(core, 'get_cds_lengths', return_value={})
    @patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101))
    def test_blastparse(self, _mocked_record, _mocked_core):
        queries, clusters = core.blastparse(self.sample_data, Record(), 0, 0)

        # check we process the right number of queries
        self.assertEqual(len(queries),
                         len(set(i[0] for i in self.sample_data_as_lists)))

        # check we have entries for every gene_cluster we found
        subjects = [
            self.parse_subject_wrapper(i) for i in self.sample_data_as_lists
        ]
        self.verify_subjects_and_clusters_represented(subjects, clusters)

        # test perc_coverage threshold (value arbitrary due to mocking)
        coverage_threshold = 650
        queries, clusters = core.blastparse(self.sample_data, Record(),
                                            coverage_threshold, 0)
        new_subjects = [
            s for s in subjects if s.perc_coverage > coverage_threshold
        ]
        assert new_subjects and len(new_subjects) < len(
            subjects), "coverage test has become meaningless"
        self.verify_subjects_and_clusters_represented(new_subjects, clusters)

        # test perc_identity threshold
        ident_threshold = 35
        queries, clusters = core.blastparse(self.sample_data, Record(), 0,
                                            ident_threshold)
        new_subjects = [s for s in subjects if s.perc_ident > ident_threshold]
        assert new_subjects and len(new_subjects) < len(
            subjects), "identity% test has become meaningless"
        self.verify_subjects_and_clusters_represented(new_subjects, clusters)

        # test combo threshold
        queries, clusters = core.blastparse(self.sample_data, Record(),
                                            coverage_threshold,
                                            ident_threshold)
        new_subjects = [
            s for s in subjects if s.perc_ident > ident_threshold
            and s.perc_coverage > coverage_threshold
        ]
        assert new_subjects and len(new_subjects) < len(
            subjects), "combo test has become meaningless"
        self.verify_subjects_and_clusters_represented(new_subjects, clusters)

    def test_blastparse_on_empty(self):
        for blast in ["", "\n", "\r\n", "\n\n"]:
            queries, clusters = core.blastparse(blast, Record(), 0, 0)
            self.assertEqual(len(queries), 0)
            self.assertEqual(len(clusters), 0)

    @patch.object(core, 'get_cds_lengths', return_value={})
    @patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101))
    def test_parse_all_single_cluster(self, _mocked_record, _mocked_core):
        # single cluster to test the thresholds and content
        def parse_all_wrapper(coverage_threshold, ident_threshold):
            clusters_by_number, queries_by_number = core.parse_all_clusters(
                self.sample_data, Record(), coverage_threshold,
                ident_threshold)
            # make sure we only found one cluster number
            self.assertEqual(len(clusters_by_number), 1)
            self.assertEqual(list(clusters_by_number), [24])
            self.assertEqual(len(queries_by_number), 1)
            self.assertEqual(list(queries_by_number), [24])

            # now test the values of those queries
            queries = queries_by_number[24]
            clusters = clusters_by_number[24]
            return queries, clusters

        queries, clusters = parse_all_wrapper(0, 0)

        # check we process the right number of queries
        self.assertEqual(len(queries),
                         len(set(i[0] for i in self.sample_data_as_lists)))

        # check we have entries for every gene_cluster we found
        subjects = [
            self.parse_subject_wrapper(i) for i in self.sample_data_as_lists
        ]
        self.verify_subjects_and_clusters_represented(subjects, clusters)

        # test perc_coverage threshold (value arbitrary due to mocking)
        coverage_threshold = 650
        queries, clusters = parse_all_wrapper(coverage_threshold, 0)
        new_subjects = [
            s for s in subjects if s.perc_coverage > coverage_threshold
        ]
        assert new_subjects and len(new_subjects) < len(
            subjects), "coverage test has become meaningless"
        self.verify_subjects_and_clusters_represented(new_subjects, clusters)

        # test perc_identity threshold
        ident_threshold = 35
        queries, clusters = parse_all_wrapper(0, ident_threshold)
        new_subjects = [s for s in subjects if s.perc_ident > ident_threshold]
        assert new_subjects and len(new_subjects) < len(
            subjects), "identity% test has become meaningless"
        self.verify_subjects_and_clusters_represented(new_subjects, clusters)

        # test combo threshold
        queries, clusters = parse_all_wrapper(coverage_threshold,
                                              ident_threshold)
        new_subjects = [
            s for s in subjects if s.perc_ident > ident_threshold
            and s.perc_coverage > coverage_threshold
        ]
        assert new_subjects and len(new_subjects) < len(
            subjects), "combo test has become meaningless"
        self.verify_subjects_and_clusters_represented(new_subjects, clusters)

    @patch.object(core, 'get_cds_lengths', return_value={})
    @patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101))
    def test_parse_all_multi_cluster(self, _mocked_record, _mocked_core):
        # test we partition correctly by cluster number
        sample_data = self.read_sample_data(
            "data/diamond_output_sample_multicluster.txt")
        clusters_by_number, queries_by_number = core.parse_all_clusters(
            sample_data, Record(), 0, 0)
        self.assertEqual(len(clusters_by_number), 3)
        self.assertEqual(sorted(clusters_by_number), [1, 2, 4])
        self.assertEqual(len(queries_by_number), 3)
        self.assertEqual(sorted(queries_by_number), [1, 2, 4])
        for i in [1, 2, 4]:
            self.assertEqual(len(clusters_by_number[i]), i)
            self.assertEqual(len(queries_by_number[i]), i)

    def test_parse_all_empty(self):
        for sample_data in ["", "\n", "\r\n", "\n\n"]:
            clusters, queries = core.parse_all_clusters(
                sample_data, Record(), 0, 0)
        self.assertEqual(len(clusters), 0)
        self.assertEqual(len(queries), 0)
 def setUp(self):
     # used by parse_subject, every sequence will be 100 long
     mock('Record.get_cds_by_name', returns=DummyCDS(1, 101))
     mock('core.get_cds_lengths', returns={})
     self.sample_data = self.read_sample_data()
     self.sample_data_as_lists = self.file_data_to_lists(self.sample_data)
 def test_missing_modules(self):
     missing_modules = CDSModuleInfo(DummyCDS(start=50, end=110), [])
     has_modules = CDSModuleInfo(DummyCDS(start=150, end=210),
                                 [self.generic_tail])
     assert not combine_modules(missing_modules, has_modules)
     assert not combine_modules(has_modules, missing_modules)