Example #1
0
    def test_filter_fimo_results(self):
        fimo_dir = os.path.join(self.options.output_dir, "fimo")
        motifs = [Motif(0, 3)]
        # gene2 will be the anchor promoter
        anchor_promoter = 1
        promoters = []
        for i in range(1, 16):
            promoters.append(Promoter("gene%d" % i, i * 10, i * 10 + 4))
        # need certain amount of promoters, otherwise the proportion of
        # promoters with a motif (motif frequency) will be too high --> error
        expected_motifs = [Motif(0, 3, hits={"gene1": 1, "gene2": 2})]

        # fake FIMO output file, corresponding to expected_motifs
        source = path.get_full_path(__file__, "data", "fake_short_fimo.txt")
        target = os.path.join(fimo_dir, "+00_-03")
        if not os.path.exists(target):
            os.makedirs(target)
        copy(source, os.path.join(target,
                                  "fimo.txt"))  # overwrite fimo.txt if exists

        found_motifs = filter_fimo_results(motifs, fimo_dir, promoters,
                                           anchor_promoter)
        assert found_motifs == expected_motifs
        bs_per_promoter, expected_bs_per_promoter = read_generated_expected_file(
            os.path.join(target, "bs_per_promoter.csv"),
            "expected_bs_per_promoter.csv")
        self.assertEqual(bs_per_promoter, expected_bs_per_promoter)
Example #2
0
    def test_regeneration(self):
        record = create_fake_record()
        results = cassis.CassisResults(record.id)
        # create a prediction, since it will generate a border with many extra qualifiers
        start_marker = ClusterMarker("gene1", Motif(3, 3, score=1))
        start_marker.promoter = "gene1"
        start_marker.abundance = 2
        end_marker = ClusterMarker("gene4", Motif(3, 3, score=1))
        end_marker.promoter = "gene3+gene4"
        assert end_marker.abundance == 1
        cluster = cassis.ClusterPrediction(start_marker, end_marker)
        results.subregions = cassis.create_subregions("gene1", [cluster],
                                                      record)
        assert results.subregions

        results.promoters = [
            Promoter("gene1", 10, 20, seq=Seq("cgtacgtacgt")),
            Promoter("gene2", 30, 40, seq=Seq("cgtacgtacgt")),
            CombinedPromoter("gene3", "gene4", 50, 60, seq=Seq("cgtacgtacgt"))
        ]

        round_trip = cassis.regenerate_previous_results(
            results.to_json(), record, None)
        assert isinstance(round_trip, cassis.CassisResults)
        assert len(results.subregions) == len(round_trip.subregions)
        for old, new in zip(results.subregions, round_trip.subregions):
            assert old.location == new.location
            assert old.to_biopython()[0].qualifiers == new.to_biopython(
            )[0].qualifiers
        assert round_trip.promoters == results.promoters
Example #3
0
 def test_conversion(self):
     motif = Motif(3, 3)
     assert motif.pairing_string == "+03_-03"
     motif.plus = 4
     assert motif.pairing_string == "+04_-03"
     motif.minus = 2
     assert motif.pairing_string == "+04_-02"
Example #4
0
    def test_filter_meme_results(self):
        meme_dir = os.path.join(self.options.output_dir, "meme")
        anchor = "AFUA_6G09660"
        promoter_sets = [Motif(0, 3)]
        motif = Motif(0, 3, score=3.9e+003)
        motif.seqs = [
            "TTTCGACCCGTC", "TTTCAAACCGTC", "TTTTGATTCGTC", "TTTTGACCGGTC",
            "TTTTAGACGGTC", "TTTTACCTCGTC", "TCTCGATCCGTC", "TTTCTATCCGTT",
            "TTTTGGACCGCC", "ATTTGGCCTGTC", "TGTTGTCTCGTC", "TTTGAGGCCGTC",
            "TTGTATTCTGTC", "TTTCTTCCTGTT"
        ]
        expected_motifs = [motif]

        # this is a "real" MEME output file, I was too lazy to create my own fake XML file
        source = path.get_full_path(__file__, "data", "real_meme.xml")
        target = os.path.join(meme_dir, "+00_-03")
        if not os.path.exists(target):
            os.makedirs(target)
        copy(source, os.path.join(target,
                                  "meme.xml"))  # overwrite meme.xml if exists

        self.assertEqual(
            list(filter_meme_results(meme_dir, promoter_sets, anchor)),
            expected_motifs)
        binding_sites, expected_binding_sites = read_generated_expected_file(
            os.path.join(meme_dir, "+00_-03", "binding_sites.fasta"),
            "expected_binding_sites.fasta")
        self.assertEqual(binding_sites, expected_binding_sites)
Example #5
0
    def test_check_cluster_predictions(self):
        seq_record = create_fake_record()
        promoters = [
            Promoter("gene1", 1, 5),
            Promoter("gene2", 6, 10),
            CombinedPromoter("gene3", "gene4", 11, 15)
        ]
        ignored_genes = [  # see captured logging
            Gene(FeatureLocation(1, 5), locus_tag="gene5")
        ]
        clusters = [
            ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)),
                              ClusterMarker("gene4", Motif(3, 3, score=1)))
        ]
        expected = [
            ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)),
                              ClusterMarker("gene4", Motif(3, 3, score=1)))
        ]
        expected[0].start.promoter = "gene1"
        expected[0].end.promoter = "gene3+gene4"
        expected[0].genes = 4
        expected[0].promoters = 3

        assert check_cluster_predictions(clusters, seq_record, promoters,
                                         ignored_genes) == expected
Example #6
0
    def test_cleanup_outdir(self):
        anchor_genes = ["gene1", "gene4"]
        cluster = cassis.ClusterPrediction(
            ClusterMarker("gene1", Motif(3, 3, score=1)),
            ClusterMarker("gene4", Motif(3, 3, score=1)))
        cluster.start.promoter = "gene1"
        cluster.end.promoter = "gene3+gene4"
        cluster.genes = 4
        cluster.promoters = 3
        cluster_predictions = {"gene1": [cluster]}

        # create some empty test dirs, which should be deleted during the test
        # prediction! --> keep!
        os.makedirs(
            os.path.join(self.options.output_dir, "meme", "gene1", "+03_-03"))
        # prediction! --> keep!
        os.makedirs(
            os.path.join(self.options.output_dir, "fimo", "gene1", "+03_-03"))
        # no prediction --> delete
        os.makedirs(
            os.path.join(self.options.output_dir, "meme", "gene1", "+04_-04"))
        # no prediction --> delete
        os.makedirs(
            os.path.join(self.options.output_dir, "fimo", "gene1", "+04_-04"))
        # no prediction --> delete
        os.makedirs(
            os.path.join(self.options.output_dir, "meme", "gene4", "+03_-03"))
        # no prediction --> delete
        os.makedirs(
            os.path.join(self.options.output_dir, "fimo", "gene4", "+03_-03"))
        # prediction for this gene, but not from this motif --> delete
        os.makedirs(
            os.path.join(self.options.output_dir, "meme", "gene4", "+04_-04"))
        # prediction for this gene, but not from this motif --> delete
        os.makedirs(
            os.path.join(self.options.output_dir, "fimo", "gene4", "+04_-04"))

        cassis.cleanup_outdir(anchor_genes, cluster_predictions, self.options)

        # assert kept directories
        self.assertTrue("gene1" in os.listdir(
            os.path.join(self.options.output_dir, "meme")))
        self.assertTrue("gene1" in os.listdir(
            os.path.join(self.options.output_dir, "fimo")))
        self.assertTrue("+03_-03" in os.listdir(
            os.path.join(self.options.output_dir, "meme", "gene1")))
        self.assertTrue("+03_-03" in os.listdir(
            os.path.join(self.options.output_dir, "fimo", "gene1")))

        # assert deleted directories
        self.assertTrue("gene4" not in os.listdir(
            os.path.join(self.options.output_dir, "meme")))
        self.assertTrue("gene4" not in os.listdir(
            os.path.join(self.options.output_dir, "fimo")))
        self.assertTrue("+04_-04" not in os.listdir(
            os.path.join(self.options.output_dir, "meme", "gene1")))
        self.assertTrue("+04_-04" not in os.listdir(
            os.path.join(self.options.output_dir, "fimo", "gene1")))
Example #7
0
    def test_store_clusters(self):
        # this test is similar to test_store_promoters
        anchor = "gene3"

        start_marker = ClusterMarker("gene1", Motif(3, 3, score=1))
        start_marker.promoter = "gene1"
        start_marker.abundance = 2
        end_marker = ClusterMarker("gene4", Motif(3, 3, score=1))
        end_marker.promoter = "gene3+gene4"
        assert end_marker.abundance == 1
        first_cluster = cassis.ClusterPrediction(start_marker, end_marker)
        first_cluster.promoters = 3
        first_cluster.genes = 4

        start_marker = ClusterMarker("gene1", Motif(4, 4, score=1))
        start_marker.promoter = "gene1"
        assert start_marker.abundance == 1
        end_marker = ClusterMarker("gene5", Motif(4, 4, score=1))
        end_marker.promoter = "gene5"
        assert end_marker.abundance == 1
        second_cluster = cassis.ClusterPrediction(start_marker, end_marker)
        second_cluster.promoters = 3
        second_cluster.genes = 4

        clusters = [first_cluster, second_cluster]

        record_with_clusters = create_fake_record()
        record_without_clusters = create_fake_record(
        )  # just the same, without adding clusters

        borders = cassis.create_cluster_borders(anchor, clusters,
                                                record_with_clusters)
        assert record_with_clusters.get_feature_count(
        ) == record_without_clusters.get_feature_count()

        for border in borders:
            record_with_clusters.add_cluster_border(border)

        # test if store_clusters changed any non-cluster feature (should not!)  # TODO

        # test cluster features
        assert record_without_clusters.get_feature_count() + len(
            clusters) == record_with_clusters.get_feature_count()
        for i, cluster in enumerate(clusters):
            cluster_border = record_with_clusters.get_cluster_borders()[i]
            self.assertEqual(cluster_border.type, "cluster_border")
            self.assertEqual(cluster_border.tool, "cassis")
            self.assertEqual(cluster_border.get_qualifier("anchor"),
                             (anchor, ))
            self.assertEqual(cluster_border.get_qualifier("genes"),
                             (cluster.genes, ))
            self.assertEqual(cluster_border.get_qualifier("promoters"),
                             (cluster.promoters, ))
            self.assertEqual(cluster_border.get_qualifier("gene_left"),
                             (cluster.start.gene, ))
            self.assertEqual(cluster_border.get_qualifier("gene_right"),
                             (cluster.end.gene, ))
Example #8
0
    def test_store_subregions(self):
        # this test is similar to test_store_promoters
        anchor = "gene3"

        start_marker = ClusterMarker("gene1", Motif(3, 3, score=1))
        start_marker.promoter = "gene1"
        start_marker.abundance = 2
        end_marker = ClusterMarker("gene4", Motif(3, 3, score=1))
        end_marker.promoter = "gene3+gene4"
        assert end_marker.abundance == 1
        first_cluster = cassis.ClusterPrediction(start_marker, end_marker)
        first_cluster.promoters = 3
        first_cluster.genes = 4

        start_marker = ClusterMarker("gene1", Motif(4, 4, score=1))
        start_marker.promoter = "gene1"
        assert start_marker.abundance == 1
        end_marker = ClusterMarker("gene5", Motif(4, 4, score=1))
        end_marker.promoter = "gene5"
        assert end_marker.abundance == 1
        second_cluster = cassis.ClusterPrediction(start_marker, end_marker)
        second_cluster.promoters = 3
        second_cluster.genes = 4

        # order reversed because subregions are ordered by length when starts are the same
        region_predictions = [second_cluster, first_cluster]

        record_with_subregions = create_fake_record()
        record_without_subregions = create_fake_record(
        )  # just the same, without adding subregions

        subregions = cassis.create_subregions(anchor, region_predictions,
                                              record_with_subregions)
        assert record_with_subregions.get_feature_count(
        ) == record_without_subregions.get_feature_count()

        for region in subregions:
            record_with_subregions.add_subregion(region)

        # test subregion features
        expected_count = record_without_subregions.get_feature_count() + len(
            subregions)
        assert record_with_subregions.get_feature_count() == expected_count
        for i, region in enumerate(region_predictions):
            subregion = record_with_subregions.get_subregions()[i]
            self.assertEqual(subregion.type, "subregion")
            self.assertEqual(subregion.tool, "cassis")
            self.assertEqual(subregion.anchor, anchor)
            self.assertEqual(subregion.get_qualifier("genes"),
                             (region.genes, ))
            self.assertEqual(subregion.get_qualifier("promoters"),
                             (region.promoters, ))
            self.assertEqual(subregion.get_qualifier("gene_left"),
                             (region.start.gene, ))
            self.assertEqual(subregion.get_qualifier("gene_right"),
                             (region.end.gene, ))
Example #9
0
    def test_get_promoter_sets(self):
        meme_dir = os.path.join(self.options.output_dir, "meme")
        anchor_promoter = 5
        promoters = [
            Promoter("gene1", 1, 1, seq=Seq("acgtacgtacgtacgt")),
            Promoter("gene2", 2, 2, seq=Seq("acgtacgtacgtacgt")),
            CombinedPromoter("gene3",
                             "gene4",
                             3,
                             4,
                             seq=Seq("acgtacgtacgtacgt")),
            Promoter("gene5", 5, 5, seq=Seq("acgtacgtacgtacgt")),
            Promoter("gene6", 6, 6, seq=Seq("acgtacgtacgtacgt")),
            # promoter with index=5 --> anchor promoter
            Promoter("gene7", 7, 7, seq=Seq("acgtacgtacgtacgt")),
            Promoter("gene8", 8, 8, seq=Seq("acgtacgtacgtacgt")),
            Promoter("gene9", 9, 9, seq=Seq("acgtacgtacgtacgt"))
        ]

        expected_motifs = [
            Motif(plus, minus) for plus in range(3)
            for minus in range(3 - plus, 6)
        ]
        self.assertEqual(generate_motifs(meme_dir, anchor_promoter, promoters),
                         expected_motifs)
Example #10
0
 def test_get_islands(self):
     motifs = [Motif(0, 3, hits={"gene1": 1, "gene2": 2}),
               Motif(0, 4, hits={"gene2": 3, "gene4": 2, "gene5": 1})]
     # gene2 will be the anchor promoter
     anchor_promoter = 1
     promoters = []
     for i in range(1, 7):
         promoters.append(Promoter("gene%d" % i, i * 10, i * 10 + 4))
     # resulting in 2 different islands (this example)
     # promoter (pos): 1 2 3 4 5 6
     # binding sites:  1 2 0 0 0 0
     # island:         |-|
     first_island = Island(promoters[0], promoters[1], motifs[0])
     # promoter (pos): 1 2 3 4 5 6
     # binding sites:  0 3 0 2 1 0
     # island:           |---|
     second_island = Island(promoters[1], promoters[4], motifs[1])
     expected_islands = [first_island, second_island]
     assert get_islands(anchor_promoter, motifs, promoters) == expected_islands
Example #11
0
    def test_sort_by_abundance(self):
        islands = []

        # island 1: [gene1 -- gene2]
        motif = Motif(0, 3, score=3, hits={"gene1": 1, "gene2": 1})
        islands.append(Island(Promoter("gene1", 1, 1), Promoter("gene2", 2, 2), motif))
        # island 2: [gene2 -- gene5]
        motif = Motif(3, 0, score=2, hits={"gene2": 1, "gene3": 1, "gene4": 1, "gene5": 1})
        islands.append(Island(Promoter("gene2", 2, 2), Promoter("gene5", 5, 5), motif))
        # island 3: [gene1 -- gene5]
        motif = Motif(3, 3, score=1, hits={"gene1": 1, "gene2": 1, "gene3": 1, "gene4": 1, "gene5": 1})
        islands.append(Island(Promoter("gene1", 1, 1), Promoter("gene5", 5, 5), motif))

        # left border: 2x gene1, 1x gene2
        # right border: 2x gene5, 1x gene2

        expected_clusters = []
        # cluster 1: [gene1 -- gene5] --> abundance 2+2 (most abundant)
        start = ClusterMarker("gene1", Motif(3, 3, score=1))
        start.abundance = 2
        end = ClusterMarker("gene5", Motif(3, 3, score=1))
        end.abundance = 2
        expected_clusters.append(ClusterPrediction(start, end))
        # cluster 3: [gene2 -- gene5] --> abundance 1+2, score 2+1 (better/lower)
        start = ClusterMarker("gene2", Motif(3, 0, score=2))
        start.abundance = 1
        end = ClusterMarker("gene5", Motif(3, 3, score=1))
        end.abundance = 2
        expected_clusters.append(ClusterPrediction(start, end))
        # cluster 2: [gene1 -- gene2] --> abundance 2+1, score 1+3 (worse, higher)
        start = ClusterMarker("gene1", Motif(3, 3, score=1))
        start.abundance = 2
        end = ClusterMarker("gene2", Motif(0, 3, score=3))
        end.abundance = 1
        expected_clusters.append(ClusterPrediction(start, end))
        # cluster 4: [gene2 -- gene2] --> abundance 1+1
        start = ClusterMarker("gene2", Motif(3, 0, score=2))
        start.abundance = 1
        end = ClusterMarker("gene2", Motif(0, 3, score=3))
        end.abundance = 1
        expected_clusters.append(ClusterPrediction(start, end))
        # abundance: as high as possible
        # score: as low as possible

        self.assertEqual(create_predictions(islands), expected_clusters)