def test_filter_fimo_results(self): fimo_dir = os.path.join(self.options.output_dir, "fimo") motifs = [Motif(0, 3)] # gene2 will be the anchor promoter anchor_promoter = 1 promoters = [] for i in range(1, 16): promoters.append(Promoter("gene%d" % i, i * 10, i * 10 + 4)) # need certain amount of promoters, otherwise the proportion of # promoters with a motif (motif frequency) will be too high --> error expected_motifs = [Motif(0, 3, hits={"gene1": 1, "gene2": 2})] # fake FIMO output file, corresponding to expected_motifs source = path.get_full_path(__file__, "data", "fake_short_fimo.txt") target = os.path.join(fimo_dir, "+00_-03") if not os.path.exists(target): os.makedirs(target) copy(source, os.path.join(target, "fimo.txt")) # overwrite fimo.txt if exists found_motifs = filter_fimo_results(motifs, fimo_dir, promoters, anchor_promoter) assert found_motifs == expected_motifs bs_per_promoter, expected_bs_per_promoter = read_generated_expected_file( os.path.join(target, "bs_per_promoter.csv"), "expected_bs_per_promoter.csv") self.assertEqual(bs_per_promoter, expected_bs_per_promoter)
def test_regeneration(self): record = create_fake_record() results = cassis.CassisResults(record.id) # create a prediction, since it will generate a border with many extra qualifiers start_marker = ClusterMarker("gene1", Motif(3, 3, score=1)) start_marker.promoter = "gene1" start_marker.abundance = 2 end_marker = ClusterMarker("gene4", Motif(3, 3, score=1)) end_marker.promoter = "gene3+gene4" assert end_marker.abundance == 1 cluster = cassis.ClusterPrediction(start_marker, end_marker) results.subregions = cassis.create_subregions("gene1", [cluster], record) assert results.subregions results.promoters = [ Promoter("gene1", 10, 20, seq=Seq("cgtacgtacgt")), Promoter("gene2", 30, 40, seq=Seq("cgtacgtacgt")), CombinedPromoter("gene3", "gene4", 50, 60, seq=Seq("cgtacgtacgt")) ] round_trip = cassis.regenerate_previous_results( results.to_json(), record, None) assert isinstance(round_trip, cassis.CassisResults) assert len(results.subregions) == len(round_trip.subregions) for old, new in zip(results.subregions, round_trip.subregions): assert old.location == new.location assert old.to_biopython()[0].qualifiers == new.to_biopython( )[0].qualifiers assert round_trip.promoters == results.promoters
def test_conversion(self): motif = Motif(3, 3) assert motif.pairing_string == "+03_-03" motif.plus = 4 assert motif.pairing_string == "+04_-03" motif.minus = 2 assert motif.pairing_string == "+04_-02"
def test_filter_meme_results(self): meme_dir = os.path.join(self.options.output_dir, "meme") anchor = "AFUA_6G09660" promoter_sets = [Motif(0, 3)] motif = Motif(0, 3, score=3.9e+003) motif.seqs = [ "TTTCGACCCGTC", "TTTCAAACCGTC", "TTTTGATTCGTC", "TTTTGACCGGTC", "TTTTAGACGGTC", "TTTTACCTCGTC", "TCTCGATCCGTC", "TTTCTATCCGTT", "TTTTGGACCGCC", "ATTTGGCCTGTC", "TGTTGTCTCGTC", "TTTGAGGCCGTC", "TTGTATTCTGTC", "TTTCTTCCTGTT" ] expected_motifs = [motif] # this is a "real" MEME output file, I was too lazy to create my own fake XML file source = path.get_full_path(__file__, "data", "real_meme.xml") target = os.path.join(meme_dir, "+00_-03") if not os.path.exists(target): os.makedirs(target) copy(source, os.path.join(target, "meme.xml")) # overwrite meme.xml if exists self.assertEqual( list(filter_meme_results(meme_dir, promoter_sets, anchor)), expected_motifs) binding_sites, expected_binding_sites = read_generated_expected_file( os.path.join(meme_dir, "+00_-03", "binding_sites.fasta"), "expected_binding_sites.fasta") self.assertEqual(binding_sites, expected_binding_sites)
def test_check_cluster_predictions(self): seq_record = create_fake_record() promoters = [ Promoter("gene1", 1, 5), Promoter("gene2", 6, 10), CombinedPromoter("gene3", "gene4", 11, 15) ] ignored_genes = [ # see captured logging Gene(FeatureLocation(1, 5), locus_tag="gene5") ] clusters = [ ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)), ClusterMarker("gene4", Motif(3, 3, score=1))) ] expected = [ ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)), ClusterMarker("gene4", Motif(3, 3, score=1))) ] expected[0].start.promoter = "gene1" expected[0].end.promoter = "gene3+gene4" expected[0].genes = 4 expected[0].promoters = 3 assert check_cluster_predictions(clusters, seq_record, promoters, ignored_genes) == expected
def test_cleanup_outdir(self): anchor_genes = ["gene1", "gene4"] cluster = cassis.ClusterPrediction( ClusterMarker("gene1", Motif(3, 3, score=1)), ClusterMarker("gene4", Motif(3, 3, score=1))) cluster.start.promoter = "gene1" cluster.end.promoter = "gene3+gene4" cluster.genes = 4 cluster.promoters = 3 cluster_predictions = {"gene1": [cluster]} # create some empty test dirs, which should be deleted during the test # prediction! --> keep! os.makedirs( os.path.join(self.options.output_dir, "meme", "gene1", "+03_-03")) # prediction! --> keep! os.makedirs( os.path.join(self.options.output_dir, "fimo", "gene1", "+03_-03")) # no prediction --> delete os.makedirs( os.path.join(self.options.output_dir, "meme", "gene1", "+04_-04")) # no prediction --> delete os.makedirs( os.path.join(self.options.output_dir, "fimo", "gene1", "+04_-04")) # no prediction --> delete os.makedirs( os.path.join(self.options.output_dir, "meme", "gene4", "+03_-03")) # no prediction --> delete os.makedirs( os.path.join(self.options.output_dir, "fimo", "gene4", "+03_-03")) # prediction for this gene, but not from this motif --> delete os.makedirs( os.path.join(self.options.output_dir, "meme", "gene4", "+04_-04")) # prediction for this gene, but not from this motif --> delete os.makedirs( os.path.join(self.options.output_dir, "fimo", "gene4", "+04_-04")) cassis.cleanup_outdir(anchor_genes, cluster_predictions, self.options) # assert kept directories self.assertTrue("gene1" in os.listdir( os.path.join(self.options.output_dir, "meme"))) self.assertTrue("gene1" in os.listdir( os.path.join(self.options.output_dir, "fimo"))) self.assertTrue("+03_-03" in os.listdir( os.path.join(self.options.output_dir, "meme", "gene1"))) self.assertTrue("+03_-03" in os.listdir( os.path.join(self.options.output_dir, "fimo", "gene1"))) # assert deleted directories self.assertTrue("gene4" not in os.listdir( os.path.join(self.options.output_dir, "meme"))) self.assertTrue("gene4" not in os.listdir( os.path.join(self.options.output_dir, "fimo"))) self.assertTrue("+04_-04" not in os.listdir( os.path.join(self.options.output_dir, "meme", "gene1"))) self.assertTrue("+04_-04" not in os.listdir( os.path.join(self.options.output_dir, "fimo", "gene1")))
def test_store_clusters(self): # this test is similar to test_store_promoters anchor = "gene3" start_marker = ClusterMarker("gene1", Motif(3, 3, score=1)) start_marker.promoter = "gene1" start_marker.abundance = 2 end_marker = ClusterMarker("gene4", Motif(3, 3, score=1)) end_marker.promoter = "gene3+gene4" assert end_marker.abundance == 1 first_cluster = cassis.ClusterPrediction(start_marker, end_marker) first_cluster.promoters = 3 first_cluster.genes = 4 start_marker = ClusterMarker("gene1", Motif(4, 4, score=1)) start_marker.promoter = "gene1" assert start_marker.abundance == 1 end_marker = ClusterMarker("gene5", Motif(4, 4, score=1)) end_marker.promoter = "gene5" assert end_marker.abundance == 1 second_cluster = cassis.ClusterPrediction(start_marker, end_marker) second_cluster.promoters = 3 second_cluster.genes = 4 clusters = [first_cluster, second_cluster] record_with_clusters = create_fake_record() record_without_clusters = create_fake_record( ) # just the same, without adding clusters borders = cassis.create_cluster_borders(anchor, clusters, record_with_clusters) assert record_with_clusters.get_feature_count( ) == record_without_clusters.get_feature_count() for border in borders: record_with_clusters.add_cluster_border(border) # test if store_clusters changed any non-cluster feature (should not!) # TODO # test cluster features assert record_without_clusters.get_feature_count() + len( clusters) == record_with_clusters.get_feature_count() for i, cluster in enumerate(clusters): cluster_border = record_with_clusters.get_cluster_borders()[i] self.assertEqual(cluster_border.type, "cluster_border") self.assertEqual(cluster_border.tool, "cassis") self.assertEqual(cluster_border.get_qualifier("anchor"), (anchor, )) self.assertEqual(cluster_border.get_qualifier("genes"), (cluster.genes, )) self.assertEqual(cluster_border.get_qualifier("promoters"), (cluster.promoters, )) self.assertEqual(cluster_border.get_qualifier("gene_left"), (cluster.start.gene, )) self.assertEqual(cluster_border.get_qualifier("gene_right"), (cluster.end.gene, ))
def test_store_subregions(self): # this test is similar to test_store_promoters anchor = "gene3" start_marker = ClusterMarker("gene1", Motif(3, 3, score=1)) start_marker.promoter = "gene1" start_marker.abundance = 2 end_marker = ClusterMarker("gene4", Motif(3, 3, score=1)) end_marker.promoter = "gene3+gene4" assert end_marker.abundance == 1 first_cluster = cassis.ClusterPrediction(start_marker, end_marker) first_cluster.promoters = 3 first_cluster.genes = 4 start_marker = ClusterMarker("gene1", Motif(4, 4, score=1)) start_marker.promoter = "gene1" assert start_marker.abundance == 1 end_marker = ClusterMarker("gene5", Motif(4, 4, score=1)) end_marker.promoter = "gene5" assert end_marker.abundance == 1 second_cluster = cassis.ClusterPrediction(start_marker, end_marker) second_cluster.promoters = 3 second_cluster.genes = 4 # order reversed because subregions are ordered by length when starts are the same region_predictions = [second_cluster, first_cluster] record_with_subregions = create_fake_record() record_without_subregions = create_fake_record( ) # just the same, without adding subregions subregions = cassis.create_subregions(anchor, region_predictions, record_with_subregions) assert record_with_subregions.get_feature_count( ) == record_without_subregions.get_feature_count() for region in subregions: record_with_subregions.add_subregion(region) # test subregion features expected_count = record_without_subregions.get_feature_count() + len( subregions) assert record_with_subregions.get_feature_count() == expected_count for i, region in enumerate(region_predictions): subregion = record_with_subregions.get_subregions()[i] self.assertEqual(subregion.type, "subregion") self.assertEqual(subregion.tool, "cassis") self.assertEqual(subregion.anchor, anchor) self.assertEqual(subregion.get_qualifier("genes"), (region.genes, )) self.assertEqual(subregion.get_qualifier("promoters"), (region.promoters, )) self.assertEqual(subregion.get_qualifier("gene_left"), (region.start.gene, )) self.assertEqual(subregion.get_qualifier("gene_right"), (region.end.gene, ))
def test_get_promoter_sets(self): meme_dir = os.path.join(self.options.output_dir, "meme") anchor_promoter = 5 promoters = [ Promoter("gene1", 1, 1, seq=Seq("acgtacgtacgtacgt")), Promoter("gene2", 2, 2, seq=Seq("acgtacgtacgtacgt")), CombinedPromoter("gene3", "gene4", 3, 4, seq=Seq("acgtacgtacgtacgt")), Promoter("gene5", 5, 5, seq=Seq("acgtacgtacgtacgt")), Promoter("gene6", 6, 6, seq=Seq("acgtacgtacgtacgt")), # promoter with index=5 --> anchor promoter Promoter("gene7", 7, 7, seq=Seq("acgtacgtacgtacgt")), Promoter("gene8", 8, 8, seq=Seq("acgtacgtacgtacgt")), Promoter("gene9", 9, 9, seq=Seq("acgtacgtacgtacgt")) ] expected_motifs = [ Motif(plus, minus) for plus in range(3) for minus in range(3 - plus, 6) ] self.assertEqual(generate_motifs(meme_dir, anchor_promoter, promoters), expected_motifs)
def test_get_islands(self): motifs = [Motif(0, 3, hits={"gene1": 1, "gene2": 2}), Motif(0, 4, hits={"gene2": 3, "gene4": 2, "gene5": 1})] # gene2 will be the anchor promoter anchor_promoter = 1 promoters = [] for i in range(1, 7): promoters.append(Promoter("gene%d" % i, i * 10, i * 10 + 4)) # resulting in 2 different islands (this example) # promoter (pos): 1 2 3 4 5 6 # binding sites: 1 2 0 0 0 0 # island: |-| first_island = Island(promoters[0], promoters[1], motifs[0]) # promoter (pos): 1 2 3 4 5 6 # binding sites: 0 3 0 2 1 0 # island: |---| second_island = Island(promoters[1], promoters[4], motifs[1]) expected_islands = [first_island, second_island] assert get_islands(anchor_promoter, motifs, promoters) == expected_islands
def test_sort_by_abundance(self): islands = [] # island 1: [gene1 -- gene2] motif = Motif(0, 3, score=3, hits={"gene1": 1, "gene2": 1}) islands.append(Island(Promoter("gene1", 1, 1), Promoter("gene2", 2, 2), motif)) # island 2: [gene2 -- gene5] motif = Motif(3, 0, score=2, hits={"gene2": 1, "gene3": 1, "gene4": 1, "gene5": 1}) islands.append(Island(Promoter("gene2", 2, 2), Promoter("gene5", 5, 5), motif)) # island 3: [gene1 -- gene5] motif = Motif(3, 3, score=1, hits={"gene1": 1, "gene2": 1, "gene3": 1, "gene4": 1, "gene5": 1}) islands.append(Island(Promoter("gene1", 1, 1), Promoter("gene5", 5, 5), motif)) # left border: 2x gene1, 1x gene2 # right border: 2x gene5, 1x gene2 expected_clusters = [] # cluster 1: [gene1 -- gene5] --> abundance 2+2 (most abundant) start = ClusterMarker("gene1", Motif(3, 3, score=1)) start.abundance = 2 end = ClusterMarker("gene5", Motif(3, 3, score=1)) end.abundance = 2 expected_clusters.append(ClusterPrediction(start, end)) # cluster 3: [gene2 -- gene5] --> abundance 1+2, score 2+1 (better/lower) start = ClusterMarker("gene2", Motif(3, 0, score=2)) start.abundance = 1 end = ClusterMarker("gene5", Motif(3, 3, score=1)) end.abundance = 2 expected_clusters.append(ClusterPrediction(start, end)) # cluster 2: [gene1 -- gene2] --> abundance 2+1, score 1+3 (worse, higher) start = ClusterMarker("gene1", Motif(3, 3, score=1)) start.abundance = 2 end = ClusterMarker("gene2", Motif(0, 3, score=3)) end.abundance = 1 expected_clusters.append(ClusterPrediction(start, end)) # cluster 4: [gene2 -- gene2] --> abundance 1+1 start = ClusterMarker("gene2", Motif(3, 0, score=2)) start.abundance = 1 end = ClusterMarker("gene2", Motif(0, 3, score=3)) end.abundance = 1 expected_clusters.append(ClusterPrediction(start, end)) # abundance: as high as possible # score: as low as possible self.assertEqual(create_predictions(islands), expected_clusters)