Beispiel #1
0
 def test_306_BedDataSet_array_from_query(self):
     chromstart = 10
     chromend = 20
     query = [(0, 15, 25), (0, 5, 13)]
     dataset = peaksql.BedDataSet(DATABASE_BED, seq_length=10, stride=10)
     assert np.all(
         dataset.array_from_query(query, chromstart, chromend) == np.array([
             [True, True, True, False, False, True, True, True, True, True]
         ]))
Beispiel #2
0
    def test_310_BedDataSet_random_pos_distribution(self):
        dataset = peaksql.BedDataSet(DATABASE_BED,
                                     seq_length=10,
                                     nr_rand_pos=100_000)

        # chromosomes are of equal size, so we expect equal nr of positions for each
        un_cumsum = dataset.cumsum - np.roll(dataset.cumsum, shift=1)
        for count in un_cumsum[1:]:
            assert 0.245 <= count / 100_000 <= 0.255
Beispiel #3
0
    def test_302_BedDataSet_stride_sequences(self):
        dataset = peaksql.BedDataSet(DATABASE_BED, seq_length=10, stride=10)

        all_dna = ("AAAACCCCGGGGTTTTAAACCCGGGTTTAACCGGTTACGT" +
                   "TTTTGGGGCCCCAAAATTTGGGCCCAAATTGGCCAATGCA" +
                   "ATGCGTAGCTGATCGATGCTAGCTAGCTAGCTAGCTAAAA" +
                   "ATGGTGAATGTGAGTAGTGATGATGAGTGTAGTGAGGGGG")

        dna_strided = [all_dna[i:i + 10] for i in range(0, len(all_dna), 10)]
        dna_onehot = [
            peaksql.util.sequence_to_onehot(dna) for dna in dna_strided
        ]
        for seq, label in dataset:
            assert np.sum(
                np.all(seq == potential_seq) for potential_seq in dna_onehot)
Beispiel #4
0
 def test_309_BedDataSet_random_pos_sequences(self):
     dataset = peaksql.BedDataSet(DATABASE_BED,
                                  seq_length=10,
                                  nr_rand_pos=20)
     all_dna = [
         "AAAACCCCGGGGTTTTAAACCCGGGTTTAACCGGTTACGT",
         "TTTTGGGGCCCCAAAATTTGGGCCCAAATTGGCCAATGCA",
         "ATGCGTAGCTGATCGATGCTAGCTAGCTAGCTAGCTAAAA",
         "ATGGTGAATGTGAGTAGTGATGATGAGTGTAGTGAGGGGG",
     ]
     dna_onehot = [peaksql.util.sequence_to_onehot(dna) for dna in all_dna]
     for i, (seq, label) in enumerate(dataset):
         found = False
         for chromosome in range(4):
             for idx in range(0, 30):
                 if np.all(seq == dna_onehot[chromosome][idx:idx + 10]):
                     found = True
         assert found
 def test_402_Integration_PyTorch_DataLoader(self):
     dataset = peaksql.BedDataSet(DATABASE_BED, nr_rand_pos=100, seq_length=3)
     dataloader = DataLoader(dataset, batch_size=10)
     for seq, label in dataloader:
         assert tuple(seq.shape) == (10, 3, 4,)
         assert tuple(label.shape) == (10, 1,)
 def test_401_iterable(self):
     dataset = peaksql.BedDataSet(DATABASE_BED, nr_rand_pos=100, seq_length=3)
     for seq, label in dataset:
         assert seq.shape == (3, 4,)
         assert label.shape == (1,)
Beispiel #7
0
 def test_308_BedDataSet_random_pos_length(self):
     dataset = peaksql.BedDataSet(DATABASE_BED,
                                  seq_length=10,
                                  nr_rand_pos=20)
     assert len(dataset) == 20
Beispiel #8
0
 def test_305_Bed_label_fraction(self):
     dataset = peaksql.BedDataSet(DATABASE_BED, seq_length=10, stride=10)
     dataset.ratio = 0.4
     assert all(
         dataset.fraction(self.positions) == [False, True, False, True])
Beispiel #9
0
 def test_304_Bed_label_all(self):
     dataset = peaksql.BedDataSet(DATABASE_BED, seq_length=10, stride=10)
     assert all(dataset.all(self.positions) == [False, False, False, True])
Beispiel #10
0
 def test_301_BedDataSet_stride_length(self):
     dataset = peaksql.BedDataSet(DATABASE_BED, seq_length=10, stride=10)
     assert len(dataset) == 16