def test_dna_genomic_interval_access(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 2 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_merged, storage='ndarray', order=order) with pytest.raises(Exception): # due to store_whole_genome = False data[data.gindexer[0]] data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_merged, storage='ndarray', order=order, store_whole_genome=True) np.testing.assert_equal(data[0], data[data.gindexer[0]]) chrom = data.gindexer[0].chrom start = data.gindexer[0].start end = data.gindexer[0].end np.testing.assert_equal(data[0], data[(chrom, start, end)]) np.testing.assert_equal(data[0], data[chrom, start, end])
def test_dnaconv(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1) xin = Input(dna.shape[1:]) l1 = DnaConv2D(Conv2D(30, (21, 1), activation='relu'))(xin) m1 = Model(xin, l1) res1 = m1.predict(dna[0])[0, 0, 0, :] clayer = m1.layers[1].forward_layer # forward only l1 = clayer(xin) m2 = Model(xin, l1) res2 = m2.predict(dna[0])[0, 0, 0, :] rxin = Reverse()(Complement()(xin)) l1 = clayer(rxin) l1 = Reverse()(l1) m3 = Model(xin, l1) res3 = m3.predict(dna[0])[0, 0, 0, :] res4 = np.maximum(res3, res2) np.testing.assert_allclose(res1, res4, rtol=1e-4)
def test_dnaconv2(): # this checks if DnaConv2D layer is instantiated correctly if # the conv2d layer has been instantiated beforehand. data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1) xin = Input(dna.shape[1:]) clayer = Conv2D(30, (21, 1), activation='relu') clayer(xin) l1 = DnaConv2D(clayer)(xin) m1 = Model(xin, l1) res1 = m1.predict(dna[0])[0, 0, 0, :] np.testing.assert_allclose(clayer.get_weights()[0], m1.layers[1].forward_layer.get_weights()[0]) assert len(clayer.weights) == 2
def test_create_from_array_whole_genome_false(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa') # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_FILE, binsize=200, stepsize=200, order=1, store_whole_genome=False, datatags=['ref']) LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=PEAK_FILE, binsize=200, stepsize=200, resolution=200, store_whole_genome=False, datatags=['train']) @inputlayer @outputconv('sigmoid') def double_stranded_model_dnaconv(inputs, inp, oup, params): with inputs.use('dna') as layer: layer = DnaConv2D(Conv2D(params[0], (params[1], 1), activation=params[2]))(layer) output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1], name='motif')(layer) return inputs, output modeltemplate = double_stranded_model_dnaconv K.clear_session() # create a new model object model = Janggu.create(template=modeltemplate, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=LABELS) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) pred = model.predict(DNA) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, store_whole_genome=False) assert pred.shape == cov_out.shape np.testing.assert_equal(pred, cov_out[:]) assert len(cov_out.gindexer) == len(pred) assert len(cov_out.garray.handle) == len(pred)
def get_data(refgenome, flank): dna = Bioseq.create_from_refgenome(name='dna', refgenome=refgenome, roi="input.bed", flank=flank) print(dna.shape[0]) return np.reshape(dna, (dna.shape[0], flank * 2 + 1, 4, 1))
def complement_layer(order): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_file, storage='ndarray', binsize=binsize, flank=flank, order=order) dna_in = Input(shape=data.shape[1:], name='dna') cdna_layer = Complement()(dna_in) cmod = Model(dna_in, cdna_layer) # actual shape of DNA dna = data[0] cdna = cmod.predict(dna) ccdna = cmod.predict(cdna) with pytest.raises(Exception): np.testing.assert_equal(dna, cdna) np.testing.assert_equal(dna, ccdna)
def test_split_train_test(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, binsize=200, stepsize=200, order=1, store_whole_genome=True) traindna, testdna = split_train_test(dna, holdout_chroms='chr2') assert len(traindna) == 50 assert len(testdna) == 50 assert len(dna) == len(traindna) + len(testdna) traindna, testdna = split_train_test([dna, dna], holdout_chroms='chr2') assert len(traindna[0]) == 50 assert len(testdna[0]) == 50 assert len(dna) == len(traindna[0]) + len(testdna[0])
def get_data(params): train_labels = Cover.create_from_bed('labels', bedfiles=bedfiles, roi=train_roi, resolution=200, store_whole_genome=True, storage='sparse', cache=True, dtype='int8', minoverlap=.5, verbose=True) test_labels = view(train_labels, test_roi) val_labels = view(train_labels, val_roi) train_seq = Bioseq.create_from_refgenome('dna', refgenome=refgenome, roi=train_roi, store_whole_genome=True, storage='ndarray', cache=True, order=params['order'], flank=params['flank'], verbose=True) test_seq = view(train_seq, test_roi) val_seq = view(train_seq, val_roi) return ((train_seq, ReduceDim(train_labels)), (val_seq, ReduceDim(val_labels)), (test_seq, ReduceDim(test_labels)))
def test_dna_dims_order_2(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 2 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_merged, binsize=200, storage='ndarray', order=order) # for order 1 assert len(data) == 100 assert data.shape == (100, 199, 1, 16) # the correctness of the sequence extraction was also # validated using: # >bedtools getfasta -fi sample_genome.fa -bed sample.bed # >chr1:15000-25000 # ATTGTGGTGAC... np.testing.assert_equal( data[0][0, :10, 0, :], np.asarray( [ [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # AT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # TT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], # GT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # GG [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], # GT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # GA [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # AC dtype='int8')) # bedtools getfasta -fi sample_genome.fa -bed sample.bed # >chr2:15000-25000 # ggggaagcaag... # this sequence is read from the reverse strand # so we have ...cttgcttcccc np.testing.assert_equal( data[50][0, -10:, 0, :], np.asarray( [ [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], # CT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # TT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], # GC [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], # CT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # TT [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], # TC [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # CC [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # CC [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # CC dtype='int8'))
def test_janggu_variant_prediction(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') for order in [1, 2, 3]: refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) def _cnn_model(inputs, inp, oup, params): inputs = Input( (50 - params['order'] + 1, 1, pow(4, params['order']))) layer = Flatten()(inputs) layer = Dense(params['hiddenunits'])(layer) output = Dense(4, activation='sigmoid')(layer) return inputs, output model = Janggu.create(_cnn_model, modelparams={ 'hiddenunits': 2, 'order': order }, name='dna_ctcf_HepG2-cnn') model.predict_variant_effect( dna, vcffile, conditions=['m' + str(i) for i in range(4)], output_folder=os.path.join(os.environ['JANGGU_OUTPUT'])) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5')) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz')) f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'), 'r') gindexer = GenomicIndexer.create_from_file( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None, None) cov = Cover.create_from_array('snps', f['diffscore'], gindexer, store_whole_genome=True) print(cov['chr2', 55, 65].shape) print(cov['chr2', 55, 65]) assert np.abs(cov['chr2', 59, 60]).sum() > 0.0 assert np.abs(cov['chr2', 54, 55]).sum() == 0.0 f.close()
def test_janggu_use_dnaconv_max(tmpdir): os.environ['JANGGU_OUTPUT']=tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'positive.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1) @inputlayer def _cnn_model1(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'), merge_mode='max', name='bothstrands')(layer) return inputs, layer bwm1 = Janggu.create(_cnn_model1, modelparams=(2,), inputs=dna, name='dna_ctcf_HepG2-cnn1') p1 = bwm1.predict(dna[1:2]) w = bwm1.kerasmodel.get_layer('bothstrands').get_weights() @inputlayer def _cnn_model2(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer conv = Conv2D(5, (3, 1), name='singlestrand') fl = conv(layer) rl = Reverse()(conv(Complement()(Reverse()(inlayer)))) layer = Maximum()([fl, rl]) return inputs, layer bwm2 = Janggu.create(_cnn_model2, modelparams=(2,), inputs=dna, name='dna_ctcf_HepG2-cnn2') bwm2.kerasmodel.get_layer('singlestrand').set_weights(w) p2 = bwm2.predict(dna[1:2]) np.testing.assert_allclose(p1, p2, rtol=1e-4, atol=1e-3) bwm1.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm1._storage_path(bwm1.name, outputdir=tmpdir.strpath) bwm1.save() bwm1.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn1')
def test_dna_dims_order_1_from_reference(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 1 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') gindexer = GenomicIndexer.create_from_file(bed_merged, 200, 200) data = Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', order=order, store_whole_genome=True) data.gindexer = gindexer assert len(data.garray.handle) == 2 assert 'chr1' in data.garray.handle assert 'chr2' in data.garray.handle # for order 1 assert len(data) == 100 assert data.shape == (100, 200, 1, 4) # the correctness of the sequence extraction was also # validated using: # bedtools getfasta -fi sample_genome.fa -bed sample.bed # >chr1:15000-25000 # ATTGTGGTGA... # this sequence is read from the forward strand np.testing.assert_equal(data[0][0, :10, 0, :], np.asarray([[1, 0, 0, 0], # A [0, 0, 0, 1], # T [0, 0, 0, 1], # T [0, 0, 1, 0], # C [0, 0, 0, 1], # T [0, 0, 1, 0], # G [0, 0, 1, 0], # G [0, 0, 0, 1], # T [0, 0, 1, 0], # G [1, 0, 0, 0]], # A dtype='int8')) # bedtools getfasta -fi sample_genome.fa -bed sample.bed # >chr2:15000-25000 # ggggaagcaa... # this sequence is read from the reverse strand # so we have ...ttgcttcccc np.testing.assert_equal(data[50][0, -10:, 0, :], np.asarray([[0, 0, 0, 1], # T [0, 0, 0, 1], # T [0, 0, 1, 0], # G [0, 1, 0, 0], # C [0, 0, 0, 1], # T [0, 0, 0, 1], # T [0, 1, 0, 0], # C [0, 1, 0, 0], # C [0, 1, 0, 0], # C [0, 1, 0, 0]], # C dtype='int8'))
def get_data(params): zscore = ZScore() LABELS_TRAIN = ReduceDim(Cover.create_from_bam( 'geneexpr', bamfiles=RNA.format(params['traincell'], params['trainrep']), roi=ROI_INPUT_TRAIN, flank=params['cageflank'], conditions=['GeneExpr'], resolution=None, store_whole_genome=False, storage='ndarray', normalizer=[LogTransform(), zscore], stranded=False, cache=True), aggregator="mean") train_labels = LABELS_TRAIN train_input = [] if params['inputs'] in ['dna_only', 'epi_dna']: dnaflank = params['dnaflank'] order = params['order'] # DNA DNA_TRAIN = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_INPUT_TRAIN, flank=dnaflank, order=order, cache=True, store_whole_genome=False) train_input += [DNA_TRAIN] if params['inputs'] in ['epi_only', 'epi_dna']: zscore = ZScore() dnase_TRAIN = ReduceDim(Cover.create_from_bam( 'dnase', bamfiles=DNASE.format(params['traincell']), roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], resolution=None, store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [dnase_TRAIN] zscore = ZScore() h3k4_TRAIN = ReduceDim(Cover.create_from_bigwig( 'h3k4', bigwigfiles=[H3K4me3.format(params['traincell'])], roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [h3k4_TRAIN] if len(train_input) == 0: raise ValueError('no input') return (train_input, train_labels)
def test_janggu_influence_genomic(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') csvfile = os.path.join(data_path, 'sample.csv') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, roi=bed_file, order=1) df = pd.read_csv(csvfile, header=None) ctcf = Array('ctcf', df.values, conditions=['peaks']) @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output model = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') model.compile(optimizer='adadelta', loss='binary_crossentropy') # check with some nice offset iv = dna.gindexer[0] chrom, start, end = iv.chrom, iv.start, iv.end influence = input_attribution(model, dna, chrom=chrom, start=start, end=end) # check with an odd offset # chrom, start, end = influence2 = input_attribution(model, dna, chrom=chrom, start=start - 1, end=end + 1) np.testing.assert_equal(influence[0][:], influence2[0][:][:, 1:-1])
def test_dna_loading_from_seqrecord(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 2 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') seqs = sequences_from_fasta(refgenome) data = Bioseq.create_from_refgenome('train', refgenome=seqs, roi=bed_merged, storage='ndarray', order=order)
def test_dna_first_last_channel(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') data1 = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_merged, storage='ndarray', channel_last=True) assert data1.shape == (2, 10000, 1, 4) assert data1[0].shape == (1, 10000, 1, 4) data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_merged, storage='ndarray', channel_last=False) assert data.shape == (2, 4, 10000, 1) assert data[0].shape == (1, 4, 10000, 1) np.testing.assert_equal(data1[0], np.transpose(data[0], (0, 2, 3, 1)))
def test_janggu_variant_streamer_order_12_ignore_ref_match(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') for order in [1, 2]: dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) # even binsize vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1, ignore_reference_match=True) it_vcf = iter(vcf.flow()) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # C to T print(names, chroms, poss, ra, aa) print(reference) print(alternative) assert names[0] == 'refmismatch' #np.testing.assert_equal(reference, alternative) np.testing.assert_equal( np.abs(reference - alternative).sum(), 2 * order) #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) # odd binsize vcf = VariantStreamer(dna, vcffile, binsize=3, batch_size=1, ignore_reference_match=True) it_vcf = iter(vcf.flow()) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # C to T print(names, chroms, poss, ra, aa) print(reference) print(alternative) assert names[0] == 'refmismatch' np.testing.assert_equal( np.abs(reference - alternative).sum(), 2 * order)
def test_dnabed_overreaching_ends_partial_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "bed_test.bed") filename = os.path.join(data_path, 'sample_genome.fa') bioseq = Bioseq.create_from_refgenome('test', refgenome=filename, roi=bed_file, binsize=2, flank=20, store_whole_genome=False, storage='ndarray') assert len(bioseq) == 9 assert bioseq.shape == (9, 2 + 2 * 20, 1, 4) np.testing.assert_equal(bioseq[0].sum(), 22) np.testing.assert_equal(bioseq[-1].sum(), 42 - 4)
def test_janggu_chr2_validation(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, binsize=200, stepsize=50, roi=bed_file, order=1) ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=None, flank=0, collapser='max', storage='ndarray') @inputlayer @outputconv('sigmoid') def _cnn_model1(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'), merge_mode='max', name='bothstrands')(layer) layer = MaxPooling2D((198, 1))(layer) return inputs, layer bwm1 = Janggu.create(_cnn_model1, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn1') bwm1.compile(optimizer='adadelta', loss='binary_crossentropy') p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
def test_subset_exclude_chrname_test(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, binsize=200, stepsize=200, order=1, store_whole_genome=True) subdna = subset(dna, exclude_regions='chr2') assert len(subdna) == 50
def test_dna_dims_order_1_from_subset(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 1 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_merged, storage='ndarray', order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) assert len(data.garray.handle) == 2 # for order 1 assert len(data) == 2 assert data.shape == (2, 10000, 1, 4)
def test_view_bed_test(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') bedsub_file = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, binsize=200, stepsize=200, order=1, store_whole_genome=True) subdna = view(dna, use_regions=bedsub_file) assert len(subdna) == 4
def test_dnabed_overreaching_ends_whole_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "bed_test.bed") filename = os.path.join(data_path, 'sample_genome.fa') bioseq = Bioseq.create_from_refgenome('test', refgenome=filename, roi=bed_file, binsize=2, flank=20, store_whole_genome=True, storage='ndarray', cache=False) assert len(bioseq) == 9 assert bioseq.shape == (9, 2 + 2 * 20, 1, 4) # test if beginning is correctly padded np.testing.assert_equal(bioseq[0].sum(), 22) # test if end is correctly padded np.testing.assert_equal(bioseq['chr1', 29990, 30010].sum(), 10)
def test_dna_props_extraction(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, binsize=200, stepsize=200, order=1) props = _data_props(dna) assert 'dna' in props assert props['dna']['shape'] == (200, 1, 4) with pytest.raises(Exception): _data_props((0, ))
def test_dna_loading_from_seqrecord(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 2 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') seqs = sequences_from_fasta(refgenome) data = Bioseq.create_from_refgenome('train', refgenome=seqs, roi=bed_merged, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) chrom = data.gindexer[0].chrom start = data.gindexer[0].start end = data.gindexer[0].end np.testing.assert_equal(data[0], data[(chrom, start, end)]) np.testing.assert_equal(data[0], data[chrom, start, end])
def reverse_layer(order): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_file, storage='ndarray', binsize=binsize, flank=flank, order=order) dna_in = Input(shape=data.shape[1:], name='dna') rdna_layer = Reverse()(dna_in) rmod = Model(dna_in, rdna_layer) # actual shape of DNA dna = data[0] np.testing.assert_equal(dna[:, ::-1, :, :], rmod.predict(dna))
def test_janggu_variant_streamer_order_1_revcomp(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) annot = BedTool([Interval('chr2', 110, 130, '-')]) # even binsize vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1) it_vcf = iter(vcf.flow()) next(it_vcf) # C to T #print(names, chroms, poss, ra, aa) #print(reference) #print(alternative) #assert names[0] == 'refmismatch' #np.testing.assert_equal(reference, alternative) #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) next(it_vcf) # C to T #print(names, chroms, poss, ra, aa) #print(reference) #print(alternative) #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0])) #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1])) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # T to C print(names, chroms, poss, ra, aa) print(reference) print(alternative) # np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1])) # np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) # even binsize vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1, annotation=annot) it_vcf = iter(vcf.flow()) next(it_vcf) # C to T next(it_vcf) # C to T names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf) # T to C print(names, chroms, poss, ra, aa) print(reference) print(alternative) np.testing.assert_equal(reference, reference2[:, ::-1, :, ::-1]) np.testing.assert_equal(alternative, alternative2[:, ::-1, :, ::-1])
def test_dna_dims_order_1_from_subset_dataframe(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 1 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') roi = pandas.read_csv( bed_merged, sep='\t', header=None, usecols=[0, 2, 3, 4, 5, 6], skiprows=2, names=['chrom', 'name', 'start', 'end', 'score', 'strand']) roi.start -= 1 print(roi) data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=roi, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) assert len(data.garray.handle) == 2 # for order 1 assert len(data) == 2 assert data.shape == (2, 10000, 1, 4) assert data[:].sum() == 20000 roi = BedTool(bed_merged) data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=roi, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) assert len(data.garray.handle) == 2 # for order 1 assert len(data) == 2 assert data.shape == (2, 10000, 1, 4) assert data[:].sum() == 20000 roi = [iv for iv in BedTool(bed_merged)] data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=roi, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) assert len(data.garray.handle) == 2 # for order 1 assert len(data) == 2 assert data.shape == (2, 10000, 1, 4) assert data[:].sum() == 20000
def test_dna_dataset_sanity(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') with pytest.raises(Exception): # name must be a string Bioseq.create_from_refgenome(1.23, refgenome='', storage='ndarray', roi=bed_file, order=1) with pytest.raises(Exception): Bioseq.create_from_refgenome('train', refgenome='', storage='ndarray', roi=bed_file, order=1) with pytest.raises(Exception): Bioseq.create_from_refgenome('train', refgenome='test', storage='ndarray', roi=bed_file, order=1) with pytest.raises(Exception): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=bed_file, order=0) with pytest.raises(Exception): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=bed_file, flank=-1) with pytest.raises(Exception): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=bed_file, binsize=0) with pytest.raises(Exception): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=bed_file, stepsize=0) with pytest.warns(FutureWarning): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=bed_file, datatags=['help']) with pytest.warns(FutureWarning): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=bed_file, overwrite=True) with pytest.raises(Exception): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='step', roi=bed_file, order=1) assert not os.path.exists( os.path.join(tmpdir.strpath, 'train', 'storage.h5')) with pytest.raises(ValueError): Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='sparse', roi=None, order=1, store_whole_genome=True) with pytest.raises(ValueError): Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=bed_file, order=0, store_whole_genome=True) with pytest.raises(ValueError): Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=None, store_whole_genome=False) Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=None, order=1, store_whole_genome=True) file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5')) assert len(file_) == 0 print(refgenome) print(bed_file) Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1, cache=True) Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='hdf5', roi=bed_file, order=1, cache=True) # a cache file must exist now file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5')) assert len(file_) == 1 # reload the cached file Bioseq.create_from_refgenome('train', refgenome=refgenome, storage='hdf5', roi=bed_file, order=1, cache=True)
def test_janggu_variant_streamer_order_2(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 2 refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1) it_vcf = iter(vcf.flow()) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # ACT -> ATT print(names, chroms, poss, ra, aa) print(reference) print(alternative) assert names[0] == 'refmismatch' np.testing.assert_equal(reference, alternative) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # ACT -> ATT print(names, chroms, poss, ra, aa) print(reference) print(alternative) np.testing.assert_equal(reference[0, 3, 0, 1], 1) np.testing.assert_equal(reference[0, 4, 0, 7], 1) np.testing.assert_equal(alternative[0, 3, 0, 3], 1) np.testing.assert_equal(alternative[0, 4, 0, 15], 1) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) print(names, chroms, poss, ra, aa) print(reference) print(alternative) # CTC -> CCC np.testing.assert_equal(reference[0, 3, 0, 7], 1) np.testing.assert_equal(reference[0, 4, 0, 13], 1) np.testing.assert_equal(alternative[0, 3, 0, 5], 1) np.testing.assert_equal(alternative[0, 4, 0, 5], 1) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) print(names, chroms, poss, ra, aa) print(reference) print(alternative) # GAC -> GGC np.testing.assert_equal(reference[0, 3, 0, 8], 1) np.testing.assert_equal(reference[0, 4, 0, 1], 1) np.testing.assert_equal(alternative[0, 3, 0, 10], 1) np.testing.assert_equal(alternative[0, 4, 0, 9], 1) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) print(names, chroms, poss, ra, aa) print(reference) print(alternative) # CGG -> CAG np.testing.assert_equal(reference[0, 3, 0, 6], 1) np.testing.assert_equal(reference[0, 4, 0, 10], 1) np.testing.assert_equal(alternative[0, 3, 0, 4], 1) np.testing.assert_equal(alternative[0, 4, 0, 2], 1) vcf = VariantStreamer(dna, vcffile, binsize=5, batch_size=1) it_vcf = iter(vcf.flow()) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # ACT -> ATT print(names, chroms, poss, ra, aa) print(reference) print(alternative) assert names[0] == 'refmismatch' np.testing.assert_equal(reference, alternative) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # ACT -> ATT print(names, chroms, poss, ra, aa) print(reference) print(alternative) np.testing.assert_equal(reference[0, 1, 0, 1], 1) np.testing.assert_equal(reference[0, 2, 0, 7], 1) np.testing.assert_equal(alternative[0, 1, 0, 3], 1) np.testing.assert_equal(alternative[0, 2, 0, 15], 1) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) print(names, chroms, poss, ra, aa) print(reference) print(alternative) # CTC -> CCC np.testing.assert_equal(reference[0, 1, 0, 7], 1) np.testing.assert_equal(reference[0, 2, 0, 13], 1) np.testing.assert_equal(alternative[0, 1, 0, 5], 1) np.testing.assert_equal(alternative[0, 2, 0, 5], 1)