def test_create_from_array_whole_genome_false(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa') # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_FILE, binsize=200, stepsize=200, order=1, store_whole_genome=False, datatags=['ref']) LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=PEAK_FILE, binsize=200, stepsize=200, resolution=200, store_whole_genome=False, datatags=['train']) @inputlayer @outputconv('sigmoid') def double_stranded_model_dnaconv(inputs, inp, oup, params): with inputs.use('dna') as layer: layer = DnaConv2D(Conv2D(params[0], (params[1], 1), activation=params[2]))(layer) output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1], name='motif')(layer) return inputs, output modeltemplate = double_stranded_model_dnaconv K.clear_session() # create a new model object model = Janggu.create(template=modeltemplate, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=LABELS) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) pred = model.predict(DNA) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, store_whole_genome=False) assert pred.shape == cov_out.shape np.testing.assert_equal(pred, cov_out[:]) assert len(cov_out.gindexer) == len(pred) assert len(cov_out.garray.handle) == len(pred)
def test_create_from_array_whole_genome_true(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=[PEAK_FILE]*5, binsize=200, stepsize=200, resolution=200, store_whole_genome=True) pred = LABELS[:] for storage in ['ndarray', 'sparse', 'hdf5']: print(storage) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, cache=True, storage=storage, store_whole_genome=True) np.testing.assert_equal(cov_out[:], LABELS[:]) np.testing.assert_equal(cov_out.shape, LABELS.shape)
def test_janggu_variant_prediction(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') for order in [1, 2, 3]: refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) def _cnn_model(inputs, inp, oup, params): inputs = Input( (50 - params['order'] + 1, 1, pow(4, params['order']))) layer = Flatten()(inputs) layer = Dense(params['hiddenunits'])(layer) output = Dense(4, activation='sigmoid')(layer) return inputs, output model = Janggu.create(_cnn_model, modelparams={ 'hiddenunits': 2, 'order': order }, name='dna_ctcf_HepG2-cnn') model.predict_variant_effect( dna, vcffile, conditions=['m' + str(i) for i in range(4)], output_folder=os.path.join(os.environ['JANGGU_OUTPUT'])) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5')) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz')) f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'), 'r') gindexer = GenomicIndexer.create_from_file( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None, None) cov = Cover.create_from_array('snps', f['diffscore'], gindexer, store_whole_genome=True) print(cov['chr2', 55, 65].shape) print(cov['chr2', 55, 65]) assert np.abs(cov['chr2', 59, 60]).sum() > 0.0 assert np.abs(cov['chr2', 54, 55]).sum() == 0.0 f.close()
model = Model(xin, output) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) model.summary() hist = model.fit(DNA, LABELS, epochs=100, validation_data=(DNA_TEST, LABELS_TEST)) print('#' * 40) print('loss: {}, acc: {}'.format(hist.history['loss'][-1], hist.history['acc'][-1])) print('#' * 40) # convert the prediction to a cover object pred = model.predict(DNA_TEST) cov_pred = Cover.create_from_array('BindingProba', pred, LABELS_TEST.gindexer) print('Prediction score examples for Oct4') for i in range(4): print('{}.: {}'.format(i, cov_pred[i])) print('Prediction score examples for Mafk') for i in range(1, 5): print('{}.: {}'.format(i, cov_pred[-i])) # predictions (or feature activities) can finally be exported to bigwig cov_pred.export_to_bigwig(output_dir=args.path)
inputs=DNA, outputs=ReduceDim(LABELS)) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) hist = model.fit(DNA, ReduceDim(LABELS), epochs=args.epochs) print('#' * 40) print('loss: {}, acc: {}'.format(hist.history['loss'][-1], hist.history['acc'][-1])) print('#' * 40) pred = model.predict(DNA_TEST) cov_pred = Cover.create_from_array('BindingProba', pred, LABELS_TEST.gindexer) print('Oct4 predictions scores should be greater than Mafk scores:') print('Prediction score examples for Oct4') for i in range(4): print('{}.: {}'.format(i, cov_pred[i])) print('Prediction score examples for Mafk') for i in range(1, 5): print('{}.: {}'.format(i, cov_pred[-i])) # Extract the 4th interval to perform input feature importance attribution # which represents an Oct4 bound region gi = DNA.gindexer[3] chrom = gi.chrom start = gi.start end = gi.end