def _model(): inputs = Input((10,), name='x') output = Dense(1, name='y', activation='sigmoid')(inputs) model = Janggu(inputs=inputs, outputs=output, name='test_model') model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy']) return model
def test_janggu_use_dnaconv_max(tmpdir): os.environ['JANGGU_OUTPUT']=tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'positive.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1) @inputlayer def _cnn_model1(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'), merge_mode='max', name='bothstrands')(layer) return inputs, layer bwm1 = Janggu.create(_cnn_model1, modelparams=(2,), inputs=dna, name='dna_ctcf_HepG2-cnn1') p1 = bwm1.predict(dna[1:2]) w = bwm1.kerasmodel.get_layer('bothstrands').get_weights() @inputlayer def _cnn_model2(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer conv = Conv2D(5, (3, 1), name='singlestrand') fl = conv(layer) rl = Reverse()(conv(Complement()(Reverse()(inlayer)))) layer = Maximum()([fl, rl]) return inputs, layer bwm2 = Janggu.create(_cnn_model2, modelparams=(2,), inputs=dna, name='dna_ctcf_HepG2-cnn2') bwm2.kerasmodel.get_layer('singlestrand').set_weights(w) p2 = bwm2.predict(dna[1:2]) np.testing.assert_allclose(p1, p2, rtol=1e-4, atol=1e-3) bwm1.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm1._storage_path(bwm1.name, outputdir=tmpdir.strpath) bwm1.save() bwm1.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn1')
def test_create_from_array_whole_genome_false(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa') # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_FILE, binsize=200, stepsize=200, order=1, store_whole_genome=False, datatags=['ref']) LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=PEAK_FILE, binsize=200, stepsize=200, resolution=200, store_whole_genome=False, datatags=['train']) @inputlayer @outputconv('sigmoid') def double_stranded_model_dnaconv(inputs, inp, oup, params): with inputs.use('dna') as layer: layer = DnaConv2D(Conv2D(params[0], (params[1], 1), activation=params[2]))(layer) output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1], name='motif')(layer) return inputs, output modeltemplate = double_stranded_model_dnaconv K.clear_session() # create a new model object model = Janggu.create(template=modeltemplate, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=LABELS) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) pred = model.predict(DNA) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, store_whole_genome=False) assert pred.shape == cov_out.shape np.testing.assert_equal(pred, cov_out[:]) assert len(cov_out.gindexer) == len(pred) assert len(cov_out.garray.handle) == len(pred)
def test_janggu_train_predict_option1(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Train, predict and evaluate on dummy data. create: by_shape Input args: Dataset """ inputs = Array("X", np.random.random((100, 10))) outputs = Array('y', np.random.randint(2, size=(100, 1)), conditions=['random']) @inputlayer @outputdense('sigmoid') def test_model(inputs, inp, oup, params): return inputs, inputs[0] bwm = Janggu.create(test_model, inputs=inputs, outputs=outputs, name='nptest') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) assert not os.path.exists(storage) bwm.fit(inputs, outputs, epochs=2, batch_size=32) assert os.path.exists(storage) pred = bwm.predict(inputs) np.testing.assert_equal(len(pred[:, np.newaxis]), len(inputs)) np.testing.assert_equal(pred.shape, outputs.shape) bwm.evaluate(inputs, outputs)
def test_output_export_tsne(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath inputs = Array("x", numpy.random.random((100, 10))) outputs = Array('y', numpy.random.randint(2, size=(100, 1)), conditions=['random']) @inputlayer @outputdense('sigmoid') def _model(inputs, inp, oup, params): with inputs.use('x') as layer: outputs = Dense(3, name='hidden')(layer) return inputs, outputs bwm = Janggu.create(_model, inputs=inputs, outputs=outputs, name='nptest') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') dummy_eval = Scorer('tsne', exporter=export_tsne) bwm.predict(inputs, layername='hidden', callbacks=[dummy_eval]) bwm.predict(inputs, layername='hidden', callbacks=[dummy_eval], exporter_kwargs={'fform': 'eps'}) # check if plot was produced assert os.path.exists( os.path.join(tmpdir.strpath, "evaluation", bwm.name, 'hidden', "tsne.png")) assert os.path.exists( os.path.join(tmpdir.strpath, "evaluation", bwm.name, 'hidden', "tsne.eps"))
def test_janggu_variant_prediction(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') for order in [1, 2, 3]: refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) def _cnn_model(inputs, inp, oup, params): inputs = Input( (50 - params['order'] + 1, 1, pow(4, params['order']))) layer = Flatten()(inputs) layer = Dense(params['hiddenunits'])(layer) output = Dense(4, activation='sigmoid')(layer) return inputs, output model = Janggu.create(_cnn_model, modelparams={ 'hiddenunits': 2, 'order': order }, name='dna_ctcf_HepG2-cnn') model.predict_variant_effect( dna, vcffile, conditions=['m' + str(i) for i in range(4)], output_folder=os.path.join(os.environ['JANGGU_OUTPUT'])) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5')) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz')) f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'), 'r') gindexer = GenomicIndexer.create_from_file( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None, None) cov = Cover.create_from_array('snps', f['diffscore'], gindexer, store_whole_genome=True) print(cov['chr2', 55, 65].shape) print(cov['chr2', 55, 65]) assert np.abs(cov['chr2', 59, 60]).sum() > 0.0 assert np.abs(cov['chr2', 54, 55]).sum() == 0.0 f.close()
def test_janggu_generate_name(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath def _cnn_model(inputs, inp, oup, params): inputs = Input((10, 1)) layer = Flatten()(inputs) output = Dense(params[0])(layer) return inputs, output bwm = Janggu.create(_cnn_model, modelparams=(2, )) bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=bwm.outputdir) bwm.save() bwm.summary() assert os.path.exists(storage) Janggu.create_by_name(bwm.name)
def test_janggu_influence_genomic(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') csvfile = os.path.join(data_path, 'sample.csv') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, roi=bed_file, order=1) df = pd.read_csv(csvfile, header=None) ctcf = Array('ctcf', df.values, conditions=['peaks']) @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output model = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') model.compile(optimizer='adadelta', loss='binary_crossentropy') # check with some nice offset iv = dna.gindexer[0] chrom, start, end = iv.chrom, iv.start, iv.end influence = input_attribution(model, dna, chrom=chrom, start=start, end=end) # check with an odd offset # chrom, start, end = influence2 = input_attribution(model, dna, chrom=chrom, start=start - 1, end=end + 1) np.testing.assert_equal(influence[0][:], influence2[0][:][:, 1:-1])
def test_localaveragepooling2D(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # some test data testin = np.ones((1, 10, 1, 3)) testin[:, :, :, 1] += 1 testin[:, :, :, 2] += 2 # test local average pooling lin = Input((10, 1, 3)) out = LocalAveragePooling2D(3)(lin) m = Janggu(lin, out) testout = m.predict(testin) np.testing.assert_equal(testout, testin[:, :8, :, :]) # more tests testin = np.ones((1, 3, 1, 2)) testin[:, 0, :, :] = 0 testin[:, 2, :, :] = 2 testin[:, :, :, 1] += 1 # test local average pooling lin = Input((3, 1, 2)) out = LocalAveragePooling2D(3)(lin) m = Janggu(lin, out) testout = m.predict(testin) np.testing.assert_equal(testout.shape, (1, 1, 1, 2)) np.testing.assert_equal(testout[0, 0, 0, 0], 1) np.testing.assert_equal(testout[0, 0, 0, 1], 2)
def get_janggu(inputs, outputs): @inputlayer @outputdense('sigmoid') def _model(inputs, inp, oup, params): return inputs, inputs[0] bwm = Janggu.create(_model, inputs=inputs, outputs=outputs, name='nptest') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=bwm.outputdir) assert not os.path.exists(storage) return bwm
def test_janggu_chr2_validation(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, binsize=200, stepsize=50, roi=bed_file, order=1) ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=None, flank=0, collapser='max', storage='ndarray') @inputlayer @outputconv('sigmoid') def _cnn_model1(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'), merge_mode='max', name='bothstrands')(layer) layer = MaxPooling2D((198, 1))(layer) return inputs, layer bwm1 = Janggu.create(_cnn_model1, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn1') bwm1.compile(optimizer='adadelta', loss='binary_crossentropy') p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
def test_janggu_train_predict_sequence(tmpdir): """Train, predict and evaluate on dummy data. create: YES Input args: Dataset validation_set: YES batch_size: None """ os.environ['JANGGU_OUTPUT'] = tmpdir.strpath inputs = {'x': Array("x", np.random.random((100, 10)))} outputs = { 'y': Array('y', np.random.randint(2, size=(100, 1)), conditions=['random']) } jseq = JangguSequence(10, inputs, outputs) @inputlayer @outputdense('sigmoid') def _model(inputs, inp, oup, params): return inputs, inputs[0] bwm = Janggu.create(_model, inputs=jseq.inputs['x'], outputs=jseq.outputs['y'], name='nptest') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) print('storage', storage) print('env', os.environ['JANGGU_OUTPUT']) print('name', bwm.name) print('outputdir', bwm.outputdir) assert not os.path.exists(storage) bwm.fit(jseq, epochs=2, validation_data=jseq, use_multiprocessing=False) assert os.path.exists(storage) pred = bwm.predict(jseq, use_multiprocessing=False) np.testing.assert_equal(len(pred[:, np.newaxis]), len(inputs['x'])) np.testing.assert_equal(pred.shape, outputs['y'].shape) bwm.evaluate(jseq, use_multiprocessing=False)
def test_janggu_train_predict_option0(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Train, predict and evaluate on dummy data. create: by_shape Input args: Dataset """ inputs = Array("X", np.random.random((100, 10))) outputs = ReduceDim(Array('y', np.random.randint(2, size=(100, 1))[:, None], conditions=['random']), axis=(1, )) @inputlayer @outputdense('sigmoid') def test_model(inputs, inp, oup, params): return inputs, inputs[0] bwm = Janggu.create(test_model, inputs=inputs, outputs=outputs, name='nptest') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) assert not os.path.exists(storage) bwm.fit(inputs, outputs, epochs=2, batch_size=32) assert os.path.exists(storage) pred = bwm.predict(inputs) np.testing.assert_equal(len(pred[:, np.newaxis]), len(inputs)) np.testing.assert_equal(pred.shape, outputs.shape) # test if the condition name is correctly used in the output table bwm.evaluate(inputs, outputs, callbacks=['auc']) outputauc = os.path.join(tmpdir.strpath, 'evaluation', 'nptest', 'auc.tsv') assert os.path.exists(outputauc) assert pd.read_csv(outputauc).columns[0] == 'random'
def test_janggu_influence_fasta(tmpdir): data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 filename = os.path.join(data_path, 'sample.fa') data = Bioseq.create_from_seq('dna', fastafile=filename, order=order, cache=False) dna = data @inputlayer def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Flatten()(layer) output = Dense(params[0])(layer) output = Dense(1, activation='sigmoid')(output) return inputs, output model = Janggu.create(_cnn_model, modelparams=(2, ), inputs=data, name='dna_ctcf_HepG2-cnn') #model.compile(optimizer='adadelta', loss='binary_crossentropy') # check with some nice offset iv = dna.gindexer[0] chrom, start, end = iv.chrom, iv.start, iv.end influence = input_attribution(model, dna, chrom=chrom, start=start, end=end) influence2 = input_attribution(model, dna, idx=0) np.testing.assert_equal(influence[0][:], influence2[0][:])
def objective(params): train_data, val_data, test_data = get_data(params) # define a keras model only based on DNA try: K.clear_session() model = Janggu.create(get_model, params, train_data[0], train_data[1], name=params['name']) model.compile(optimizer=get_opt(params['opt']), loss='binary_crossentropy', metrics=['acc']) hist = model.fit(train_data[0], train_data[1], epochs=params['epochs'], batch_size=64, validation_data=val_data, callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]) except ValueError: traceback.print_stack() exc_type, exc_value, exc_traceback = sys.exc_info() print(repr(traceback.extract_tb(exc_traceback))) return {'status': 'fail'} print('#' * 40) for key in hist.history: print('{}: {}'.format(key, hist.history[key][-1])) print('#' * 40) pred_test = model.predict(test_data[0]) pred_val = model.predict(val_data[0]) model.evaluate(val_data[0], val_data[1], callbacks=['auprc', 'auroc'], datatags=['val']) model.evaluate(test_data[0], test_data[1], callbacks=['auprc', 'auroc'], datatags=['test']) auprc_val = average_precision_score(val_data[1][:], pred_val) auprc_test = average_precision_score(test_data[1][:], pred_test) model.summary() print('auprc_val: {:.2%}'.format(auprc_val)) print('auprc_test: {:.2%}'.format(auprc_test)) return {'loss': hist.history['val_loss'][-1], 'status': 'ok', 'all_losses': hist.history, 'auprc_val': auprc_val, 'auprc_test': auprc_test, 'model_config': model.kerasmodel.to_json(), 'model_weights': model.kerasmodel.get_weights(), 'concrete_params': params, 'modelname': model.name}
""" with inputs.use('dna') as layer: # the name in inputs.use() should be the same as the dataset name. layer = DnaConv2D( Conv2D(params[0], (params[1], 1), activation=params[2]))(layer) output = GlobalAveragePooling2D(name='motif')(layer) return inputs, output modeltemplate = double_stranded_model_dnaconv K.clear_session() # create a new model object model = Janggu.create(template=modeltemplate, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=ReduceDim(LABELS)) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) hist = model.fit(DNA, ReduceDim(LABELS), epochs=100, shuffle=False) print('#' * 40) print('loss: {}, acc: {}'.format(hist.history['loss'][-1], hist.history['acc'][-1])) print('#' * 40) # clustering plots based on hidden features heatmap_eval = Scorer('heatmap', exporter=ExportClustermap(z_score=1.))
def test_janggu_instance_dense(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') csvfile = os.path.join(data_path, 'sample.csv') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1) df = pd.read_csv(csvfile, header=None) ctcf = Array('ctcf', df.values, conditions=['peaks']) @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs['.'] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output with pytest.raises(Exception): # due to No input name . defined bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs[list()] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output with pytest.raises(Exception): # due to Wrong type for indexing bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs()[0] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output with pytest.raises(Exception): # name with must be string bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name=12342134) # test with given model name bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') # test with auto. generated modelname. bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs[0] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') kbwm2 = model_from_json(bwm.kerasmodel.to_json()) kbwm3 = model_from_yaml(bwm.kerasmodel.to_yaml()) bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) bwm.save() bwm.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn')
'order': order, 'stranded': strand, 'flank': flank, 'rep': 'r{}'.format(rep), 'flatten': flatten } DATA = get_data(pars) mname = '{}_s{}_o{}_f{}_a{}_r{}'.format(modelname, pars['stranded'], pars['order'], pars['flank'], pars['flatten'], pars['rep']) if not evaluate: model = Janggu.create(dna_model, pars, inputs=DATA[0][0], outputs=DATA[0][1], name=mname) model.summary() model.compile(optimizer=get_opt('amsgrad'), loss='binary_crossentropy', metrics=['accuracy']) train_data = DATA[0] val_data = DATA[1] test_data = DATA[2] hist = model.fit( train_data[0], train_data[1], epochs=epochs,
val_data = DATA[1] test_data = DATA[2] auprc_pre_val = [] auprc_pre_test = [] auprc_rand_val = [] auprc_rand_test = [] # Next, we concatenate the individual models and fine-tune them. # Furthermore, the combined models are reset with random weights and trained from scratch # as a comparison. for dnarun, dnaserun in zip([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]): # load pre-trained models dnaname = dnamodelname.format(dnarun) dnasename = dnasemodelname.format(dnaserun) dnamodel = Janggu.create_by_name(dnaname) dnasemodel = Janggu.create_by_name(dnasename) # remove output layer, concatenate the top-hidden layers, append output hidden_dna = dnamodel.kerasmodel.layers[-2].output hidden_dnase = dnasemodel.kerasmodel.layers[-2].output joint_hidden = Concatenate(name='concat')([hidden_dna, hidden_dnase]) output = Dense(1, activation='sigmoid', name='peaks')(joint_hidden) # fit the model with preinitialized weights jointmodel = Janggu(dnamodel.kerasmodel.inputs + dnasemodel.kerasmodel.inputs, output, name='pretrained_dnase_dna_joint_model_{}_{}'.format( dnasename, dnaname))
shared_space['pretrained'] = False res = objective(shared_space) write_results(shared_space, res) else: print('no training') shared_space['val_chrom'] = "chr22" shared_space['order'] = dnaorder shared_space['pretrained'] = False shared_space['seq_dropout'] = 0.2 shared_space['inputs'] = 'epi_dna' params = shared_space train_data = get_data(params) train, test = split_train_test(train_data, [test_chrom]) model = Janggu.create_by_name('cage_promoters_epi_dna') testpred = model.predict(test[0]) fig, ax = plt.subplots() ax.scatter(test[1][:], testpred) ax.set_xlabel('Observed normalized CAGE signal') ax.set_ylabel('Predicted normalized CAGE signal') fig.savefig( os.path.join(os.environ['JANGGU_OUTPUT'], 'cage_promoter_testchrom_agreement.png')) fig, ax = plt.subplots() ax.scatter(test[1][:], testpred) ax.set_xlabel('Observed normalized CAGE signal') ax.set_ylabel('Predicted normalized CAGE signal')
def objective(params): print(params) try: train_data = get_data(params) train_data, test = split_train_test(train_data, [test_chrom]) train, val = split_train_test(train_data, [params['val_chrom']]) # define a keras model only based on DNA K.clear_session() if params['inputs'] == 'epi_dna': dnam = Janggu.create_by_name('cage_promoters_dna_only') epim = Janggu.create_by_name('cage_promoters_epi_only') layer = Concatenate()([ dnam.kerasmodel.layers[-2].output, epim.kerasmodel.layers[-2].output ]) layer = Dense(1, name='geneexpr')(layer) model = Janggu([dnam.kerasmodel.input] + epim.kerasmodel.input, layer, name='cage_promoters_epi_dna') if not params['pretrained']: # This part randomly reinitializes the network # so that we can train it from scratch newjointmodel = model_from_json(model.kerasmodel.to_json()) newjointmodel = Janggu( newjointmodel.inputs, newjointmodel.outputs, name='cage_promoters_epi_dna_randominit') model = newjointmodel else: model = Janggu.create(get_model, params, train_data[0], train_data[1], name='cage_promoters_{}'.format( params['inputs'])) except ValueError: main_logger.exception('objective:') return {'status': 'fail'} model.compile(optimizer=get_opt(params['opt']), loss='mae', metrics=['mse']) hist = model.fit( train_data[0], train_data[1], epochs=params['epochs'], batch_size=64, validation_data=[params['val_chrom']], callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]) print('#' * 40) for key in hist.history: print('{}: {}'.format(key, hist.history[key][-1])) print('#' * 40) pred_train = model.predict(train[0]) pred_val = model.predict(val[0]) pred_test = model.predict(test[0]) model.evaluate(train[0], train[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['train']) mae_val = model.evaluate(val[0], val[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['val']) mae_val = mae_val[0] model.evaluate(test[0], test[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['test']) cor_train = np.corrcoef(train[1][:][:, 0], pred_train[:, 0])[0, 1] cor_val = np.corrcoef(val[1][:][:, 0], pred_val[:, 0])[0, 1] cor_test = np.corrcoef(test[1][:][:, 0], pred_test[:, 0])[0, 1] model.summary() main_logger.info('cor [train/val/test]: {:.2f}/{:.2f}/{:.2f}'.format( cor_train, cor_val, cor_test)) return { 'loss': mae_val, 'status': 'ok', 'all_losses': hist.history, 'cor_train': cor_train, 'cor_val': cor_val, 'cor_test': cor_test, 'model_config': model.kerasmodel.to_json(), 'model_weights': model.kerasmodel.get_weights(), 'concrete_params': params }
def test_janggu_instance_conv(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1, binsize=200, stepsize=50) ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=50, store_whole_genome=False, flank=0, collapser=None, storage='ndarray') ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=50, store_whole_genome=True, flank=0, collapser=None, storage='ndarray') @inputlayer @outputconv('sigmoid') def _cnn_model(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = Complement()(layer) layer = Reverse()(layer) return inputs, layer bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) bwm.save() bwm.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn')
return inputs, output if args.model == 'single': modeltemplate = single_stranded_model elif args.model == 'double': modeltemplate = double_stranded_model else: modeltemplate = double_stranded_model_dnaconv K.clear_session() # create a new model object model = Janggu.create(template=modeltemplate, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=LABELS, name='fasta_seqs_m{}_o{}'.format(args.model, args.order)) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) model.summary() # fit the model hist = model.fit(DNA, LABELS, epochs=100) print('#' * 40) print('loss: {}, acc: {}'.format(hist.history['loss'][-1], hist.history['acc'][-1])) print('#' * 40)
performs the convolution operation with the normal kernel weights and the reverse complemented weights. """ with inputs.use('dna') as layer: # the name in inputs.use() should be the same as the dataset name. layer = DnaConv2D( Conv2D(params[0], (params[1], 1), activation=params[2]))(layer) output = GlobalAveragePooling2D(name='motif')(layer) return inputs, output K.clear_session() # create a new model object model = Janggu.create(template=double_stranded_model_dnaconv, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=ReduceDim(LABELS)) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) model.fit(DNA, ReduceDim(LABELS), epochs=100, validation_data=['pseudo2']) # do the evaluation on the independent test data model.evaluate(DNA_TEST, ReduceDim(LABELS_TEST), datatags=['test'], callbacks=['auc', 'auprc', 'roc', 'prc']) pred = model.predict(DNA_TEST)