def test_load_cover_bigwig_resolutionNone(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bwfile_ = os.path.join(data_path, "sample.bw") bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize print(store) cover1 = Cover.create_from_bigwig( "cov", bigwigfiles=bwfile_, roi=bed_file, binsize=200, stepsize=200, resolution=1, storage=store, cache=True) cover = Cover.create_from_bigwig( "cov", bigwigfiles=bwfile_, roi=bed_file, binsize=200, stepsize=200, resolution=None, storage=store, cache=True, datatags=['None'], collapser='sum') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover1[:].sum(axis=1), cover[:].sum(axis=1))
def test_create_from_array_whole_genome_true(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=[PEAK_FILE]*5, binsize=200, stepsize=200, resolution=200, store_whole_genome=True) pred = LABELS[:] for storage in ['ndarray', 'sparse', 'hdf5']: print(storage) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, cache=True, storage=storage, store_whole_genome=True) np.testing.assert_equal(cov_out[:], LABELS[:]) np.testing.assert_equal(cov_out.shape, LABELS.shape)
def test_bam_store_whole_genome_option(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bam") cover1 = Cover.create_from_bam('test', bamfiles=bamfile_, regions=bed_file, store_whole_genome=True, binsize=200, stepsize=200, storage='ndarray') cover2 = Cover.create_from_bam('test2', bamfiles=bamfile_, regions=bed_file, store_whole_genome=False, binsize=200, stepsize=200, storage='ndarray') assert len(cover1) == 100 assert len(cover2) == len(cover1) assert cover1.shape == (100, 200, 2, 1) assert cover1.shape == cover2.shape np.testing.assert_equal(cover1[:], cover2[:]) assert cover1[:].sum() == 29.
def test_create_from_array_whole_genome_false(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa') # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_FILE, binsize=200, stepsize=200, order=1, store_whole_genome=False, datatags=['ref']) LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=PEAK_FILE, binsize=200, stepsize=200, resolution=200, store_whole_genome=False, datatags=['train']) @inputlayer @outputconv('sigmoid') def double_stranded_model_dnaconv(inputs, inp, oup, params): with inputs.use('dna') as layer: layer = DnaConv2D(Conv2D(params[0], (params[1], 1), activation=params[2]))(layer) output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1], name='motif')(layer) return inputs, output modeltemplate = double_stranded_model_dnaconv K.clear_session() # create a new model object model = Janggu.create(template=modeltemplate, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=LABELS) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) pred = model.predict(DNA) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, store_whole_genome=False) assert pred.shape == cov_out.shape np.testing.assert_equal(pred, cov_out[:]) assert len(cov_out.gindexer) == len(pred) assert len(cov_out.garray.handle) == len(pred)
def test_plotgenometracks(): roi = pkg_resources.resource_filename('janggu', 'resources/sample.bed') bw_file = pkg_resources.resource_filename('janggu', 'resources/sample.bw') cover = Cover.create_from_bigwig('coverage2', bigwigfiles=bw_file, roi=roi, binsize=200, stepsize=200, resolution=50) cover2 = Cover.create_from_bigwig('coverage2', bigwigfiles=bw_file, roi=roi, binsize=200, stepsize=200, resolution=50) a = plotGenomeTrack([cover,cover2],'chr1',16000,18000) a = plotGenomeTrack(cover,'chr1',16000,18000)
def test_channel_last_first(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bwfile_ = os.path.join(data_path, "sample.bw") cover = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, resolution=1, binsize=200, regions=bed_file, store_whole_genome=True, channel_last=True, storage='ndarray') assert cover.shape == (100, 200, 1, 1) assert cover[0].shape == (1, 200, 1, 1) cover1 = cover cover = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, resolution=1, binsize=200, regions=bed_file, store_whole_genome=True, channel_last=False, storage='ndarray') assert cover.shape == (100, 1, 200, 1) assert cover[0].shape == (1, 1, 200, 1) np.testing.assert_equal(cover1[0], np.transpose(cover[0], (0, 2, 3, 1)))
def test_cover_from_bam_sanity(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bam") cover = Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=200, stepsize=200, flank=0, storage='ndarray') cover[0] with pytest.raises(IndexError): # not interable cover[1.2] cov2 = Cover.create_from_bam( 'test', bamfiles=bamfile_, storage='ndarray', store_whole_genome=True) assert len(cover.gindexer) == len(cover.garray.handle) assert len(cov2.garray.handle) != len(cover.garray.handle) with pytest.raises(Exception): # name must be a string Cover.create_from_bam( 1.2, bamfiles=bamfile_, roi=bed_file, binsize=1, stepsize=1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=1, stepsize=1, flank=-1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=1, stepsize=-1, flank=0, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=-1, stepsize=1, flank=0, storage='ndarray')
def test_bed_unsync_roi_targets(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") bed_shift_file = os.path.join(data_path, "positive_shift.bed") cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=None, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 1, 1, 1) assert cover[:].sum() == 25 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=50, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 4, 1, 1) assert cover[:].sum() == 25 * 4 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=50, store_whole_genome=True, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 4, 1, 1) assert cover[:].sum() == 25 * 4 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=1, store_whole_genome=False, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 1, 1) assert cover[:].sum() == 25 * 200 - 2 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=1, store_whole_genome=True, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 1, 1) assert cover[:].sum() == 25 * 200 - 2
def get_data(params): zscore = ZScore() LABELS_TRAIN = ReduceDim(Cover.create_from_bam( 'geneexpr', bamfiles=RNA.format(params['traincell'], params['trainrep']), roi=ROI_INPUT_TRAIN, flank=params['cageflank'], conditions=['GeneExpr'], resolution=None, store_whole_genome=False, storage='ndarray', normalizer=[LogTransform(), zscore], stranded=False, cache=True), aggregator="mean") train_labels = LABELS_TRAIN train_input = [] if params['inputs'] in ['dna_only', 'epi_dna']: dnaflank = params['dnaflank'] order = params['order'] # DNA DNA_TRAIN = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_INPUT_TRAIN, flank=dnaflank, order=order, cache=True, store_whole_genome=False) train_input += [DNA_TRAIN] if params['inputs'] in ['epi_only', 'epi_dna']: zscore = ZScore() dnase_TRAIN = ReduceDim(Cover.create_from_bam( 'dnase', bamfiles=DNASE.format(params['traincell']), roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], resolution=None, store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [dnase_TRAIN] zscore = ZScore() h3k4_TRAIN = ReduceDim(Cover.create_from_bigwig( 'h3k4', bigwigfiles=[H3K4me3.format(params['traincell'])], roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [h3k4_TRAIN] if len(train_input) == 0: raise ValueError('no input') return (train_input, train_labels)
def test_load_cover_bed_binary(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed') score_file = pkg_resources.resource_filename( 'janggu', 'resources/scored_sample.bed') for store in ['ndarray', 'hdf5', 'sparse']: print('store', store) cover = Cover.create_from_bed("cov", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, resolution=200, storage=store, mode='binary', cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1) cover = Cover.create_from_bed("cov50", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, storage=store, resolution=50, mode='binary', cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 4, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 4 * 1) cover = Cover.create_from_bed( "cov50_firstdim", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, storage=store, #resolution=50, dimmode='first', mode='binary', cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1)
def test_load_cover_bed_categorical(): bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed') score_file = pkg_resources.resource_filename( 'janggu', 'resources/scored_sample.bed') for store in ['ndarray', 'sparse']: cover = Cover.create_from_bed("cov", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, resolution=200, storage=store, mode='categorical') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 6)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1) cover = Cover.create_from_bed("cov50", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, resolution=50, storage=store, mode='categorical') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 4, 1, 6)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 4 * 1) cover = Cover.create_from_bed( "cov50", bedfiles=score_file, regions=bed_file, # resolution=50, binsize=200, stepsize=200, storage=store, dimmode='first', mode='categorical') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 6)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1)
def test_load_cover_bed_scored(): bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed') score_file = pkg_resources.resource_filename('janggu', 'resources/scored_sample.bed') for store in ['ndarray', 'sparse']: cover = Cover.create_from_bed( "cov", bedfiles=score_file, roi=bed_file, binsize=200, stepsize=200, resolution=200, storage=store, mode='score') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 5) cover = Cover.create_from_bed( "cov50", bedfiles=score_file, roi=bed_file, binsize=200, stepsize=200, storage=store, resolution=50, mode='score') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 4, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 4*5) cover = Cover.create_from_bed( "cov50", bedfiles=score_file, roi=bed_file, storage=store, resolution=None, binsize=200, stepsize=200, collapser='max', mode='score') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 5)
def test_load_cover_bigwig_default(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bwfile_ = os.path.join(data_path, "sample.bw") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') gsize = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr').to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize print(store) cover = Cover.create_from_bigwig("cov", bigwigfiles=bwfile_, regions=bed_file, binsize=200, stepsize=200, genomesize=gsize, storage=store, cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 200, 1, 1)) # there is one read in the region np.testing.assert_allclose(cover[4].sum(), 36.) np.testing.assert_allclose(cover[52].sum(), 2 * 36.)
def get_data(params): train_labels = Cover.create_from_bed('labels', bedfiles=bedfiles, roi=train_roi, resolution=200, store_whole_genome=True, storage='sparse', cache=True, dtype='int8', minoverlap=.5, verbose=True) test_labels = view(train_labels, test_roi) val_labels = view(train_labels, val_roi) train_seq = Bioseq.create_from_refgenome('dna', refgenome=refgenome, roi=train_roi, store_whole_genome=True, storage='ndarray', cache=True, order=params['order'], flank=params['flank'], verbose=True) test_seq = view(train_seq, test_roi) val_seq = view(train_seq, val_roi) return ((train_seq, ReduceDim(train_labels)), (val_seq, ReduceDim(val_labels)), (test_seq, ReduceDim(test_labels)))
def test_janggu_variant_prediction(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') for order in [1, 2, 3]: refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) def _cnn_model(inputs, inp, oup, params): inputs = Input( (50 - params['order'] + 1, 1, pow(4, params['order']))) layer = Flatten()(inputs) layer = Dense(params['hiddenunits'])(layer) output = Dense(4, activation='sigmoid')(layer) return inputs, output model = Janggu.create(_cnn_model, modelparams={ 'hiddenunits': 2, 'order': order }, name='dna_ctcf_HepG2-cnn') model.predict_variant_effect( dna, vcffile, conditions=['m' + str(i) for i in range(4)], output_folder=os.path.join(os.environ['JANGGU_OUTPUT'])) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5')) assert os.path.exists( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz')) f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'), 'r') gindexer = GenomicIndexer.create_from_file( os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None, None) cov = Cover.create_from_array('snps', f['diffscore'], gindexer, store_whole_genome=True) print(cov['chr2', 55, 65].shape) print(cov['chr2', 55, 65]) assert np.abs(cov['chr2', 59, 60]).sum() > 0.0 assert np.abs(cov['chr2', 54, 55]).sum() == 0.0 f.close()
def test_load_bam_resolution10(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample.bam") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr') gsize = content.to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize # print(store) cover = Cover.create_from_bam("yeast_I_II_III.bam", bamfiles=bamfile_, regions=bed_file, binsize=200, stepsize=200, genomesize=gsize, resolution=10, storage=store, cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 20, 2, 1)) # the region is read relative to the forward strand # read on the reverse strand val = np.where(cover[4] == 1) np.testing.assert_equal(cover[4].sum(), 1.) np.testing.assert_equal(val[1][0], 17) # pos np.testing.assert_equal(val[2][0], 1) # strand # two reads on the forward strand val = np.where(cover[13] == 1) np.testing.assert_equal(cover[13].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([16, 17])) # pos np.testing.assert_equal(val[2], np.asarray([0, 0])) # strand # the region is read relative to the reverse strand # for index 50 # read on the reverse strand val = np.where(cover[52] == 1) np.testing.assert_equal(cover[52].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([0, 8])) # pos np.testing.assert_equal(val[2], np.asarray([0, 0])) # strand # two reads on the forward strand val = np.where(cover[96] == 1) np.testing.assert_equal(cover[96].sum(), 1.) np.testing.assert_equal(val[1], np.asarray([2])) # pos np.testing.assert_equal(val[2], np.asarray([1])) # strand
def test_load_cover_bigwig_resolution1(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bwfile_ = os.path.join(data_path, "sample.bw") bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize print(store) cover = Cover.create_from_bigwig( "cov", bigwigfiles=bwfile_, roi=bed_file, binsize=200, stepsize=200, resolution=1, storage=store, cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 200, 1, 1)) # there is one read in the region 4 np.testing.assert_allclose(cover[4].sum(), 36) np.testing.assert_equal(cover[4][0, :, 0, 0], np.asarray( [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])) # and two reads in region 52 np.testing.assert_allclose(cover[52].sum(), 2*36) np.testing.assert_equal(cover[52][0, :, 0, 0], np.asarray( [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
def test_bed_store_whole_genome_option(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") cover1 = Cover.create_from_bed('test', bedfiles=bed_file, regions=bed_file, store_whole_genome=True, storage='ndarray') cover2 = Cover.create_from_bed('test2', bedfiles=bed_file, regions=bed_file, store_whole_genome=False, storage='ndarray') assert len(cover1) == 25 assert len(cover2) == len(cover1) assert cover1.shape == (25, 200, 1, 1) assert cover1.shape == cover2.shape np.testing.assert_equal(cover1[:], np.ones(cover1.shape)) np.testing.assert_equal(cover2[:], np.ones(cover1.shape))
def test_bam_inferred_binsize(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") bamfile_ = os.path.join(data_path, "sample.bam") cover = Cover.create_from_bam('test', bamfiles=bamfile_, regions=bed_file, flank=0, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 2, 1)
def test_bed_inferred_binsize(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") #file_ = os.path.join(data_path, "sample.bw") cover = Cover.create_from_bed('test', bedfiles=bed_file, regions=bed_file, resolution=1, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 1, 1)
def test_load_bam_resolutionNone(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample.bam") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr') gsize = content.to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize # print(store) cover1 = Cover.create_from_bam( "yeast_I_II_III.bam", bamfiles=bamfile_, roi=bed_file, binsize=200, stepsize=200, genomesize=gsize, resolution=1, storage=store, cache=True) cover = Cover.create_from_bam( "yeast_I_II_III.bam", bamfiles=bamfile_, roi=bed_file, binsize=200, stepsize=200, genomesize=gsize, resolution=None, storage=store, cache=True, datatags=['None']) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 2, 1)) np.testing.assert_equal(cover1[:].sum(axis=1), cover[:].sum(axis=1))
def test_cover_export_bigwig(tmpdir): path = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bwfile_ = os.path.join(data_path, "sample.bw") for resolution in [1, 50]: for storage in [True, False]: print('resolution=', resolution) print('store_whole_genome', storage) cover = Cover.create_from_bigwig( 'test', bigwigfiles=bwfile_, resolution=resolution, binsize=200, roi=bed_file, store_whole_genome=storage, storage='ndarray') cover.export_to_bigwig(output_dir=path) cov2 = Cover.create_from_bigwig('test', bigwigfiles='{path}/{name}.{sample}.bigwig'.format( path=path, name=cover.name, sample=cover.conditions[0]), resolution=resolution, binsize=200, roi=bed_file, store_whole_genome=storage, storage='ndarray') assert cover.shape == (100, 200 // resolution, 1, 1) assert cover.shape == cov2.shape k = list(cover.garray.handle.keys())[0] np.testing.assert_allclose(cover[:].sum(), 1044.0 / resolution) np.testing.assert_allclose(cov2[:].sum(), 1044.0 / resolution)
def test_bed_genomic_interval_access_part_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bed") storage = False for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bed( 'test', bedfiles=bamfile_, roi=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :',storage,'/ resolution :',reso,'/ shift :',shift) print(i, cover.gindexer[i]) np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[i].chrom, \ cover.gindexer[i].start, \ cover.gindexer[i].end, \ cover.gindexer[i].strand np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': gicov = cover[chrom, start, end, strand][:, :(-shift*reso),:,:] np.testing.assert_equal(cover[i][:, shift:,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :]) else: gicov = cover[chrom, start, end, strand][:, (shift*reso):,:,:] np.testing.assert_equal(cover[i][:, :-shift,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :])
def test_cover_bam_unstranded(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample.bam") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr') gsize = content.to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") cover = Cover.create_from_bam("yeast_I_II_III.bam", bamfiles=bamfile_, regions=bed_file, binsize=200, stepsize=200, genomesize=gsize, stranded=False) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 200, 1, 1)) # the region is read relative to the forward strand # read on the reverse strand val = np.where(cover[4] == 1) np.testing.assert_equal(cover[4].sum(), 1.) np.testing.assert_equal(val[1][0], 179) # pos # two reads on the forward strand val = np.where(cover[13] == 1) np.testing.assert_equal(cover[13].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([162, 178])) # pos # the region is read relative to the reverse strand # for index 50 # read on the reverse strand val = np.where(cover[52] == 1) np.testing.assert_equal(cover[52].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([9, 89])) # pos # two reads on the forward strand val = np.where(cover[96] == 1) np.testing.assert_equal(cover[96].sum(), 1.) np.testing.assert_equal(val[1], np.asarray([25])) # pos
def test_janggu_chr2_validation(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, binsize=200, stepsize=50, roi=bed_file, order=1) ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=None, flank=0, collapser='max', storage='ndarray') @inputlayer @outputconv('sigmoid') def _cnn_model1(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'), merge_mode='max', name='bothstrands')(layer) layer = MaxPooling2D((198, 1))(layer) return inputs, layer bwm1 = Janggu.create(_cnn_model1, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn1') bwm1.compile(optimizer='adadelta', loss='binary_crossentropy') p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
def test_bed_overreaching_ends_part_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "bed_test.bed") for store in ['ndarray', 'sparse']: print(store) cover = Cover.create_from_bed( 'test', bedfiles=bed_file, roi=bed_file, binsize=2, flank=20, resolution=1, store_whole_genome=False, storage=store) assert len(cover) == 9 assert cover.shape == (9, 2+2*20, 1, 1) np.testing.assert_equal(cover[0].sum(), 18) np.testing.assert_equal(cover[:].sum(), 9*18)
def test_bam_genomic_interval_access(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bam") for storage in [True, False]: for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bam('test', bamfiles=bamfile_, regions=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :', storage, '/ resolution :', reso, '/ shift :', shift) print(i, cover.gindexer[i]) np.testing.assert_equal(cover[i], cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[ i].chrom, cover.gindexer[i].start, cover.gindexer[ i].end, cover.gindexer[i].strand np.testing.assert_equal(cover[i], cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': np.testing.assert_equal( cover[i][:, shift:, :, :], cover[chrom, start, end, strand][:, :-shift, :, :]) else: np.testing.assert_equal( cover[i][:, :-shift, :, :], cover[chrom, start, end, strand][:, shift:, :, :])
def test_bed_overreaching_ends(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") cover = Cover.create_from_bed('test', bedfiles=bed_file, regions=bed_file, flank=2000, resolution=1, store_whole_genome=True, storage='ndarray') cover.garray.handle['chr1'][0] = 1 assert len(cover) == 25 assert cover.shape == (25, 200 + 2 * 2000, 1, 1) np.testing.assert_equal(cover[0][0, :550, 0, 0].sum(), 0) np.testing.assert_equal(cover[0][0, 550, 0, 0], 1.) np.testing.assert_equal( cover[0][0, 550:(550 + len(cover.garray.handle['chr1'])), :, :], cover.garray.handle['chr1'])
def test_cover_bam_paired_midpoint(): # sample2.bam contains paired end examples, # unmapped examples, unmapped mate and low quality example data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample2.bam") cover = Cover.create_from_bam("yeast_I_II_III.bam", bamfiles=bamfile_, stranded=False, pairedend='midpoint', min_mapq=30, store_whole_genome=True) assert cover.garray.handle['ref'].sum() == 2, cover.garray.handle['ref'] print(cover.garray.handle['ref']) # the read starts at index 6 and tlen is 39 assert cover.garray.handle['ref'][6 + 39 // 2, 0, 0] == 1 # another read maps to index 34 assert cover.garray.handle['ref'][34, 0, 0] == 1
# identically to the models obtained from classify_fasta.py. REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa') # ROI contains regions spanning positive and negative examples ROI_TRAIN = resource_filename('janggu', 'resources/roi_train.bed') ROI_TEST = resource_filename('janggu', 'resources/roi_test.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TEST, binsize=200) LABELS_TEST = Cover.create_from_bed('peaks', bedfiles=PEAK_FILE, roi=ROI_TEST, binsize=200, resolution=None) # Training input and labels are purely defined genomic coordinates DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TRAIN, binsize=200) LABELS = Cover.create_from_bed('peaks', roi=ROI_TRAIN, bedfiles=PEAK_FILE, binsize=200, resolution=None)