def test_plotgenometracks(): roi = pkg_resources.resource_filename('janggu', 'resources/sample.bed') bw_file = pkg_resources.resource_filename('janggu', 'resources/sample.bw') cover = Cover.create_from_bigwig('coverage2', bigwigfiles=bw_file, roi=roi, binsize=200, stepsize=200, resolution=50) cover2 = Cover.create_from_bigwig('coverage2', bigwigfiles=bw_file, roi=roi, binsize=200, stepsize=200, resolution=50) a = plotGenomeTrack([cover,cover2],'chr1',16000,18000) a = plotGenomeTrack(cover,'chr1',16000,18000)
def test_bigwig_store_whole_genome_option(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bwfile_ = os.path.join(data_path, "sample.bw") cover1 = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, regions=bed_file, store_whole_genome=True, binsize=200, stepsize=200, storage='ndarray') cover2 = Cover.create_from_bigwig('test2', bigwigfiles=bwfile_, regions=bed_file, store_whole_genome=False, binsize=200, stepsize=200, storage='ndarray') assert len(cover1) == 100 assert len(cover2) == len(cover1) assert cover1.shape == (100, 200, 1, 1) assert cover1.shape == cover2.shape np.testing.assert_equal(cover1[:], cover2[:]) assert cover1[:].sum() == 1044.0
def test_channel_last_first(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bwfile_ = os.path.join(data_path, "sample.bw") cover = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, resolution=1, binsize=200, regions=bed_file, store_whole_genome=True, channel_last=True, storage='ndarray') assert cover.shape == (100, 200, 1, 1) assert cover[0].shape == (1, 200, 1, 1) cover1 = cover cover = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, resolution=1, binsize=200, regions=bed_file, store_whole_genome=True, channel_last=False, storage='ndarray') assert cover.shape == (100, 1, 200, 1) assert cover[0].shape == (1, 1, 200, 1) np.testing.assert_equal(cover1[0], np.transpose(cover[0], (0, 2, 3, 1)))
def test_load_cover_bigwig_resolutionNone(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bwfile_ = os.path.join(data_path, "sample.bw") bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize print(store) cover1 = Cover.create_from_bigwig( "cov", bigwigfiles=bwfile_, roi=bed_file, binsize=200, stepsize=200, resolution=1, storage=store, cache=True) cover = Cover.create_from_bigwig( "cov", bigwigfiles=bwfile_, roi=bed_file, binsize=200, stepsize=200, resolution=None, storage=store, cache=True, datatags=['None'], collapser='sum') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover1[:].sum(axis=1), cover[:].sum(axis=1))
def test_load_cover_bigwig_default(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bwfile_ = os.path.join(data_path, "sample.bw") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') gsize = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr').to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize print(store) cover = Cover.create_from_bigwig("cov", bigwigfiles=bwfile_, regions=bed_file, binsize=200, stepsize=200, genomesize=gsize, storage=store, cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 200, 1, 1)) # there is one read in the region np.testing.assert_allclose(cover[4].sum(), 36.) np.testing.assert_allclose(cover[52].sum(), 2 * 36.)
def get_data(params): zscore = ZScore() LABELS_TRAIN = ReduceDim(Cover.create_from_bam( 'geneexpr', bamfiles=RNA.format(params['traincell'], params['trainrep']), roi=ROI_INPUT_TRAIN, flank=params['cageflank'], conditions=['GeneExpr'], resolution=None, store_whole_genome=False, storage='ndarray', normalizer=[LogTransform(), zscore], stranded=False, cache=True), aggregator="mean") train_labels = LABELS_TRAIN train_input = [] if params['inputs'] in ['dna_only', 'epi_dna']: dnaflank = params['dnaflank'] order = params['order'] # DNA DNA_TRAIN = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_INPUT_TRAIN, flank=dnaflank, order=order, cache=True, store_whole_genome=False) train_input += [DNA_TRAIN] if params['inputs'] in ['epi_only', 'epi_dna']: zscore = ZScore() dnase_TRAIN = ReduceDim(Cover.create_from_bam( 'dnase', bamfiles=DNASE.format(params['traincell']), roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], resolution=None, store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [dnase_TRAIN] zscore = ZScore() h3k4_TRAIN = ReduceDim(Cover.create_from_bigwig( 'h3k4', bigwigfiles=[H3K4me3.format(params['traincell'])], roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [h3k4_TRAIN] if len(train_input) == 0: raise ValueError('no input') return (train_input, train_labels)
def test_load_cover_bigwig_resolution1(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bwfile_ = os.path.join(data_path, "sample.bw") bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize print(store) cover = Cover.create_from_bigwig( "cov", bigwigfiles=bwfile_, roi=bed_file, binsize=200, stepsize=200, resolution=1, storage=store, cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 200, 1, 1)) # there is one read in the region 4 np.testing.assert_allclose(cover[4].sum(), 36) np.testing.assert_equal(cover[4][0, :, 0, 0], np.asarray( [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])) # and two reads in region 52 np.testing.assert_allclose(cover[52].sum(), 2*36) np.testing.assert_equal(cover[52][0, :, 0, 0], np.asarray( [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
def test_bigwig_inferred_binsize(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") bwfile_ = os.path.join(data_path, "sample.bw") cover = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, resolution=1, regions=bed_file, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 1, 1)
def test_cover_export_bigwig(tmpdir): path = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bwfile_ = os.path.join(data_path, "sample.bw") for resolution in [1, 50]: for storage in [True, False]: print('resolution=', resolution) print('store_whole_genome', storage) cover = Cover.create_from_bigwig( 'test', bigwigfiles=bwfile_, resolution=resolution, binsize=200, roi=bed_file, store_whole_genome=storage, storage='ndarray') cover.export_to_bigwig(output_dir=path) cov2 = Cover.create_from_bigwig('test', bigwigfiles='{path}/{name}.{sample}.bigwig'.format( path=path, name=cover.name, sample=cover.conditions[0]), resolution=resolution, binsize=200, roi=bed_file, store_whole_genome=storage, storage='ndarray') assert cover.shape == (100, 200 // resolution, 1, 1) assert cover.shape == cov2.shape k = list(cover.garray.handle.keys())[0] np.testing.assert_allclose(cover[:].sum(), 1044.0 / resolution) np.testing.assert_allclose(cov2[:].sum(), 1044.0 / resolution)
def test_bigwig_genomic_interval_access_part_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bw") storage = False for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bigwig( 'test', bigwigfiles=bamfile_, roi=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :',storage,'/ resolution :',reso,'/ shift :',shift) print(i, cover.gindexer[i]) np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[i].chrom, \ cover.gindexer[i].start, \ cover.gindexer[i].end, \ cover.gindexer[i].strand np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': gicov = cover[chrom, start, end, strand][:, :(-shift*reso),:,:] np.testing.assert_equal(cover[i][:, shift:,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :]) else: gicov = cover[chrom, start, end, strand][:, (shift*reso):,:,:] np.testing.assert_equal(cover[i][:, :-shift,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :])
def test_bigwig_genomic_interval_access(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bw") for storage in [True, False]: for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bigwig('test', bigwigfiles=bamfile_, regions=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :', storage, '/ resolution :', reso, '/ shift :', shift) print(i, cover.gindexer[i]) np.testing.assert_equal(cover[i], cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[ i].chrom, cover.gindexer[i].start, cover.gindexer[ i].end, cover.gindexer[i].strand np.testing.assert_equal(cover[i], cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': np.testing.assert_equal( cover[i][:, shift:, :, :], cover[chrom, start, end, strand][:, :-shift, :, :]) else: np.testing.assert_equal( cover[i][:, :-shift, :, :], cover[chrom, start, end, strand][:, shift:, :, :])
data, high, low = get_high_low_data(input, high_hbf, low_hbf) roi_A, roi = get_roi(high + low) seq = roi2fasta(roi_A, refgenome, flank) test = pd.DataFrame.from_dict(seq, orient='index') data['seq'] = test[0] # 1. using janggu get DNA one-hot ## get one-hot data and ATAC feature matrix dna_A = Bioseq.create_from_refgenome(name='dna', refgenome=refgenome, roi=roi_A, flank=flank) Tn5 = Cover.create_from_bigwig('bigwig_coverage', bigwigfiles=bw_file, roi=roi, binsize=1, stepsize=1, flank=flank) ## ReShape dna_A = np.reshape(dna_A, (len(high + low), flank * 2 + 1, 4)) bw_values = np.reshape(Tn5, (len(high + low), flank * 2 + 1)) ## get motif PWM, 3. read meme get motif PWMs in both strands motifs = read_motif(meme_file) # 4. scan motifs get score_list, max(pos_strand,neg_strand) score_list_A = Parallel(n_jobs=-1)( delayed(DNA_motif_scan)(dna_A, motifs[m][0], motifs[m][1]) for m in motifs)
import matplotlib.pyplot as plt from pkg_resources import resource_filename from janggu.data import Cover from janggu.data import plotGenomeTrack roi = resource_filename('janggu', 'resources/sample.bed') bw_file = resource_filename('janggu', 'resources/sample.bw') cover = Cover.create_from_bigwig('coverage1', bigwigfiles=[bw_file] * 2, conditions=['rep1', 'rep2'], roi=roi, binsize=200, stepsize=200, resolution=50) cover2 = Cover.create_from_bigwig('coverage2', bigwigfiles=bw_file, roi=roi, binsize=200, stepsize=200, resolution=50) a = plotGenomeTrack([cover, cover2], 'chr1', 16000, 18000) a.savefig('coverage.png') #plt.show(a)
def test_cover_from_bigwig_sanity(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bwfile_ = os.path.join(data_path, "sample.bw") cover = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, regions=bed_file, binsize=200, stepsize=50, resolution=50, flank=0, storage='ndarray') cover[0] assert len(cover.gindexer) == 394 assert len(cover.garray.handle) == 394 cover = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, regions=bed_file, binsize=200, stepsize=50, resolution=50, flank=0, storage='ndarray', store_whole_genome=True) cover[0] assert len(cover.gindexer) == 394 assert len(cover.garray.handle) == 2 cov2 = Cover.create_from_bigwig('test', bigwigfiles=bwfile_, resolution=7, storage='ndarray', store_whole_genome=True) assert len(cov2.garray.handle) == 2 assert cov2['chr1', 100, 200].shape == (1, 100 // 7 + 1, 1, 1) with pytest.raises(Exception): cov2.shape with pytest.raises(Exception): cov2[0] with pytest.raises(Exception): # name must be a string Cover.create_from_bigwig(1.2, bigwigfiles=bwfile_, regions=bed_file, binsize=1, stepsize=1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bigwig('test', bigwigfiles=bwfile_, regions=bed_file, binsize=1, stepsize=1, flank=-1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bigwig('test', bigwigfiles=bwfile_, regions=bed_file, binsize=1, stepsize=-1, flank=0, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bigwig('test', bigwigfiles=bwfile_, regions=bed_file, binsize=-1, stepsize=1, flank=0, storage='ndarray') with pytest.raises(Exception): # resolution must be greater than stepsize Cover.create_from_bigwig('test', bigwigfiles=bwfile_, regions=bed_file, binsize=200, stepsize=50, resolution=300, flank=0, storage='ndarray')