Exemple #1
0
def test_bam_store_whole_genome_option():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")
    bamfile_ = os.path.join(data_path, "sample.bam")

    cover1 = Cover.create_from_bam('test',
                                   bamfiles=bamfile_,
                                   regions=bed_file,
                                   store_whole_genome=True,
                                   binsize=200,
                                   stepsize=200,
                                   storage='ndarray')
    cover2 = Cover.create_from_bam('test2',
                                   bamfiles=bamfile_,
                                   regions=bed_file,
                                   store_whole_genome=False,
                                   binsize=200,
                                   stepsize=200,
                                   storage='ndarray')

    assert len(cover1) == 100
    assert len(cover2) == len(cover1)
    assert cover1.shape == (100, 200, 2, 1)
    assert cover1.shape == cover2.shape
    np.testing.assert_equal(cover1[:], cover2[:])
    assert cover1[:].sum() == 29.
Exemple #2
0
def test_cover_from_bam_sanity():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bam")
    cover = Cover.create_from_bam(
        'test',
        bamfiles=bamfile_,
        roi=bed_file,
        binsize=200, stepsize=200,
        flank=0,
        storage='ndarray')
    cover[0]

    with pytest.raises(IndexError):
        # not interable
        cover[1.2]

    cov2 = Cover.create_from_bam(
           'test',
           bamfiles=bamfile_,
           storage='ndarray',
           store_whole_genome=True)

    assert len(cover.gindexer) == len(cover.garray.handle)
    assert len(cov2.garray.handle) != len(cover.garray.handle)

    with pytest.raises(Exception):
        # name must be a string
        Cover.create_from_bam(
            1.2,
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=1, stepsize=1,
            storage='ndarray')

    with pytest.raises(Exception):
        Cover.create_from_bam(
            'test',
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=1, stepsize=1,
            flank=-1,
            storage='ndarray')
    with pytest.raises(Exception):
        Cover.create_from_bam(
            'test',
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=1, stepsize=-1,
            flank=0,
            storage='ndarray')
    with pytest.raises(Exception):
        Cover.create_from_bam(
            'test',
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=-1, stepsize=1,
            flank=0,
            storage='ndarray')
def get_data(params):
    zscore = ZScore()
    LABELS_TRAIN = ReduceDim(Cover.create_from_bam(
        'geneexpr',
        bamfiles=RNA.format(params['traincell'], params['trainrep']),
        roi=ROI_INPUT_TRAIN,
        flank=params['cageflank'],
        conditions=['GeneExpr'],
        resolution=None,
        store_whole_genome=False,
        storage='ndarray',
        normalizer=[LogTransform(), zscore],
        stranded=False,
        cache=True),
                             aggregator="mean")
    train_labels = LABELS_TRAIN
    train_input = []
    if params['inputs'] in ['dna_only', 'epi_dna']:
        dnaflank = params['dnaflank']
        order = params['order']
        # DNA
        DNA_TRAIN = Bioseq.create_from_refgenome('dna',
                                                 refgenome=REFGENOME,
                                                 roi=ROI_INPUT_TRAIN,
                                                 flank=dnaflank,
                                                 order=order,
                                                 cache=True,
                                                 store_whole_genome=False)
        train_input += [DNA_TRAIN]
    if params['inputs'] in ['epi_only', 'epi_dna']:
        zscore = ZScore()
        dnase_TRAIN = ReduceDim(Cover.create_from_bam(
            'dnase',
            bamfiles=DNASE.format(params['traincell']),
            roi=ROI_INPUT_TRAIN,
            flank=params['dnaseflank'],
            resolution=None,
            store_whole_genome=False,
            normalizer=[LogTransform(), zscore],
            cache=True),
                                aggregator="mean")
        train_input += [dnase_TRAIN]
        zscore = ZScore()
        h3k4_TRAIN = ReduceDim(Cover.create_from_bigwig(
            'h3k4',
            bigwigfiles=[H3K4me3.format(params['traincell'])],
            roi=ROI_INPUT_TRAIN,
            flank=params['dnaseflank'],
            store_whole_genome=False,
            normalizer=[LogTransform(), zscore],
            cache=True),
                               aggregator="mean")
        train_input += [h3k4_TRAIN]
    if len(train_input) == 0:
        raise ValueError('no input')
    return (train_input, train_labels)
Exemple #4
0
def test_load_bam_resolution10(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample.bam")
    gsfile_ = os.path.join(data_path, 'sample.chrom.sizes')

    content = pandas.read_csv(gsfile_,
                              sep='\t',
                              names=['chr', 'length'],
                              index_col='chr')

    gsize = content.to_dict()['length']

    bed_file = os.path.join(data_path, "sample.bed")

    for store in ['ndarray', 'hdf5', 'sparse']:
        # base pair binsize
        # print(store)
        cover = Cover.create_from_bam("yeast_I_II_III.bam",
                                      bamfiles=bamfile_,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      genomesize=gsize,
                                      resolution=10,
                                      storage=store,
                                      cache=True)

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 20, 2, 1))

        # the region is read relative to the forward strand
        # read on the reverse strand
        val = np.where(cover[4] == 1)
        np.testing.assert_equal(cover[4].sum(), 1.)
        np.testing.assert_equal(val[1][0], 17)  # pos
        np.testing.assert_equal(val[2][0], 1)  # strand

        # two reads on the forward strand
        val = np.where(cover[13] == 1)
        np.testing.assert_equal(cover[13].sum(), 2.)
        np.testing.assert_equal(val[1], np.asarray([16, 17]))  # pos
        np.testing.assert_equal(val[2], np.asarray([0, 0]))  # strand

        # the region is read relative to the reverse strand
        # for index 50
        # read on the reverse strand
        val = np.where(cover[52] == 1)
        np.testing.assert_equal(cover[52].sum(), 2.)
        np.testing.assert_equal(val[1], np.asarray([0, 8]))  # pos
        np.testing.assert_equal(val[2], np.asarray([0, 0]))  # strand

        # two reads on the forward strand
        val = np.where(cover[96] == 1)
        np.testing.assert_equal(cover[96].sum(), 1.)
        np.testing.assert_equal(val[1], np.asarray([2]))  # pos
        np.testing.assert_equal(val[2], np.asarray([1]))  # strand
Exemple #5
0
def test_bam_inferred_binsize():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    bamfile_ = os.path.join(data_path, "sample.bam")

    cover = Cover.create_from_bam('test',
                                  bamfiles=bamfile_,
                                  regions=bed_file,
                                  flank=0,
                                  storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 2, 1)
Exemple #6
0
def test_load_bam_resolutionNone(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample.bam")
    gsfile_ = os.path.join(data_path, 'sample.chrom.sizes')

    content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'],
                              index_col='chr')

    gsize = content.to_dict()['length']

    bed_file = os.path.join(data_path, "sample.bed")

    for store in ['ndarray', 'hdf5', 'sparse']:
        # base pair binsize
        # print(store)
        cover1 = Cover.create_from_bam(
            "yeast_I_II_III.bam",
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=200, stepsize=200,
            genomesize=gsize,
            resolution=1,
            storage=store, cache=True)
        cover = Cover.create_from_bam(
            "yeast_I_II_III.bam",
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=200, stepsize=200,
            genomesize=gsize,
            resolution=None,
            storage=store, cache=True, datatags=['None'])

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 2, 1))

        np.testing.assert_equal(cover1[:].sum(axis=1), cover[:].sum(axis=1))
Exemple #7
0
def test_bam_genomic_interval_access_part_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bam")

    storage = False
    for reso in [1, 50]:
        for shift in [0, 1]:
            cover = Cover.create_from_bam(
                'test',
                bamfiles=bamfile_,
                roi=bed_file,
                flank=0,
                storage='ndarray',
                store_whole_genome=storage,
                resolution=reso)

            for i in range(len(cover)):
                print('storage :',storage,'/ resolution :',reso,'/ shift :',shift)
                print(i, cover.gindexer[i])


                np.testing.assert_equal(np.repeat(cover[i],
                                    cover.garray.resolution,
                                    axis=1), cover[cover.gindexer[i]])

                chrom, start, end, strand = cover.gindexer[i].chrom, \
                    cover.gindexer[i].start, \
                    cover.gindexer[i].end, \
                    cover.gindexer[i].strand

                np.testing.assert_equal(np.repeat(cover[i],
                                    cover.garray.resolution, axis=1),
                                    cover[chrom, start, end, strand])

                if shift != 0:
                    start += shift * reso
                    end += shift * reso

                    if strand != '-':
                        gicov = cover[chrom, start, end, strand][:, :(-shift*reso),:,:]
                        np.testing.assert_equal(cover[i][:, shift:,:, :],
                            gicov.reshape((1, gicov.shape[1]//reso, reso, 2, 1))[:, :, 0, :, :])
                    else:
                        gicov = cover[chrom, start, end, strand][:, (shift*reso):,:,:]
                        np.testing.assert_equal(cover[i][:, :-shift,:, :],
                        gicov.reshape((1, gicov.shape[1]//reso, reso, 2, 1))[:, :, 0, :, :])
Exemple #8
0
def test_cover_bam_unstranded():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample.bam")
    gsfile_ = os.path.join(data_path, 'sample.chrom.sizes')

    content = pandas.read_csv(gsfile_,
                              sep='\t',
                              names=['chr', 'length'],
                              index_col='chr')

    gsize = content.to_dict()['length']

    bed_file = os.path.join(data_path, "sample.bed")

    cover = Cover.create_from_bam("yeast_I_II_III.bam",
                                  bamfiles=bamfile_,
                                  regions=bed_file,
                                  binsize=200,
                                  stepsize=200,
                                  genomesize=gsize,
                                  stranded=False)

    np.testing.assert_equal(len(cover), 100)
    np.testing.assert_equal(cover.shape, (100, 200, 1, 1))

    # the region is read relative to the forward strand
    # read on the reverse strand
    val = np.where(cover[4] == 1)
    np.testing.assert_equal(cover[4].sum(), 1.)
    np.testing.assert_equal(val[1][0], 179)  # pos

    # two reads on the forward strand
    val = np.where(cover[13] == 1)
    np.testing.assert_equal(cover[13].sum(), 2.)
    np.testing.assert_equal(val[1], np.asarray([162, 178]))  # pos

    # the region is read relative to the reverse strand
    # for index 50
    # read on the reverse strand
    val = np.where(cover[52] == 1)
    np.testing.assert_equal(cover[52].sum(), 2.)
    np.testing.assert_equal(val[1], np.asarray([9, 89]))  # pos

    # two reads on the forward strand
    val = np.where(cover[96] == 1)
    np.testing.assert_equal(cover[96].sum(), 1.)
    np.testing.assert_equal(val[1], np.asarray([25]))  # pos
Exemple #9
0
def test_bam_genomic_interval_access():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bam")

    for storage in [True, False]:
        for reso in [1, 50]:
            for shift in [0, 1]:
                cover = Cover.create_from_bam('test',
                                              bamfiles=bamfile_,
                                              regions=bed_file,
                                              flank=0,
                                              storage='ndarray',
                                              store_whole_genome=storage,
                                              resolution=reso)

                for i in range(len(cover)):
                    print('storage :', storage, '/ resolution :', reso,
                          '/ shift :', shift)
                    print(i, cover.gindexer[i])

                    np.testing.assert_equal(cover[i], cover[cover.gindexer[i]])
                    chrom, start, end, strand = cover.gindexer[
                        i].chrom, cover.gindexer[i].start, cover.gindexer[
                            i].end, cover.gindexer[i].strand
                    np.testing.assert_equal(cover[i], cover[chrom, start, end,
                                                            strand])

                    if shift != 0:
                        start += shift * reso
                        end += shift * reso

                        if strand != '-':
                            np.testing.assert_equal(
                                cover[i][:, shift:, :, :],
                                cover[chrom, start, end,
                                      strand][:, :-shift, :, :])
                        else:
                            np.testing.assert_equal(
                                cover[i][:, :-shift, :, :],
                                cover[chrom, start, end, strand][:,
                                                                 shift:, :, :])
Exemple #10
0
def test_cover_bam_paired_midpoint():
    # sample2.bam contains paired end examples,
    # unmapped examples, unmapped mate and low quality example
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample2.bam")

    cover = Cover.create_from_bam("yeast_I_II_III.bam",
                                  bamfiles=bamfile_,
                                  stranded=False,
                                  pairedend='midpoint',
                                  min_mapq=30,
                                  store_whole_genome=True)

    assert cover.garray.handle['ref'].sum() == 2, cover.garray.handle['ref']
    print(cover.garray.handle['ref'])
    # the read starts at index 6 and tlen is 39
    assert cover.garray.handle['ref'][6 + 39 // 2, 0, 0] == 1
    # another read maps to index 34
    assert cover.garray.handle['ref'][34, 0, 0] == 1
Exemple #11
0
def get_data(params):
    binsize = params['binsize']

    # PEAKS
    LABELS = ReduceDim(Cover.create_from_bed('peaks',
                                             bedfiles=PEAKS,
                                             roi=ROI,
                                             binsize=binsize,
                                             conditions=['JunD'],
                                             resolution=binsize,
                                             store_whole_genome=True,
                                             storage='sparse',
                                             cache=True),
                       aggregator='max')

    # training on chr1, validation on chr2, test on chr3 with swapped Dnase samples
    LABELS, LABELS_TEST = split_train_test(LABELS, 'chr3')
    LABELS_TRAIN, LABELS_VAL = split_train_test(LABELS, 'chr2')
    if params['type'] in ['dna_only', 'dnase_dna']:
        dnaflank = params['dnaflank']
        order = params['order']
        # DNA
        DNA = Bioseq.create_from_refgenome('dna',
                                           refgenome=REFGENOME,
                                           roi=ROI,
                                           binsize=binsize,
                                           flank=dnaflank,
                                           order=order,
                                           cache=True,
                                           store_whole_genome=True)

        DNA, DNA_TEST = split_train_test(DNA, 'chr3')
        DNA_TRAIN, DNA_VAL = split_train_test(DNA, 'chr2')
    if params['type'] in ['dnase_bam_only', 'dnase_dna']:

        dnaseflank = params['dnaseflank']
        # ACCESSIBILITY
        ACCESS_TEST = Cover.create_from_bam(
            'dnase',
            bamfiles=[DNASE_STAM_ENCODE, DNASE_STAM_ROADMAP],
            roi=ROI,
            binsize=binsize,
            conditions=['Encode', 'Roadmap'],
            flank=dnaseflank,
            resolution=50,
            normalizer=params['normalize'],
            store_whole_genome=True,
            cache=True)
        ACCESS = Cover.create_from_bam(
            'dnase',
            roi=ROI,
            bamfiles=[DNASE_STAM_ROADMAP, DNASE_STAM_ENCODE],
            binsize=binsize,
            conditions=['Roadmap', 'Encode'],
            resolution=50,
            flank=dnaseflank,
            normalizer=params['normalize'],
            store_whole_genome=True,
            cache=True)

        _, ACCESS_TEST = split_train_test(ACCESS_TEST, 'chr3')
        ACCESS, _ = split_train_test(ACCESS, 'chr3')
        ACCESS_TRAIN, ACCESS_VAL = split_train_test(ACCESS, 'chr2')

    if params['type'] in ['dna_dnase', 'dnase_bam_only']:
        if params['augment'] == 'orient':
            ACCESS_TRAIN = RandomOrientation(ACCESS_TRAIN)
        if params['augment'] == 'scale':
            ACCESS_TRAIN = RandomSignalScale(ACCESS_TRAIN, 0.1)
        if params['augment'] == 'both':
            ACCESS_TRAIN = RandomSignalScale(RandomOrientation(ACCESS_TRAIN),
                                             0.1)

    if params['type'] == 'dna_only':
        return (DNA_TRAIN, LABELS_TRAIN), (DNA_VAL, LABELS_VAL), \
               (DNA_TEST, LABELS_TEST)
    elif params['type'] == 'dnase_dna':
        return ([DNA_TRAIN, ACCESS_TRAIN], LABELS_TRAIN), \
                ([DNA_VAL, ACCESS_VAL], LABELS_VAL),\
               ([DNA_TEST, ACCESS_TEST], LABELS_TEST)
    elif params['type'] in ['dnase_bam_only']:
        return ([ACCESS_TRAIN], LABELS_TRAIN), \
               ([ACCESS_VAL], LABELS_VAL), \
               ([ACCESS_TEST], LABELS_TEST)