Ejemplo n.º 1
0
def test_load_cover_bigwig_resolutionNone(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    bwfile_ = os.path.join(data_path, "sample.bw")

    bed_file = os.path.join(data_path, "sample.bed")

    for store in ['ndarray', 'hdf5', 'sparse']:
        # base pair binsize
        print(store)
        cover1 = Cover.create_from_bigwig(
            "cov",
            bigwigfiles=bwfile_,
            roi=bed_file,
            binsize=200, stepsize=200,
            resolution=1,
            storage=store, cache=True)

        cover = Cover.create_from_bigwig(
            "cov",
            bigwigfiles=bwfile_,
            roi=bed_file,
            binsize=200, stepsize=200,
            resolution=None,
            storage=store, cache=True, datatags=['None'],
            collapser='sum')
        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))

        np.testing.assert_equal(cover1[:].sum(axis=1), cover[:].sum(axis=1))
Ejemplo n.º 2
0
def test_create_from_array_whole_genome_true(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=[PEAK_FILE]*5,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=True)

    pred = LABELS[:]

    for storage in ['ndarray', 'sparse', 'hdf5']:
        print(storage)
        cov_out = Cover.create_from_array('BindingProba', pred,
                                          LABELS.gindexer,
                                          cache=True,
                                          storage=storage,
                                          store_whole_genome=True)

        np.testing.assert_equal(cov_out[:], LABELS[:])
        np.testing.assert_equal(cov_out.shape, LABELS.shape)
Ejemplo n.º 3
0
def test_bam_store_whole_genome_option():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")
    bamfile_ = os.path.join(data_path, "sample.bam")

    cover1 = Cover.create_from_bam('test',
                                   bamfiles=bamfile_,
                                   regions=bed_file,
                                   store_whole_genome=True,
                                   binsize=200,
                                   stepsize=200,
                                   storage='ndarray')
    cover2 = Cover.create_from_bam('test2',
                                   bamfiles=bamfile_,
                                   regions=bed_file,
                                   store_whole_genome=False,
                                   binsize=200,
                                   stepsize=200,
                                   storage='ndarray')

    assert len(cover1) == 100
    assert len(cover2) == len(cover1)
    assert cover1.shape == (100, 200, 2, 1)
    assert cover1.shape == cover2.shape
    np.testing.assert_equal(cover1[:], cover2[:])
    assert cover1[:].sum() == 29.
Ejemplo n.º 4
0
def test_create_from_array_whole_genome_false(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                       roi=ROI_FILE,
                                       binsize=200, stepsize=200,
                                       order=1,
                                       store_whole_genome=False,
                                       datatags=['ref'])

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=PEAK_FILE,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=False,
                                   datatags=['train'])

    @inputlayer
    @outputconv('sigmoid')
    def double_stranded_model_dnaconv(inputs, inp, oup, params):
        with inputs.use('dna') as layer:
            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
                                     activation=params[2]))(layer)
        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
                                       name='motif')(layer)
        return inputs, output

    modeltemplate = double_stranded_model_dnaconv

    K.clear_session()

    # create a new model object
    model = Janggu.create(template=modeltemplate,
                          modelparams=(30, 21, 'relu'),
                          inputs=DNA,
                          outputs=LABELS)

    model.compile(optimizer='adadelta', loss='binary_crossentropy',
                  metrics=['acc'])

    pred = model.predict(DNA)

    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
                                      store_whole_genome=False)

    assert pred.shape == cov_out.shape

    np.testing.assert_equal(pred, cov_out[:])

    assert len(cov_out.gindexer) == len(pred)
    assert len(cov_out.garray.handle) == len(pred)
Ejemplo n.º 5
0
def test_plotgenometracks():

    roi = pkg_resources.resource_filename('janggu', 'resources/sample.bed')

    bw_file = pkg_resources.resource_filename('janggu', 'resources/sample.bw')



    cover = Cover.create_from_bigwig('coverage2',
                                     bigwigfiles=bw_file,
                                     roi=roi,
                                     binsize=200,
                                     stepsize=200,
                                     resolution=50)



    cover2 = Cover.create_from_bigwig('coverage2',
                                      bigwigfiles=bw_file,
                                      roi=roi,
                                      binsize=200,
                                      stepsize=200,
                                      resolution=50)



    a = plotGenomeTrack([cover,cover2],'chr1',16000,18000)
    a = plotGenomeTrack(cover,'chr1',16000,18000)
Ejemplo n.º 6
0
def test_channel_last_first():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bwfile_ = os.path.join(data_path, "sample.bw")

    cover = Cover.create_from_bigwig('test',
                                     bigwigfiles=bwfile_,
                                     resolution=1,
                                     binsize=200,
                                     regions=bed_file,
                                     store_whole_genome=True,
                                     channel_last=True,
                                     storage='ndarray')
    assert cover.shape == (100, 200, 1, 1)
    assert cover[0].shape == (1, 200, 1, 1)
    cover1 = cover

    cover = Cover.create_from_bigwig('test',
                                     bigwigfiles=bwfile_,
                                     resolution=1,
                                     binsize=200,
                                     regions=bed_file,
                                     store_whole_genome=True,
                                     channel_last=False,
                                     storage='ndarray')
    assert cover.shape == (100, 1, 200, 1)
    assert cover[0].shape == (1, 1, 200, 1)

    np.testing.assert_equal(cover1[0], np.transpose(cover[0], (0, 2, 3, 1)))
Ejemplo n.º 7
0
def test_cover_from_bam_sanity():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bam")
    cover = Cover.create_from_bam(
        'test',
        bamfiles=bamfile_,
        roi=bed_file,
        binsize=200, stepsize=200,
        flank=0,
        storage='ndarray')
    cover[0]

    with pytest.raises(IndexError):
        # not interable
        cover[1.2]

    cov2 = Cover.create_from_bam(
           'test',
           bamfiles=bamfile_,
           storage='ndarray',
           store_whole_genome=True)

    assert len(cover.gindexer) == len(cover.garray.handle)
    assert len(cov2.garray.handle) != len(cover.garray.handle)

    with pytest.raises(Exception):
        # name must be a string
        Cover.create_from_bam(
            1.2,
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=1, stepsize=1,
            storage='ndarray')

    with pytest.raises(Exception):
        Cover.create_from_bam(
            'test',
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=1, stepsize=1,
            flank=-1,
            storage='ndarray')
    with pytest.raises(Exception):
        Cover.create_from_bam(
            'test',
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=1, stepsize=-1,
            flank=0,
            storage='ndarray')
    with pytest.raises(Exception):
        Cover.create_from_bam(
            'test',
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=-1, stepsize=1,
            flank=0,
            storage='ndarray')
Ejemplo n.º 8
0
def test_bed_unsync_roi_targets():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")
    bed_shift_file = os.path.join(data_path, "positive_shift.bed")

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=None,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 1, 1, 1)
    assert cover[:].sum() == 25

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=50,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 4, 1, 1)
    assert cover[:].sum() == 25 * 4


    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=50,
        store_whole_genome=True,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 4, 1, 1)
    assert cover[:].sum() == 25 * 4

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=1,
        store_whole_genome=False,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 1, 1)
    assert cover[:].sum() == 25 * 200 - 2

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=1,
        store_whole_genome=True,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 1, 1)
    assert cover[:].sum() == 25 * 200 - 2
Ejemplo n.º 9
0
def get_data(params):
    zscore = ZScore()
    LABELS_TRAIN = ReduceDim(Cover.create_from_bam(
        'geneexpr',
        bamfiles=RNA.format(params['traincell'], params['trainrep']),
        roi=ROI_INPUT_TRAIN,
        flank=params['cageflank'],
        conditions=['GeneExpr'],
        resolution=None,
        store_whole_genome=False,
        storage='ndarray',
        normalizer=[LogTransform(), zscore],
        stranded=False,
        cache=True),
                             aggregator="mean")
    train_labels = LABELS_TRAIN
    train_input = []
    if params['inputs'] in ['dna_only', 'epi_dna']:
        dnaflank = params['dnaflank']
        order = params['order']
        # DNA
        DNA_TRAIN = Bioseq.create_from_refgenome('dna',
                                                 refgenome=REFGENOME,
                                                 roi=ROI_INPUT_TRAIN,
                                                 flank=dnaflank,
                                                 order=order,
                                                 cache=True,
                                                 store_whole_genome=False)
        train_input += [DNA_TRAIN]
    if params['inputs'] in ['epi_only', 'epi_dna']:
        zscore = ZScore()
        dnase_TRAIN = ReduceDim(Cover.create_from_bam(
            'dnase',
            bamfiles=DNASE.format(params['traincell']),
            roi=ROI_INPUT_TRAIN,
            flank=params['dnaseflank'],
            resolution=None,
            store_whole_genome=False,
            normalizer=[LogTransform(), zscore],
            cache=True),
                                aggregator="mean")
        train_input += [dnase_TRAIN]
        zscore = ZScore()
        h3k4_TRAIN = ReduceDim(Cover.create_from_bigwig(
            'h3k4',
            bigwigfiles=[H3K4me3.format(params['traincell'])],
            roi=ROI_INPUT_TRAIN,
            flank=params['dnaseflank'],
            store_whole_genome=False,
            normalizer=[LogTransform(), zscore],
            cache=True),
                               aggregator="mean")
        train_input += [h3k4_TRAIN]
    if len(train_input) == 0:
        raise ValueError('no input')
    return (train_input, train_labels)
Ejemplo n.º 10
0
def test_load_cover_bed_binary(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    bed_file = pkg_resources.resource_filename('janggu',
                                               'resources/sample.bed')
    score_file = pkg_resources.resource_filename(
        'janggu', 'resources/scored_sample.bed')

    for store in ['ndarray', 'hdf5', 'sparse']:
        print('store', store)
        cover = Cover.create_from_bed("cov",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      resolution=200,
                                      storage=store,
                                      mode='binary',
                                      cache=True)
        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)

        cover = Cover.create_from_bed("cov50",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      storage=store,
                                      resolution=50,
                                      mode='binary',
                                      cache=True)
        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 4, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 4 * 1)

        cover = Cover.create_from_bed(
            "cov50_firstdim",
            bedfiles=score_file,
            regions=bed_file,
            binsize=200,
            stepsize=200,
            storage=store,
            #resolution=50,
            dimmode='first',
            mode='binary',
            cache=True)
        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)
Ejemplo n.º 11
0
def test_load_cover_bed_categorical():
    bed_file = pkg_resources.resource_filename('janggu',
                                               'resources/sample.bed')
    score_file = pkg_resources.resource_filename(
        'janggu', 'resources/scored_sample.bed')

    for store in ['ndarray', 'sparse']:
        cover = Cover.create_from_bed("cov",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      resolution=200,
                                      storage=store,
                                      mode='categorical')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 6))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)

        cover = Cover.create_from_bed("cov50",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      resolution=50,
                                      storage=store,
                                      mode='categorical')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 4, 1, 6))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 4 * 1)

        cover = Cover.create_from_bed(
            "cov50",
            bedfiles=score_file,
            regions=bed_file,
            #            resolution=50,
            binsize=200,
            stepsize=200,
            storage=store,
            dimmode='first',
            mode='categorical')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 6))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)
Ejemplo n.º 12
0
def test_load_cover_bed_scored():
    bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed')
    score_file = pkg_resources.resource_filename('janggu',
                                                 'resources/scored_sample.bed')

    for store in ['ndarray', 'sparse']:
        cover = Cover.create_from_bed(
            "cov",
            bedfiles=score_file,
            roi=bed_file,
            binsize=200, stepsize=200,
            resolution=200,
            storage=store,
            mode='score')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 5)

        cover = Cover.create_from_bed(
            "cov50",
            bedfiles=score_file,
            roi=bed_file,
            binsize=200, stepsize=200,
            storage=store,
            resolution=50,
            mode='score')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 4, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 4*5)

        cover = Cover.create_from_bed(
            "cov50",
            bedfiles=score_file,
            roi=bed_file,
            storage=store,
            resolution=None,
            binsize=200, stepsize=200,
            collapser='max',
            mode='score')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 5)
Ejemplo n.º 13
0
def test_load_cover_bigwig_default(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    bwfile_ = os.path.join(data_path, "sample.bw")
    gsfile_ = os.path.join(data_path, 'sample.chrom.sizes')

    gsize = pandas.read_csv(gsfile_,
                            sep='\t',
                            names=['chr', 'length'],
                            index_col='chr').to_dict()['length']

    bed_file = os.path.join(data_path, "sample.bed")

    for store in ['ndarray', 'hdf5', 'sparse']:
        # base pair binsize
        print(store)
        cover = Cover.create_from_bigwig("cov",
                                         bigwigfiles=bwfile_,
                                         regions=bed_file,
                                         binsize=200,
                                         stepsize=200,
                                         genomesize=gsize,
                                         storage=store,
                                         cache=True)

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 200, 1, 1))

        # there is one read in the region
        np.testing.assert_allclose(cover[4].sum(), 36.)
        np.testing.assert_allclose(cover[52].sum(), 2 * 36.)
Ejemplo n.º 14
0
def get_data(params):
    train_labels = Cover.create_from_bed('labels',
                                         bedfiles=bedfiles,
                                         roi=train_roi,
                                         resolution=200,
                                         store_whole_genome=True,
                                         storage='sparse',
                                         cache=True,
                                         dtype='int8',
                                         minoverlap=.5,
                                         verbose=True)
    test_labels = view(train_labels, test_roi)
    val_labels = view(train_labels, val_roi)
    train_seq = Bioseq.create_from_refgenome('dna',
                                             refgenome=refgenome,
                                             roi=train_roi,
                                             store_whole_genome=True,
                                             storage='ndarray',
                                             cache=True,
                                             order=params['order'],
                                             flank=params['flank'],
                                             verbose=True)
    test_seq = view(train_seq, test_roi)
    val_seq = view(train_seq, val_roi)
    return ((train_seq, ReduceDim(train_labels)), (val_seq,
                                                   ReduceDim(val_labels)),
            (test_seq, ReduceDim(test_labels)))
Ejemplo n.º 15
0
def test_janggu_variant_prediction(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    for order in [1, 2, 3]:
        refgenome = os.path.join(data_path, 'sample_genome.fa')
        vcffile = os.path.join(data_path, 'sample.vcf')

        dna = Bioseq.create_from_refgenome('dna',
                                           refgenome=refgenome,
                                           storage='ndarray',
                                           binsize=50,
                                           store_whole_genome=True,
                                           order=order)

        def _cnn_model(inputs, inp, oup, params):
            inputs = Input(
                (50 - params['order'] + 1, 1, pow(4, params['order'])))
            layer = Flatten()(inputs)
            layer = Dense(params['hiddenunits'])(layer)
            output = Dense(4, activation='sigmoid')(layer)
            return inputs, output

        model = Janggu.create(_cnn_model,
                              modelparams={
                                  'hiddenunits': 2,
                                  'order': order
                              },
                              name='dna_ctcf_HepG2-cnn')

        model.predict_variant_effect(
            dna,
            vcffile,
            conditions=['m' + str(i) for i in range(4)],
            output_folder=os.path.join(os.environ['JANGGU_OUTPUT']))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'))

        f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'),
                      'r')

        gindexer = GenomicIndexer.create_from_file(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None,
            None)

        cov = Cover.create_from_array('snps',
                                      f['diffscore'],
                                      gindexer,
                                      store_whole_genome=True)

        print(cov['chr2', 55, 65].shape)
        print(cov['chr2', 55, 65])

        assert np.abs(cov['chr2', 59, 60]).sum() > 0.0
        assert np.abs(cov['chr2', 54, 55]).sum() == 0.0
        f.close()
Ejemplo n.º 16
0
def test_load_bam_resolution10(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample.bam")
    gsfile_ = os.path.join(data_path, 'sample.chrom.sizes')

    content = pandas.read_csv(gsfile_,
                              sep='\t',
                              names=['chr', 'length'],
                              index_col='chr')

    gsize = content.to_dict()['length']

    bed_file = os.path.join(data_path, "sample.bed")

    for store in ['ndarray', 'hdf5', 'sparse']:
        # base pair binsize
        # print(store)
        cover = Cover.create_from_bam("yeast_I_II_III.bam",
                                      bamfiles=bamfile_,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      genomesize=gsize,
                                      resolution=10,
                                      storage=store,
                                      cache=True)

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 20, 2, 1))

        # the region is read relative to the forward strand
        # read on the reverse strand
        val = np.where(cover[4] == 1)
        np.testing.assert_equal(cover[4].sum(), 1.)
        np.testing.assert_equal(val[1][0], 17)  # pos
        np.testing.assert_equal(val[2][0], 1)  # strand

        # two reads on the forward strand
        val = np.where(cover[13] == 1)
        np.testing.assert_equal(cover[13].sum(), 2.)
        np.testing.assert_equal(val[1], np.asarray([16, 17]))  # pos
        np.testing.assert_equal(val[2], np.asarray([0, 0]))  # strand

        # the region is read relative to the reverse strand
        # for index 50
        # read on the reverse strand
        val = np.where(cover[52] == 1)
        np.testing.assert_equal(cover[52].sum(), 2.)
        np.testing.assert_equal(val[1], np.asarray([0, 8]))  # pos
        np.testing.assert_equal(val[2], np.asarray([0, 0]))  # strand

        # two reads on the forward strand
        val = np.where(cover[96] == 1)
        np.testing.assert_equal(cover[96].sum(), 1.)
        np.testing.assert_equal(val[1], np.asarray([2]))  # pos
        np.testing.assert_equal(val[2], np.asarray([1]))  # strand
Ejemplo n.º 17
0
def test_load_cover_bigwig_resolution1(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    bwfile_ = os.path.join(data_path, "sample.bw")

    bed_file = os.path.join(data_path, "sample.bed")

    for store in ['ndarray', 'hdf5', 'sparse']:
        # base pair binsize
        print(store)
        cover = Cover.create_from_bigwig(
            "cov",
            bigwigfiles=bwfile_,
            roi=bed_file,
            binsize=200, stepsize=200,
            resolution=1,
            storage=store, cache=True)

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 200, 1, 1))

        # there is one read in the region 4
        np.testing.assert_allclose(cover[4].sum(), 36)
        np.testing.assert_equal(cover[4][0, :, 0, 0],
         np.asarray(
          [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
           1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
           1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
        # and two reads in region 52
        np.testing.assert_allclose(cover[52].sum(), 2*36)
        np.testing.assert_equal(cover[52][0, :, 0, 0],
        np.asarray(
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
Ejemplo n.º 18
0
def test_bed_store_whole_genome_option():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    cover1 = Cover.create_from_bed('test',
                                   bedfiles=bed_file,
                                   regions=bed_file,
                                   store_whole_genome=True,
                                   storage='ndarray')
    cover2 = Cover.create_from_bed('test2',
                                   bedfiles=bed_file,
                                   regions=bed_file,
                                   store_whole_genome=False,
                                   storage='ndarray')

    assert len(cover1) == 25
    assert len(cover2) == len(cover1)
    assert cover1.shape == (25, 200, 1, 1)
    assert cover1.shape == cover2.shape
    np.testing.assert_equal(cover1[:], np.ones(cover1.shape))
    np.testing.assert_equal(cover2[:], np.ones(cover1.shape))
Ejemplo n.º 19
0
def test_bam_inferred_binsize():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    bamfile_ = os.path.join(data_path, "sample.bam")

    cover = Cover.create_from_bam('test',
                                  bamfiles=bamfile_,
                                  regions=bed_file,
                                  flank=0,
                                  storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 2, 1)
Ejemplo n.º 20
0
def test_bed_inferred_binsize():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    #file_ = os.path.join(data_path, "sample.bw")

    cover = Cover.create_from_bed('test',
                                  bedfiles=bed_file,
                                  regions=bed_file,
                                  resolution=1,
                                  storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 1, 1)
Ejemplo n.º 21
0
def test_load_bam_resolutionNone(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample.bam")
    gsfile_ = os.path.join(data_path, 'sample.chrom.sizes')

    content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'],
                              index_col='chr')

    gsize = content.to_dict()['length']

    bed_file = os.path.join(data_path, "sample.bed")

    for store in ['ndarray', 'hdf5', 'sparse']:
        # base pair binsize
        # print(store)
        cover1 = Cover.create_from_bam(
            "yeast_I_II_III.bam",
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=200, stepsize=200,
            genomesize=gsize,
            resolution=1,
            storage=store, cache=True)
        cover = Cover.create_from_bam(
            "yeast_I_II_III.bam",
            bamfiles=bamfile_,
            roi=bed_file,
            binsize=200, stepsize=200,
            genomesize=gsize,
            resolution=None,
            storage=store, cache=True, datatags=['None'])

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 2, 1))

        np.testing.assert_equal(cover1[:].sum(axis=1), cover[:].sum(axis=1))
Ejemplo n.º 22
0
def test_cover_export_bigwig(tmpdir):
    path = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bwfile_ = os.path.join(data_path, "sample.bw")

    for resolution in [1, 50]:
        for storage in [True, False]:
            print('resolution=', resolution)
            print('store_whole_genome', storage)
            cover = Cover.create_from_bigwig(
                'test',
                bigwigfiles=bwfile_,
                resolution=resolution,
                binsize=200,
                roi=bed_file,
                store_whole_genome=storage,
                storage='ndarray')

            cover.export_to_bigwig(output_dir=path)

            cov2 = Cover.create_from_bigwig('test',
                bigwigfiles='{path}/{name}.{sample}.bigwig'.format(
                path=path, name=cover.name,
                sample=cover.conditions[0]),
                resolution=resolution,
                binsize=200,
                roi=bed_file,
                store_whole_genome=storage,
                storage='ndarray')

            assert cover.shape == (100, 200 // resolution, 1, 1)
            assert cover.shape == cov2.shape
            k = list(cover.garray.handle.keys())[0]
            np.testing.assert_allclose(cover[:].sum(), 1044.0 / resolution)
            np.testing.assert_allclose(cov2[:].sum(), 1044.0 / resolution)
Ejemplo n.º 23
0
def test_bed_genomic_interval_access_part_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bed")

    storage = False
    for reso in [1, 50]:
        for shift in [0, 1]:
            cover = Cover.create_from_bed(
                'test',
                bedfiles=bamfile_,
                roi=bed_file,
                flank=0,
                storage='ndarray',
                store_whole_genome=storage,
                resolution=reso)

            for i in range(len(cover)):
                print('storage :',storage,'/ resolution :',reso,'/ shift :',shift)
                print(i, cover.gindexer[i])


                np.testing.assert_equal(np.repeat(cover[i],
                                    cover.garray.resolution,
                                    axis=1), cover[cover.gindexer[i]])

                chrom, start, end, strand = cover.gindexer[i].chrom, \
                    cover.gindexer[i].start, \
                    cover.gindexer[i].end, \
                    cover.gindexer[i].strand

                np.testing.assert_equal(np.repeat(cover[i],
                                    cover.garray.resolution, axis=1),
                                    cover[chrom, start, end, strand])

                if shift != 0:
                    start += shift * reso
                    end += shift * reso

                    if strand != '-':
                        gicov = cover[chrom, start, end, strand][:, :(-shift*reso),:,:]
                        np.testing.assert_equal(cover[i][:, shift:,:, :],
                            gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :])
                    else:
                        gicov = cover[chrom, start, end, strand][:, (shift*reso):,:,:]
                        np.testing.assert_equal(cover[i][:, :-shift,:, :],
                        gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :])
Ejemplo n.º 24
0
def test_cover_bam_unstranded():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample.bam")
    gsfile_ = os.path.join(data_path, 'sample.chrom.sizes')

    content = pandas.read_csv(gsfile_,
                              sep='\t',
                              names=['chr', 'length'],
                              index_col='chr')

    gsize = content.to_dict()['length']

    bed_file = os.path.join(data_path, "sample.bed")

    cover = Cover.create_from_bam("yeast_I_II_III.bam",
                                  bamfiles=bamfile_,
                                  regions=bed_file,
                                  binsize=200,
                                  stepsize=200,
                                  genomesize=gsize,
                                  stranded=False)

    np.testing.assert_equal(len(cover), 100)
    np.testing.assert_equal(cover.shape, (100, 200, 1, 1))

    # the region is read relative to the forward strand
    # read on the reverse strand
    val = np.where(cover[4] == 1)
    np.testing.assert_equal(cover[4].sum(), 1.)
    np.testing.assert_equal(val[1][0], 179)  # pos

    # two reads on the forward strand
    val = np.where(cover[13] == 1)
    np.testing.assert_equal(cover[13].sum(), 2.)
    np.testing.assert_equal(val[1], np.asarray([162, 178]))  # pos

    # the region is read relative to the reverse strand
    # for index 50
    # read on the reverse strand
    val = np.where(cover[52] == 1)
    np.testing.assert_equal(cover[52].sum(), 2.)
    np.testing.assert_equal(val[1], np.asarray([9, 89]))  # pos

    # two reads on the forward strand
    val = np.where(cover[96] == 1)
    np.testing.assert_equal(cover[96].sum(), 1.)
    np.testing.assert_equal(val[1], np.asarray([25]))  # pos
Ejemplo n.º 25
0
def test_janggu_chr2_validation(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'scored_sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       binsize=200,
                                       stepsize=50,
                                       roi=bed_file,
                                       order=1)

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=None,
                                 flank=0,
                                 collapser='max',
                                 storage='ndarray')

    @inputlayer
    @outputconv('sigmoid')
    def _cnn_model1(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'),
                              merge_mode='max',
                              name='bothstrands')(layer)
            layer = MaxPooling2D((198, 1))(layer)
        return inputs, layer

    bwm1 = Janggu.create(_cnn_model1,
                         modelparams=(2, ),
                         inputs=dna,
                         outputs=ctcf,
                         name='dna_ctcf_HepG2-cnn1')

    bwm1.compile(optimizer='adadelta', loss='binary_crossentropy')
    p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
Ejemplo n.º 26
0
def test_bed_overreaching_ends_part_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "bed_test.bed")

    for store in ['ndarray', 'sparse']:
        print(store)
        cover = Cover.create_from_bed(
            'test',
            bedfiles=bed_file,
            roi=bed_file,
            binsize=2,
            flank=20,
            resolution=1,
            store_whole_genome=False,
            storage=store)
        assert len(cover) == 9
        assert cover.shape == (9, 2+2*20, 1, 1)
        np.testing.assert_equal(cover[0].sum(), 18)
        np.testing.assert_equal(cover[:].sum(), 9*18)
Ejemplo n.º 27
0
def test_bam_genomic_interval_access():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bam")

    for storage in [True, False]:
        for reso in [1, 50]:
            for shift in [0, 1]:
                cover = Cover.create_from_bam('test',
                                              bamfiles=bamfile_,
                                              regions=bed_file,
                                              flank=0,
                                              storage='ndarray',
                                              store_whole_genome=storage,
                                              resolution=reso)

                for i in range(len(cover)):
                    print('storage :', storage, '/ resolution :', reso,
                          '/ shift :', shift)
                    print(i, cover.gindexer[i])

                    np.testing.assert_equal(cover[i], cover[cover.gindexer[i]])
                    chrom, start, end, strand = cover.gindexer[
                        i].chrom, cover.gindexer[i].start, cover.gindexer[
                            i].end, cover.gindexer[i].strand
                    np.testing.assert_equal(cover[i], cover[chrom, start, end,
                                                            strand])

                    if shift != 0:
                        start += shift * reso
                        end += shift * reso

                        if strand != '-':
                            np.testing.assert_equal(
                                cover[i][:, shift:, :, :],
                                cover[chrom, start, end,
                                      strand][:, :-shift, :, :])
                        else:
                            np.testing.assert_equal(
                                cover[i][:, :-shift, :, :],
                                cover[chrom, start, end, strand][:,
                                                                 shift:, :, :])
Ejemplo n.º 28
0
def test_bed_overreaching_ends():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    cover = Cover.create_from_bed('test',
                                  bedfiles=bed_file,
                                  regions=bed_file,
                                  flank=2000,
                                  resolution=1,
                                  store_whole_genome=True,
                                  storage='ndarray')
    cover.garray.handle['chr1'][0] = 1
    assert len(cover) == 25
    assert cover.shape == (25, 200 + 2 * 2000, 1, 1)
    np.testing.assert_equal(cover[0][0, :550, 0, 0].sum(), 0)
    np.testing.assert_equal(cover[0][0, 550, 0, 0], 1.)
    np.testing.assert_equal(
        cover[0][0, 550:(550 + len(cover.garray.handle['chr1'])), :, :],
        cover.garray.handle['chr1'])
Ejemplo n.º 29
0
def test_cover_bam_paired_midpoint():
    # sample2.bam contains paired end examples,
    # unmapped examples, unmapped mate and low quality example
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bamfile_ = os.path.join(data_path, "sample2.bam")

    cover = Cover.create_from_bam("yeast_I_II_III.bam",
                                  bamfiles=bamfile_,
                                  stranded=False,
                                  pairedend='midpoint',
                                  min_mapq=30,
                                  store_whole_genome=True)

    assert cover.garray.handle['ref'].sum() == 2, cover.garray.handle['ref']
    print(cover.garray.handle['ref'])
    # the read starts at index 6 and tlen is 39
    assert cover.garray.handle['ref'][6 + 39 // 2, 0, 0] == 1
    # another read maps to index 34
    assert cover.garray.handle['ref'][34, 0, 0] == 1
Ejemplo n.º 30
0
# identically to the models obtained from classify_fasta.py.
REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
# ROI contains regions spanning positive and negative examples
ROI_TRAIN = resource_filename('janggu', 'resources/roi_train.bed')
ROI_TEST = resource_filename('janggu', 'resources/roi_test.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

DNA_TEST = Bioseq.create_from_refgenome('dna',
                                        refgenome=REFGENOME,
                                        roi=ROI_TEST,
                                        binsize=200)

LABELS_TEST = Cover.create_from_bed('peaks',
                                    bedfiles=PEAK_FILE,
                                    roi=ROI_TEST,
                                    binsize=200,
                                    resolution=None)

# Training input and labels are purely defined genomic coordinates
DNA = Bioseq.create_from_refgenome('dna',
                                   refgenome=REFGENOME,
                                   roi=ROI_TRAIN,
                                   binsize=200)

LABELS = Cover.create_from_bed('peaks',
                               roi=ROI_TRAIN,
                               bedfiles=PEAK_FILE,
                               binsize=200,
                               resolution=None)