Esempio n. 1
0
def test_invalid_access():

    ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
        {'chr10': 300}),
                              stranded=False,
                              typecode='int8',
                              storage='ndarray')

    with pytest.raises(Exception):
        # access only via genomic interval
        ga[1]

    with pytest.raises(Exception):
        # access only via genomic interval and condition
        ga[1] = 1

    ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
        {'chr10': 300}),
                              stranded=False,
                              typecode='int8',
                              storage='sparse')

    with pytest.raises(Exception):
        # access only via genomic interval
        ga[1]

    with pytest.raises(Exception):
        # access only via genomic interval and condition
        ga[1] = 1
Esempio n. 2
0
def test_gindexer_short_interval():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    gi = GenomicIndexer.create_from_file(os.path.join(data_path,
                                                      'sample_equalsize.bed'),
                                         binsize=200,
                                         stepsize=200)
    assert len(gi) == 4
    gi = GenomicIndexer.create_from_file(os.path.join(data_path,
                                                      'sample_equalsize.bed'),
                                         binsize=180,
                                         stepsize=20)
    assert len(gi) == 8
    gi = GenomicIndexer.create_from_file(os.path.join(data_path,
                                                      'sample_equalsize.bed'),
                                         binsize=210,
                                         stepsize=20,
                                         zero_padding=False)
    assert len(gi) == 0

    gi = GenomicIndexer.create_from_file(os.path.join(data_path,
                                                      'sample_equalsize.bed'),
                                         binsize=210,
                                         stepsize=20,
                                         zero_padding=True)
    assert len(gi) == 4
Esempio n. 3
0
def test_gindexer_merged_variable_length_ranges():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    # with fixed size
    gi = GenomicIndexer.create_from_file(os.path.join(data_path, 'sample.bed'),
                                         binsize=3000,
                                         stepsize=3000,
                                         zero_padding=False)
    np.testing.assert_equal(len(gi), 6)

    iv = gi[0]
    np.testing.assert_equal((iv.chrom, iv.start, iv.end, iv.strand),
                            ('chr1', 15000, 18000, '+'))
    iv = gi[-1]
    np.testing.assert_equal((iv.chrom, iv.start, iv.end, iv.strand),
                            ('chr2', 21000, 24000, '-'))

    # with variable size regions
    gi = GenomicIndexer.create_from_file(os.path.join(data_path, 'sample.bed'),
                                         binsize=3000,
                                         stepsize=3000,
                                         zero_padding=True)
    np.testing.assert_equal(len(gi), 8)

    iv = gi[0]
    np.testing.assert_equal((iv.chrom, iv.start, iv.end, iv.strand),
                            ('chr1', 15000, 18000, '+'))
    iv = gi[-1]
    np.testing.assert_equal((iv.chrom, iv.start, iv.end, iv.strand),
                            ('chr2', 24000, 25000, '-'))
Esempio n. 4
0
def test_gindexer_errors():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    with pytest.raises(ValueError):
        GenomicIndexer.create_from_file(os.path.join(data_path, 'sample.bed'),
                                        binsize=0,
                                        stepsize=50)

    with pytest.raises(ValueError):
        GenomicIndexer.create_from_file(os.path.join(data_path, 'sample.bed'),
                                        binsize=10,
                                        stepsize=0)
    with pytest.raises(ValueError):
        # due to flank < 0
        GenomicIndexer.create_from_file(os.path.join(data_path, 'sample.bed'),
                                        binsize=200,
                                        stepsize=50,
                                        flank=-1)
    # due to unequal intervals
    gi = GenomicIndexer.create_from_file(os.path.join(data_path, 'scores.bed'),
                                         binsize=None,
                                         stepsize=None,
                                         flank=0)
    #print(len(gi))
    #for reg in gi:
    #    print(reg)
    GenomicIndexer.create_from_file(os.path.join(data_path, 'scores.bed'),
                                    binsize=200,
                                    stepsize=200,
                                    flank=0)
Esempio n. 5
0
def test_tmp_normalization(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1)
        garray[Interval('chr2', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1)
        return garray

    for store in ['ndarray', 'hdf5']:
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage=store,
                                  cache="cache_file",
                                  resolution=50,
                                  loader=loading,
                                  collapser='sum',
                                  normalizer=['tpm'])
        np.testing.assert_allclose(
            ga[Interval('chr1', 100, 101)],
            np.asarray([[[10 * 1000 / 50 * 1e6 / (720.)]]]))
        np.testing.assert_allclose(
            ga[Interval('chr2', 100, 101)],
            np.asarray([[[1 * 1000 / 50 * 1e6 / (720.)]]]))
Esempio n. 6
0
def test_bwga_instance_unstranded_taged(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    iv = Interval('chr10', 100, 120, strand='.')
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
        {'chr10': 300}),
                              stranded=False,
                              typecode='int8',
                              storage='ndarray',
                              datatags='test_bwga_instance_unstranded')

    with pytest.raises(Exception):
        # access only via genomic interval
        ga[1]

    with pytest.raises(Exception):
        # access only via genomic interval and condition
        ga[1] = 1

    np.testing.assert_equal(ga[iv].shape, (20, 1, 1))
    np.testing.assert_equal(ga[iv], np.zeros((20, 1, 1)))

    ga[iv, 0] = np.ones((20, 1))
    np.testing.assert_equal(ga[iv], np.ones((20, 1, 1)))
    np.testing.assert_equal(ga[iv].sum(), 20)
    iv = Interval('chr10', 0, 300, strand='.')
    np.testing.assert_equal(ga[iv].sum(), 20)
Esempio n. 7
0
def test_output_bigwig_loss_resolution_unequal_stepsize(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 4, 1, 10)))
    outputs = Array('y', numpy.random.random((7, 4, 1, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path,
                                         binsize=200,
                                         stepsize=50)

    dummy_eval = Scorer('loss', lambda t, p: [0.2] * len(t),
                        exporter=ExportBigwig(gindexer=gi))

    bwm.evaluate(inputs, outputs, callbacks=[dummy_eval])

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name,
                         'loss.{}.bigwig')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bw = pyBigWig.open(file_.format('c1'))

    co = bw.values('chr1', 600, 2000-150)

    numpy.testing.assert_allclose(numpy.mean(co), 0.2, rtol=1e-5)
Esempio n. 8
0
def test_janggu_variant_prediction(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    for order in [1, 2, 3]:
        refgenome = os.path.join(data_path, 'sample_genome.fa')
        vcffile = os.path.join(data_path, 'sample.vcf')

        dna = Bioseq.create_from_refgenome('dna',
                                           refgenome=refgenome,
                                           storage='ndarray',
                                           binsize=50,
                                           store_whole_genome=True,
                                           order=order)

        def _cnn_model(inputs, inp, oup, params):
            inputs = Input(
                (50 - params['order'] + 1, 1, pow(4, params['order'])))
            layer = Flatten()(inputs)
            layer = Dense(params['hiddenunits'])(layer)
            output = Dense(4, activation='sigmoid')(layer)
            return inputs, output

        model = Janggu.create(_cnn_model,
                              modelparams={
                                  'hiddenunits': 2,
                                  'order': order
                              },
                              name='dna_ctcf_HepG2-cnn')

        model.predict_variant_effect(
            dna,
            vcffile,
            conditions=['m' + str(i) for i in range(4)],
            output_folder=os.path.join(os.environ['JANGGU_OUTPUT']))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'))

        f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'),
                      'r')

        gindexer = GenomicIndexer.create_from_file(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None,
            None)

        cov = Cover.create_from_array('snps',
                                      f['diffscore'],
                                      gindexer,
                                      store_whole_genome=True)

        print(cov['chr2', 55, 65].shape)
        print(cov['chr2', 55, 65])

        assert np.abs(cov['chr2', 59, 60]).sum() > 0.0
        assert np.abs(cov['chr2', 54, 55]).sum() == 0.0
        f.close()
Esempio n. 9
0
def test_resolution_negative():
    with pytest.raises(Exception):
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
            {'chr10': 300}),
                                  stranded=True,
                                  typecode='int8',
                                  storage='ndarray',
                                  cache=False,
                                  resolution=-1)
Esempio n. 10
0
def test_invalid_storage():
    with pytest.raises(Exception):
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
            {'chr10': 300}),
                                  stranded=True,
                                  typecode='int8',
                                  storage='storgae',
                                  resolution=1,
                                  cache=False)
Esempio n. 11
0
def test_create_from_array(tmpdir):
    inbed = resource_filename('janggu', 'resources/bed_test.bed')

    outbed = os.path.join(tmpdir.strpath, 'out.bed')
    trim_bed(inbed, outbed, 5)

    # original file
    gindexer = GenomicIndexer.create_from_file(inbed, None, None)
    reg = gindexer[0]
    assert (reg.start % 5) == 0
    assert (reg.end % 5) > 0

    # trimmed file
    gindexer = GenomicIndexer.create_from_file(outbed, None, None)
    gindexer = GenomicIndexer.create_from_file(outbed, None, None)
    reg = gindexer[0]
    assert (reg.start % 5) == 0
    assert (reg.end % 5) == 0
Esempio n. 12
0
def test_dna_dims_order_1_from_reference(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    gindexer = GenomicIndexer.create_from_file(bed_merged, 200, 200)

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                        storage='ndarray',
                                        order=order,
                                        store_whole_genome=True)
    data.gindexer = gindexer
    assert len(data.garray.handle) == 2
    assert 'chr1' in data.garray.handle
    assert 'chr2' in data.garray.handle

    # for order 1
    assert len(data) == 100
    assert data.shape == (100, 200, 1, 4)
    # the correctness of the sequence extraction was also
    # validated using:
    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr1:15000-25000
    # ATTGTGGTGA...
    # this sequence is read from the forward strand
    np.testing.assert_equal(data[0][0, :10, 0, :],
                            np.asarray([[1, 0, 0, 0],  # A
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [1, 0, 0, 0]],  # A
                            dtype='int8'))

    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr2:15000-25000
    # ggggaagcaa...
    # this sequence is read from the reverse strand
    # so we have ...ttgcttcccc
    np.testing.assert_equal(data[50][0, -10:, 0, :],
                            np.asarray([[0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 1, 0, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0]],  # C
                            dtype='int8'))
Esempio n. 13
0
def test_hdf5_no_cache():

    with pytest.raises(Exception):
        # cache must be True
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
            {'chr10': 300}),
                                  stranded=True,
                                  typecode='int8',
                                  storage='hdf5',
                                  cache=None)
Esempio n. 14
0
def test_gindexer_short_interval_with_dataframe():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    df = pd.read_csv(os.path.join(data_path, 'sample_equalsize.bed'),
                     sep='\t',
                     header=None,
                     names=['chrom', 'start', 'end'])

    gi = GenomicIndexer.create_from_file(df, binsize=200, stepsize=200)
    assert len(gi) == 4
    gi = GenomicIndexer.create_from_file(df, binsize=180, stepsize=20)
    assert len(gi) == 8
    gi = GenomicIndexer.create_from_file(df,
                                         binsize=210,
                                         stepsize=20,
                                         zero_padding=False)
    assert len(gi) == 0

    gi = GenomicIndexer.create_from_file(df,
                                         binsize=210,
                                         stepsize=20,
                                         zero_padding=True)
    assert len(gi) == 4
Esempio n. 15
0
def test_bwga_instance_unstranded(tmpdir):
    iv = Interval('chr10', 100, 120, strand='.')
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
        {'chr10': 300}),
                              stranded=False,
                              typecode='int8',
                              storage='ndarray',
                              cache=False)
    np.testing.assert_equal(ga[iv].shape, (20, 1, 1))
    np.testing.assert_equal(ga[iv], np.zeros((20, 1, 1)))

    ga[iv, 0] = np.ones((20, 1))
    np.testing.assert_equal(ga[iv], np.ones((20, 1, 1)))
    np.testing.assert_equal(ga[iv].sum(), 20)
    iv = Interval('chr10', 0, 300, strand='.')
    np.testing.assert_equal(ga[iv].sum(), 20)
Esempio n. 16
0
def test_gindexer_merged():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    gi = GenomicIndexer.create_from_file(os.path.join(data_path, 'sample.bed'),
                                         binsize=200,
                                         stepsize=200)
    np.testing.assert_equal(len(gi), 100)
    gi2 = gi.filter_by_region(include='chr1')
    gi3 = gi.filter_by_region(include='chr10')
    gi4 = gi.filter_by_region(exclude='chr2')
    gi5 = gi.filter_by_region(exclude='chr10')

    np.testing.assert_equal(len(gi2), 50)

    np.testing.assert_equal(len(gi3), 0)
    np.testing.assert_equal(len(gi4), 50)
    np.testing.assert_equal(len(gi5), 100)
Esempio n. 17
0
def test_output_bed_loss_resolution_equal_stepsize(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 1, 1, 10)))
    outputs = Array('y',
                    numpy.random.random((7, 1, 1, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu_conv(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200)

    dummy_eval = Scorer('loss',
                        lambda t, p: [0.1] * len(t),
                        exporter=export_bed)

    bwm.evaluate(inputs,
                 outputs,
                 callbacks=[dummy_eval],
                 exporter_kwargs={
                     'gindexer': gi,
                     'resolution': 200
                 })

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name,
                         'loss.nptest.y.{}.bed')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bed = iter(HTSeq.BED_Reader(file_.format('c1')))

    nreg = 0
    for reg in bed:
        numpy.testing.assert_equal(reg.score, 0.1)
        nreg += 1


#        numpy.testing.assert_equal(breg.score, value)

    assert nreg == 7, 'There should be 7 regions in the bed file.'
Esempio n. 18
0
def test_bwga_instance_stranded(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    iv = Interval('chr10', 100, 120, strand='+')
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize(
        {'chr10': 300}),
                              stranded=True,
                              typecode='int8',
                              storage='ndarray')
    np.testing.assert_equal(ga[iv].shape, (20, 2, 1))
    np.testing.assert_equal(ga[iv], np.zeros((20, 2, 1)))

    x = np.zeros((20, 2, 1))
    x[:, :1, :] = 1
    ga[iv, 0] = x[:, :, 0]
    np.testing.assert_equal(ga[iv], x)
    np.testing.assert_equal(ga[iv].sum(), 20)
    iv = Interval('chr10', 0, 300)
    np.testing.assert_equal(ga[iv].sum(), 20)
Esempio n. 19
0
def test_output_bed_loss_resolution_unequal_stepsize(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 4, 1, 10)))
    outputs = Array('y', numpy.random.random((7, 4, 1, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path,
                                         binsize=200,
                                         stepsize=200)

    # dummy_eval = Scorer('loss', lambda t, p: -t * numpy.log(p),
    #                    exporter=export_bed, export_args={'gindexer': gi})
    dummy_eval = Scorer('loss', lambda t, p: [0.1] * len(t),
                        exporter=ExportBed(gindexer=gi, resolution=50))

    bwm.evaluate(inputs, outputs, callbacks=[dummy_eval])

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name,
                         'loss.{}.bed')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bed = BedTool(file_.format('c1'))

    nreg = 0
    for reg in bed:
        numpy.testing.assert_equal(float(reg.score), 0.1)
        nreg += 1

    assert nreg == 28, 'There should be 28 regions in the bed file.'
Esempio n. 20
0
def test_perctrim(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[Interval('chr1', 0, 150),
               0] = np.random.normal(loc=10, size=150).reshape(-1, 1)
        garray[Interval('chr2', 0, 300),
               0] = np.random.normal(loc=100, size=300).reshape(-1, 1)
        return garray

    for store in ['ndarray', 'hdf5']:
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage=store,
                                  cache="cache_file",
                                  loader=loading,
                                  normalizer=['binsizenorm', 'perctrim'])
Esempio n. 21
0
def test_zscore_normalization(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[Interval('chr1', 0, 150), 0] = np.repeat(1, 150).reshape(-1, 1)
        garray[Interval('chr2', 0, 300), 0] = np.repeat(-1, 300).reshape(-1, 1)
        return garray

    for store in ['ndarray', 'hdf5']:
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage=store,
                                  cache=True,
                                  loader=loading,
                                  normalizer=['zscore'])
        np.testing.assert_allclose(ga.weighted_mean(),
                                   np.asarray([0.0]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga.weighted_sd(),
                                   np.asarray([1.]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga[Interval('chr1', 100, 101)],
                                   np.asarray([[[1.412641340027806]]]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga[Interval('chr2', 100, 101)],
                                   np.asarray([[[-0.706320670013903]]]),
                                   rtol=1e-5,
                                   atol=1e-5)
Esempio n. 22
0
def test_output_bed_predict_denseout(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 10)))
    outputs = Array('y',
                    numpy.random.random((7, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200)

    dummy_eval = Scorer('pred',
                        lambda p: [0.1] * len(p),
                        exporter=ExportBed(gindexer=gi, resolution=200),
                        conditions=['c1', 'c2', 'c3', 'c4'])

    bwm.predict(inputs, callbacks=[dummy_eval])

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name,
                         'pred.nptest.y.{}.bed')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bed = iter(HTSeq.BED_Reader(file_.format('c1')))

    nreg = 0
    for reg in bed:
        numpy.testing.assert_equal(reg.score, 0.1)
        nreg += 1

    assert nreg == 7, 'There should be 7 regions in the bed file.'
Esempio n. 23
0
def test_output_bigwig_predict_denseout(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 10)))
    outputs = Array('y',
                    numpy.random.random((7, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200)

    dummy_eval = Scorer('pred',
                        lambda p: [0.1] * len(p),
                        exporter=export_bigwig,
                        conditions=['c1', 'c2', 'c3', 'c4'])

    bwm.predict(inputs,
                callbacks=[dummy_eval],
                exporter_kwargs={'gindexer': gi})

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name,
                         'pred.nptest.y.{}.bigwig')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bw = pyBigWig.open(file_.format('c1'))

    co = bw.values('chr1', 600, 2000)

    numpy.testing.assert_allclose(numpy.mean(co), 0.1, rtol=1e-5)
Esempio n. 24
0
def test_check_resolution_collapse_compatibility(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1)
        garray[Interval('chr2', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1)
        return garray

    with pytest.raises(Exception):
        # Error because resolution=50 but no collapser defined
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage="ndarray",
                                  cache=None,
                                  resolution=50,
                                  loader=loading,
                                  collapser=None,
                                  normalizer=['tpm'])

    with pytest.raises(Exception):
        # Error because resolution=None but no collapser defined
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage="ndarray",
                                  cache=None,
                                  resolution=None,
                                  loader=loading,
                                  collapser=None,
                                  normalizer=['tpm'])

    ga = create_genomic_array(GenomicIndexer.create_from_file(
        [
            Interval('chr1', 0, 150),
            Interval('chr2', 0, 150),
            Interval('chr2', 150, 300)
        ],
        binsize=150,
        stepsize=None,
    ),
                              stranded=False,
                              typecode='float32',
                              storage="ndarray",
                              cache=None,
                              resolution=1,
                              loader=loading)
    ga = create_genomic_array(GenomicIndexer.create_from_file(
        [Interval('chr1', 0, 150),
         Interval('chr2', 0, 300)],
        binsize=None,
        stepsize=None,
        collapse=True),
                              stranded=False,
                              typecode='float32',
                              storage="ndarray",
                              cache='test',
                              resolution=None,
                              loader=loading,
                              store_whole_genome=None,
                              collapser='sum')
    ga = create_genomic_array(GenomicIndexer.create_from_file(
        [Interval('chr1', 0, 150),
         Interval('chr2', 0, 300)],
        binsize=None,
        stepsize=None,
        collapse=True),
                              stranded=False,
                              typecode='float32',
                              storage="ndarray",
                              cache=None,
                              resolution=None,
                              loader=loading,
                              collapser='sum',
                              normalizer=['tpm'])
Esempio n. 25
0
os.makedirs(vcfoutput, exist_ok=True)

# perform variant effect prediction using Bioseq object and
# a VCF file
scoresfile, variantsfile = model.predict_variant_effect(
    DNA, VCFFILE, conditions=['feature'], output_folder=vcfoutput)

scoresfile = os.path.join(vcfoutput, 'scores.hdf5')
variantsfile = os.path.join(vcfoutput, 'snps.bed.gz')

# parse the variant effect predictions (difference between
# reference and alternative variant) into a Cover object
# for the purpose of visualization
f = h5py.File(scoresfile, 'r')

gindexer = GenomicIndexer.create_from_file(variantsfile, None, None)

snpcov = Cover.create_from_array('snps',
                                 f['diffscore'],
                                 gindexer,
                                 store_whole_genome=True,
                                 padding_value=np.nan)
snpcov = Cover.create_from_array('snps',
                                 f['diffscore'],
                                 gindexer,
                                 store_whole_genome=False,
                                 padding_value=np.nan)

gi = DNA.gindexer[3]
chrom = gi.chrom
start = gi.start
Esempio n. 26
0
def test_filter_by_region():

    roi_file = pkg_resources.resource_filename('janggu',
                                               'resources/bed_test.bed')

    f1 = GenomicIndexer.create_from_file(regions=roi_file,
                                         binsize=2,
                                         stepsize=2)
    np.testing.assert_equal(len(f1), 9)

    j = ""
    for i in f1:
        j += str(i) + "\n"

    prv = "chr1:[0,2)/+\n" \
          "chr1:[2,4)/+\n" \
          "chr1:[4,6)/+\n" \
          "chr1:[6,8)/+\n" \
          "chr1:[8,10)/+\n" \
          "chr1:[10,12)/+\n" \
          "chr1:[12,14)/+\n" \
          "chr1:[14,16)/+\n" \
          "chr1:[16,18)/+\n"
    np.testing.assert_equal(j, prv)

    test1 = f1.filter_by_region(include='chr1', start=0, end=18)
    k = ""
    for i in test1:
        k += str(i) + "\n"
    np.testing.assert_equal(j, k)

    test2 = f1.filter_by_region(include='chr1', start=5, end=10)
    z = ""
    for i in test2:
        z += str(i) + "\n"
    prv2 = "chr1:[4,6)/+\n" \
           "chr1:[6,8)/+\n" \
           "chr1:[8,10)/+\n"
    np.testing.assert_equal(z, prv2)

    test3 = f1.filter_by_region(include='chr1', start=5, end=11)
    q = ""
    for i in test3:
        q += str(i) + "\n"
    prv3 = "chr1:[4,6)/+\n" \
           "chr1:[6,8)/+\n" \
           "chr1:[8,10)/+\n" \
           "chr1:[10,12)/+\n"
    np.testing.assert_equal(q, prv3)

    test4 = f1.filter_by_region(include='chr1', start=6, end=10)
    z1 = ""
    for i in test4:
        z1 += str(i) + "\n"
    prv4 = "chr1:[6,8)/+\n" \
           "chr1:[8,10)/+\n"
    np.testing.assert_equal(z1, prv4)

    test5 = f1.filter_by_region(include='chr1', start=6, end=11)
    q1 = ""
    for i in test5:
        q1 += str(i) + "\n"
    prv5 = "chr1:[6,8)/+\n" \
           "chr1:[8,10)/+\n" \
           "chr1:[10,12)/+\n"
    np.testing.assert_equal(q1, prv5)

    test6 = f1.filter_by_region(include='chr1', start=20, end=30)
    np.testing.assert_equal(len(test6), 0)