def test_create_hdf5_with_chunks(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks()) assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) assert list(hdf5_2['calls'].keys()) == ['GT'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2)) _, prob = ttest_ind(hdf5['/variations/pos'][:], hdf5_2['/variations/pos'][:]) assert prob > 0.05 assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1 chrom = hdf5_2['/variations/chrom'][0] pos = hdf5_2['/variations/pos'][0] index = PosIndex(hdf5) idx = index.index_pos(chrom, pos) old_snp = hdf5['/calls/GT'][idx] new_snp = hdf5_2['/calls/GT'][0] assert numpy.all(old_snp == new_snp) # putting empty chunks hdf5_2.put_chunks(None) hdf5_2.put_chunks([]) chunk = hdf5.get_chunk(slice(1000, None)) hdf5_2.put_chunks([chunk]) old_snp = hdf5['/calls/DP'][idx] new_snp = hdf5_2['/calls/DP'][0] assert numpy.all(old_snp == new_snp) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0)) assert hdf5_2.num_variations == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_3 = VariationsArrays() hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
def test_create_hdf5_with_chunks(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks()) assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) assert list(hdf5_2['calls'].keys()) == ['GT'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2)) _, prob = scipy.stats.ttest_ind(hdf5['/variations/pos'][:], hdf5_2['/variations/pos'][:]) assert prob > 0.05 assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1 chrom = hdf5_2['/variations/chrom'][0] pos = hdf5_2['/variations/pos'][0] index = PosIndex(hdf5) idx = index.index_pos(chrom, pos) old_snp = hdf5['/calls/GT'][idx] new_snp = hdf5_2['/calls/GT'][0] assert numpy.all(old_snp == new_snp) # putting empty chunks hdf5_2.put_chunks(None) hdf5_2.put_chunks([]) chunk = hdf5.get_chunk(slice(1000, None)) hdf5_2.put_chunks([chunk]) old_snp = hdf5['/calls/DP'][idx] new_snp = hdf5_2['/calls/DP'][0] assert numpy.all(old_snp == new_snp) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0)) assert hdf5_2.num_variations == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_3 = VariationsArrays() hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
def test_index(self): snps = VariationsArrays() chroms = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] pos = [1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 4, 6] snps[CHROM_FIELD] = numpy.array(chroms) snps[POS_FIELD] = numpy.array(pos) index = PosIndex(snps) assert index.index_pos(1, 1) == 0 assert index.index_pos(2, 1) == 3 assert index.index_pos(3, 1) == 6 assert index.index_pos(4, 1) == 9 assert index.index_pos(4, 2) == 9 assert index.index_pos(4, 3) == 10 assert index.index_pos(4, 4) == 10 assert index.get_chrom_range_index(1) == (0, 2) assert index.get_chrom_range_pos(1) == (1, 3) assert index.covered_length == 10
def test_index(self): snps = VariationsArrays() chroms = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] pos = [1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 4, 6] snps['/variations/chrom'] = numpy.array(chroms) snps['/variations/pos'] = numpy.array(pos) index = PosIndex(snps) assert index.index_pos(1, 1) == 0 assert index.index_pos(2, 1) == 3 assert index.index_pos(3, 1) == 6 assert index.index_pos(4, 1) == 9 assert index.index_pos(4, 2) == 9 assert index.index_pos(4, 3) == 10 assert index.index_pos(4, 4) == 10 assert index.get_chrom_range_index(1) == (0, 2) assert index.get_chrom_range_pos(1) == (1, 3) assert index.covered_length == 10
def pos_index(self): if self._index is None: self._index = PosIndex(self) return self._index