Exemple #1
0
    def test_count_alleles(self):

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunk = first(hdf5.iterate_chunks())
        genotypes = chunk['/calls/GT']
        expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]]
        counts = counts_by_row(genotypes, missing_value=-1)
        assert numpy.all(expected == counts)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT'])
        chunks = (chunk['/calls/GT'] for chunk in chunks)
        matrix = first(chunks)
        for _ in range(20):
            extend_matrix(matrix, chunks)

        counts = counts_by_row(matrix, missing_value=-1)

        gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6]])

        gts = [[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6, 6]])
    def test_count_alleles(self):

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunk = first(hdf5.iterate_chunks())
        genotypes = chunk['/calls/GT']
        expected = [[3, 3, 0],
                    [5, 1, 0],
                    [0, 2, 4],
                    [6, 0, 0],
                    [2, 3, 1]]
        counts = counts_by_row(genotypes, missing_value=-1)
        assert numpy.all(expected == counts)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT'])
        chunks = (chunk['/calls/GT'] for chunk in chunks)
        matrix = first(chunks)
        for _ in range(20):
            extend_matrix(matrix, chunks)

        counts = counts_by_row(matrix, missing_value=-1)

        gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6]])

        gts = [[[0, 0], [0, 0], [0, 0]],
               [[0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6, 6]])
Exemple #3
0
    def gts_as_mat012(self):
        '''It transforms the GT matrix into 0 (major allele h**o), 1 (het),
        2(other hom)'''
        gts = self[GT_FIELD]
        counts = counts_by_row(gts, missing_value=MISSING_INT)
        if counts is None:
            return numpy.full((gts.shape[0], gts.shape[1]),
                              fill_value=MISSING_INT)

        major_alleles = numpy.argmax(counts, axis=1)
        if is_dataset(gts):
            gts = gts[:]
        gts012 = numpy.sum(gts != major_alleles[:, None, None], axis=2)
        gts012[numpy.any(gts == MISSING_INT, axis=2)] = MISSING_INT
        return gts012
    def gts_as_mat012(self):
        '''It transforms the GT matrix into 0 (major allele h**o), 1 (het),
        2(other hom)'''
        gts = self[GT_FIELD]
        counts = counts_by_row(gts, missing_value=MISSING_INT)
        if counts is None:
            return numpy.full((gts.shape[0], gts.shape[1]),
                              fill_value=MISSING_INT)

        major_alleles = numpy.argmax(counts, axis=1)
        if is_dataset(gts):
            gts = gts[:]
        gts012 = numpy.sum(gts != major_alleles[:, None, None], axis=2)
        gts012[numpy.any(gts == MISSING_INT, axis=2)] = MISSING_INT
        return gts012
Exemple #5
0
 def allele_count(self):
     counts = None
     for gt_chunk in select_dset_from_chunks(self.iterate_chunks(),
                                             '/calls/GT'):
         chunk_counts = counts_by_row(gt_chunk,
                                      missing_value=MISSING_VALUES[int])
         if counts is None:
             counts = chunk_counts
         else:
             if counts.shape[1:] < chunk_counts.shape[1:]:
                 n_extra_cols = chunk_counts.shape[-1] - counts.shape[-1]
                 shape = list(counts.shape)
                 shape[-1] = n_extra_cols
                 extra_cols = numpy.zeros(shape, dtype=chunk_counts.dtype)
                 counts = numpy.hstack((counts, extra_cols))
             elif counts.shape[1:] > chunk_counts.shape[1:]:
                 n_extra_cols = counts.shape[-1] - chunk_counts.shape[-1]
                 shape = list(chunk_counts.shape)
                 shape[-1] = n_extra_cols
                 extra_cols = numpy.zeros(shape, dtype=chunk_counts.dtype)
                 chunk_counts = numpy.hstack((chunk_counts, extra_cols))
             counts = numpy.concatenate([counts, chunk_counts], axis=0)
     return counts
 def allele_count(self):
     counts = None
     for gt_chunk in select_dset_from_chunks(self.iterate_chunks(),
                                             '/calls/GT'):
         chunk_counts = counts_by_row(gt_chunk,
                                      missing_value=MISSING_VALUES[int])
         if counts is None:
             counts = chunk_counts
         else:
             if counts.shape[1:] < chunk_counts.shape[1:]:
                 n_extra_cols = chunk_counts.shape[-1] - counts.shape[-1]
                 shape = list(counts.shape)
                 shape[-1] = n_extra_cols
                 extra_cols = numpy.zeros(shape, dtype=chunk_counts.dtype)
                 counts = numpy.hstack((counts, extra_cols))
             elif counts.shape[1:] > chunk_counts.shape[1:]:
                 n_extra_cols = counts.shape[-1] - chunk_counts.shape[-1]
                 shape = list(chunk_counts.shape)
                 shape[-1] = n_extra_cols
                 extra_cols = numpy.zeros(shape, dtype=chunk_counts.dtype)
                 chunk_counts = numpy.hstack((chunk_counts, extra_cols))
             counts = numpy.concatenate([counts, chunk_counts], axis=0)
     return counts
Exemple #7
0
 def allele_count(self):
     gts = self['/calls/GT']
     counts = counts_by_row(gts, missing_value=MISSING_VALUES[int])
     return counts
 def allele_count(self):
     gts = self['/calls/GT']
     counts = counts_by_row(gts, missing_value=MISSING_VALUES[int])
     return counts