def test_count_value_per_row(self): mat = numpy.array([[0, 0], [1, -1], [2, -1], [-1, -1]]) missing_counter = row_value_counter_fact(value=-1) assert numpy.all(missing_counter(mat) == [0, 1, 1, 2]) missing_counter = row_value_counter_fact(value=-1, ratio=True) assert numpy.allclose(missing_counter(mat), [0., 0.5, 0.5, 1.]) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') chunks = list(hdf5.iterate_chunks()) gt_chunk = first(select_dset_from_chunks(chunks, '/calls/GT')) homo_counter = row_value_counter_fact(value=2) assert numpy.all(homo_counter(gt_chunk) == [0, 0, 4, 0, 1]) missing_counter = row_value_counter_fact(value=2, ratio=True) expected = [0., 0, 0.66666, 0., 0.166666] assert numpy.allclose(missing_counter(gt_chunk), expected) hdf5.close()
def _count_gts(variations): if variations[GT_FIELD].shape[0] == 0: return numpy.array([]), numpy.array([]), numpy.array([]) gts = variations[GT_FIELD] gts = gts[...] # get rid of genotypes with missing alleles missing_alleles = gts == MISSING_INT miss_gts = numpy.any(missing_alleles, axis=2) # We pack the genotype of a sample that is in third axes as two # integers as one integer: 1, 1 -> 11 0, 1 -> 01, 0, 0-> 0 gts_per_haplo = [(gts[:, :, idx].astype(numpy.int16)) * (100**idx) for idx in range(gts.shape[2])] packed_gts = None for gts_ in gts_per_haplo: if packed_gts is None: packed_gts = gts_ else: packed_gts += gts_ packed_gts[miss_gts] = MISSING_INT different_gts = numpy.unique(packed_gts) # Count genotypes, h**o, het and alleles ploidy = gts.shape[2] counts = {} for gt in different_gts: count_gt_by_row = row_value_counter_fact(gt) gt_counts = count_gt_by_row(packed_gts) if gt == MISSING_INT: continue unpacked_gt = _packed_gt_to_tuple(gt, ploidy) if unpacked_gt not in counts: counts[unpacked_gt] = gt_counts else: counts[unpacked_gt] += gt_counts return counts
def _count_gts(variations): if variations[GT_FIELD].shape[0] == 0: return numpy.array([]), numpy.array([]), numpy.array([]) gts = variations[GT_FIELD] gts = gts[...] # get rid of genotypes with missing alleles missing_alleles = gts == MISSING_INT miss_gts = numpy.any(missing_alleles, axis=2) # We pack the genotype of a sample that is in third axes as two # integers as one integer: 1, 1 -> 11 0, 1 -> 01, 0, 0-> 0 gts_per_haplo = [(gts[:, :, idx].astype(numpy.int16)) * (100 ** idx) for idx in range(gts.shape[2])] packed_gts = None for gts_ in gts_per_haplo: if packed_gts is None: packed_gts = gts_ else: packed_gts += gts_ packed_gts[miss_gts] = MISSING_INT different_gts = numpy.unique(packed_gts) # Count genotypes, h**o, het and alleles ploidy = gts.shape[2] counts = {} for gt in different_gts: count_gt_by_row = row_value_counter_fact(gt) gt_counts = count_gt_by_row(packed_gts) if gt == MISSING_INT: continue unpacked_gt = _packed_gt_to_tuple(gt, ploidy) if unpacked_gt not in counts: counts[unpacked_gt] = gt_counts else: counts[unpacked_gt] += gt_counts return counts