def test_itereate_chunks(self): mat = numpy.array([[1, 2, 3], [4, 5, 6]]) exp = [[[1, 2, 3]], [[4, 5, 6]]] res = list(iterate_matrix_chunks(mat, chunk_size=1)) assert numpy.all(res[0] == exp[0]) assert numpy.all(res[1] == exp[1]) exp = [[2], [5]] res = list(iterate_matrix_chunks(mat, chunk_size=1, sample_idx=1)) assert numpy.all(res[0] == exp[0]) assert numpy.all(res[1] == exp[1])
def _row_value_counter(mat, value, ratio=False, by_chunk=False): ndims = len(mat.shape) if ndims == 1: raise ValueError('The matrix has to have at least 2 dimensions') elif ndims == 2: axes = 1 else: axes = tuple([i + 1 for i in range(ndims - 1)]) if by_chunk: chunks = iterate_matrix_chunks(mat) result = numpy.zeros(mat.shape[0]) start = 0 for chunk in chunks: chunk_result = _row_value_counter_array(chunk, value, axes) end = start + chunk_result.shape[0] result[start:end] = chunk_result start = end else: if is_dataset(mat): mat = mat[...] result = _row_value_counter_array(mat, value, axes) if ratio: num_items_per_row = reduce(operator.mul, mat.shape[1:], 1) result = result / num_items_per_row return result
def is_variant(chunks): gts = _get_chunks_gt(chunks) is_var = numpy.ndarray(gts.shape[0]) start = 0 for chunk in iterate_matrix_chunks(gts): stop = start + chunk.shape[0] is_var[start:stop] = numpy.sum(chunk > 0, axis=(1, 2)) >= 1 start = stop return is_var
def allele_count(chunks, allele=1): """Calculate number of observations of the given allele per variant.""" gts = _get_chunks_gt(chunks) a_count = numpy.ndarray(gts.shape[0]) start = 0 for chunk in iterate_matrix_chunks(gts): stop = start + chunk.shape[0] a_count[start:stop] = numpy.sum(chunk == allele, axis=(1, 2)) start = stop return a_count
def allele_number(chunks): """Count the number of non-missing allele calls per variant.""" gts = _get_chunks_gt(chunks) allele_num = numpy.ndarray(gts.shape[0]) start = 0 for chunk in iterate_matrix_chunks(gts): stop = start + chunk.shape[0] allele_num[start:stop] = numpy.sum(chunk >= 0, axis=(1, 2)) start = stop return allele_num
def is_doubleton(chunks, allele=1): """Find variants with only two instance of `allele` observed.""" gts = _get_chunks_gt(chunks) start = 0 is_double = numpy.ndarray(gts.shape[0]) for chunk in iterate_matrix_chunks(gts): stop = start + chunk.shape[0] is_double[start:stop] = numpy.sum(chunk == allele, axis=(1, 2)) == 2 start = stop return is_double