def _select_mono(self, chunk): keep_monomorphic = self.keep_monomorphic gts = chunk[GT_FIELD] if is_dataset(gts): gts = gts[:] shape = gts.shape # we count how many different alleles are per row # we do it adding a complex part to each number. The complex part is # related with the row. Then we use unique weight = 1j * numpy.arange(0, shape[0]) weight = numpy.repeat(weight, shape[1] * shape[2]).reshape(shape) b = gts + weight _, ind = numpy.unique(b, return_index=True) b = numpy.zeros_like(gts) c = numpy.ones_like(gts) numpy.put(b, ind, c.flat[ind]) c = numpy.sum(b, axis=(2, 1)) # we remove the missing values from the count rows_with_missing = numpy.any(gts == -1, axis=(1, 2)) c -= rows_with_missing if keep_monomorphic: selected_rows = (c <= 2) else: selected_rows = (c == 2) return selected_rows
def _row_value_counter(mat, value, ratio=False, by_chunk=False): ndims = len(mat.shape) if ndims == 1: raise ValueError('The matrix has to have at least 2 dimensions') elif ndims == 2: axes = 1 else: axes = tuple([i + 1 for i in range(ndims - 1)]) if by_chunk: chunks = iterate_matrix_chunks(mat) result = numpy.zeros(mat.shape[0]) start = 0 for chunk in chunks: chunk_result = _row_value_counter_array(chunk, value, axes) end = start + chunk_result.shape[0] result[start:end] = chunk_result start = end else: if is_dataset(mat): mat = mat[...] result = _row_value_counter_array(mat, value, axes) if ratio: num_items_per_row = reduce(operator.mul, mat.shape[1:], 1) result = result / num_items_per_row return result
def _calc_stat(self, variations): stat = variations['/variations/qual'] if is_dataset(stat): stat = stat[:] if numpy.issubdtype(stat.dtype, numpy.float): stat[numpy.isinf(stat)] = numpy.finfo(stat.dtype).max return stat
def __call__(self, variations): gts = variations[GT_FIELD][:] mat_to_check = variations[self.field_path] if is_dataset(variations[GT_FIELD]): mat_to_check = mat_to_check[:] gts[mat_to_check < self.min] = MISSING_INT else: gts[mat_to_check < self.min] = MISSING_INT result = {} if self.do_filtering: copied_vars = variations.get_chunk(slice(None, None), ignored_fields=[GT_FIELD]) copied_vars[GT_FIELD] = gts result[FLT_VARS] = copied_vars if self.do_histogram: counts, edges = histogram(mat_to_check, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges return result
def __call__(self, variations): gts = variations[GT_FIELD][:] mat_to_check = variations[self.field_path] if is_dataset(variations[GT_FIELD]): mat_to_check = mat_to_check[:] gts[mat_to_check < self.min] = MISSING_INT ignore_fields_to_copy = [GT_FIELD] if self.query_field_to_missing: mat_to_check[mat_to_check < self.min] = MISSING_INT ignore_fields_to_copy.append(self.field_path) result = {} if self.do_filtering: copied_vars = variations.get_chunk( slice(None, None), ignored_fields=ignore_fields_to_copy) copied_vars[GT_FIELD] = gts if self.query_field_to_missing: # print(self.field_path, mat_to_check) copied_vars[self.field_path] = mat_to_check result[FLT_VARS] = copied_vars if self.do_histogram: counts, edges = histogram(mat_to_check, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges return result
def _calc_stat(self, variations): stat = variations['/variations/qual'] if is_dataset(stat): stat = stat[:] if numpy.issubdtype(stat.dtype, numpy.dtype(float)): stat[numpy.isinf(stat)] = numpy.finfo(stat.dtype).max return stat
def _calc_stat(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if dps.shape[0] == 0: # No SNPs raise ValueError('No SNPs to filter') if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps het_calls = call_is_het(vars_for_stat[GT_FIELD]) het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls) num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp_and_het_calls = (num_high_dp_and_het_calls / num_no_miss_calls) return freq_high_dp_and_het_calls
def __call__(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps num_high_dp_calls = numpy.sum(high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp = num_high_dp_calls / num_no_miss_calls result = {} if self.do_histogram: counts, edges = histogram(freq_high_dp, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.do_filtering or self.report_selection: het_call = call_is_het(vars_for_stat[GT_FIELD]) with numpy.errstate(all='ignore'): obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls with numpy.errstate(all='ignore'): too_much_het = numpy.greater(obs_het, self.max_obs_het) with numpy.errstate(all='ignore'): snps_too_high = numpy.greater(freq_high_dp, self.max_high_dp_freq) to_remove = numpy.logical_and(too_much_het, snps_too_high) selected_snps = numpy.logical_not(to_remove) if self.report_selection: result[SELECTED_VARS] = selected_snps if self.do_filtering: flt_vars = variations.get_chunk(selected_snps) n_kept = numpy.count_nonzero(selected_snps) tot = selected_snps.shape[0] n_filtered_out = tot - n_kept result[FLT_VARS] = flt_vars result[FLT_STATS] = { N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot } return result
def __call__(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps num_high_dp_calls = numpy.sum(high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp = num_high_dp_calls / num_no_miss_calls result = {} if self.do_histogram: counts, edges = histogram(freq_high_dp, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.do_filtering or self.report_selection: het_call = call_is_het(vars_for_stat[GT_FIELD]) with numpy.errstate(all='ignore'): obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls with numpy.errstate(all='ignore'): too_much_het = numpy.greater(obs_het, self.max_obs_het) with numpy.errstate(all='ignore'): snps_too_high = numpy.greater(freq_high_dp, self.max_high_dp_freq) to_remove = numpy.logical_and(too_much_het, snps_too_high) selected_snps = numpy.logical_not(to_remove) if self.report_selection: result[SELECTED_VARS] = selected_snps if self.do_filtering: flt_vars = variations.get_chunk(selected_snps) n_kept = numpy.count_nonzero(selected_snps) tot = selected_snps.shape[0] n_filtered_out = tot - n_kept result[FLT_VARS] = flt_vars result[FLT_STATS] = {N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot} return result
def _iterate_vars(variations): kept_fields = [CHROM_FIELD, POS_FIELD] optional_fields = [REF_FIELD, ALT_FIELD, GT_FIELD, QUAL_FIELD] for field in optional_fields: if field in variations.keys(): kept_fields.append(field) for chunk in variations.iterate_chunks(kept_fields=kept_fields): chunk_keys = chunk.keys() vars_chrom = chunk[CHROM_FIELD] vars_pos = chunk[POS_FIELD] vars_ref = chunk[REF_FIELD] if REF_FIELD in chunk_keys else None vars_alt = chunk[ALT_FIELD] if ALT_FIELD in chunk_keys else None vars_qual = chunk[QUAL_FIELD] if QUAL_FIELD in chunk_keys else None vars_gts = chunk[GT_FIELD] if GT_FIELD in chunk_keys else None if is_dataset(vars_chrom): vars_chrom = vars_chrom[:] vars_pos = vars_pos[:] if vars_ref is not None: vars_ref = vars_ref[:] if vars_alt is not None: vars_alt = vars_alt[:] if vars_qual is not None: vars_qual = vars_qual[:] if vars_gts is not None: vars_gts = vars_gts[:] for var_idx in range(chunk.num_variations): chrom = vars_chrom[var_idx] pos = vars_pos[var_idx] ref = None if vars_ref is None else vars_ref[var_idx] if vars_alt is None: alts = None else: alts = vars_alt[var_idx] alts = [alt for alt in alts if alt != MISSING_BYTE] if not alts: alts = None qual = None if vars_qual is None else vars_qual[var_idx] gts = None if vars_gts is None else vars_gts[var_idx] var_ = { 'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alts, 'qual': qual, 'gts': gts } yield var_
def _calc_sort_order_by_chrom(variations): chrom = variations['/variations/chrom'] if is_dataset(chrom): chrom = chrom[:] pos = variations['/variations/pos'] chrom_names = numpy.sort(numpy.unique(chrom)) for chrom_name in chrom_names: mask = chrom == chrom_name snps_in_chrom_idx = numpy.where(mask)[0] pos_chrom = pos[mask] sorted_idx = numpy.lexsort((pos_chrom, ), axis=0) sorted_snps_in_chrom_idx = snps_in_chrom_idx[sorted_idx] yield sorted_snps_in_chrom_idx
def _calc_sort_order_by_chrom(variations): chrom = variations['/variations/chrom'] if is_dataset(chrom): chrom = chrom[:] pos = variations['/variations/pos'] chrom_names = numpy.sort(numpy.unique(chrom)) for chrom_name in chrom_names: mask = chrom == chrom_name snps_in_chrom_idx = numpy.where(mask)[0] pos_chrom = pos[mask] sorted_idx = numpy.lexsort((pos_chrom,), axis=0) sorted_snps_in_chrom_idx = snps_in_chrom_idx[sorted_idx] yield sorted_snps_in_chrom_idx
def _iterate_vars(variations): kept_fields = [CHROM_FIELD, POS_FIELD] optional_fields = [REF_FIELD, ALT_FIELD, GT_FIELD, QUAL_FIELD, DP_FIELD] for field in optional_fields: if field in variations.keys(): kept_fields.append(field) for chunk in variations.iterate_chunks(kept_fields=kept_fields): chunk_keys = chunk.keys() vars_chrom = chunk[CHROM_FIELD] vars_pos = chunk[POS_FIELD] vars_ref = chunk[REF_FIELD] if REF_FIELD in chunk_keys else None vars_alt = chunk[ALT_FIELD] if ALT_FIELD in chunk_keys else None vars_qual = chunk[QUAL_FIELD] if QUAL_FIELD in chunk_keys else None vars_dp = chunk[DP_FIELD] if DP_FIELD in chunk_keys else None vars_gts = chunk[GT_FIELD] if GT_FIELD in chunk_keys else None if is_dataset(vars_chrom): vars_chrom = vars_chrom[:] vars_pos = vars_pos[:] if vars_ref is not None: vars_ref = vars_ref[:] if vars_alt is not None: vars_alt = vars_alt[:] if vars_qual is not None: vars_qual = vars_qual[:] if vars_dp is not None: vars_dp = vars_dp[:] if vars_gts is not None: vars_gts = vars_gts[:] for var_idx in range(chunk.num_variations): chrom = vars_chrom[var_idx] pos = vars_pos[var_idx] ref = None if vars_ref is None else vars_ref[var_idx] if vars_alt is None: alts = None else: alts = vars_alt[var_idx] alts = [alt for alt in alts if alt != MISSING_BYTE] if not alts: alts = None qual = None if vars_qual is None else vars_qual[var_idx] gts = None if vars_gts is None else vars_gts[var_idx] var_ = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alts, 'qual': qual, 'gts': gts} if vars_dp is not None: var_['dp'] = vars_dp[var_idx] yield var_
def gts_as_mat012(self): '''It transforms the GT matrix into 0 (major allele h**o), 1 (het), 2(other hom)''' gts = self[GT_FIELD] counts = counts_by_row(gts, missing_value=MISSING_INT) if counts is None: return numpy.full((gts.shape[0], gts.shape[1]), fill_value=MISSING_INT) major_alleles = numpy.argmax(counts, axis=1) if is_dataset(gts): gts = gts[:] gts012 = numpy.sum(gts != major_alleles[:, None, None], axis=2) gts012[numpy.any(gts == MISSING_INT, axis=2)] = MISSING_INT return gts012
def _create_matrix_from_matrix(self, path, matrix): result = _dset_metadata_from_matrix(matrix) shape, dtype, chunks, maxshape, fillvalue = result try: dset = self._create_matrix(path, shape=shape, dtype=dtype, chunks=chunks, maxshape=maxshape, fillvalue=fillvalue) new_matrix = dset except TypeError: array = self._create_matrix(path, shape=shape, dtype=dtype, fillvalue=fillvalue) new_matrix = array if is_dataset(matrix): array = matrix[:] else: array = matrix new_matrix[:] = array return new_matrix
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.dtype(bool)) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [ samples[idx] for idx, keep in enumerate(sample_cols) if keep ] filtered_vars.samples = kept_samples return filtered_vars
def _calc_stat(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps het_calls = call_is_het(vars_for_stat[GT_FIELD]) het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls) num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp_and_het_calls = (num_high_dp_and_het_calls / num_no_miss_calls) return freq_high_dp_and_het_calls
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.bool) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols) if keep] filtered_vars.samples = kept_samples return filtered_vars
def _load_matrix(variations, path): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] return matrix