Example #1
0
    def _select_mono(self, chunk):
        keep_monomorphic = self.keep_monomorphic

        gts = chunk[GT_FIELD]
        if is_dataset(gts):
            gts = gts[:]

        shape = gts.shape

        # we count how many different alleles are per row
        # we do it adding a complex part to each number. The complex part is
        # related with the row. Then we use unique
        weight = 1j * numpy.arange(0, shape[0])
        weight = numpy.repeat(weight, shape[1] * shape[2]).reshape(shape)
        b = gts + weight
        _, ind = numpy.unique(b, return_index=True)
        b = numpy.zeros_like(gts)
        c = numpy.ones_like(gts)
        numpy.put(b, ind, c.flat[ind])
        c = numpy.sum(b, axis=(2, 1))

        # we remove the missing values from the count
        rows_with_missing = numpy.any(gts == -1, axis=(1, 2))
        c -= rows_with_missing

        if keep_monomorphic:
            selected_rows = (c <= 2)
        else:
            selected_rows = (c == 2)
        return selected_rows
Example #2
0
def _row_value_counter(mat, value, ratio=False, by_chunk=False):
    ndims = len(mat.shape)
    if ndims == 1:
        raise ValueError('The matrix has to have at least 2 dimensions')
    elif ndims == 2:
        axes = 1
    else:
        axes = tuple([i + 1 for i in range(ndims - 1)])

    if by_chunk:
        chunks = iterate_matrix_chunks(mat)
        result = numpy.zeros(mat.shape[0])
        start = 0
        for chunk in chunks:
            chunk_result = _row_value_counter_array(chunk, value, axes)
            end = start + chunk_result.shape[0]
            result[start:end] = chunk_result
            start = end
    else:
        if is_dataset(mat):
            mat = mat[...]
        result = _row_value_counter_array(mat, value, axes)

    if ratio:
        num_items_per_row = reduce(operator.mul, mat.shape[1:], 1)
        result = result / num_items_per_row
    return result
Example #3
0
 def _calc_stat(self, variations):
     stat = variations['/variations/qual']
     if is_dataset(stat):
         stat = stat[:]
     if numpy.issubdtype(stat.dtype, numpy.float):
         stat[numpy.isinf(stat)] = numpy.finfo(stat.dtype).max
     return stat
Example #4
0
    def __call__(self, variations):

        gts = variations[GT_FIELD][:]
        mat_to_check = variations[self.field_path]

        if is_dataset(variations[GT_FIELD]):
            mat_to_check = mat_to_check[:]
            gts[mat_to_check < self.min] = MISSING_INT
        else:
            gts[mat_to_check < self.min] = MISSING_INT

        result = {}
        if self.do_filtering:
            copied_vars = variations.get_chunk(slice(None, None),
                                               ignored_fields=[GT_FIELD])
            copied_vars[GT_FIELD] = gts

            result[FLT_VARS] = copied_vars

        if self.do_histogram:
            counts, edges = histogram(mat_to_check, n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        return result
Example #5
0
    def _select_mono(self, chunk):
        keep_monomorphic = self.keep_monomorphic

        gts = chunk[GT_FIELD]
        if is_dataset(gts):
            gts = gts[:]

        shape = gts.shape

        # we count how many different alleles are per row
        # we do it adding a complex part to each number. The complex part is
        # related with the row. Then we use unique
        weight = 1j * numpy.arange(0, shape[0])
        weight = numpy.repeat(weight, shape[1] * shape[2]).reshape(shape)
        b = gts + weight
        _, ind = numpy.unique(b, return_index=True)
        b = numpy.zeros_like(gts)
        c = numpy.ones_like(gts)
        numpy.put(b, ind, c.flat[ind])
        c = numpy.sum(b, axis=(2, 1))

        # we remove the missing values from the count
        rows_with_missing = numpy.any(gts == -1, axis=(1, 2))
        c -= rows_with_missing

        if keep_monomorphic:
            selected_rows = (c <= 2)
        else:
            selected_rows = (c == 2)
        return selected_rows
Example #6
0
def _row_value_counter(mat, value, ratio=False, by_chunk=False):
    ndims = len(mat.shape)
    if ndims == 1:
        raise ValueError('The matrix has to have at least 2 dimensions')
    elif ndims == 2:
        axes = 1
    else:
        axes = tuple([i + 1 for i in range(ndims - 1)])

    if by_chunk:
        chunks = iterate_matrix_chunks(mat)
        result = numpy.zeros(mat.shape[0])
        start = 0
        for chunk in chunks:
            chunk_result = _row_value_counter_array(chunk, value, axes)
            end = start + chunk_result.shape[0]
            result[start:end] = chunk_result
            start = end
    else:
        if is_dataset(mat):
            mat = mat[...]
        result = _row_value_counter_array(mat, value, axes)

    if ratio:
        num_items_per_row = reduce(operator.mul, mat.shape[1:], 1)
        result = result / num_items_per_row
    return result
Example #7
0
    def __call__(self, variations):

        gts = variations[GT_FIELD][:]
        mat_to_check = variations[self.field_path]

        if is_dataset(variations[GT_FIELD]):
            mat_to_check = mat_to_check[:]

        gts[mat_to_check < self.min] = MISSING_INT

        ignore_fields_to_copy = [GT_FIELD]
        if self.query_field_to_missing:
            mat_to_check[mat_to_check < self.min] = MISSING_INT
            ignore_fields_to_copy.append(self.field_path)

        result = {}
        if self.do_filtering:
            copied_vars = variations.get_chunk(
                slice(None, None), ignored_fields=ignore_fields_to_copy)
            copied_vars[GT_FIELD] = gts
            if self.query_field_to_missing:
                #                 print(self.field_path, mat_to_check)
                copied_vars[self.field_path] = mat_to_check

            result[FLT_VARS] = copied_vars

        if self.do_histogram:
            counts, edges = histogram(mat_to_check,
                                      n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        return result
Example #8
0
 def _calc_stat(self, variations):
     stat = variations['/variations/qual']
     if is_dataset(stat):
         stat = stat[:]
     if numpy.issubdtype(stat.dtype, numpy.dtype(float)):
         stat[numpy.isinf(stat)] = numpy.finfo(stat.dtype).max
     return stat
Example #9
0
    def _calc_stat(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if dps.shape[0] == 0:
            # No SNPs
            raise ValueError('No SNPs to filter')
        if is_dataset(dps):
            dps = dps[:]

        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps
        het_calls = call_is_het(vars_for_stat[GT_FIELD])
        het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls)

        num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp_and_het_calls = (num_high_dp_and_het_calls /
                                          num_no_miss_calls)

        return freq_high_dp_and_het_calls
Example #10
0
    def __call__(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]
        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps

        num_high_dp_calls = numpy.sum(high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp = num_high_dp_calls / num_no_miss_calls

        result = {}

        if self.do_histogram:
            counts, edges = histogram(freq_high_dp,
                                      n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.do_filtering or self.report_selection:
            het_call = call_is_het(vars_for_stat[GT_FIELD])
            with numpy.errstate(all='ignore'):
                obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls
            with numpy.errstate(all='ignore'):
                too_much_het = numpy.greater(obs_het, self.max_obs_het)

            with numpy.errstate(all='ignore'):
                snps_too_high = numpy.greater(freq_high_dp,
                                              self.max_high_dp_freq)
            to_remove = numpy.logical_and(too_much_het, snps_too_high)
            selected_snps = numpy.logical_not(to_remove)

        if self.report_selection:
            result[SELECTED_VARS] = selected_snps

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_snps)

            n_kept = numpy.count_nonzero(selected_snps)
            tot = selected_snps.shape[0]
            n_filtered_out = tot - n_kept

            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = {
                N_KEPT: n_kept,
                N_FILTERED_OUT: n_filtered_out,
                TOT: tot
            }

        return result
Example #11
0
    def __call__(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]
        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps

        num_high_dp_calls = numpy.sum(high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp = num_high_dp_calls / num_no_miss_calls

        result = {}

        if self.do_histogram:
            counts, edges = histogram(freq_high_dp, n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.do_filtering or self.report_selection:
            het_call = call_is_het(vars_for_stat[GT_FIELD])
            with numpy.errstate(all='ignore'):
                obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls
            with numpy.errstate(all='ignore'):
                too_much_het = numpy.greater(obs_het, self.max_obs_het)

            with numpy.errstate(all='ignore'):
                snps_too_high = numpy.greater(freq_high_dp,
                                              self.max_high_dp_freq)
            to_remove = numpy.logical_and(too_much_het, snps_too_high)
            selected_snps = numpy.logical_not(to_remove)

        if self.report_selection:
            result[SELECTED_VARS] = selected_snps

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_snps)

            n_kept = numpy.count_nonzero(selected_snps)
            tot = selected_snps.shape[0]
            n_filtered_out = tot - n_kept

            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = {N_KEPT: n_kept,
                                 N_FILTERED_OUT: n_filtered_out,
                                 TOT: tot}

        return result
Example #12
0
def _iterate_vars(variations):
    kept_fields = [CHROM_FIELD, POS_FIELD]

    optional_fields = [REF_FIELD, ALT_FIELD, GT_FIELD, QUAL_FIELD]
    for field in optional_fields:
        if field in variations.keys():
            kept_fields.append(field)

    for chunk in variations.iterate_chunks(kept_fields=kept_fields):
        chunk_keys = chunk.keys()
        vars_chrom = chunk[CHROM_FIELD]
        vars_pos = chunk[POS_FIELD]
        vars_ref = chunk[REF_FIELD] if REF_FIELD in chunk_keys else None
        vars_alt = chunk[ALT_FIELD] if ALT_FIELD in chunk_keys else None
        vars_qual = chunk[QUAL_FIELD] if QUAL_FIELD in chunk_keys else None
        vars_gts = chunk[GT_FIELD] if GT_FIELD in chunk_keys else None

        if is_dataset(vars_chrom):
            vars_chrom = vars_chrom[:]
            vars_pos = vars_pos[:]
            if vars_ref is not None:
                vars_ref = vars_ref[:]
            if vars_alt is not None:
                vars_alt = vars_alt[:]
            if vars_qual is not None:
                vars_qual = vars_qual[:]
            if vars_gts is not None:
                vars_gts = vars_gts[:]

        for var_idx in range(chunk.num_variations):
            chrom = vars_chrom[var_idx]
            pos = vars_pos[var_idx]
            ref = None if vars_ref is None else vars_ref[var_idx]
            if vars_alt is None:
                alts = None
            else:
                alts = vars_alt[var_idx]
                alts = [alt for alt in alts if alt != MISSING_BYTE]
                if not alts:
                    alts = None
            qual = None if vars_qual is None else vars_qual[var_idx]
            gts = None if vars_gts is None else vars_gts[var_idx]

            var_ = {
                'chrom': chrom,
                'pos': pos,
                'ref': ref,
                'alt': alts,
                'qual': qual,
                'gts': gts
            }
            yield var_
Example #13
0
def _calc_sort_order_by_chrom(variations):
    chrom = variations['/variations/chrom']
    if is_dataset(chrom):
        chrom = chrom[:]
    pos = variations['/variations/pos']
    chrom_names = numpy.sort(numpy.unique(chrom))
    for chrom_name in chrom_names:
        mask = chrom == chrom_name
        snps_in_chrom_idx = numpy.where(mask)[0]
        pos_chrom = pos[mask]
        sorted_idx = numpy.lexsort((pos_chrom, ), axis=0)
        sorted_snps_in_chrom_idx = snps_in_chrom_idx[sorted_idx]
        yield sorted_snps_in_chrom_idx
Example #14
0
def _calc_sort_order_by_chrom(variations):
    chrom = variations['/variations/chrom']
    if is_dataset(chrom):
        chrom = chrom[:]
    pos = variations['/variations/pos']
    chrom_names = numpy.sort(numpy.unique(chrom))
    for chrom_name in chrom_names:
        mask = chrom == chrom_name
        snps_in_chrom_idx = numpy.where(mask)[0]
        pos_chrom = pos[mask]
        sorted_idx = numpy.lexsort((pos_chrom,), axis=0)
        sorted_snps_in_chrom_idx = snps_in_chrom_idx[sorted_idx]
        yield sorted_snps_in_chrom_idx
Example #15
0
def _iterate_vars(variations):
    kept_fields = [CHROM_FIELD, POS_FIELD]

    optional_fields = [REF_FIELD, ALT_FIELD, GT_FIELD, QUAL_FIELD, DP_FIELD]
    for field in optional_fields:
        if field in variations.keys():
            kept_fields.append(field)

    for chunk in variations.iterate_chunks(kept_fields=kept_fields):
        chunk_keys = chunk.keys()
        vars_chrom = chunk[CHROM_FIELD]
        vars_pos = chunk[POS_FIELD]
        vars_ref = chunk[REF_FIELD] if REF_FIELD in chunk_keys else None
        vars_alt = chunk[ALT_FIELD] if ALT_FIELD in chunk_keys else None
        vars_qual = chunk[QUAL_FIELD] if QUAL_FIELD in chunk_keys else None
        vars_dp = chunk[DP_FIELD] if DP_FIELD in chunk_keys else None
        vars_gts = chunk[GT_FIELD] if GT_FIELD in chunk_keys else None

        if is_dataset(vars_chrom):
            vars_chrom = vars_chrom[:]
            vars_pos = vars_pos[:]
            if vars_ref is not None:
                vars_ref = vars_ref[:]
            if vars_alt is not None:
                vars_alt = vars_alt[:]
            if vars_qual is not None:
                vars_qual = vars_qual[:]
            if vars_dp is not None:
                vars_dp = vars_dp[:]
            if vars_gts is not None:
                vars_gts = vars_gts[:]

        for var_idx in range(chunk.num_variations):
            chrom = vars_chrom[var_idx]
            pos = vars_pos[var_idx]
            ref = None if vars_ref is None else vars_ref[var_idx]
            if vars_alt is None:
                alts = None
            else:
                alts = vars_alt[var_idx]
                alts = [alt for alt in alts if alt != MISSING_BYTE]
                if not alts:
                    alts = None
            qual = None if vars_qual is None else vars_qual[var_idx]
            gts = None if vars_gts is None else vars_gts[var_idx]

            var_ = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alts,
                    'qual': qual, 'gts': gts}
            if vars_dp is not None:
                var_['dp'] = vars_dp[var_idx]
            yield var_
Example #16
0
    def gts_as_mat012(self):
        '''It transforms the GT matrix into 0 (major allele h**o), 1 (het),
        2(other hom)'''
        gts = self[GT_FIELD]
        counts = counts_by_row(gts, missing_value=MISSING_INT)
        if counts is None:
            return numpy.full((gts.shape[0], gts.shape[1]),
                              fill_value=MISSING_INT)

        major_alleles = numpy.argmax(counts, axis=1)
        if is_dataset(gts):
            gts = gts[:]
        gts012 = numpy.sum(gts != major_alleles[:, None, None], axis=2)
        gts012[numpy.any(gts == MISSING_INT, axis=2)] = MISSING_INT
        return gts012
Example #17
0
    def gts_as_mat012(self):
        '''It transforms the GT matrix into 0 (major allele h**o), 1 (het),
        2(other hom)'''
        gts = self[GT_FIELD]
        counts = counts_by_row(gts, missing_value=MISSING_INT)
        if counts is None:
            return numpy.full((gts.shape[0], gts.shape[1]),
                              fill_value=MISSING_INT)

        major_alleles = numpy.argmax(counts, axis=1)
        if is_dataset(gts):
            gts = gts[:]
        gts012 = numpy.sum(gts != major_alleles[:, None, None], axis=2)
        gts012[numpy.any(gts == MISSING_INT, axis=2)] = MISSING_INT
        return gts012
Example #18
0
    def _create_matrix_from_matrix(self, path, matrix):

        result = _dset_metadata_from_matrix(matrix)
        shape, dtype, chunks, maxshape, fillvalue = result
        try:
            dset = self._create_matrix(path, shape=shape,
                                       dtype=dtype,
                                       chunks=chunks,
                                       maxshape=maxshape,
                                       fillvalue=fillvalue)
            new_matrix = dset
        except TypeError:
            array = self._create_matrix(path, shape=shape, dtype=dtype,
                                        fillvalue=fillvalue)
            new_matrix = array
        if is_dataset(matrix):
            array = matrix[:]
        else:
            array = matrix
        new_matrix[:] = array
        return new_matrix
Example #19
0
    def _create_matrix_from_matrix(self, path, matrix):

        result = _dset_metadata_from_matrix(matrix)
        shape, dtype, chunks, maxshape, fillvalue = result
        try:
            dset = self._create_matrix(path, shape=shape,
                                       dtype=dtype,
                                       chunks=chunks,
                                       maxshape=maxshape,
                                       fillvalue=fillvalue)
            new_matrix = dset
        except TypeError:
            array = self._create_matrix(path, shape=shape, dtype=dtype,
                                        fillvalue=fillvalue)
            new_matrix = array
        if is_dataset(matrix):
            array = matrix[:]
        else:
            array = matrix
        new_matrix[:] = array
        return new_matrix
Example #20
0
def _filter_samples_by_index(variations,
                             sample_cols,
                             filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.dtype(bool))
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [
        samples[idx] for idx, keep in enumerate(sample_cols) if keep
    ]
    filtered_vars.samples = kept_samples
    return filtered_vars
Example #21
0
    def _calc_stat(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]

        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps
        het_calls = call_is_het(vars_for_stat[GT_FIELD])
        het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls)

        num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp_and_het_calls = (num_high_dp_and_het_calls /
                                          num_no_miss_calls)

        return freq_high_dp_and_het_calls
Example #22
0
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.bool)
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols)
                    if keep]
    filtered_vars.samples = kept_samples
    return filtered_vars
def _load_matrix(variations, path):
    matrix = variations[path]
    if is_dataset(matrix):
        matrix = matrix[:]
    return matrix
Example #24
0
def _load_matrix(variations, path):
    matrix = variations[path]
    if is_dataset(matrix):
        matrix = matrix[:]
    return matrix