Ejemplo n.º 1
0
    def _calc_stat(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if dps.shape[0] == 0:
            # No SNPs
            raise ValueError('No SNPs to filter')
        if is_dataset(dps):
            dps = dps[:]

        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps
        het_calls = call_is_het(vars_for_stat[GT_FIELD])
        het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls)

        num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp_and_het_calls = (num_high_dp_and_het_calls /
                                          num_no_miss_calls)

        return freq_high_dp_and_het_calls
Ejemplo n.º 2
0
def _calc_sample_missing_rates(variations, chunk_size, min_called_rate,
                               max_het):

    if chunk_size is None:
        chunks = [variations]
    else:
        chunks = variations.iterate_chunks(kept_fields=[GT_FIELD],
                                           chunk_size=chunk_size)
    missing = None
    het_counts = None
    for chunk in chunks:
        chunk_missing = calc_called_gt(chunk, rates=False, axis=0)
        if min_called_rate is not None:
            if missing is None:
                missing = chunk_missing
            else:
                missing += chunk_missing

        if max_het is not None:
            is_het = call_is_het(chunk[GT_FIELD])
            chunk_het_counts = numpy.sum(is_het, axis=0)
            if het_counts is None:
                het_counts = chunk_het_counts
            else:
                het_counts += chunk_het_counts

    res = {}
    if min_called_rate is not None:
        rates = missing / variations.num_variations
        res['missing_rates'] = rates
    if max_het is not None:
        obs_hets = het_counts / variations.num_variations
        res['obs_hets'] = obs_hets

    return res
Ejemplo n.º 3
0
    def __call__(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]
        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps

        num_high_dp_calls = numpy.sum(high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp = num_high_dp_calls / num_no_miss_calls

        result = {}

        if self.do_histogram:
            counts, edges = histogram(freq_high_dp,
                                      n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.do_filtering or self.report_selection:
            het_call = call_is_het(vars_for_stat[GT_FIELD])
            with numpy.errstate(all='ignore'):
                obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls
            with numpy.errstate(all='ignore'):
                too_much_het = numpy.greater(obs_het, self.max_obs_het)

            with numpy.errstate(all='ignore'):
                snps_too_high = numpy.greater(freq_high_dp,
                                              self.max_high_dp_freq)
            to_remove = numpy.logical_and(too_much_het, snps_too_high)
            selected_snps = numpy.logical_not(to_remove)

        if self.report_selection:
            result[SELECTED_VARS] = selected_snps

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_snps)

            n_kept = numpy.count_nonzero(selected_snps)
            tot = selected_snps.shape[0]
            n_filtered_out = tot - n_kept

            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = {
                N_KEPT: n_kept,
                N_FILTERED_OUT: n_filtered_out,
                TOT: tot
            }

        return result
Ejemplo n.º 4
0
    def __call__(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]
        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps

        num_high_dp_calls = numpy.sum(high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp = num_high_dp_calls / num_no_miss_calls

        result = {}

        if self.do_histogram:
            counts, edges = histogram(freq_high_dp, n_bins=self.n_bins,
                                      range_=self.range)
            result[COUNTS] = counts
            result[EDGES] = edges

        if self.do_filtering or self.report_selection:
            het_call = call_is_het(vars_for_stat[GT_FIELD])
            with numpy.errstate(all='ignore'):
                obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls
            with numpy.errstate(all='ignore'):
                too_much_het = numpy.greater(obs_het, self.max_obs_het)

            with numpy.errstate(all='ignore'):
                snps_too_high = numpy.greater(freq_high_dp,
                                              self.max_high_dp_freq)
            to_remove = numpy.logical_and(too_much_het, snps_too_high)
            selected_snps = numpy.logical_not(to_remove)

        if self.report_selection:
            result[SELECTED_VARS] = selected_snps

        if self.do_filtering:
            flt_vars = variations.get_chunk(selected_snps)

            n_kept = numpy.count_nonzero(selected_snps)
            tot = selected_snps.shape[0]
            n_filtered_out = tot - n_kept

            result[FLT_VARS] = flt_vars
            result[FLT_STATS] = {N_KEPT: n_kept,
                                 N_FILTERED_OUT: n_filtered_out,
                                 TOT: tot}

        return result
Ejemplo n.º 5
0
    def _calc_stat(self, variations):

        vars_for_stat = self._filter_samples_for_stats(variations)

        assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0]

        dps = vars_for_stat[DP_FIELD]
        if is_dataset(dps):
            dps = dps[:]

        num_no_miss_calls = numpy.sum(dps > 0, axis=1)

        high_dp_calls = dps > self._too_high_dps
        het_calls = call_is_het(vars_for_stat[GT_FIELD])
        het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls)

        num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1)

        with numpy.errstate(all='ignore'):
            # This is the stat
            freq_high_dp_and_het_calls = (num_high_dp_and_het_calls /
                                          num_no_miss_calls)

        return freq_high_dp_and_het_calls