Ejemplo n.º 1
0
def summarize_variations(in_zarr_path,
                         out_dir_path,
                         draw_missin_rate=True,
                         draw_mac=True,
                         draw_maf=True,
                         draw_obs_het=True,
                         min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET,
                         min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                         num_bins=DEF_NUM_BINS,
                         silence_runtime_warnings=True):
    stats = {}
    variations = load_zarr(in_zarr_path)
    max_alleles = variations[ALT_FIELD].shape[1]
    num_variations = variations.num_variations
    num_samples = variations.num_samples

    if draw_missin_rate:
        _stats = calc_called_gt(variations, rates=True)
        counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1))
        stats['called'] = {'counts': counts, 'edges': edges}

    if draw_mac:
        _stats = calc_mac(variations, max_alleles, min_num_genotypes)
        counts, edges = va.histogram(_stats,
                                     n_bins=num_bins,
                                     limits=(0, variations.num_samples))
        stats['mac'] = {'counts': counts, 'edges': edges}

    if draw_maf:
        _stats = calc_maf_by_gt(variations, max_alleles, min_num_genotypes)
        counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1))
        stats['maf'] = {'counts': counts, 'edges': edges}

    if draw_obs_het:
        _stats = calc_obs_het(
            variations,
            min_num_genotypes=min_num_genotypes,
            min_call_dp_for_het_call=min_call_dp_for_het_call)
        counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1))
        stats['obs_heterocigosity'] = {'counts': counts, 'edges': edges}

    computed_stats = compute(stats,
                             silence_runtime_warnings=silence_runtime_warnings)

    for kind, stats in computed_stats.items():
        with (out_dir_path / f'{kind}.png').open('wb') as out_fhand:
            plot_histogram(stats['counts'],
                           stats['edges'],
                           out_fhand,
                           log_scale=True)

    with (out_dir_path / 'stats.txt').open('w') as fhand:
        fhand.write(f'STATS FOR: {in_zarr_path.name}\n')
        fhand.write('-----------' + '-' * len(in_zarr_path.name) + '\n')
        fhand.write(f'Num. variations: {num_variations}\n')
        fhand.write(f'Num. samples: {num_samples}\n')
        fhand.write('\n')
Ejemplo n.º 2
0
def remove_low_call_rate_samples(variations,
                                 min_call_rate,
                                 rates=True,
                                 filter_id='sample_call_rate',
                                 calc_histogram=False,
                                 n_bins=DEF_NUM_BINS,
                                 limits=None):

    num_missing_gts = calc_missing_gt_per_sample(variations, rates=rates)
    if rates:
        num_called = 1 - num_missing_gts
    else:
        num_called = utils_array.get_shape_item(variations.gt,
                                                0) - num_missing_gts

    selected_samples = num_called >= min_call_rate
    variations = keep_samples_with_mask(variations, selected_samples)[FLT_VARS]

    num_selected_samples = va.count_nonzero(selected_samples)
    num_filtered_samples = va.count_nonzero(va.logical_not(selected_samples))

    flt_stats = {
        N_SAMPLES_KEPT: num_selected_samples,
        N_SAMPLES_FILTERED_OUT: num_filtered_samples
    }

    if calc_histogram:
        limits = (0, 1) if rates else (0, len(variations.num_variations))
        counts, bin_edges = va.histogram(num_called,
                                         n_bins=n_bins,
                                         limits=limits)
        flt_stats[COUNT] = counts
        flt_stats[BIN_EDGES] = bin_edges
        flt_stats[HIST_RANGE] = [min_call_rate]
    return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
Ejemplo n.º 3
0
def filter_by_mac(variations,
                  max_alleles,
                  max_allowable_mac=None,
                  min_allowable_mac=None,
                  filter_id='filter_by_mac',
                  min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                  calc_histogram=False,
                  n_bins=DEF_NUM_BINS,
                  limits=None):
    macs = calc_mac(variations,
                    max_alleles=max_alleles,
                    min_num_genotypes=min_num_genotypes)
    # print(compute(macs))

    result = _select_vars(variations, macs, min_allowable_mac,
                          max_allowable_mac)

    if calc_histogram:
        if limits is None:
            limits = (0, variations.num_samples)
        counts, bin_edges = va.histogram(macs, n_bins=n_bins, limits=limits)
        result[FLT_STATS][COUNT] = counts
        result[FLT_STATS][BIN_EDGES] = bin_edges
        limits = []
        if min_allowable_mac is not None:
            limits.append(min_allowable_mac)
        if max_allowable_mac is not None:
            limits.append(max_allowable_mac)
        result[FLT_STATS]['limits'] = limits

    return {
        FLT_VARS: result[FLT_VARS],
        FLT_ID: filter_id,
        FLT_STATS: result[FLT_STATS]
    }
Ejemplo n.º 4
0
def remove_low_call_rate_vars(variations,
                              min_call_rate,
                              rates=True,
                              filter_id='call_rate',
                              calc_histogram=False,
                              n_bins=DEF_NUM_BINS,
                              limits=None):
    num_missing_gts = calc_missing_gt(variations, rates=rates)
    if rates:
        num_called = 1 - num_missing_gts
    else:
        num_called = variations.gt.shape[1] - num_missing_gts

    selected_vars = num_called >= min_call_rate
    variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    if calc_histogram:
        limits = (0, 1) if rates else (0, len(variations.num_samples))
        counts, bin_edges = va.histogram(num_called,
                                         n_bins=n_bins,
                                         limits=limits)
        flt_stats[COUNT] = counts
        flt_stats[BIN_EDGES] = bin_edges
        flt_stats['limits'] = [min_call_rate]

    return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
Ejemplo n.º 5
0
    def test_calc_maf_by_gt2(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0)

        # res = compute(mafs, silence_runtime_warnings=True)
        counts, edges = va.histogram(mafs, n_bins=5, limits=(0, 1))
        cc = compute({
            'counts': counts,
            'edges': edges
        },
                     silence_runtime_warnings=True)
        self.assertTrue(np.all(cc['counts'] == [0, 0, 4, 2, 0]))
        self.assertTrue(
            np.all(np.isclose(cc['edges'], [0, 0.2, 0.4, 0.6, 0.8, 1])))
Ejemplo n.º 6
0
def filter_by_obs_heterocigosis(
        variations,
        max_allowable_het=None,
        min_allowable_het=None,
        min_call_dp_for_het_call=None,
        max_call_dp_for_het_call=None,
        filter_id='obs_het',
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
        calc_histogram=False,
        n_bins=DEF_NUM_BINS,
        limits=None):

    obs_het = calc_obs_het(variations,
                           min_num_genotypes=min_num_genotypes,
                           min_call_dp_for_het_call=min_call_dp_for_het_call,
                           max_call_dp_for_het_call=max_call_dp_for_het_call)

    result = _select_vars(variations,
                          obs_het,
                          min_allowable=min_allowable_het,
                          max_allowable=max_allowable_het)
    if calc_histogram:
        if limits is None:
            limits = (0, 1)
        counts, bin_edges = va.histogram(obs_het, n_bins=n_bins, limits=limits)
        result[FLT_STATS][COUNT] = counts
        result[FLT_STATS][BIN_EDGES] = bin_edges
        limits = []
        if min_allowable_het is not None:
            limits.append(min_allowable_het)
        if max_allowable_het is not None:
            limits.append(max_allowable_het)
        result[FLT_STATS]['limits'] = limits

    return {
        FLT_VARS: result[FLT_VARS],
        FLT_ID: filter_id,
        FLT_STATS: result[FLT_STATS]
    }