def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={'set_xlabel': {'args': ['SNP density'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP number'], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}, 'set_yscale': {'args': ['log'], 'kwargs': {}}}) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}}, 'set_ylabel': {'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {}}, 'set_title': {'args': [title], 'kwargs': {}}}, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def filter_variation_density(in_vars, max_density, window, out_vars=None, chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS, range_=None, do_histogram=None): do_histogram = _check_if_histogram_is_required(do_histogram, n_bins, range_) res = _get_result_if_empty_vars(in_vars, do_histogram) if res is not None: return None do_filtering = False if out_vars is None else True if do_histogram and range_ is None: range_ = _calc_range_for_var_density(in_vars, window, chunk_size) stats = calc_snp_density(in_vars, window) edges, counts = None, None if chunk_size is None: chunks = in_vars.iterate_chunks(chunk_size=chunk_size) else: chunks = [in_vars] n_kept, tot, n_filtered_out = 0, 0, 0 for chunk in chunks: stats_for_chunk = itertools.islice(stats, chunk.num_variations) stats_for_chunk = numpy.array(array.array('I', stats_for_chunk)) if do_filtering: selected_rows = stats_for_chunk <= max_density out_vars.put_chunks([chunk.get_chunk(selected_rows)]) n_kept += numpy.count_nonzero(selected_rows) tot += selected_rows.shape[0] n_filtered_out += tot - n_kept if do_histogram: this_counts, this_edges = histogram(stats_for_chunk, n_bins=n_bins, range_=range_) if edges is None: edges = this_edges counts = this_counts else: counts += this_counts if not numpy.allclose(edges, this_edges): msg = 'Bin edges do not match in a chunk iteration' raise RuntimeError(msg) res = {} if do_filtering: res[FLT_STATS] = {N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot} if do_histogram: res[EDGES] = edges res[COUNTS] = counts return res
def _calc_range_for_var_density(variations, window, chunk_size): min_, max_ = None, None for stats in group_in_packets(calc_snp_density(variations, window), chunk_size): stats = array.array('I', stats) this_min = min(stats) if min_ is None or min_ > this_min: min_ = this_min this_max = max(stats) if max_ is None or max_ < this_max: max_ = this_max return min_, max_
def test_calc_snp_density(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) density_h5 = list(calc_snp_density(hdf5, 1000)) density_array = list(calc_snp_density(snps, 1000)) assert density_array == density_h5 var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [6, 7, 7, 7, 7, 7, 6, 1, 1, 1, 1, 1, 2, 2] assert dens_var == expected var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c3']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1] assert dens_var == expected var = {'/variations/chrom': numpy.array(['c1', 'c4', 'c5', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c3']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1] assert dens_var == expected var = {'/variations/chrom': numpy.array([]), '/variations/pos': numpy.array([])} dens_var = list(calc_snp_density(var, 11)) assert dens_var == [] var = {'/variations/chrom': numpy.array([1]), '/variations/pos': numpy.array([1])} dens_var = list(calc_snp_density(var, 11)) assert dens_var == [1]
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False): # Calculate and plot variations density distribution density = calc_snp_density(variations, window_size) density_distrib, bins = histogram(density, 20) fpath = join(data_dir, 'snps_density.png') title = 'SNP density distribution per {} bp windows'.format(window_size) plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c', mpl_params={ 'set_xlabel': { 'args': ['SNP density'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP number'], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} }, 'set_yscale': { 'args': ['log'], 'kwargs': {} } }) # Manhattan plot for SNP density fpath = join(data_dir, 'snps_density_manhattan.png') fhand = open(fpath, 'w') title = 'SNP denisity along the genome' chrom = _load_matrix(variations, CHROM_FIELD) pos = _load_matrix(variations, POS_FIELD) manhattan_plot(chrom, pos, density, mpl_params={ 'set_xlabel': { 'args': ['Chromosome'], 'kwargs': {} }, 'set_ylabel': { 'args': ['SNP per {} bp'.format(window_size)], 'kwargs': {} }, 'set_title': { 'args': [title], 'kwargs': {} } }, fhand=fhand, figsize=(15, 7.5), ylim=1) # Save in bedgraph format if write_bg: bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w') pos_dens = PositionalStatsCalculator(chrom, pos, density) pos_dens.write(bg_fhand, 'snp_density', 'SNP number in {} bp around'.format(window_size), track_type='bedgraph')
def filter_variation_density(in_vars, max_density, window, out_vars=None, chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS, range_=None, do_histogram=None): do_histogram = _check_if_histogram_is_required(do_histogram, n_bins, range_) res = _get_result_if_empty_vars(in_vars, do_histogram) if res is not None: return None do_filtering = False if out_vars is None else True if do_histogram and range_ is None: range_ = _calc_range_for_var_density(in_vars, window, chunk_size) stats = calc_snp_density(in_vars, window) edges, counts = None, None if chunk_size is None: chunks = in_vars.iterate_chunks(chunk_size=chunk_size) else: chunks = [in_vars] n_kept, tot, n_filtered_out = 0, 0, 0 for chunk in chunks: stats_for_chunk = itertools.islice(stats, chunk.num_variations) stats_for_chunk = numpy.array(array.array('I', stats_for_chunk)) if do_filtering: selected_rows = stats_for_chunk <= max_density out_vars.put_chunks([chunk.get_chunk(selected_rows)]) n_kept += numpy.count_nonzero(selected_rows) tot += selected_rows.shape[0] n_filtered_out += tot - n_kept if do_histogram: this_counts, this_edges = histogram(stats_for_chunk, n_bins=n_bins, range_=range_) if edges is None: edges = this_edges counts = this_counts else: counts += this_counts if not numpy.allclose(edges, this_edges): msg = 'Bin edges do not match in a chunk iteration' raise RuntimeError(msg) res = {} if do_filtering: res[FLT_STATS] = { N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot } if do_histogram: res[EDGES] = edges res[COUNTS] = counts return res