Esempio n. 1
0
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False):
    # Calculate and plot variations density distribution
    density = calc_snp_density(variations, window_size)
    density_distrib, bins = histogram(density, 20)
    fpath = join(data_dir, 'snps_density.png')
    title = 'SNP density distribution per {} bp windows'.format(window_size)
    plot_distrib(density_distrib, bins, fhand=open(fpath, 'w'), color='c',
                 mpl_params={'set_xlabel': {'args': ['SNP density'],
                                            'kwargs': {}},
                             'set_ylabel': {'args': ['SNP number'],
                                            'kwargs': {}},
                             'set_title': {'args': [title], 'kwargs': {}},
                             'set_yscale': {'args': ['log'], 'kwargs': {}}})

    # Manhattan plot for SNP density
    fpath = join(data_dir, 'snps_density_manhattan.png')
    fhand = open(fpath, 'w')
    title = 'SNP denisity along the genome'
    chrom = _load_matrix(variations, CHROM_FIELD)
    pos = _load_matrix(variations, POS_FIELD)
    manhattan_plot(chrom, pos, density,
                   mpl_params={'set_xlabel': {'args': ['Chromosome'],
                                              'kwargs': {}},
                               'set_ylabel': {'args': ['SNP per {} bp'.format(window_size)],
                                              'kwargs': {}},
                               'set_title': {'args': [title], 'kwargs': {}}},
                   fhand=fhand, figsize=(15, 7.5), ylim=1)
    
    # Save in bedgraph format
    if write_bg:
        bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w')
        pos_dens = PositionalStatsCalculator(chrom, pos, density)
        pos_dens.write(bg_fhand, 'snp_density',
                       'SNP number in {} bp around'.format(window_size),
                       track_type='bedgraph')
Esempio n. 2
0
def filter_variation_density(in_vars, max_density, window, out_vars=None,
                             chunk_size=SNPS_PER_CHUNK, n_bins=DEF_NUM_BINS,
                             range_=None, do_histogram=None):
    do_histogram = _check_if_histogram_is_required(do_histogram, n_bins,
                                                   range_)

    res = _get_result_if_empty_vars(in_vars, do_histogram)
    if res is not None:
        return None

    do_filtering = False if out_vars is None else True

    if do_histogram and range_ is None:
        range_ = _calc_range_for_var_density(in_vars, window, chunk_size)

    stats = calc_snp_density(in_vars, window)
    edges, counts = None, None

    if chunk_size is None:
        chunks = in_vars.iterate_chunks(chunk_size=chunk_size)
    else:
        chunks = [in_vars]

    n_kept, tot, n_filtered_out = 0, 0, 0
    for chunk in chunks:
        stats_for_chunk = itertools.islice(stats, chunk.num_variations)
        stats_for_chunk = numpy.array(array.array('I', stats_for_chunk))

        if do_filtering:
            selected_rows = stats_for_chunk <= max_density
            out_vars.put_chunks([chunk.get_chunk(selected_rows)])

            n_kept += numpy.count_nonzero(selected_rows)
            tot += selected_rows.shape[0]
            n_filtered_out += tot - n_kept

        if do_histogram:
            this_counts, this_edges = histogram(stats_for_chunk, n_bins=n_bins,
                                                range_=range_)
            if edges is None:
                edges = this_edges
                counts = this_counts
            else:
                counts += this_counts
                if not numpy.allclose(edges, this_edges):
                    msg = 'Bin edges do not match in a chunk iteration'
                    raise RuntimeError(msg)

    res = {}
    if do_filtering:
        res[FLT_STATS] = {N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out,
                          TOT: tot}

    if do_histogram:
        res[EDGES] = edges
        res[COUNTS] = counts

    return res
Esempio n. 3
0
def _calc_range_for_var_density(variations, window, chunk_size):

    min_, max_ = None, None
    for stats in group_in_packets(calc_snp_density(variations, window),
                                  chunk_size):
        stats = array.array('I', stats)
        this_min = min(stats)
        if min_ is None or min_ > this_min:
            min_ = this_min
        this_max = max(stats)
        if max_ is None or max_ < this_max:
            max_ = this_max
    return min_, max_
Esempio n. 4
0
def _calc_range_for_var_density(variations, window, chunk_size):

    min_, max_ = None, None
    for stats in group_in_packets(calc_snp_density(variations, window),
                                  chunk_size):
        stats = array.array('I', stats)
        this_min = min(stats)
        if min_ is None or min_ > this_min:
            min_ = this_min
        this_max = max(stats)
        if max_ is None or max_ < this_max:
            max_ = this_max
    return min_, max_
Esempio n. 5
0
    def test_calc_snp_density(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        density_h5 = list(calc_snp_density(hdf5, 1000))
        density_array = list(calc_snp_density(snps, 1000))
        assert density_array == density_h5
        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [6, 7, 7, 7, 7, 7, 6, 1, 1, 1, 1, 1, 2, 2]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['c1', 'c4', 'c5', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array([]),
               '/variations/pos': numpy.array([])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == []

        var = {'/variations/chrom': numpy.array([1]),
               '/variations/pos': numpy.array([1])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == [1]
    def test_calc_snp_density(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        density_h5 = list(calc_snp_density(hdf5, 1000))
        density_array = list(calc_snp_density(snps, 1000))
        assert density_array == density_h5
        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [6, 7, 7, 7, 7, 7, 6, 1, 1, 1, 1, 1, 2, 2]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['c1', 'c4', 'c5', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array([]),
               '/variations/pos': numpy.array([])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == []

        var = {'/variations/chrom': numpy.array([1]),
               '/variations/pos': numpy.array([1])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == [1]
Esempio n. 7
0
def plot_snp_dens_distrib(variations, window_size, data_dir, write_bg=False):
    # Calculate and plot variations density distribution
    density = calc_snp_density(variations, window_size)
    density_distrib, bins = histogram(density, 20)
    fpath = join(data_dir, 'snps_density.png')
    title = 'SNP density distribution per {} bp windows'.format(window_size)
    plot_distrib(density_distrib,
                 bins,
                 fhand=open(fpath, 'w'),
                 color='c',
                 mpl_params={
                     'set_xlabel': {
                         'args': ['SNP density'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     },
                     'set_yscale': {
                         'args': ['log'],
                         'kwargs': {}
                     }
                 })

    # Manhattan plot for SNP density
    fpath = join(data_dir, 'snps_density_manhattan.png')
    fhand = open(fpath, 'w')
    title = 'SNP denisity along the genome'
    chrom = _load_matrix(variations, CHROM_FIELD)
    pos = _load_matrix(variations, POS_FIELD)
    manhattan_plot(chrom,
                   pos,
                   density,
                   mpl_params={
                       'set_xlabel': {
                           'args': ['Chromosome'],
                           'kwargs': {}
                       },
                       'set_ylabel': {
                           'args': ['SNP per {} bp'.format(window_size)],
                           'kwargs': {}
                       },
                       'set_title': {
                           'args': [title],
                           'kwargs': {}
                       }
                   },
                   fhand=fhand,
                   figsize=(15, 7.5),
                   ylim=1)

    # Save in bedgraph format
    if write_bg:
        bg_fhand = open(join(data_dir, 'snp_density.bg'), 'w')
        pos_dens = PositionalStatsCalculator(chrom, pos, density)
        pos_dens.write(bg_fhand,
                       'snp_density',
                       'SNP number in {} bp around'.format(window_size),
                       track_type='bedgraph')
Esempio n. 8
0
def filter_variation_density(in_vars,
                             max_density,
                             window,
                             out_vars=None,
                             chunk_size=SNPS_PER_CHUNK,
                             n_bins=DEF_NUM_BINS,
                             range_=None,
                             do_histogram=None):
    do_histogram = _check_if_histogram_is_required(do_histogram, n_bins,
                                                   range_)

    res = _get_result_if_empty_vars(in_vars, do_histogram)
    if res is not None:
        return None

    do_filtering = False if out_vars is None else True

    if do_histogram and range_ is None:
        range_ = _calc_range_for_var_density(in_vars, window, chunk_size)

    stats = calc_snp_density(in_vars, window)
    edges, counts = None, None

    if chunk_size is None:
        chunks = in_vars.iterate_chunks(chunk_size=chunk_size)
    else:
        chunks = [in_vars]

    n_kept, tot, n_filtered_out = 0, 0, 0
    for chunk in chunks:
        stats_for_chunk = itertools.islice(stats, chunk.num_variations)
        stats_for_chunk = numpy.array(array.array('I', stats_for_chunk))

        if do_filtering:
            selected_rows = stats_for_chunk <= max_density
            out_vars.put_chunks([chunk.get_chunk(selected_rows)])

            n_kept += numpy.count_nonzero(selected_rows)
            tot += selected_rows.shape[0]
            n_filtered_out += tot - n_kept

        if do_histogram:
            this_counts, this_edges = histogram(stats_for_chunk,
                                                n_bins=n_bins,
                                                range_=range_)
            if edges is None:
                edges = this_edges
                counts = this_counts
            else:
                counts += this_counts
                if not numpy.allclose(edges, this_edges):
                    msg = 'Bin edges do not match in a chunk iteration'
                    raise RuntimeError(msg)

    res = {}
    if do_filtering:
        res[FLT_STATS] = {
            N_KEPT: n_kept,
            N_FILTERED_OUT: n_filtered_out,
            TOT: tot
        }

    if do_histogram:
        res[EDGES] = edges
        res[COUNTS] = counts

    return res