Example #1
0
 def test_ld_random_pairs_from_different_chroms(self):
     hdf5 = VariationsH5(join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.h5'),
                         mode='r')
     variations = hdf5.get_chunk(slice(5000, 15000))
     mafs = calc_maf(variations, min_num_genotypes=10, chunk_size=None)
     mafs[numpy.isnan(mafs)] = 1
     variations = variations.get_chunk(mafs < 0.95)
     lds = calc_ld_random_pairs_from_different_chroms(variations, 100)
     lds = list(lds)
     assert len(lds) == 100
    def test_calc_maf_distrib(self):
        gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                           [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis['/calls/GT'] = gts
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)

        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [53, 75, 74, 70, 69, 129, 73, 74, 49, 277]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)
Example #3
0
    def test_calc_maf_distrib(self):
        gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                           [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis['/calls/GT'] = gts
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [1, 0, 0, 0, 0, 1, 0, 0, 0, 1]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)

        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        mafs = calc_maf(varis, min_num_genotypes=1)
        distrib, bins = histogram(mafs, n_bins=10)
        dist_expected = [53, 72, 77, 66, 73, 129, 74, 73, 49, 277]
        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)
Example #4
0
    def test_maf(self):
        gts = numpy.array([])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        mafs = calc_maf(varis, chunk_size=None)
        assert mafs.shape == (0,)
        mafs = calc_maf(varis)
        assert mafs.shape == (0,)

        mafs = calc_mac(varis, chunk_size=None)
        assert mafs.shape == (0,)
        mafs = calc_mac(varis)
        assert mafs.shape == (0,)

        gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                           [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        mafs = calc_maf(varis, min_num_genotypes=1)
        assert numpy.allclose(mafs, numpy.array([1., 0.5, 0.75, numpy.NaN]),
                              equal_nan=True)

        macs = calc_mac(varis, min_num_genotypes=1)
        assert numpy.allclose(macs, numpy.array([4, 2, 3, numpy.NaN]),
                              equal_nan=True)

        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        mafs = calc_maf(varis)
        assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] >= 0.5)
        assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] <= 1)
        assert mafs.shape == (943,)

        macs = calc_mac(varis)
        # assert macs.shape == (943,)
        min_mac = varis['/calls/GT'].shape[1] / 2
        max_mac = varis['/calls/GT'].shape[1]
        assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] >= min_mac)
        assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] <= max_mac)
    def test_maf(self):
        gts = numpy.array([])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        mafs = calc_maf(varis, chunk_size=None)
        assert mafs.shape == (0,)
        mafs = calc_maf(varis)
        assert mafs.shape == (0,)

        mafs = calc_mac(varis, chunk_size=None)
        assert mafs.shape == (0,)
        mafs = calc_mac(varis)
        assert mafs.shape == (0,)

        gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                           [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        mafs = calc_maf(varis, min_num_genotypes=1)
        assert numpy.allclose(mafs, numpy.array([1., 0.5, 0.75, numpy.NaN]),
                              equal_nan=True)

        macs = calc_mac(varis, min_num_genotypes=1)
        assert numpy.allclose(macs, numpy.array([4, 2, 3, numpy.NaN]),
                              equal_nan=True)

        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        mafs = calc_maf(varis)
        assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] >= 0.5)
        assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] <= 1)
        assert mafs.shape == (943,)

        macs = calc_mac(varis)
        # assert macs.shape == (943,)
        min_mac = varis['/calls/GT'].shape[1] / 2
        max_mac = varis['/calls/GT'].shape[1]
        assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] >= min_mac)
        assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] <= max_mac)
Example #6
0
def _calc_ld_between_chunks(chunk_pair, min_num_gts=10, max_maf=0.95):
    chunk1 = chunk_pair['chunk1']
    chunk2 = chunk_pair['chunk2']

    maf1 = calc_maf(chunk1, min_num_genotypes=min_num_gts, chunk_size=None)
    maf2 = calc_maf(chunk2, min_num_genotypes=min_num_gts, chunk_size=None)
    if (numpy.any(numpy.isnan(maf1)) or numpy.any(maf1 > max_maf)
            or numpy.any(numpy.isnan(maf2)) or numpy.any(maf2 > max_maf)):
        msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf'
        raise RuntimeError(msg)

    lds_for_pair = calc_rogers_huff_r(chunk1.gts_as_mat012,
                                      chunk2.gts_as_mat012,
                                      min_num_gts=min_num_gts)
    pos1 = chunk1[POS_FIELD]
    pos2 = chunk2[POS_FIELD]

    pos1_repeated = numpy.repeat(pos1, pos2.size).reshape(
        (pos1.size, pos2.size))
    pos2_repeated = numpy.tile(pos2, pos1.size).reshape((pos1.size, pos2.size))
    physical_dist = numpy.abs(pos1_repeated - pos2_repeated).astype(float)
    assert lds_for_pair.shape == physical_dist.shape

    chrom1 = chunk1[CHROM_FIELD]
    chrom2 = chunk2[CHROM_FIELD]
    chrom1_repeated = numpy.repeat(chrom1, chrom2.size).reshape(
        (chrom1.size, chrom2.size))
    chrom2_repeated = numpy.tile(chrom2, chrom1.size).reshape(
        (chrom1.size, chrom2.size))

    physical_dist[chrom1_repeated != chrom2_repeated] = numpy.nan

    positions = list(
        zip(chrom1_repeated.flat, pos1_repeated.flat, chrom2_repeated.flat,
            pos2_repeated.flat))

    yield zip(lds_for_pair.flat, physical_dist.flat, positions)
def plot_maf(variations, data_dir, chunk_size=SNPS_PER_CHUNK, window_size=None,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, write_bg=False,
             calc_genome_wise=False):
    # Calculate and plot MAF distribution
    mafs = calc_maf(variations, min_num_genotypes, chunk_size)
    maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1))
    
    fpath = join(data_dir, 'mafs.png')
    title = 'Maximum allele frequency (MAF) distribution'
    plot_distrib(maf_distrib, bins=bins, fhand=open(fpath, 'w'), color='c',
                   mpl_params={'set_xlabel': {'args': ['MAF'], 'kwargs': {}},
                               'set_ylabel': {'args': ['SNP number'],
                                              'kwargs': {}},
                               'set_title': {'args': [title], 'kwargs': {}}})

    # Write bedgraph file
    if calc_genome_wise:
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD) 
        bg_fhand = open(join(data_dir, 'maf.bg'), 'w')
        pos_maf = PositionalStatsCalculator(chrom, pos, mafs,
                                            window_size=window_size,
                                            step=window_size)
        if write_bg:
            pos_maf.write(bg_fhand, 'MAF', 'Maximum allele frequency',
                          track_type='bedgraph')
        if window_size is not None:
            pos_maf = pos_maf.calc_window_stat()
        
    
        # Manhattan plot for MAF along genome
        fpath = join(data_dir, 'maf_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Max Allele Freq (MAF) along the genome'
        chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat
        mpl_params = {'set_xlabel': {'args': ['Chromosome'], 'kwargs': {}},
                      'set_ylabel': {'args': ['MAF'],'kwargs': {}},
                      'set_title': {'args': [title], 'kwargs': {}}}
        manhattan_plot(chrom, pos, mafs, mpl_params=mpl_params,
                       fhand=fhand, figsize=(15, 7.5))
Example #8
0
def calc_ld_random_pairs_from_different_chroms(variations,
                                               num_pairs,
                                               max_maf=0.95,
                                               min_num_gts=10):
    different_chroms = numpy.unique(variations[CHROM_FIELD])
    if different_chroms.size < 2:
        raise ValueError('Only one chrom in variations')

    mafs = calc_maf(variations, min_num_genotypes=min_num_gts, chunk_size=None)
    if numpy.any(numpy.isnan(mafs)) or numpy.any(mafs > max_maf):
        msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf'
        raise RuntimeError(msg)

    chroms = variations[CHROM_FIELD]
    gts = variations[GT_FIELD]

    num_variations = variations.num_variations

    pairs_computed = 0
    while True:
        snp_idx1 = random.randrange(num_variations)
        snp_idx2 = random.randrange(num_variations)
        chrom1 = chroms[snp_idx1]
        chrom2 = chroms[snp_idx2]
        if chrom1 == chrom2:
            continue

        gts_snp1 = gts[snp_idx1]
        gts_snp2 = gts[snp_idx2]
        r2_ld = _calc_rogers_huff_r_for_snp_pair(gts_snp1,
                                                 gts_snp2,
                                                 min_num_gts=min_num_gts)
        if not math.isnan(r2_ld):
            yield chrom1, snp_idx1, chrom2, snp_idx2, r2_ld

        pairs_computed += 1
        if pairs_computed > num_pairs:
            break
Example #9
0
def plot_maf(variations,
             data_dir,
             chunk_size=SNPS_PER_CHUNK,
             window_size=None,
             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
             write_bg=False,
             calc_genome_wise=False):
    # Calculate and plot MAF distribution
    mafs = calc_maf(variations, min_num_genotypes, chunk_size)
    maf_distrib, bins = histogram(mafs, n_bins=25, range_=(0, 1))

    fpath = join(data_dir, 'mafs.png')
    title = 'Maximum allele frequency (MAF) distribution'
    plot_distrib(maf_distrib,
                 bins=bins,
                 fhand=open(fpath, 'w'),
                 color='c',
                 mpl_params={
                     'set_xlabel': {
                         'args': ['MAF'],
                         'kwargs': {}
                     },
                     'set_ylabel': {
                         'args': ['SNP number'],
                         'kwargs': {}
                     },
                     'set_title': {
                         'args': [title],
                         'kwargs': {}
                     }
                 })

    # Write bedgraph file
    if calc_genome_wise:
        chrom = _load_matrix(variations, CHROM_FIELD)
        pos = _load_matrix(variations, POS_FIELD)
        bg_fhand = open(join(data_dir, 'maf.bg'), 'w')
        pos_maf = PositionalStatsCalculator(chrom,
                                            pos,
                                            mafs,
                                            window_size=window_size,
                                            step=window_size)
        if write_bg:
            pos_maf.write(bg_fhand,
                          'MAF',
                          'Maximum allele frequency',
                          track_type='bedgraph')
        if window_size is not None:
            pos_maf = pos_maf.calc_window_stat()

        # Manhattan plot for MAF along genome
        fpath = join(data_dir, 'maf_manhattan.png')
        fhand = open(fpath, 'w')
        title = 'Max Allele Freq (MAF) along the genome'
        chrom, pos, mafs = pos_maf.chrom, pos_maf.pos, pos_maf.stat
        mpl_params = {
            'set_xlabel': {
                'args': ['Chromosome'],
                'kwargs': {}
            },
            'set_ylabel': {
                'args': ['MAF'],
                'kwargs': {}
            },
            'set_title': {
                'args': [title],
                'kwargs': {}
            }
        }
        manhattan_plot(chrom,
                       pos,
                       mafs,
                       mpl_params=mpl_params,
                       fhand=fhand,
                       figsize=(15, 7.5))
Example #10
0
 def _calc_stat(self, variations):
     return calc_maf(variations, min_num_genotypes=self.min_num_genotypes,
                     chunk_size=None)
Example #11
0
 def _calc_stat(self, variations):
     return calc_maf(variations,
                     min_num_genotypes=self.min_num_genotypes,
                     chunk_size=None)