Beispiel #1
0
def pi(c,
       chrsize,
       ac_subpops,
       pos,
       pop2color,
       plot=False,
       blenw=1000,
       nwindow=100):
    """
    """
    pidict = {}
    windlen = int(chrsize / nwindow)
    for x in ac_subpops.keys():
        acu = ac_subpops[x]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('PI : retaining', np.count_nonzero(flt), 'SNPs')
        posflt = pos[flt]
        ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt,
                                                            axis=0)[:, :2])
        # pi
        pi = allel.windowed_diversity(posflt, ac, size=blenw)
        pi_m, pi_se, *f = jackknife(pi[0])
        pi_windowed = allel.windowed_diversity(posflt,
                                               ac,
                                               size=windlen,
                                               start=1,
                                               stop=chrsize)
        pidict[x] = (pi_m, pi_se, (pi_windowed[0], pi_windowed[1]))

    if plot:
        div_plot(pidict, pop2color, list(ac_subpops.keys()), c, chrsize, "pi")
    return (pidict)
Beispiel #2
0
def population_statistics(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5):
    window_size = int(window_size)
    reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples])
    original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code]

    synthetic_allele_counts = allel.GenotypeArray(synthetic_genotypes).count_alleles()
    reference_allele_counts = allel.GenotypeArray(original_reference_genotypes).count_alleles()

    synthetic_pi, _, _, _ = allel.windowed_diversity(synthetic_positions, synthetic_allele_counts, size=window_size)
    reference_pi, _, _, _ = allel.windowed_diversity(reference_positions, reference_allele_counts, size=window_size)

    plt.title('Nucleotide Diversity Sliding Window Analysis')
    plt.plot(np.arange(1, len(synthetic_pi) + 1), synthetic_pi, label='Synthetic {}'.format(synthetic_population_code))
    plt.plot(np.arange(1, len(reference_pi) + 1), reference_pi, label='{}'.format(synthetic_population_code))
    plt.xlabel('Windows ({}kb)'.format(window_size // 1000))
    plt.ylabel('Nucleotide Diversity (π)')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, '{}.pi.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())

    synthetic_D, _, _ = allel.windowed_tajima_d(synthetic_positions, synthetic_allele_counts, size=window_size)
    reference_D, _, _ = allel.windowed_tajima_d(reference_positions, reference_allele_counts, size=window_size)

    plt.title('Tajima\'s D Sliding Window Analysis')
    plt.plot(np.arange(1, len(synthetic_D) + 1), synthetic_D, label='Synthetic {}'.format(synthetic_population_code))
    plt.plot(np.arange(1, len(reference_D) + 1), reference_D, label='{}'.format(synthetic_population_code))
    plt.xlabel('Windows ({}kb)'.format(window_size // 1000))
    plt.ylabel('Tajima\'s D')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, '{}.tajima_d.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())
Beispiel #3
0
 def test_masked_windowed_diversity(self):
     # four haplotypes, 6 pairwise comparison
     h = allel.HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1],
                               [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2],
                               [0, 1, 1, 2], [0, 1, -1, -1],
                               [-1, -1, -1, -1]])
     ac = h.count_alleles()
     # mean pairwise diversity
     # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1]
     pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27])
     mask = np.tile(np.repeat(np.array([True, False]), 5), 3)
     # expected is every other window with size 5
     expect, _, _, _ = allel.windowed_diversity(pos,
                                                ac,
                                                size=5,
                                                start=1,
                                                stop=31)
     # only getting every other element
     expect = expect[::2]
     # actual is window of size 10 with the last half masked out
     actual, _, _, _ = allel.windowed_diversity(pos,
                                                ac,
                                                size=10,
                                                start=1,
                                                stop=31,
                                                is_accessible=mask)
     assert_array_almost_equal(expect, actual)
Beispiel #4
0
def pi_window(pos, gt, win_size, length_bp):
    """Calculate pi in windows.

    Parameters
    ----------
    gt : TYPE
        DESCRIPTION.
    pos : TYPE
        DESCRIPTION.
    win_size : TYPE
        DESCRIPTION.
    length_bp : TYPE
        DESCRIPTION.

    Returns
    -------
    pi_mean : TYPE
        DESCRIPTION.
    pi_std : TYPE
        DESCRIPTION.

    """
    gtseg, pos_s = get_seg(gt, pos)
    ac = gtseg.count_alleles()
    pi, *_ = allel.windowed_diversity(pos_s,
                                      ac,
                                      size=win_size,
                                      start=1,
                                      stop=length_bp)
    pi_mean = np.nanmean(pi)
    pi_std = np.nanstd(pi)

    return pi_mean, pi_std
Beispiel #5
0
def pi_fx(pos, gt, win_size, length_bp):
    """Diversity in one pop in a window."""
    ac = gt.count_alleles()
    pi_ = allel.windowed_diversity(pos,
                                   ac,
                                   size=win_size,
                                   start=1,
                                   stop=length_bp)

    return pi_[0]
Beispiel #6
0
 def test_fully_masked_windowed_diversty(self):
     ac = allel.AlleleCountsArray(np.array([[5, 5], [5, 5], [1, 9], [1,
                                                                     9]]))
     pos = np.array([1, 2, 3, 4])
     mask = np.array([False, False, True, True])
     pi, _, _, _ = allel.windowed_diversity(pos,
                                            ac,
                                            size=2,
                                            start=1,
                                            stop=5,
                                            is_accessible=mask)
     self.assertTrue(np.isnan(pi[0]))
Beispiel #7
0
    def test_windowed_diversity(self):

        # four haplotypes, 6 pairwise comparison
        h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1],
                            [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2],
                            [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]])
        ac = h.count_alleles()
        # mean pairwise diversity
        # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1]
        pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27])
        expect = [(7 / 6) / 10, (13 / 6) / 10, 1 / 11]
        actual, _, _, _ = allel.windowed_diversity(pos,
                                                   ac,
                                                   size=10,
                                                   start=1,
                                                   stop=31)
        assert_array_almost_equal(expect, actual)
Beispiel #8
0
def pltPi(chromlist):
    """
    """
    for c in chromlist:
        callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c),
                            mode='r')
        # callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r')
        samples = callset['samples'][:]
        sample_name = [sid.decode() for sid in samples.tolist()]
        g = allel.GenotypeChunkedArray(callset["calldata/GT"])
        pos = allel.SortedIndex(callset["variants/POS"][:])
        acc = g.count_alleles()
        pi_windowed = allel.windowed_diversity(pos, acc, size=10)
        plt.plot(pos, h12_pos)
        plt.xlabel("{} genomic position".format(c))
        plt.ylabel("H12")
        plt.savefig("PNG.{}.H12.pdf".format(c))
        plt.clf()
Beispiel #9
0
                    biallelic = get_biallelic(
                        args.z, chrom,
                        np.concatenate((loc_samples, loc2_samples), axis=0))
                    ac = ac.compress(biallelic, axis=0)[:, :2]
                    ac2 = ac2.compress(biallelic, axis=0)[:, :2]
                    pos = pos.get_mask_selection(biallelic)
            else:
                biallelic = get_biallelic(args.z, chrom, loc_samples)
                ac = ac.compress(biallelic, axis=0)[:, :2]
                pos = pos.get_mask_selection(biallelic)

            if 'pi' in args.s:
                pi, windows, n_bases, counts = allel.windowed_diversity(
                    pos,
                    ac,
                    size=winsize,
                    start=start,
                    stop=stop,
                    step=int(winsize / 2))
                new_dat = format_results(stat=pi,
                                         stat_name="pi",
                                         chrom=chrom,
                                         windows=windows,
                                         nvar=counts,
                                         pop=pop)
                df_list.append(new_dat)
            if 'thetaW' in args.s:
                thetaW, windows, n_bases, counts = allel.windowed_watterson_theta(
                    pos,
                    ac,
                    size=winsize,
## Data preprocessing ##
non_het_variants = gt.count_het(axis=1) == 0
gt_clean = gt[
    non_het_variants
]  ## filter out sites with heterozygous calls, because TB is haploid
gt_clean = gt_clean.haploidify_samples()  ## Convert to haploid calls

pos = callset["variants/POS"][non_het_variants]  ## The retained variant positions

#######Plot 1: nucleotide diversity#########
allele_counts = gt_clean.count_alleles()

window_size = int((pos[-1] - pos[0]) / 100)  # We want about 100 windows
pi, windows, n_bases, n_counts = allel.windowed_diversity(
    pos, allele_counts, size=window_size, start=pos[0], stop=pos[-1]
)

plot_windowed_pi(pi, windows)

######Plot 2: site frequency spectrum########
## Filtering : only keep biallelic variants (at most two alleles segregate in the set of samples)
max_2_alleles = [sum(row != 0) <= 2 for row in allele_counts]
filtered_ac = allele_counts[max_2_alleles]
bi_counts = allel.AlleleCountsArray(np.ndarray((filtered_ac.shape[0], 2), dtype=int))
index = 0
for row in filtered_ac:
    picked = [i for i in row if i != 0]
    assert len(picked) <= 2
    if len(picked) == 1:
        picked = picked + [0]
Beispiel #11
0
def win_pi_sims(path, neut_mut, n_pops, n_sims, T, win_size, L, N):
    foname = os.path.basename(path[:-1])
    print(("Base filename:" + foname), flush=True)
    x = np.arange(n_pops)
    combs = list(itertools.combinations(x, 2))
    pis = np.zeros((len(T), n_sims, n_pops, int(L / win_size)))
    div = np.zeros((len(T), n_sims, len(combs), int(L / win_size)))
    fst = np.zeros((len(T), n_sims, len(combs), int(L / win_size)))
    tajd = np.zeros((len(T), n_sims, n_pops, int(L / win_size)))
    for t in range(len(T)):
        for i in range(n_sims):
            files = glob(path + str(T[t]) + "N_sim_" + str(i) +
                         "_RAND_*[0-9]_overlaid.trees")
            print(files)
            assert (len(files) == 1), str(
                len(files)) + " file(s) found with glob T: " + str(
                    T[t]) + " sim:" + str(i)
            filename = files[0]
            print(filename)
            ts = pyslim.load(filename).simplify()
            #print(("Pi0: ", ts.pairwise_diversity(samples=ts.samples(population=0)),"Pi1: ", ts.pairwise_diversity(samples=ts.samples(population=1))), flush=True)
            s1 = timer()
            acs, pos = ac_from_ts(ts, n_pops, N)
            for j in range(n_pops):
                pi, windows, n_bases, counts = allel.windowed_diversity(
                    pos, acs[j], size=win_size, start=1, stop=L)
                pis[t, i, j, :] = pi
                D, windows, counts = allel.windowed_tajima_d(pos,
                                                             acs[j],
                                                             size=win_size,
                                                             start=1,
                                                             stop=L)
                tajd[t, i, j, :] = D
            s2 = timer()
            print(("Calculating windowed Pi/TajD... Time elapsed (min):" +
                   str(round((s2 - s1) / 60, 3))),
                  flush=True)
            s1 = timer()
            for k in range(len(combs)):
                dxy, windows, n_bases, counts = allel.windowed_divergence(
                    pos,
                    acs[combs[k][0]],
                    acs[combs[k][1]],
                    size=win_size,
                    start=1,
                    stop=L)
                div[t, i, k, :] = dxy
                fstat, windows, counts = allel.windowed_hudson_fst(
                    pos,
                    acs[combs[k][0]],
                    acs[combs[k][1]],
                    size=win_size,
                    start=1,
                    stop=L)
                fst[t, i, k, :] = fstat
            s2 = timer()
            print(("Calculating windowed Dxy and Fst... Time elapsed (min):" +
                   str(round((s2 - s1) / 60, 3))),
                  flush=True)

    s1 = timer()
    print((pis.shape), flush=True)
    print((tajd.shape), flush=True)
    print((div.shape), flush=True)
    output = open(path + foname + '_pis.pkl', 'wb')
    pickle.dump(pis, output)
    output.close()
    output = open(path + foname + '_tajd.pkl', 'wb')
    pickle.dump(tajd, output)
    output.close()
    output = open(path + foname + '_div.pkl', 'wb')
    pickle.dump(div, output)
    output.close()
    output = open(path + foname + '_fst.pkl', 'wb')
    pickle.dump(fst, output)
    output.close()

    if (0):
        plt.subplot(2, 1, 1)
        plt.plot(np.transpose(pis[0, 0, :]), "-")
        plt.title('0N after split')
        plt.ylabel('Pi')
        plt.subplot(2, 1, 2)
        plt.plot(np.transpose(pis[9, 0, :]), "-")
        plt.title('10N after split')
        plt.xlabel('Window')
        plt.ylabel('Pi')
        plt.tight_layout()
        plt.savefig(path + foname + '_landscape.pdf')
        plt.close()

    s2 = timer()
    print(("Saving stats and plots to file... Time elapsed (min):" +
           str(round((s2 - s1) / 60, 3))),
          flush=True)