Example #1
0
    def test_windowed_tajima_d(self):
        from allel import windowed_tajima_d

        pos = np.array([1, 11, 21, 31, 41])

        # example with calculable value
        ac = AlleleCountsArray([[1, 3], [2, 2], [3, 1], [1, 3], [2, 2]])
        expect = np.array([0.168] * 3)
        actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10)
        assert_array_almost_equal(expect, actual, decimal=3)

        # too few sites
        actual, _, _ = windowed_tajima_d(pos, ac, size=15, step=10)
        assert 4 == len(actual)
        assert np.all(np.isnan(actual))

        # too few segregating sites
        ac = AlleleCountsArray([[4, 0], [2, 2], [3, 1], [4, 0], [2, 2]])
        actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10)
        assert 3 == len(actual)
        assert np.all(np.isnan(actual))
        # allow people to override if they really want to
        expect = np.array([0.592] * 3)
        actual, _, _ = windowed_tajima_d(pos,
                                         ac,
                                         size=25,
                                         step=10,
                                         min_sites=2)
        assert_array_almost_equal(expect, actual, decimal=3)
Example #2
0
def tajd(c,
         chrsize,
         ac_subpops,
         pos,
         pop2color,
         plot=False,
         blenw=1000,
         nwindow=100):
    """
    """
    tajddict = {}
    windlen = int(chrsize / nwindow)
    for x in ac_subpops.keys():
        acu = ac_subpops[x]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('TajD : retaining', np.count_nonzero(flt), 'SNPs')
        posflt = pos[flt]
        ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt,
                                                            axis=0)[:, :2])
        # tajd
        tajd = allel.windowed_tajima_d(posflt, ac, size=blenw)
        d_m, d_se, *d = jackknife(tajd[0])
        tajd_windowed = allel.windowed_tajima_d(posflt,
                                                ac,
                                                size=windlen,
                                                start=1,
                                                stop=chrsize)
        # moving window of variants rather than based
        #        tajd_sizevars = allel.moving_tajima_d(ac, size=size)
        tajddict[x] = (d_m, d_se, (tajd_windowed[0], tajd_windowed[1]))
    if plot:
        div_plot(tajddict, pop2color, list(ac_subpops.keys()), c, chrsize,
                 "Tajima's D")
    return (tajddict)
Example #3
0
def population_statistics(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5):
    window_size = int(window_size)
    reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples])
    original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code]

    synthetic_allele_counts = allel.GenotypeArray(synthetic_genotypes).count_alleles()
    reference_allele_counts = allel.GenotypeArray(original_reference_genotypes).count_alleles()

    synthetic_pi, _, _, _ = allel.windowed_diversity(synthetic_positions, synthetic_allele_counts, size=window_size)
    reference_pi, _, _, _ = allel.windowed_diversity(reference_positions, reference_allele_counts, size=window_size)

    plt.title('Nucleotide Diversity Sliding Window Analysis')
    plt.plot(np.arange(1, len(synthetic_pi) + 1), synthetic_pi, label='Synthetic {}'.format(synthetic_population_code))
    plt.plot(np.arange(1, len(reference_pi) + 1), reference_pi, label='{}'.format(synthetic_population_code))
    plt.xlabel('Windows ({}kb)'.format(window_size // 1000))
    plt.ylabel('Nucleotide Diversity (π)')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, '{}.pi.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())

    synthetic_D, _, _ = allel.windowed_tajima_d(synthetic_positions, synthetic_allele_counts, size=window_size)
    reference_D, _, _ = allel.windowed_tajima_d(reference_positions, reference_allele_counts, size=window_size)

    plt.title('Tajima\'s D Sliding Window Analysis')
    plt.plot(np.arange(1, len(synthetic_D) + 1), synthetic_D, label='Synthetic {}'.format(synthetic_population_code))
    plt.plot(np.arange(1, len(reference_D) + 1), reference_D, label='{}'.format(synthetic_population_code))
    plt.xlabel('Windows ({}kb)'.format(window_size // 1000))
    plt.ylabel('Tajima\'s D')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, '{}.tajima_d.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())
Example #4
0
def tajimaD(pos, gt, win_size, length_bp):
    """Calculate Tajima's D in steps of seg sites.

    Parameters
    ----------
    ac : array
        allele counts array
    size : int, optional
        window size in number of variants. The default is 4.

    Returns
    -------
    tajd_mean : float
        DESCRIPTION.
    tajd_std : float
        DESCRIPTION.

    """
    gtseg, pos_s = get_seg(gt, pos)
    ac = gtseg.count_alleles()
    tajd_, *_ = allel.windowed_tajima_d(pos_s,
                                        ac,
                                        size=win_size,
                                        start=1,
                                        stop=length_bp)
    tajd_mean = np.nanmean(tajd_)
    tajd_std = np.nanstd(tajd_)

    return tajd_mean, tajd_std
Example #5
0
         size=winsize,
         start=start,
         stop=stop,
         step=int(winsize / 2))
     new_dat = format_results(stat=thetaW,
                              stat_name="thetaW",
                              chrom=chrom,
                              windows=windows,
                              nvar=counts,
                              pop=pop)
     df_list.append(new_dat)
 if 'tajD' in args.s:
     tajD, windows, counts = allel.windowed_tajima_d(
         pos,
         ac,
         size=winsize,
         start=start,
         stop=stop,
         step=int(winsize / 2))
     new_dat = format_results(stat=tajD,
                              stat_name="tajD",
                              chrom=chrom,
                              windows=windows,
                              nvar=counts,
                              pop=pop)
     df_list.append(new_dat)
 if 'dxy' in args.s and args.p2 != "None":
     dxy, windows, n_bases, counts = allel.windowed_divergence(
         pos,
         ac,
         ac2,
Example #6
0
def win_pi_sims(path, neut_mut, n_pops, n_sims, T, win_size, L, N):
    foname = os.path.basename(path[:-1])
    print(("Base filename:" + foname), flush=True)
    x = np.arange(n_pops)
    combs = list(itertools.combinations(x, 2))
    pis = np.zeros((len(T), n_sims, n_pops, int(L / win_size)))
    div = np.zeros((len(T), n_sims, len(combs), int(L / win_size)))
    fst = np.zeros((len(T), n_sims, len(combs), int(L / win_size)))
    tajd = np.zeros((len(T), n_sims, n_pops, int(L / win_size)))
    for t in range(len(T)):
        for i in range(n_sims):
            files = glob(path + str(T[t]) + "N_sim_" + str(i) +
                         "_RAND_*[0-9]_overlaid.trees")
            print(files)
            assert (len(files) == 1), str(
                len(files)) + " file(s) found with glob T: " + str(
                    T[t]) + " sim:" + str(i)
            filename = files[0]
            print(filename)
            ts = pyslim.load(filename).simplify()
            #print(("Pi0: ", ts.pairwise_diversity(samples=ts.samples(population=0)),"Pi1: ", ts.pairwise_diversity(samples=ts.samples(population=1))), flush=True)
            s1 = timer()
            acs, pos = ac_from_ts(ts, n_pops, N)
            for j in range(n_pops):
                pi, windows, n_bases, counts = allel.windowed_diversity(
                    pos, acs[j], size=win_size, start=1, stop=L)
                pis[t, i, j, :] = pi
                D, windows, counts = allel.windowed_tajima_d(pos,
                                                             acs[j],
                                                             size=win_size,
                                                             start=1,
                                                             stop=L)
                tajd[t, i, j, :] = D
            s2 = timer()
            print(("Calculating windowed Pi/TajD... Time elapsed (min):" +
                   str(round((s2 - s1) / 60, 3))),
                  flush=True)
            s1 = timer()
            for k in range(len(combs)):
                dxy, windows, n_bases, counts = allel.windowed_divergence(
                    pos,
                    acs[combs[k][0]],
                    acs[combs[k][1]],
                    size=win_size,
                    start=1,
                    stop=L)
                div[t, i, k, :] = dxy
                fstat, windows, counts = allel.windowed_hudson_fst(
                    pos,
                    acs[combs[k][0]],
                    acs[combs[k][1]],
                    size=win_size,
                    start=1,
                    stop=L)
                fst[t, i, k, :] = fstat
            s2 = timer()
            print(("Calculating windowed Dxy and Fst... Time elapsed (min):" +
                   str(round((s2 - s1) / 60, 3))),
                  flush=True)

    s1 = timer()
    print((pis.shape), flush=True)
    print((tajd.shape), flush=True)
    print((div.shape), flush=True)
    output = open(path + foname + '_pis.pkl', 'wb')
    pickle.dump(pis, output)
    output.close()
    output = open(path + foname + '_tajd.pkl', 'wb')
    pickle.dump(tajd, output)
    output.close()
    output = open(path + foname + '_div.pkl', 'wb')
    pickle.dump(div, output)
    output.close()
    output = open(path + foname + '_fst.pkl', 'wb')
    pickle.dump(fst, output)
    output.close()

    if (0):
        plt.subplot(2, 1, 1)
        plt.plot(np.transpose(pis[0, 0, :]), "-")
        plt.title('0N after split')
        plt.ylabel('Pi')
        plt.subplot(2, 1, 2)
        plt.plot(np.transpose(pis[9, 0, :]), "-")
        plt.title('10N after split')
        plt.xlabel('Window')
        plt.ylabel('Pi')
        plt.tight_layout()
        plt.savefig(path + foname + '_landscape.pdf')
        plt.close()

    s2 = timer()
    print(("Saving stats and plots to file... Time elapsed (min):" +
           str(round((s2 - s1) / 60, 3))),
          flush=True)