Esempio n. 1
0
    def __call__(self, pop):


        if self.counter % 50 == 0  or (pop.generation==self.final):
 #           print(pop.generation)
            actual_gen = np.around([(pop.generation-self.set_gen)/self.Nstart],decimals=3)

            # sample chromosomes from the population
            chr_sampled = 400
            samp = fp11.sampling.sample_separate(self.__rng, pop, chr_sampled, True)
            neutral_sample = polyt.SimData([str2byte(mut, 'utf-8') for mut in samp[0]])

            # split into windows
            w = Windows(neutral_sample, window_size=1/self.val_per_window, step_len=1/self.val_per_window, starting_pos=self.beginning, ending_pos=self.nwindows)

            # calculate summaries
            window_pi = np.around([PolySIM(w[i]).thetapi() for i in range(len(w))],decimals=3)
            window_singleton = np.around([PolySIM(w[i]).numsingletons() for i in range(len(w))],decimals=3)
            window_tajimasD = np.around([PolySIM(w[i]).tajimasd() for i in range(len(w))],decimals=3)

            # add data to output
            self.pi.append(np.append(actual_gen,window_pi))
            self.singleton.append(np.append(actual_gen,window_singleton))
            self.tajimasD.append(np.append(actual_gen,window_tajimasD))
        self.counter += 1
Esempio n. 2
0
def get_summstats(pop, repid, nsam, temp):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = np.array([], dtype=temp.dtype)
    locus = 0
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            ps = PolySIM(w[i])
            gs = garudStats(w[i])

            raw_nSL = nSLiHS(w[i])
            raw_nSL = [
                i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3
            ]
            nSL = np.array(raw_nSL)
            mean_nSL = np.nan
            max_nSL = np.nan
            if len(nSL) > 0:
                mean_nSL = nSL[:, 0].mean()
                bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5))
                for b in set(bins):
                    binscores = nSL[:, 0][np.where(bins == b)[0]]
                    if len(binscores) > 1:
                        bmean = binscores.mean()
                        sd = binscores.std()
                        # if np.isfinite(bmean) == 0:
                        #     print(bscores)
                        #     sys.exit(0)
                        if sd > 0.0:
                            binscores = (binscores - bmean) / sd
                            bmax = binscores[np.where(
                                np.abs(binscores) == max(np.abs(binscores)))
                                             [0]][0]
                            if np.isnan(
                                    max_nSL) or np.abs(bmax) > np.abs(max_nSL):
                                max_nSL = bmax
            temp[i] = np.array(
                [(locus, int(i), repid, pop.generation, ps.tajimasd(),
                  ps.hprime(), ps.thetapi(), gs['H1'], gs['H12'], gs['H2H1'],
                  mean_nSL, max_nSL)],
                dtype=temp.dtype)
            #print("done assigning ",i)
        #print("attempt concat")
        rv = np.concatenate([rv, temp.copy()])
        #print("here at ",locus)
        locus += 1
    #print("returning summstats")
    return rv
Esempio n. 3
0
def get_outlier_nSL(pop, repid, nsam):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = []
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        ## neut.extend([i for i in sel])
        ## neut = sorted(neut,key = lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            ps = PolySIM(w[i])
            raw_nSL = nSLiHS(w[i])
            raw_nSL = [
                X for X in raw_nSL if np.isfinite(X[0]) == 1 and X[2] > 3
            ]
            nSL = np.array(raw_nSL)
            nbig2 = 0
            nbig3 = 0
            nscores = 0
            if len(nSL) > 0:
                bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5))
                for b in set(bins):
                    binscores = nSL[:, 0][np.where(bins == b)[0]]
                    if len(binscores) > 1:
                        bmean = binscores.mean()
                        sd = binscores.std()
                        # if np.isfinite(bmean) == 0:
                        #     print(bscores)
                        #     sys.exit(0)
                        if sd > 0.0 and np.isfinite(sd):
                            binscores = (binscores - bmean) / sd
                            nscores += len(binscores)
                            abs_nSL_gt_2 = np.where(
                                np.abs(binscores) >= 2.0)[0]
                            nbig2 += len(abs_nSL_gt_2)
                            abs_nSL_gt_3 = np.where(
                                np.abs(binscores) >= 3.0)[0]
                            nbig3 += len(abs_nSL_gt_3)
            if nscores > 0:
                rv.append(
                    Datum(repid, pop.generation, locus_index, i,
                          nbig2 / nscores, nbig3 / nscores))
            else:
                rv.append(Datum(repid, pop.generation, locus_index, i, 0, 0))
    return rv
Esempio n. 4
0
def get_omega_max_per_window(pop, repid, nsam):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = []
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            om = omega_max(w[i])
            rv.append(Datum(repid, pop.generation, locus_index, i, om[0]))
    return rv
Esempio n. 5
0
def get_summstats(pop, nsam, generation):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = []
    locus = 0
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            ps = PolySIM(w[i])
            gs = garudStats(w[i])
            rv.append(
                GenomeScanDataPoint(generation, locus, i, ps.tajimasd(),
                                    ps.hprime(), gs['H1'], gs['H12'],
                                    gs['H2H1']))
        locus += 1
    return rv
Esempio n. 6
0
def get_summstats(pop, repid, nsam):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    locus = 0

    # The procedure here is:
    # 1. Get raw nSL for all windows at all loci
    # 2. For each locus, collect values from first/last window
    #    to use for normalizing.
    # 3. For all other windows per locus, get mean z-score

    reference_daf = []
    reference_values = []
    temp = []
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        assert sd.size() == 2 * args.nsam, "sample size error"
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            raw_nSL = nSLiHS(w[i])
            # Filter out non-finite values
            # and values where derived allele
            # present fewer than 3 times.
            raw_nSL = [
                i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3
            ]
            nSL = np.array(raw_nSL)
            if len(nSL) > 0:
                if i == 0 or i == len(w) - 1:
                    reference_values.extend(nSL[:, 0].tolist())
                    reference_daf.extend(nSL[:, 2].tolist())
                else:
                    temp.append(TempRecord(locus, i, nSL))
        locus += 1

    # bin the reference data
    rdaf = np.array(reference_daf)
    rdaf_bins = np.digitize(rdaf, np.arange(0, 2 * args.nsam, 10))
    rstats = np.array(reference_values)

    mean_sd = {}
    for b in set(rdaf_bins):
        w = np.where(rdaf_bins == b)[0]
        if len(w) > 0:
            m = rstats[w].mean()
            sdev = rstats[w].std()
            if np.isfinite(sdev) == 1 and sdev > 0.0:
                mean_sd[b] = (m, sdev)

    rv = []
    # package up the data
    for t in temp:
        tb = np.digitize(t.values[:, 2], np.arange(0, 2 * args.nsam, 10))
        zscores_win = np.array([])
        for b in set(tb):
            w = np.where(tb == b)[0]
            if b in mean_sd:
                m = mean_sd[b][0]
                sdev = mean_sd[b][1]
                zscores = (t.values[:, 0][w] - m) / sdev
                zscores_win = np.concatenate((zscores_win, zscores))
        mz = zscores_win.mean()
        rv.append(DataRecord(pop.generation, repid, t.locus, t.window, mz))
    return rv
Esempio n. 7
0
# In[30]:

for i in samples:
    windows = []
    start = 0
    while start < 3:
        ##We will only look at neutral mutations, which are element 0 of each sampl
        window = [j[0] for j in i[0] if (j[0] >= start and j[0] < start + 0.1)]
        windows.append(window)
        start += 0.1
    ##We now have a full set of windows that we can do something with
    print(len(windows))  ##There should be 30, and many will be empy

# ### Using [pylibseq](https://github.com/molpopgen/pylibseq)

# In[31]:

from libsequence.windows import Windows
from libsequence.polytable import simData
for i in samples:
    ##We need to convert our list of tuples
    ##into types that pylibseq/libsequence understand:
    windows = Windows(simData(i[0]), 0.1, 0.1, 0, 3)
    ##Now, you can analyze the windows, etc.
    print(len(windows))

# Well, the pylibseq version is clearly more compact.  Of course, you can/should abstract the pure Python version into a standalone function.
#
# Why would you ever use the manual version?  It can save you memory.  The pylibseq version constructs an iterable list of windows, meaning that there is an object allocated for each window.  For the manual version above, we grew a list of objects, but we could just have easily processed them and let them go out of scope.