Example #1
0
def process_replicate(argtuple):
    seed, args, infile = argtuple
    np.random.seed(seed)
    tf = tarfile.open(args.tarfile, 'r')
    ti = tf.getmember(infile)
    lzma_file = tf.extract(ti)
    rv = []
    statnames = ['thetapi', 'tajimasd', 'hprime']
    with lzma.open(infile, 'rb') as f:
        while True:
            try:
                rep, pop = pickle.load(f)
                ind = np.random.choice(pop.N, args.nsam, replace=False)
                s = sample_separate(pop, ind)
                # Only loci 0, 5, and len(s)-1 can have
                # any neutral variants
                for locus in [0, 5, len(s) - 1]:
                    sd = SimData(s[locus][0])
                    ps = PolySIM(sd)
                    for stat in statnames:
                        d = {'rep': rep,
                             'locus': locus,
                             'generation': pop.generation,
                             'stat': stat,
                             'value': getattr(ps, stat)()
                             }
                        rv.append(d)
            except:
                break
    os.remove(infile)
    return rv
def get_summstats(pop, repid, nsam, temp):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = np.array([], dtype=temp.dtype)
    locus = 0
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            ps = PolySIM(w[i])
            gs = garudStats(w[i])

            raw_nSL = nSLiHS(w[i])
            raw_nSL = [
                i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3
            ]
            nSL = np.array(raw_nSL)
            mean_nSL = np.nan
            max_nSL = np.nan
            if len(nSL) > 0:
                mean_nSL = nSL[:, 0].mean()
                bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5))
                for b in set(bins):
                    binscores = nSL[:, 0][np.where(bins == b)[0]]
                    if len(binscores) > 1:
                        bmean = binscores.mean()
                        sd = binscores.std()
                        # if np.isfinite(bmean) == 0:
                        #     print(bscores)
                        #     sys.exit(0)
                        if sd > 0.0:
                            binscores = (binscores - bmean) / sd
                            bmax = binscores[np.where(
                                np.abs(binscores) == max(np.abs(binscores)))
                                             [0]][0]
                            if np.isnan(
                                    max_nSL) or np.abs(bmax) > np.abs(max_nSL):
                                max_nSL = bmax
            temp[i] = np.array(
                [(locus, int(i), repid, pop.generation, ps.tajimasd(),
                  ps.hprime(), ps.thetapi(), gs['H1'], gs['H12'], gs['H2H1'],
                  mean_nSL, max_nSL)],
                dtype=temp.dtype)
            #print("done assigning ",i)
        #print("attempt concat")
        rv = np.concatenate([rv, temp.copy()])
        #print("here at ",locus)
        locus += 1
    #print("returning summstats")
    return rv
def get_outlier_nSL(pop, repid, nsam):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = []
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        ## neut.extend([i for i in sel])
        ## neut = sorted(neut,key = lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            ps = PolySIM(w[i])
            raw_nSL = nSLiHS(w[i])
            raw_nSL = [
                X for X in raw_nSL if np.isfinite(X[0]) == 1 and X[2] > 3
            ]
            nSL = np.array(raw_nSL)
            nbig2 = 0
            nbig3 = 0
            nscores = 0
            if len(nSL) > 0:
                bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5))
                for b in set(bins):
                    binscores = nSL[:, 0][np.where(bins == b)[0]]
                    if len(binscores) > 1:
                        bmean = binscores.mean()
                        sd = binscores.std()
                        # if np.isfinite(bmean) == 0:
                        #     print(bscores)
                        #     sys.exit(0)
                        if sd > 0.0 and np.isfinite(sd):
                            binscores = (binscores - bmean) / sd
                            nscores += len(binscores)
                            abs_nSL_gt_2 = np.where(
                                np.abs(binscores) >= 2.0)[0]
                            nbig2 += len(abs_nSL_gt_2)
                            abs_nSL_gt_3 = np.where(
                                np.abs(binscores) >= 3.0)[0]
                            nbig3 += len(abs_nSL_gt_3)
            if nscores > 0:
                rv.append(
                    Datum(repid, pop.generation, locus_index, i,
                          nbig2 / nscores, nbig3 / nscores))
            else:
                rv.append(Datum(repid, pop.generation, locus_index, i, 0, 0))
    return rv
Example #4
0
def get_omega_max_per_window(pop, repid, nsam):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = []
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            om = omega_max(w[i])
            rv.append(Datum(repid, pop.generation, locus_index, i, om[0]))
    return rv
Example #5
0
def get_summstats(pop, nsam, generation):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    rv = []
    locus = 0
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            ps = PolySIM(w[i])
            gs = garudStats(w[i])
            rv.append(
                GenomeScanDataPoint(generation, locus, i, ps.tajimasd(),
                                    ps.hprime(), gs['H1'], gs['H12'],
                                    gs['H2H1']))
        locus += 1
    return rv
Example #6
0
def get_summstats(pop, repid, nsam):
    """
    The 'genome scan' bit.
    """
    ind = np.random.choice(pop.N, nsam, replace=False)
    s = sample_separate(pop, ind)
    locus = 0

    # The procedure here is:
    # 1. Get raw nSL for all windows at all loci
    # 2. For each locus, collect values from first/last window
    #    to use for normalizing.
    # 3. For all other windows per locus, get mean z-score

    reference_daf = []
    reference_values = []
    temp = []
    for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))):
        neut, sel = si
        neut.extend([i for i in sel])
        neut = sorted(neut, key=lambda x: x[0])
        sd = SimData(neut)
        assert sd.size() == 2 * args.nsam, "sample size error"
        w = Windows(sd, 1.0, 1.0, bi[0], bi[1])
        for i in range(len(w)):
            raw_nSL = nSLiHS(w[i])
            # Filter out non-finite values
            # and values where derived allele
            # present fewer than 3 times.
            raw_nSL = [
                i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3
            ]
            nSL = np.array(raw_nSL)
            if len(nSL) > 0:
                if i == 0 or i == len(w) - 1:
                    reference_values.extend(nSL[:, 0].tolist())
                    reference_daf.extend(nSL[:, 2].tolist())
                else:
                    temp.append(TempRecord(locus, i, nSL))
        locus += 1

    # bin the reference data
    rdaf = np.array(reference_daf)
    rdaf_bins = np.digitize(rdaf, np.arange(0, 2 * args.nsam, 10))
    rstats = np.array(reference_values)

    mean_sd = {}
    for b in set(rdaf_bins):
        w = np.where(rdaf_bins == b)[0]
        if len(w) > 0:
            m = rstats[w].mean()
            sdev = rstats[w].std()
            if np.isfinite(sdev) == 1 and sdev > 0.0:
                mean_sd[b] = (m, sdev)

    rv = []
    # package up the data
    for t in temp:
        tb = np.digitize(t.values[:, 2], np.arange(0, 2 * args.nsam, 10))
        zscores_win = np.array([])
        for b in set(tb):
            w = np.where(tb == b)[0]
            if b in mean_sd:
                m = mean_sd[b][0]
                sdev = mean_sd[b][1]
                zscores = (t.values[:, 0][w] - m) / sdev
                zscores_win = np.concatenate((zscores_win, zscores))
        mz = zscores_win.mean()
        rv.append(DataRecord(pop.generation, repid, t.locus, t.window, mz))
    return rv