Beispiel #1
0
def simulate(out_path,
             species,
             model,
             genetic_map,
             seed,
             chrmStr,
             sample_size=20,
             population=0,
             ld_thresh=1.0,
             max_workers=1):
    mask_path = out_path + ".r2Mask.p"
    sfs_path = out_path + ".sfs.pdf"
    chrom = species.genome.chromosomes[chrmStr]
    samples = [msp.Sample(population=population, time=0)] * sample_size
    print("Simulating...")
    ts = msp.simulate(samples=samples,
                      recombination_map=chrom.recombination_map(
                          genetic_map.name),
                      mutation_rate=chrom.default_mutation_rate,
                      random_seed=seed,
                      **model.asdict())
    ts.dump(out_path)
    haps = allel.HaplotypeArray(ts.genotype_matrix())
    SFSs = []
    SFSs.append(allel.sfs(haps.count_alleles()[:, 1])[1:])
    print("Simulation finished!")
    if ld_thresh < 1.0:
        ul = unlinked(ts, ld_thresh, max_workers)
        mask_file = open(mask_path, "wb")
        pickle.dump(ul, mask_file)
        SFSs.append(allel.sfs(haps[ul, :].count_alleles()[:, 1])[1:])
    plot_sfs(SFSs, sfs_path)
Beispiel #2
0
 def test_sfs(self):
     dac = [0, 1, 2, 1]
     expect = [1, 2, 1]
     actual = allel.sfs(dac)
     aeq(expect, actual)
     for dtype in 'u2', 'i2', 'u8', 'i8':
         daca = np.asarray(dac, dtype=dtype)
         actual = allel.sfs(daca)
         aeq(expect, actual)
Beispiel #3
0
    def ts_to_stairway(self, ts_path, num_bootstraps=1, mask_file=None):
        """
        Converts the specified tskit tree sequence to text files used by
        stairway plot.
        """
        derived_counts_all = [[] for _ in range(num_bootstraps + 1)]
        total_length = 0
        num_samples = 0
        for i, ts_p in enumerate(ts_path):
            ts = tskit.load(ts_p)
            total_length += ts.sequence_length
            num_samples = ts.num_samples
            haps = ts.genotype_matrix()

            SFSs = []
            # Masking
            retain = np.full(ts.get_num_mutations(), False)
            if mask_file:
                mask_table = pd.read_csv(mask_file, sep="\t", header=None)
                chrom = ts_p.split("/")[-1].split(".")[0]
                sub = mask_table[mask_table[0] == chrom]
                mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2])
                snp_locs = [int(x.site.position) for x in ts.variants()]
                tmp_bool = [mask_ints.contains(x) for x in snp_locs]
                retain = np.logical_or(retain, tmp_bool)
                total_length -= np.sum(mask_ints.length)

            retain = np.logical_not(retain)
            # append unmasked SFS
            SFSs.append(allel.sfs(allel.HaplotypeArray(haps).count_alleles()[:, 1])[1:])
            # get masked allele counts and append SFS
            allele_counts = allel.HaplotypeArray(haps[retain, :]).count_alleles()
            SFSs.append(allel.sfs(allele_counts[:, 1])[1:])
            sfs_path = ts_p+".sfs.pdf"
            plots.plot_sfs(SFSs, sfs_path)
            # Bootstrap allele counts
            derived_counts_all[0].extend(allele_counts[:, 1])
            for j in range(1, num_bootstraps + 1):
                nsites = np.shape(allele_counts)[0]
                bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True)
                bootac = allele_counts[bootset, :]
                der_bootac = bootac[:, 1]
                derived_counts_all[j].extend(der_bootac)
        # Get the SFS minus the 0 bin and write output
        stairway_files = []
        for l in range(len(derived_counts_all)):
            sfs = allel.sfs(derived_counts_all[l])[1:]
            filename = self.workdir / "sfs_{}.txt".format(l)
            write_stairway_sfs(total_length, num_samples, sfs, filename)
            stairway_files.append(filename)

        return stairway_files
Beispiel #4
0
def test_sfs():
    dac = [0, 1, 2, 1]
    expect = [1, 2, 1]
    actual = allel.sfs(dac)
    assert_array_equal(expect, actual)
    for dtype in 'u2', 'i2', 'u8', 'i8':
        daca = np.asarray(dac, dtype=dtype)
        actual = allel.sfs(daca)
        assert_array_equal(expect, actual)
    # explicitly provide number of chromosomes
    expect = [1, 2, 1, 0]
    actual = allel.sfs(dac, n=3)
    assert_array_equal(expect, actual)
    with pytest.raises(ValueError):
        allel.sfs(dac, n=1)
Beispiel #5
0
def asfs_stats(gt, pos, fold):
    """Calculate the allele frequence spectrum.

    Future implementations will utilize the breakpoints from msprime tree object
    to find unlinked positions.

    Parameters
    ----------
    gt : TYPE
        DESCRIPTION.
    pos : TYPE
        DESCRIPTION.
    fold : bool
        if True, return folded SFS

    Returns
    -------
    sfs : TYPE
        DESCRIPTION.

    """
    # TODO: random sample OR use msprime breakpoint to reduce linkage
    gtseg, pos_s = get_seg(gt, pos)
    # sfs
    if fold:
        sfsp = (allel.sfs_folded(gtseg.count_alleles(), gtseg.shape[1]))[1:]
    else:
        sfsp = (allel.sfs(gtseg.count_alleles()[:, 1], gtseg.shape[1]))[1:-1]
    tots = np.sum(sfsp)
    sfs = sfsp / tots

    return sfs
Beispiel #6
0
def asfsStatsSeg(gt, pops, chrm, rand=True, plot=False):
    """Aggregate SFS, singletons and doubletons
    """
    print("asfs")
    aSFS1 = []
    aSFS2 = []
    for p in pops:
        gtpop = gt.take(p, axis=1)
        acpop = gtpop.count_alleles()
        seg = acpop.is_segregating()
        gtseg = gtpop.compress(seg)
        # random snps
        if rand:
            n = 100000  # number of SNPs to choose randomly
            try:
                vidx = np.random.choice(gtseg.shape[0], n, replace=False)
            except ValueError:
                vidx = np.random.choice(gtseg.shape[0],
                                        gtseg.shape[0],
                                        replace=False)
        else:
            vidx = np.random.choice(gtseg.shape[0],
                                    gtseg.shape[0],
                                    replace=False)
        vidx.sort()
        gtp = gtseg.take(vidx, axis=0)
        sfsp = (allel.sfs(gtp.count_alleles()[:, 1]))
        print(sfsp)
        if plot:
            fig, ax = plt.subplots(figsize=(6, 6))
            allel.stats.plot_sfs(sfsp, ax=ax)
        tots = np.sum(sfsp)
        aSFS1.append(sfsp[1] / tots)
        aSFS2.append(sfsp[2] / tots)
    return (aSFS1, aSFS2)
Beispiel #7
0
def sfs_plot(c, ac_subpops, save=True, fold=True, scale=True):
    """
    note: should filter on segregating if only using subset of pops
    note: only biallelic if >1 allele
    is_biallelic_01 = ac_seg['all'].is_biallelic_01()[:]
    ac1 = ac_seg['BFM'].compress(is_biallelic_01, axis=0)[:, :2]
    ac2 = ac_seg['AOM'].compress(is_biallelic_01, axis=0)[:, :2]
    """
    sfsdict = {}
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.despine(ax=ax, offset=10)
    for pop in ac_subpops.keys():
        acu = ac_subpops[pop]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('SFS : retaining', np.count_nonzero(flt), 'SNPs')
        # ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)[:, :2])
        ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0))
        if fold and scale:
            sfs = allel.sfs_folded_scaled(ac1)
        elif fold and not scale:
            sfs = allel.sfs_folded(ac1)
        elif not fold and not scale:
            sfs = allel.sfs(ac1[:, 1])
        elif not fold and scale:
            sfs = allel.sfs_scaled(ac1[:, 1])
        sfsdict[pop] = sfs
        allel.stats.plot_sfs_folded_scaled(sfsdict[pop], ax=ax, label=pop,
                                           n=ac1.sum(axis=1).max())
    ax.legend()
    ax.set_title('{} Scaled folded site frequency spectra'.format(c))
    ax.set_xlabel('minor allele frequency')
    if save:
        fig.savefig("ScaledSFS-{}.pdf".format(c), bbox_inches='tight')
    return(sfsdict)
def site_frequency_spectrum(genotypes: np.ndarray, population: str=None) -> np.ndarray:
    allele_counts = genotypes.reshape(genotypes.shape[0], -1).sum(1)
    sfs = allel.sfs(allele_counts, np.product(genotypes.shape[1:]))
    if population is not None:
        plt.title('{} site frequency spectrum'.format(population))
    ax = plt.gca()
    ax = allel.plot_sfs(sfs, ax=ax)
    plt.savefig(os.path.join(FIGURES_DIR, '{}.sfs.png'.format(population.replace(' ', '_'))))
    plt.clf()
    return sfs / sfs.sum()
Beispiel #9
0
def sfs(directory, vcffile):
    ### create a Site Frequency Spectrum Figure
    callset = allel.read_vcf(directory + vcffile + ".vcf")

    gt = allel.GenotypeArray(callset['calldata/GT'])
    ac = gt.count_alleles()[:]

    derived = ac[:, 1]

    sfslist = allel.sfs(derived)

    xlabel = [x for x in range(1, len(sfslist) + 1)]

    plt.plot(xlabel, list(sfslist))
    plt.xlabel("K value")
    plt.ylabel("Number of variants")
    plt.savefig(directory + "/" + vcffile + "_sfs.jpg")
Beispiel #10
0
    def ts_to_stairway(self, ts_path, num_bootstraps=1):
        """
        Converts the specified tskit tree sequence to text files used by
        stairway plot.
        """
        derived_counts_all = [[] for _ in range(num_bootstraps + 1)]
        total_length = 0
        num_samples = 0
        for i, ts_p in enumerate(ts_path):
            ts = tskit.load(ts_p)
            total_length += ts.sequence_length
            num_samples = ts.num_samples
            haps = ts.genotype_matrix()

            # Mask high-ld sites and return genotypes
            mask_path = ts_p + ".unlinkedMask.p"
            if os.path.exists(mask_path):
                mask_file = open(mask_path, "rb")
                ul = pickle.load(mask_file)
                allele_counts = allel.HaplotypeArray(
                    haps[ul, :]).count_alleles()
            else:
                allele_counts = allel.HaplotypeArray(haps).count_alleles()

            # Bootstrap allele counts
            derived_counts_all[0].extend(allele_counts[:, 1])
            for j in range(1, num_bootstraps + 1):
                nsites = np.shape(allele_counts)[0]
                bootset = np.random.choice(np.arange(0, nsites, 1),
                                           nsites,
                                           replace=True)
                bootac = allele_counts[bootset, :]
                der_bootac = bootac[:, 1]
                derived_counts_all[j].extend(der_bootac)

        # Get the SFS minus the 0 bin and write output
        stairway_files = []
        for l in range(len(derived_counts_all)):
            sfs = allel.sfs(derived_counts_all[l])[1:]
            filename = self.workdir / "sfs_{}.txt".format(l)
            write_stairway_sfs(total_length, num_samples, sfs, filename)
            stairway_files.append(filename)

        return stairway_files
Beispiel #11
0
    def ts_to_stairway(self, ts_path, num_bootstraps=1):
        """
        Converts the specified tskit tree sequence to text files used by
        stairway plot.
        """

        derived_counts_all = [[] for _ in range(num_bootstraps + 1)]
        total_length = 0
        num_samples = 0

        for i, ts_p in enumerate(ts_path):

            ts = tskit.load(ts_p)
            total_length += ts.sequence_length
            num_samples = ts.num_samples

            # count alleles, bootstrap over sites, return the SFS minus the 0% bin
            haps = ts.genotype_matrix()
            genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2)
            allele_counts = genotypes.count_alleles()
            derived_allele_counts = allele_counts[:, 1]
            derived_counts_all[0].extend(derived_allele_counts)

            # Write bootstrapped inputs
            for j in range(1, num_bootstraps + 1):
                nsites = np.shape(allele_counts)[0]
                bootset = np.random.choice(np.arange(0, nsites, 1),
                                           nsites,
                                           replace=True)
                bootac = allele_counts[bootset, :]
                der_bootac = bootac[:, 1]
                derived_counts_all[j].extend(der_bootac)

        stairway_files = []
        for l in range(len(derived_counts_all)):
            sfs = allel.sfs(derived_counts_all[l])[1:]
            filename = self.workdir / "sfs_{}.txt".format(l)
            write_stairway_sfs(total_length, num_samples, sfs, filename)
            stairway_files.append(filename)

        return stairway_files
def sfs(haplotype, ac, nindiv=None, folded=False):
    """
    Compute sfs for SNP matrix
    """
    if nindiv == None:
        nindiv = haplotype.shape[1]
    tmp_df = pd.DataFrame({"N_indiv": range(1, nindiv)})
    if folded:
        df_sfs = pd.DataFrame(allel.sfs_folded(ac), columns=["count_SNP"])
        df_sfs["i_xi"] = allel.sfs_folded_scaled(ac)
        df_sfs.index.name = "N_indiv"
        df_sfs.reset_index(inplace=True)
        df_sfs = df_sfs.merge(tmp_df, on="N_indiv",
                              how="right").fillna(0).astype(int)
    else:
        df_sfs = pd.DataFrame(allel.sfs(ac.T[1]), columns=["count_SNP"])
        df_sfs["i_xi"] = allel.sfs_scaled(ac.T[1])
        df_sfs.index.name = "N_indiv"
        df_sfs.reset_index(inplace=True)
        df_sfs = df_sfs.merge(tmp_df, on="N_indiv",
                              how="right").fillna(0).astype(int)

    df_sfs["freq_indiv"] = df_sfs.N_indiv / nindiv
    return df_sfs
Beispiel #13
0
                    type=float,
                    help="mutation rate (per base per generation)")
parser.add_argument('generation_time', type=float, help="generation time")
parser.add_argument('plot', type=bool, help='plot Ne ~ t ? T/F')
args = parser.parse_args()

np.random.seed(args.seed)

#read in tree sequence
ts = msp.load(args.infile)

#count alleles, bootstrap over sites, return the SFS minus the 0% bin
haps = np.array(ts.genotype_matrix())
genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2)
allele_counts = genotypes.count_alleles()
sfs = allel.sfs(allele_counts[:, 1])
sfs = sfs[1:len(sfs)]

#tmp directory for input files
command = ("cd " + args.outdir + ";" + "mkdir infiles")
subprocess.run(command, shell=True)

#write stairwayplot input
out = open((join(args.outdir, "infiles",
                 splitext(basename(args.infile))[0]) + "_strwyplt.txt"), "w")
out.write(
    ("msp" + "\t" + str(ts.num_samples) + "\t" + str(int(ts.sequence_length)) +
     "\t" + str(1) + "\t" + str(ts.num_samples - 1) + "\n")
)  #order is name,n_samples,sequence_length,lowest_sfs_bin,highest_sfs_bin
for x in sfs:
    out.write(str(int(x)) + "\t")
Beispiel #14
0
simYRI = simYRI[simdf['pop'] == "YRI"]
simYRI_ac_all = np.apply_along_axis(sum, 0, simYRI)
simYRI_ac_all = np.array(simYRI_ac_all, dtype="i")
tmp = np.array([100 - x for x in simYRI_ac_all], dtype="i")
simYRI_ac_all = allel.AlleleCountsArray(
    np.transpose(np.vstack((tmp, simYRI_ac_all))))

genYRI = bingen
genYRI = genYRI[pred['pop'] == "YRI"]
genYRI_ac_all = np.apply_along_axis(sum, 0, genYRI)
genYRI_ac_all = np.array(genYRI_ac_all, dtype="i")
tmp = np.array([100 - x for x in genYRI_ac_all], dtype="i")
genYRI_ac_all = allel.AlleleCountsArray(
    np.transpose(np.vstack((tmp, genYRI_ac_all))))

realsfs = allel.sfs(YRI_ac_all[:, 1])
gensfs = allel.sfs(genYRI_ac_all[:, 1])
simsfs = allel.sfs(simYRI_ac_all[:, 1])
sfs = pd.DataFrame()
sfs['real'] = realsfs
sfs['VAE'] = gensfs
sfs['simulation'] = simsfs
sfs['bin'] = np.arange(0, len(realsfs))
sfs.to_csv("out/1kg/1kg_sfs.csv", index=False)

################### LD decay ####################
plt.hist(pos, bins=100)[2]
maskstart = 4.4e7
maskstop = 4.45e7

#get LD and pairwise distance for a subset of 1000 SNPs