Esempio n. 1
0
def hdfst(c, csize, ac_subpops, pos, plot, blenw=10000, nwindow=100):
    """ Hudson FST
    """
    fstdict = {}
    acdict = {}
    posdict = {}
    for x, y in combinations(ac_subpops.keys(), 2):
        acu = ac_subpops[x] + ac_subpops[y]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print("{} retaining {} SNPs".format("{}-{}".format(x, y),
                                            np.count_nonzero(flt)))
        posflt = pos[flt]
        ac1 = allel.AlleleCountsArray(ac_subpops[x].compress(flt,
                                                             axis=0)[:, :2])
        ac2 = allel.AlleleCountsArray(ac_subpops[y].compress(flt,
                                                             axis=0)[:, :2])
        num, dem = allel.stats.hudson_fst(ac1, ac2)
        snp_fst = num / dem
        fst_hd, se_hd = fstchromHD(ac1, ac2, blenw)
        windlen = int(csize / nwindow)
        fst_windowed = fstwindHD(ac1, ac2, posflt, windlen)
        fstdict["{}-{}".format(x, y)] = (snp_fst, fst_hd, se_hd, fst_windowed)
        posdict["{}-{}".format(x, y)] = posflt
        acdict["{}-{}".format(x, y)] = (ac1, ac2)
    if plot:
        plot_fst(fstdict, list(ac_subpops.keys()), c, csize)
    return (fstdict, acdict, posdict)
Esempio n. 2
0
def pairDxy(c, chrsize, ac_subpops, pos, pop2color, plot=False, blenw=10000, nwindow=100):
    """Calculates DXY
    """
    dxydict = {}
    windlen = int(chrsize / nwindow)
    for x, y in combinations(ac_subpops.keys(), 2):
        # segregating only ?
        acu = ac_subpops[x] + ac_subpops[y]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print("{} retaining {} SNPs".format("{}-{}".format(x, y),
                                            np.count_nonzero(flt)))
        posflt = pos[flt]
        ac1 = allel.AlleleCountsArray(ac_subpops[x].compress(flt,
                                                             axis=0)[:, :2])
        ac2 = allel.AlleleCountsArray(ac_subpops[y].compress(flt,
                                                             axis=0)[:, :2])
        # all sites
#        ac1 = ac_subpops[x]
#        ac2 = ac_subpops[y]
#        posflt = pos
        # whole chrom
        dxy = allel.windowed_divergence(posflt, ac1, ac2, size=blenw,
                                        start=1, stop=chrsize)
        dxy_m, dxy_se, *f = jackknife(dxy[0])
        dxy_windowed = allel.windowed_divergence(posflt, ac1, ac2,
                                                 size=windlen, start=1,
                                                 stop=chrsize)
        dxy4plot = (dxy_windowed[0], dxy_windowed[1])
        dxydict["{}-{}".format(x, y)] = (dxy_m, dxy_se, dxy4plot)
    if plot:
        plot_dxy(dxydict, pop2color, list(ac_subpops.keys()), c, chrsize)
    return(dxydict)
Esempio n. 3
0
def theta(c,
          chrsize,
          ac_subpops,
          pos,
          pop2color,
          plot=False,
          blenw=10000,
          nwindow=100):
    """
    """
    thetadict = {}
    windlen = int(chrsize / nwindow)
    for x in ac_subpops.keys():
        acu = ac_subpops[x]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('Theta : retaining', np.count_nonzero(flt), 'SNPs')
        posflt = pos[flt]
        ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt,
                                                            axis=0)[:, :2])
        # theta
        theta = allel.windowed_watterson_theta(posflt, ac, size=blenw)
        t_m, t_se, *t = jackknife(theta[0])
        theta_windowed = allel.windowed_watterson_theta(posflt,
                                                        ac,
                                                        size=windlen,
                                                        start=1,
                                                        stop=chrsize)
        thetadict[x] = (t_m, t_se, (theta_windowed[0], theta_windowed[1]))
    if plot:
        div_plot(thetadict, pop2color, list(ac_subpops.keys()), c, chrsize,
                 "theta")
    return (thetadict)
Esempio n. 4
0
def sfs_plot(c, ac_subpops, save=True, fold=True, scale=True):
    """
    note: should filter on segregating if only using subset of pops
    note: only biallelic if >1 allele
    is_biallelic_01 = ac_seg['all'].is_biallelic_01()[:]
    ac1 = ac_seg['BFM'].compress(is_biallelic_01, axis=0)[:, :2]
    ac2 = ac_seg['AOM'].compress(is_biallelic_01, axis=0)[:, :2]
    """
    sfsdict = {}
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.despine(ax=ax, offset=10)
    for pop in ac_subpops.keys():
        acu = ac_subpops[pop]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('SFS : retaining', np.count_nonzero(flt), 'SNPs')
        # ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)[:, :2])
        ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0))
        if fold and scale:
            sfs = allel.sfs_folded_scaled(ac1)
        elif fold and not scale:
            sfs = allel.sfs_folded(ac1)
        elif not fold and not scale:
            sfs = allel.sfs(ac1[:, 1])
        elif not fold and scale:
            sfs = allel.sfs_scaled(ac1[:, 1])
        sfsdict[pop] = sfs
        allel.stats.plot_sfs_folded_scaled(sfsdict[pop], ax=ax, label=pop,
                                           n=ac1.sum(axis=1).max())
    ax.legend()
    ax.set_title('{} Scaled folded site frequency spectra'.format(c))
    ax.set_xlabel('minor allele frequency')
    if save:
        fig.savefig("ScaledSFS-{}.pdf".format(c), bbox_inches='tight')
    return(sfsdict)
Esempio n. 5
0
def ld_decay(c, chrsize, ac_subpops, popdict, pop2color, var, min_maf=0.1,
             xmax=6000):
    """
    """
    lddict = {}
    for x in ac_subpops.keys():
        acu = ac_subpops[x]
        pos = var.pos
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        pos = pos[flt]
        ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2])
        gt = var.gt.compress(flt, axis=0)[:, popdict[x]]
        af = ac.to_frequencies()
        flt = (af[:, :2].min(axis=1) > min_maf)
        pos = pos[flt]
        gt = gt.compress(flt, axis=0)
        gn = gt.to_n_alt()
        print("calc r2...")
        r = allel.stats.rogers_huff_r(gn) ** 2
        print("calc pdist...")
        dist = pdiff(pos)
        xmax_diff = dist <= xmax
        r2 = r[xmax_diff]
        diff = dist[xmax_diff]
        lddict[x] = (diff, r2)
    plot_lddecay(c, lddict, xmax)
    return(lddict)
Esempio n. 6
0
def pi(c,
       chrsize,
       ac_subpops,
       pos,
       pop2color,
       plot=False,
       blenw=1000,
       nwindow=100):
    """
    """
    pidict = {}
    windlen = int(chrsize / nwindow)
    for x in ac_subpops.keys():
        acu = ac_subpops[x]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('PI : retaining', np.count_nonzero(flt), 'SNPs')
        posflt = pos[flt]
        ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt,
                                                            axis=0)[:, :2])
        # pi
        pi = allel.windowed_diversity(posflt, ac, size=blenw)
        pi_m, pi_se, *f = jackknife(pi[0])
        pi_windowed = allel.windowed_diversity(posflt,
                                               ac,
                                               size=windlen,
                                               start=1,
                                               stop=chrsize)
        pidict[x] = (pi_m, pi_se, (pi_windowed[0], pi_windowed[1]))

    if plot:
        div_plot(pidict, pop2color, list(ac_subpops.keys()), c, chrsize, "pi")
    return (pidict)
Esempio n. 7
0
def tajd(c,
         chrsize,
         ac_subpops,
         pos,
         pop2color,
         plot=False,
         blenw=1000,
         nwindow=100):
    """
    """
    tajddict = {}
    windlen = int(chrsize / nwindow)
    for x in ac_subpops.keys():
        acu = ac_subpops[x]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('TajD : retaining', np.count_nonzero(flt), 'SNPs')
        posflt = pos[flt]
        ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt,
                                                            axis=0)[:, :2])
        # tajd
        tajd = allel.windowed_tajima_d(posflt, ac, size=blenw)
        d_m, d_se, *d = jackknife(tajd[0])
        tajd_windowed = allel.windowed_tajima_d(posflt,
                                                ac,
                                                size=windlen,
                                                start=1,
                                                stop=chrsize)
        # moving window of variants rather than based
        #        tajd_sizevars = allel.moving_tajima_d(ac, size=size)
        tajddict[x] = (d_m, d_se, (tajd_windowed[0], tajd_windowed[1]))
    if plot:
        div_plot(tajddict, pop2color, list(ac_subpops.keys()), c, chrsize,
                 "Tajima's D")
    return (tajddict)
Esempio n. 8
0
def calcWCfst_bj_knife(ac, pairs, gtvars, idx1, idx2, blen):
    acu = al.AlleleCountsArray(ac[pairs[0]][:] + ac[pairs[1]][:])
    is_seg = acu.is_segregating() & (acu.max_allele() == 1)
    gtmp = gtvars.compress(is_seg, axis=0)
    segSitesPos = scafbp[getScafBp(idx, is_seg)]
    # Weir & Cockerham's
    fst, se, vb, vj = al.stats.fst.average_weir_cockerham_fst(gtseg_vars, subpops=[idx1, idx2], blen=blen, max_allele=1)
    return fst, se, vb, pairs, np.count_nonzero(is_seg), segSitesPos, is_seg
Esempio n. 9
0
def filter_and_convert_genotypes(genotypes,
                                 sites_boolean=None,
                                 samples_boolean=None,
                                 max_alleles=2,
                                 min_count=3,
                                 variance_threshold=0.15):
    '''Filter a genotype array based on booleans of sites
    and samples to include.

    Further filter genotypes based on allele count data.

    Return a set of alternate allele counts ready for PCA,
    and the allele counts filter.
    '''

    if not all(item > 0 for item in [max_alleles]):
        raise ValueError("Max alleles must be greater than 0 ")

    if sites_boolean is not None:

        if not len(sites_boolean) == genotypes.shape[0]:
            raise ValueError("Length of sites filter \
                             does not match length of genotypes")

    if samples_boolean is not None:

        if not len(samples_boolean) == genotypes.shape[1]:
            raise ValueError("Length of samples filter \
                             does not match length of genotypes")

    if sites_boolean is not None and samples_boolean is None:

        genotypes_subset = genotypes.subset(sel0=sites_boolean)

    elif samples_boolean is not None and sites_boolean is None:

        genotypes_subset = genotypes.subset(sel1=samples_boolean)

    elif sites_boolean is not None and samples_boolean is not None:

        genotypes_subset = genotypes.subset(sel0=sites_boolean,
                                            sel1=samples_boolean)

    else:

        raise ValueError("Either a samples or a sites filter must be passed")

    allele_counts = allel.AlleleCountsArray(genotypes_subset.count_alleles())

    allele_counts_boolean = (allele_counts.max_allele() <= max_alleles - 1) &\
    (allele_counts[:, :2].min(axis=1) > min_count) & (
        allele_counts.to_frequencies()[:,1] > variance_threshold)

    num_alt_alleles = genotypes_subset.subset(
        allele_counts_boolean).to_n_alt()[:]

    return num_alt_alleles, allele_counts_boolean
Esempio n. 10
0
def calcWCfst_per_site(ac, pairs, gtvars, idx1, idx2):
    acu = al.AlleleCountsArray(ac[pairs[0]][:] + ac[pairs[1]][:])
    is_seg = acu.is_segregating() & (acu.max_allele() == 1)
    gtmp = gtvars.compress(is_seg, axis=0)
    segSitesPos = scafbp[getScafBp(idx, is_seg)]
    # Weir & Cockerham's
    a, b, c = al.weir_cockerham_fst(gtmp, subpops=[ idx1, idx2 ], max_allele=1)
    with np.errstate(divide='ignore', invalid='ignore'):
        snp_fst = (a / (a + b + c))[:,0]
    return pairs, np.count_nonzero(is_seg), snp_fst, segSitesPos, is_seg
Esempio n. 11
0
 def allelify(self):
     """
     Updates genotypes and allele counts array to scikit-allel wrappers
     """
     self.genotypes = {
         key: allel.GenotypeArray(value)
         for key, value in self.genotypes.items()
     }  # Numpy -> allel
     self.allele_counts = {
         key: allel.AlleleCountsArray(value)
         for key, value in self.allele_counts.items()
     }
Esempio n. 12
0
 def test_fully_masked_windowed_diversty(self):
     ac = allel.AlleleCountsArray(np.array([[5, 5], [5, 5], [1, 9], [1,
                                                                     9]]))
     pos = np.array([1, 2, 3, 4])
     mask = np.array([False, False, True, True])
     pi, _, _, _ = allel.windowed_diversity(pos,
                                            ac,
                                            size=2,
                                            start=1,
                                            stop=5,
                                            is_accessible=mask)
     self.assertTrue(np.isnan(pi[0]))
Esempio n. 13
0
def filterGT(callset, pops, outgroup):
    """Count patterns from VCF
    """
    gt = allel.GenotypeArray(callset['calldata/GT'])
    if outgroup:
        # filter on outgroup pop
        acs = gt[:, outgroup].count_alleles(max_allele=1)
        flt = acs.is_segregating()
    else:
        # filter without using outgroup using sampled pops
        subpops = {"popA": pops[0], "popB": pops[1]}
        acs = gt.count_alleles_subpops(subpops, max_allele=1)
        acu = allel.AlleleCountsArray(acs["popA"][:] + acs["popB"][:])
        flt = acu.is_segregating()
    # remove non-segrating
    gt = gt.compress(flt, axis=0)
    return (gt)
Esempio n. 14
0
    def get_allele_counts(self, genomes):
        """
        Generate an allele count array for a collection of genomes

        Parameters
            genomes : ndarray, shape (nsnps, ngenomes)
                Array encoding a set of sequenced parasite
                genomes.

        Returns
            ac : AlleleCountArray, shape (nsnps, nalleles)
                Allele counts for every loci in `genomes`.

        """
        nsnps, ngenomes = genomes.shape
        ac = np.zeros((nsnps, ngenomes), np.int16)  # the maximum possible size
        for i in np.arange(nsnps):
            counts = np.unique(genomes[i], return_counts=True)[1]
            n = len(counts)
            ac[i, :n] = counts
        ac = ac[:, ac.sum(0) > 0]  # remove columns with no alleles
        return allel.AlleleCountsArray(ac)
Esempio n. 15
0
def main(args):

    ## Step 0: get null model for SNP calling
    null_loc = os.path.dirname(
        __file__) + '/helper_files/combined_null1000000.txt'
    null_model = generate_snp_model(null_loc)
    P2C = {'A': 0, 'C': 1, 'T': 2, 'G': 3}
    C2P = {0: 'A', 1: 'C', 2: 'T', 3: 'G'}

    ## Step 1: build new counts table from all objects
    s_final = SNPprofile()
    s_final.filename = args.output
    i = 0
    counts_per_block = {}
    s1 = SNPprofile()
    print("loading " + args.input[0])
    s1.load(args.input[0])

    s_final.scaffold_list = s1.scaffold_list
    s_final.counts_table = copy.deepcopy(s1.counts_table)

    s2 = SNPprofile()
    print("loading " + args.input[1])
    s2.load(args.input[1])

    for scaf in s2.scaffold_list:
        if scaf not in s_final.scaffold_list:
            sys.exit(
                "Error: scaffold " + scaf + " in " + fn +
                " not found in initial file. Your inStrain objects were probably not run on the same FASTA."
            )

    scaf_counter = 0
    for scaf in s2.counts_table:
        s_final.counts_table[scaf_counter] += scaf
        scaf_counter += 1
    i += 1

    # Step 2: call all SNPs for new object
    allele_counts_total = {}
    allele_counts1 = {}
    allele_counts2 = {}
    snp_table = defaultdict(list)
    scaf_counter = 0

    for scaf in tqdm(s_final.counts_table, desc='Calling new SNVs...'):
        pos_counter = 0
        for counts in scaf:
            snp = call_snv_site(counts,
                                min_cov=5,
                                min_freq=0.05,
                                model=null_model)

            if snp:  # means that there was coverage at this position
                if snp != -1:  # means this is a SNP
                    # calculate varBase
                    snp, varbase = major_minor_allele(counts)

                    snp_table['scaffold'].append(
                        s_final.scaffold_list[scaf_counter])
                    snp_table['position'].append(pos_counter)
                    snp_table['varBase'].append(snp)
                    snp_table['conBase'].append(varbase)
                    allele_counts_total[s_final.scaffold_list[scaf_counter] +
                                        ":" + str(pos_counter)] = (
                                            s_final.counts_table[scaf_counter]
                                            [pos_counter])
                    allele_counts1[s_final.scaffold_list[scaf_counter] + ":" +
                                   str(pos_counter)] = (
                                       s1.counts_table[scaf_counter]
                                       [pos_counter])
                    allele_counts2[s_final.scaffold_list[scaf_counter] + ":" +
                                   str(pos_counter)] = (
                                       s2.counts_table[scaf_counter]
                                       [pos_counter])
            pos_counter += 1  # 0 based positions!!
        scaf_counter += 1

    # Step 3: Save new FST_SNP table to disk.
    SNPTable = pd.DataFrame(snp_table)

    FstTable = defaultdict(list)
    for gene in tqdm(create_gene_index(args.gene_file),
                     desc="calculating fst"):
        snps = SNPTable[(SNPTable.scaffold == gene['scaf'])
                        & (SNPTable.position >= gene['start']) &
                        (SNPTable.position <= gene['end'])]
        snp_list = []
        for index, row in snps.iterrows():
            snp_list.append(row['scaffold'] + ":" + str(row['position']))

        # only continue if there are at least 3 snps in this gene
        if len(snp_list) >= 3:
            allele_counts_1 = []
            allele_counts_2 = []
            for snp in snp_list:
                allele_counts_1.append(allele_counts1[snp])
                allele_counts_2.append(allele_counts2[snp])

            allel1 = allel.AlleleCountsArray(allele_counts_1)
            allel2 = allel.AlleleCountsArray(allele_counts_2)
            fst_h = allel.moving_hudson_fst(
                allel1, allel2,
                size=len(snp_list))[0]  #allel.moving_hudson_fst(a1,a2, size=3)
            nd_1 = np.sum(allel.mean_pairwise_difference(allel1)) / (
                1 + gene['end'] - gene['start'])
            nd_2 = np.sum(allel.mean_pairwise_difference(allel2)) / (
                1 + gene['end'] - gene['start'])

            FstTable['gene'].append(gene['name'])
            FstTable['snp_num'].append(len(snp_list))
            FstTable['fst'].append(fst_h)
            FstTable['pi_1'].append(nd_1)
            FstTable['pi_2'].append(nd_2)
            FstTable['cov_1'].append(np.mean(np.sum(allele_counts_1, axis=1)))
            FstTable['cov_2'].append(np.mean(np.sum(allele_counts_2, axis=1)))

    FstTable = pd.DataFrame(FstTable)
    print(np.mean(FstTable['fst']))
    FstTable.to_csv(args.output + '.Fst.tsv', index=False, sep='\t')
Esempio n. 16
0
if isHaploidVcfGenoArray(genos):
    sys.stderr.write("Detected haploid input. Converting into diploid individuals (combining haplotypes in order).\n")
    genos = diploidizeGenotypeArray(genos)

alleleCounts = genos.count_alleles()

#remove all but mono/biallelic unmasked sites
isBiallelic = alleleCounts.is_biallelic()
for i in range(len(isBiallelic)):
    if not (isBiallelic[i] and calledGenoFracAtSite(genos[i]) >= unmaskedGenoFracCutoff):
        unmasked[positions[i]-1] = False
snpIndicesToKeep = [i for i in range(len(positions)) if unmasked[positions[i]-1]]
genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
positions = [positions[i] for i in snpIndicesToKeep]
alleleCounts = allel.AlleleCountsArray([[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep])

statNames = ["pi", "thetaW", "tajD", "distVar","distSkew","distKurt","nDiplos","diplo_H1","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"]

subWinBounds = getSubWinBounds(chrLen, subWinSize)

header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split()
statHeader = "chrom start end".split()
for statName in statNames:
    statHeader.append(statName)
    for i in range(numSubWins):
        header.append("%s_win%d" %(statName, i))
statHeader = "\t".join(statHeader)
header = "\t".join(header)
outFile=open(outfn,'w')
outFile.write(header+"\n")
Esempio n. 17
0
pos = callset["variants/POS"][non_het_variants]  ## The retained variant positions

#######Plot 1: nucleotide diversity#########
allele_counts = gt_clean.count_alleles()

window_size = int((pos[-1] - pos[0]) / 100)  # We want about 100 windows
pi, windows, n_bases, n_counts = allel.windowed_diversity(
    pos, allele_counts, size=window_size, start=pos[0], stop=pos[-1]
)

plot_windowed_pi(pi, windows)

######Plot 2: site frequency spectrum########
## Filtering : only keep biallelic variants (at most two alleles segregate in the set of samples)
max_2_alleles = [sum(row != 0) <= 2 for row in allele_counts]
filtered_ac = allele_counts[max_2_alleles]
bi_counts = allel.AlleleCountsArray(np.ndarray((filtered_ac.shape[0], 2), dtype=int))
index = 0
for row in filtered_ac:
    picked = [i for i in row if i != 0]
    assert len(picked) <= 2
    if len(picked) == 1:
        picked = picked + [0]
    elif len(picked) == 0:
        picked = [0, 0]
    bi_counts[index] = picked
    index += 1

plot_sfs(bi_counts)
Esempio n. 18
0
# extract all variants that are homozygoth for all samples
h**o = np.all(h**o, axis=1)
gt_homo = gt_del.subset(h**o)

#print(np.count_nonzero(gt_homo), 'structual variances homozygous for all individuals')

# get all variant rows for gt_homo
variants_pass = variants_pass[h**o]



###  find all variants at which the subpop/ sample Genotypes are segregating

ac1 = gt_homo.count_alleles(subpop=subpop1)
ac2 = gt_homo.count_alleles(subpop=subpop2)
acu = allel.AlleleCountsArray(ac1 + ac2)


flt = acu.is_segregating()
gt_seg = gt_homo[flt]

print(np.count_nonzero(flt), 'positions are homoyzgous and segregating')

# get all variant data for segregating positions (flt=True)
variants_pass = variants_pass[flt]
variants_pass



###   calc hudson Fst
print('calculating Hudson Fst for each position')
Esempio n. 19
0
ax2.set_xlabel("PC1")
ax2.set_ylabel("PC2")
ax3.scatter(simdf['PC1'], simdf['PC2'], c=pd.factorize(simdf['pop'])[0])
ax3.set_title("simulated")
ax3.set_xlabel("PC1")
ax3.set_ylabel("PC2")
fig.tight_layout()
#fig.savefig('fig/PCA_decoder_comp_mpl.pdf',bbox_inches='tight')

########################### site frequency spectrum #############################
realYRI = dc * 2
realYRI = realYRI[pred['pop'] == "YRI"]
YRI_ac_all = np.apply_along_axis(sum, 0, realYRI)
YRI_ac_all = np.array(YRI_ac_all, dtype="i")
tmp = np.array([100 - x for x in YRI_ac_all], dtype="i")
YRI_ac_all = allel.AlleleCountsArray(np.transpose(np.vstack(
    (tmp, YRI_ac_all))))

simYRI = np.transpose(sim_dc)
simYRI = simYRI[simdf['pop'] == "YRI"]
simYRI_ac_all = np.apply_along_axis(sum, 0, simYRI)
simYRI_ac_all = np.array(simYRI_ac_all, dtype="i")
tmp = np.array([100 - x for x in simYRI_ac_all], dtype="i")
simYRI_ac_all = allel.AlleleCountsArray(
    np.transpose(np.vstack((tmp, simYRI_ac_all))))

genYRI = bingen
genYRI = genYRI[pred['pop'] == "YRI"]
genYRI_ac_all = np.apply_along_axis(sum, 0, genYRI)
genYRI_ac_all = np.array(genYRI_ac_all, dtype="i")
tmp = np.array([100 - x for x in genYRI_ac_all], dtype="i")
genYRI_ac_all = allel.AlleleCountsArray(
    x for x in range(len(samples))
    if sampleToPop.get(samples[x], "popNotFound!") == targetPop
]

genos = genos.subset(sel1=sampleIndicesToKeep)
alleleCounts = genos.count_alleles()
isBiallelic = alleleCounts.is_biallelic()
for i in range(len(isBiallelic)):
    if not isBiallelic[i]:
        unmasked[positions[i] - 1] = False
snpIndicesToKeep = [
    x for x in range(len(positions)) if unmasked[positions[x] - 1]
]
genos = genos.subset(sel0=snpIndicesToKeep)
haps = genos.to_haplotypes()
alleleCounts = allel.AlleleCountsArray(
    [alleleCounts[x] for x in snpIndicesToKeep])
mapping = [mapping[x] for x in snpIndicesToKeep]
alleleCounts = alleleCounts.map_alleles(mapping)
positions = [positions[x] for x in snpIndicesToKeep]

statNames = [
    "pi", "thetaW", "tajD", "thetaH", "fayWuH", "HapCount", "H1", "H12",
    "H2/H1", "ZnS", "Omega", "iHSMean", "iHSMax", "iHSOutFrac", "nSLMean",
    "nSLMax", "nSLOutFrac", "distVar", "distSkew", "distKurt"
]
for i in ["HAF", "HAFunique", "phi", "kappa", "SFS", "SAFE"]:
    for j in [
            "Mean", "Median", "Mode", "Lower95%", "Lower50%", "Upper50%",
            "Upper95%", "Max", "Var", "SD", "Skew", "Kurt"
    ]:
        statNames.append("%s-%s" % (i, j))
Esempio n. 21
0
########

## Weir & Cockerham's Fst pfor each locus

a, b, c, = al.weir_cockerham_fst(gtseg, list(subpops.values())[1:])

# estimate theta (a.k.a. Fst) for each variant & allele directly:
fst = a / (a + b + c)




# compare Hudson's and Weir & Cockerham's per locus Fst:

# only take variants that are segregating between the two pops
acu = al.AlleleCountsArray(ac_subpops['S'][:] + ac_subpops['N'][:])
flt = acu.is_segregating() & (acu.max_allele() == 1)
print('retaining', np.count_nonzero(flt), 'SNPs')

ac1 = al.AlleleCountsArray(ac_subpops['S'].compress(flt, axis=0)[:, :2])
ac2 = al.AlleleCountsArray(ac_subpops['N'].compress(flt, axis=0)[:, :2])

genotype = gtsub.compress(flt, axis=0)
#genotype


pop1_idx = subpops['S']
pop2_idx = subpops['N']
a, b, c = al.weir_cockerham_fst(genotype, subpops=[pop1_idx, pop2_idx], max_allele=1)
snp_fst_wc = (a / (a + b + c))[:, 0]
     i for i in range(len(positionArray)) if unmasked[positionArray[i] - 1]
 ]
 if len(unmaskedSnpIndices) == 0:
     for statName in statNames:
         statVals[statName].append([])
     for subWinIndex in range(numSubWins):
         for statName in statNames:
             fvTools.appendStatValsForMonomorphic(statName, statVals,
                                                  instanceIndex,
                                                  subWinIndex)
 else:
     positionArrayUnmaskedOnly = [
         positionArray[i] for i in unmaskedSnpIndices
     ]
     ac = genos.count_alleles()
     alleleCountsUnmaskedOnly = allel.AlleleCountsArray(
         np.array([ac[i] for i in unmaskedSnpIndices]))
     sampleSizes = [sum(x) for x in alleleCountsUnmaskedOnly]
     assert len(set(sampleSizes)) == 1 and sampleSizes[0] == sampleSize
     if pMisPol > 0:
         alleleCountsUnmaskedOnly = fvTools.misPolarizeAlleleCounts(
             alleleCountsUnmaskedOnly, pMisPol)
     # dafs = alleleCountsUnmaskedOnly[:,1]/float(sampleSizes[0])
     unmaskedHaps = haps.subset(sel0=unmaskedSnpIndices)
     unmaskedGenos = genos.subset(sel0=unmaskedSnpIndices)
     precomputedStats = {}
     for statName in statNames:
         statVals[statName].append([])
     for subWinIndex in range(numSubWins):
         subWinStart, subWinEnd = subWinBounds[subWinIndex]
         unmaskedFrac = unmasked[subWinStart -
                                 1:subWinEnd].count(True) / float(subWinLen)
Esempio n. 23
0
archaic_allele_counts = allel.AlleleCountsArray([
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2],
    [2, 0],
    [1, 1],
    [1, 1],
    [0, 2]
])