def hdfst(c, csize, ac_subpops, pos, plot, blenw=10000, nwindow=100): """ Hudson FST """ fstdict = {} acdict = {} posdict = {} for x, y in combinations(ac_subpops.keys(), 2): acu = ac_subpops[x] + ac_subpops[y] flt = acu.is_segregating() & (acu.max_allele() == 1) print("{} retaining {} SNPs".format("{}-{}".format(x, y), np.count_nonzero(flt))) posflt = pos[flt] ac1 = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) ac2 = allel.AlleleCountsArray(ac_subpops[y].compress(flt, axis=0)[:, :2]) num, dem = allel.stats.hudson_fst(ac1, ac2) snp_fst = num / dem fst_hd, se_hd = fstchromHD(ac1, ac2, blenw) windlen = int(csize / nwindow) fst_windowed = fstwindHD(ac1, ac2, posflt, windlen) fstdict["{}-{}".format(x, y)] = (snp_fst, fst_hd, se_hd, fst_windowed) posdict["{}-{}".format(x, y)] = posflt acdict["{}-{}".format(x, y)] = (ac1, ac2) if plot: plot_fst(fstdict, list(ac_subpops.keys()), c, csize) return (fstdict, acdict, posdict)
def pairDxy(c, chrsize, ac_subpops, pos, pop2color, plot=False, blenw=10000, nwindow=100): """Calculates DXY """ dxydict = {} windlen = int(chrsize / nwindow) for x, y in combinations(ac_subpops.keys(), 2): # segregating only ? acu = ac_subpops[x] + ac_subpops[y] flt = acu.is_segregating() & (acu.max_allele() == 1) print("{} retaining {} SNPs".format("{}-{}".format(x, y), np.count_nonzero(flt))) posflt = pos[flt] ac1 = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) ac2 = allel.AlleleCountsArray(ac_subpops[y].compress(flt, axis=0)[:, :2]) # all sites # ac1 = ac_subpops[x] # ac2 = ac_subpops[y] # posflt = pos # whole chrom dxy = allel.windowed_divergence(posflt, ac1, ac2, size=blenw, start=1, stop=chrsize) dxy_m, dxy_se, *f = jackknife(dxy[0]) dxy_windowed = allel.windowed_divergence(posflt, ac1, ac2, size=windlen, start=1, stop=chrsize) dxy4plot = (dxy_windowed[0], dxy_windowed[1]) dxydict["{}-{}".format(x, y)] = (dxy_m, dxy_se, dxy4plot) if plot: plot_dxy(dxydict, pop2color, list(ac_subpops.keys()), c, chrsize) return(dxydict)
def theta(c, chrsize, ac_subpops, pos, pop2color, plot=False, blenw=10000, nwindow=100): """ """ thetadict = {} windlen = int(chrsize / nwindow) for x in ac_subpops.keys(): acu = ac_subpops[x] flt = acu.is_segregating() & (acu.max_allele() == 1) print('Theta : retaining', np.count_nonzero(flt), 'SNPs') posflt = pos[flt] ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) # theta theta = allel.windowed_watterson_theta(posflt, ac, size=blenw) t_m, t_se, *t = jackknife(theta[0]) theta_windowed = allel.windowed_watterson_theta(posflt, ac, size=windlen, start=1, stop=chrsize) thetadict[x] = (t_m, t_se, (theta_windowed[0], theta_windowed[1])) if plot: div_plot(thetadict, pop2color, list(ac_subpops.keys()), c, chrsize, "theta") return (thetadict)
def sfs_plot(c, ac_subpops, save=True, fold=True, scale=True): """ note: should filter on segregating if only using subset of pops note: only biallelic if >1 allele is_biallelic_01 = ac_seg['all'].is_biallelic_01()[:] ac1 = ac_seg['BFM'].compress(is_biallelic_01, axis=0)[:, :2] ac2 = ac_seg['AOM'].compress(is_biallelic_01, axis=0)[:, :2] """ sfsdict = {} fig, ax = plt.subplots(figsize=(8, 5)) sns.despine(ax=ax, offset=10) for pop in ac_subpops.keys(): acu = ac_subpops[pop] flt = acu.is_segregating() & (acu.max_allele() == 1) print('SFS : retaining', np.count_nonzero(flt), 'SNPs') # ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)[:, :2]) ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)) if fold and scale: sfs = allel.sfs_folded_scaled(ac1) elif fold and not scale: sfs = allel.sfs_folded(ac1) elif not fold and not scale: sfs = allel.sfs(ac1[:, 1]) elif not fold and scale: sfs = allel.sfs_scaled(ac1[:, 1]) sfsdict[pop] = sfs allel.stats.plot_sfs_folded_scaled(sfsdict[pop], ax=ax, label=pop, n=ac1.sum(axis=1).max()) ax.legend() ax.set_title('{} Scaled folded site frequency spectra'.format(c)) ax.set_xlabel('minor allele frequency') if save: fig.savefig("ScaledSFS-{}.pdf".format(c), bbox_inches='tight') return(sfsdict)
def ld_decay(c, chrsize, ac_subpops, popdict, pop2color, var, min_maf=0.1, xmax=6000): """ """ lddict = {} for x in ac_subpops.keys(): acu = ac_subpops[x] pos = var.pos flt = acu.is_segregating() & (acu.max_allele() == 1) pos = pos[flt] ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) gt = var.gt.compress(flt, axis=0)[:, popdict[x]] af = ac.to_frequencies() flt = (af[:, :2].min(axis=1) > min_maf) pos = pos[flt] gt = gt.compress(flt, axis=0) gn = gt.to_n_alt() print("calc r2...") r = allel.stats.rogers_huff_r(gn) ** 2 print("calc pdist...") dist = pdiff(pos) xmax_diff = dist <= xmax r2 = r[xmax_diff] diff = dist[xmax_diff] lddict[x] = (diff, r2) plot_lddecay(c, lddict, xmax) return(lddict)
def pi(c, chrsize, ac_subpops, pos, pop2color, plot=False, blenw=1000, nwindow=100): """ """ pidict = {} windlen = int(chrsize / nwindow) for x in ac_subpops.keys(): acu = ac_subpops[x] flt = acu.is_segregating() & (acu.max_allele() == 1) print('PI : retaining', np.count_nonzero(flt), 'SNPs') posflt = pos[flt] ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) # pi pi = allel.windowed_diversity(posflt, ac, size=blenw) pi_m, pi_se, *f = jackknife(pi[0]) pi_windowed = allel.windowed_diversity(posflt, ac, size=windlen, start=1, stop=chrsize) pidict[x] = (pi_m, pi_se, (pi_windowed[0], pi_windowed[1])) if plot: div_plot(pidict, pop2color, list(ac_subpops.keys()), c, chrsize, "pi") return (pidict)
def tajd(c, chrsize, ac_subpops, pos, pop2color, plot=False, blenw=1000, nwindow=100): """ """ tajddict = {} windlen = int(chrsize / nwindow) for x in ac_subpops.keys(): acu = ac_subpops[x] flt = acu.is_segregating() & (acu.max_allele() == 1) print('TajD : retaining', np.count_nonzero(flt), 'SNPs') posflt = pos[flt] ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) # tajd tajd = allel.windowed_tajima_d(posflt, ac, size=blenw) d_m, d_se, *d = jackknife(tajd[0]) tajd_windowed = allel.windowed_tajima_d(posflt, ac, size=windlen, start=1, stop=chrsize) # moving window of variants rather than based # tajd_sizevars = allel.moving_tajima_d(ac, size=size) tajddict[x] = (d_m, d_se, (tajd_windowed[0], tajd_windowed[1])) if plot: div_plot(tajddict, pop2color, list(ac_subpops.keys()), c, chrsize, "Tajima's D") return (tajddict)
def calcWCfst_bj_knife(ac, pairs, gtvars, idx1, idx2, blen): acu = al.AlleleCountsArray(ac[pairs[0]][:] + ac[pairs[1]][:]) is_seg = acu.is_segregating() & (acu.max_allele() == 1) gtmp = gtvars.compress(is_seg, axis=0) segSitesPos = scafbp[getScafBp(idx, is_seg)] # Weir & Cockerham's fst, se, vb, vj = al.stats.fst.average_weir_cockerham_fst(gtseg_vars, subpops=[idx1, idx2], blen=blen, max_allele=1) return fst, se, vb, pairs, np.count_nonzero(is_seg), segSitesPos, is_seg
def filter_and_convert_genotypes(genotypes, sites_boolean=None, samples_boolean=None, max_alleles=2, min_count=3, variance_threshold=0.15): '''Filter a genotype array based on booleans of sites and samples to include. Further filter genotypes based on allele count data. Return a set of alternate allele counts ready for PCA, and the allele counts filter. ''' if not all(item > 0 for item in [max_alleles]): raise ValueError("Max alleles must be greater than 0 ") if sites_boolean is not None: if not len(sites_boolean) == genotypes.shape[0]: raise ValueError("Length of sites filter \ does not match length of genotypes") if samples_boolean is not None: if not len(samples_boolean) == genotypes.shape[1]: raise ValueError("Length of samples filter \ does not match length of genotypes") if sites_boolean is not None and samples_boolean is None: genotypes_subset = genotypes.subset(sel0=sites_boolean) elif samples_boolean is not None and sites_boolean is None: genotypes_subset = genotypes.subset(sel1=samples_boolean) elif sites_boolean is not None and samples_boolean is not None: genotypes_subset = genotypes.subset(sel0=sites_boolean, sel1=samples_boolean) else: raise ValueError("Either a samples or a sites filter must be passed") allele_counts = allel.AlleleCountsArray(genotypes_subset.count_alleles()) allele_counts_boolean = (allele_counts.max_allele() <= max_alleles - 1) &\ (allele_counts[:, :2].min(axis=1) > min_count) & ( allele_counts.to_frequencies()[:,1] > variance_threshold) num_alt_alleles = genotypes_subset.subset( allele_counts_boolean).to_n_alt()[:] return num_alt_alleles, allele_counts_boolean
def calcWCfst_per_site(ac, pairs, gtvars, idx1, idx2): acu = al.AlleleCountsArray(ac[pairs[0]][:] + ac[pairs[1]][:]) is_seg = acu.is_segregating() & (acu.max_allele() == 1) gtmp = gtvars.compress(is_seg, axis=0) segSitesPos = scafbp[getScafBp(idx, is_seg)] # Weir & Cockerham's a, b, c = al.weir_cockerham_fst(gtmp, subpops=[ idx1, idx2 ], max_allele=1) with np.errstate(divide='ignore', invalid='ignore'): snp_fst = (a / (a + b + c))[:,0] return pairs, np.count_nonzero(is_seg), snp_fst, segSitesPos, is_seg
def allelify(self): """ Updates genotypes and allele counts array to scikit-allel wrappers """ self.genotypes = { key: allel.GenotypeArray(value) for key, value in self.genotypes.items() } # Numpy -> allel self.allele_counts = { key: allel.AlleleCountsArray(value) for key, value in self.allele_counts.items() }
def test_fully_masked_windowed_diversty(self): ac = allel.AlleleCountsArray(np.array([[5, 5], [5, 5], [1, 9], [1, 9]])) pos = np.array([1, 2, 3, 4]) mask = np.array([False, False, True, True]) pi, _, _, _ = allel.windowed_diversity(pos, ac, size=2, start=1, stop=5, is_accessible=mask) self.assertTrue(np.isnan(pi[0]))
def filterGT(callset, pops, outgroup): """Count patterns from VCF """ gt = allel.GenotypeArray(callset['calldata/GT']) if outgroup: # filter on outgroup pop acs = gt[:, outgroup].count_alleles(max_allele=1) flt = acs.is_segregating() else: # filter without using outgroup using sampled pops subpops = {"popA": pops[0], "popB": pops[1]} acs = gt.count_alleles_subpops(subpops, max_allele=1) acu = allel.AlleleCountsArray(acs["popA"][:] + acs["popB"][:]) flt = acu.is_segregating() # remove non-segrating gt = gt.compress(flt, axis=0) return (gt)
def get_allele_counts(self, genomes): """ Generate an allele count array for a collection of genomes Parameters genomes : ndarray, shape (nsnps, ngenomes) Array encoding a set of sequenced parasite genomes. Returns ac : AlleleCountArray, shape (nsnps, nalleles) Allele counts for every loci in `genomes`. """ nsnps, ngenomes = genomes.shape ac = np.zeros((nsnps, ngenomes), np.int16) # the maximum possible size for i in np.arange(nsnps): counts = np.unique(genomes[i], return_counts=True)[1] n = len(counts) ac[i, :n] = counts ac = ac[:, ac.sum(0) > 0] # remove columns with no alleles return allel.AlleleCountsArray(ac)
def main(args): ## Step 0: get null model for SNP calling null_loc = os.path.dirname( __file__) + '/helper_files/combined_null1000000.txt' null_model = generate_snp_model(null_loc) P2C = {'A': 0, 'C': 1, 'T': 2, 'G': 3} C2P = {0: 'A', 1: 'C', 2: 'T', 3: 'G'} ## Step 1: build new counts table from all objects s_final = SNPprofile() s_final.filename = args.output i = 0 counts_per_block = {} s1 = SNPprofile() print("loading " + args.input[0]) s1.load(args.input[0]) s_final.scaffold_list = s1.scaffold_list s_final.counts_table = copy.deepcopy(s1.counts_table) s2 = SNPprofile() print("loading " + args.input[1]) s2.load(args.input[1]) for scaf in s2.scaffold_list: if scaf not in s_final.scaffold_list: sys.exit( "Error: scaffold " + scaf + " in " + fn + " not found in initial file. Your inStrain objects were probably not run on the same FASTA." ) scaf_counter = 0 for scaf in s2.counts_table: s_final.counts_table[scaf_counter] += scaf scaf_counter += 1 i += 1 # Step 2: call all SNPs for new object allele_counts_total = {} allele_counts1 = {} allele_counts2 = {} snp_table = defaultdict(list) scaf_counter = 0 for scaf in tqdm(s_final.counts_table, desc='Calling new SNVs...'): pos_counter = 0 for counts in scaf: snp = call_snv_site(counts, min_cov=5, min_freq=0.05, model=null_model) if snp: # means that there was coverage at this position if snp != -1: # means this is a SNP # calculate varBase snp, varbase = major_minor_allele(counts) snp_table['scaffold'].append( s_final.scaffold_list[scaf_counter]) snp_table['position'].append(pos_counter) snp_table['varBase'].append(snp) snp_table['conBase'].append(varbase) allele_counts_total[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s_final.counts_table[scaf_counter] [pos_counter]) allele_counts1[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s1.counts_table[scaf_counter] [pos_counter]) allele_counts2[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s2.counts_table[scaf_counter] [pos_counter]) pos_counter += 1 # 0 based positions!! scaf_counter += 1 # Step 3: Save new FST_SNP table to disk. SNPTable = pd.DataFrame(snp_table) FstTable = defaultdict(list) for gene in tqdm(create_gene_index(args.gene_file), desc="calculating fst"): snps = SNPTable[(SNPTable.scaffold == gene['scaf']) & (SNPTable.position >= gene['start']) & (SNPTable.position <= gene['end'])] snp_list = [] for index, row in snps.iterrows(): snp_list.append(row['scaffold'] + ":" + str(row['position'])) # only continue if there are at least 3 snps in this gene if len(snp_list) >= 3: allele_counts_1 = [] allele_counts_2 = [] for snp in snp_list: allele_counts_1.append(allele_counts1[snp]) allele_counts_2.append(allele_counts2[snp]) allel1 = allel.AlleleCountsArray(allele_counts_1) allel2 = allel.AlleleCountsArray(allele_counts_2) fst_h = allel.moving_hudson_fst( allel1, allel2, size=len(snp_list))[0] #allel.moving_hudson_fst(a1,a2, size=3) nd_1 = np.sum(allel.mean_pairwise_difference(allel1)) / ( 1 + gene['end'] - gene['start']) nd_2 = np.sum(allel.mean_pairwise_difference(allel2)) / ( 1 + gene['end'] - gene['start']) FstTable['gene'].append(gene['name']) FstTable['snp_num'].append(len(snp_list)) FstTable['fst'].append(fst_h) FstTable['pi_1'].append(nd_1) FstTable['pi_2'].append(nd_2) FstTable['cov_1'].append(np.mean(np.sum(allele_counts_1, axis=1))) FstTable['cov_2'].append(np.mean(np.sum(allele_counts_2, axis=1))) FstTable = pd.DataFrame(FstTable) print(np.mean(FstTable['fst'])) FstTable.to_csv(args.output + '.Fst.tsv', index=False, sep='\t')
if isHaploidVcfGenoArray(genos): sys.stderr.write("Detected haploid input. Converting into diploid individuals (combining haplotypes in order).\n") genos = diploidizeGenotypeArray(genos) alleleCounts = genos.count_alleles() #remove all but mono/biallelic unmasked sites isBiallelic = alleleCounts.is_biallelic() for i in range(len(isBiallelic)): if not (isBiallelic[i] and calledGenoFracAtSite(genos[i]) >= unmaskedGenoFracCutoff): unmasked[positions[i]-1] = False snpIndicesToKeep = [i for i in range(len(positions)) if unmasked[positions[i]-1]] genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) positions = [positions[i] for i in snpIndicesToKeep] alleleCounts = allel.AlleleCountsArray([[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep]) statNames = ["pi", "thetaW", "tajD", "distVar","distSkew","distKurt","nDiplos","diplo_H1","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"] subWinBounds = getSubWinBounds(chrLen, subWinSize) header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split() statHeader = "chrom start end".split() for statName in statNames: statHeader.append(statName) for i in range(numSubWins): header.append("%s_win%d" %(statName, i)) statHeader = "\t".join(statHeader) header = "\t".join(header) outFile=open(outfn,'w') outFile.write(header+"\n")
pos = callset["variants/POS"][non_het_variants] ## The retained variant positions #######Plot 1: nucleotide diversity######### allele_counts = gt_clean.count_alleles() window_size = int((pos[-1] - pos[0]) / 100) # We want about 100 windows pi, windows, n_bases, n_counts = allel.windowed_diversity( pos, allele_counts, size=window_size, start=pos[0], stop=pos[-1] ) plot_windowed_pi(pi, windows) ######Plot 2: site frequency spectrum######## ## Filtering : only keep biallelic variants (at most two alleles segregate in the set of samples) max_2_alleles = [sum(row != 0) <= 2 for row in allele_counts] filtered_ac = allele_counts[max_2_alleles] bi_counts = allel.AlleleCountsArray(np.ndarray((filtered_ac.shape[0], 2), dtype=int)) index = 0 for row in filtered_ac: picked = [i for i in row if i != 0] assert len(picked) <= 2 if len(picked) == 1: picked = picked + [0] elif len(picked) == 0: picked = [0, 0] bi_counts[index] = picked index += 1 plot_sfs(bi_counts)
# extract all variants that are homozygoth for all samples h**o = np.all(h**o, axis=1) gt_homo = gt_del.subset(h**o) #print(np.count_nonzero(gt_homo), 'structual variances homozygous for all individuals') # get all variant rows for gt_homo variants_pass = variants_pass[h**o] ### find all variants at which the subpop/ sample Genotypes are segregating ac1 = gt_homo.count_alleles(subpop=subpop1) ac2 = gt_homo.count_alleles(subpop=subpop2) acu = allel.AlleleCountsArray(ac1 + ac2) flt = acu.is_segregating() gt_seg = gt_homo[flt] print(np.count_nonzero(flt), 'positions are homoyzgous and segregating') # get all variant data for segregating positions (flt=True) variants_pass = variants_pass[flt] variants_pass ### calc hudson Fst print('calculating Hudson Fst for each position')
ax2.set_xlabel("PC1") ax2.set_ylabel("PC2") ax3.scatter(simdf['PC1'], simdf['PC2'], c=pd.factorize(simdf['pop'])[0]) ax3.set_title("simulated") ax3.set_xlabel("PC1") ax3.set_ylabel("PC2") fig.tight_layout() #fig.savefig('fig/PCA_decoder_comp_mpl.pdf',bbox_inches='tight') ########################### site frequency spectrum ############################# realYRI = dc * 2 realYRI = realYRI[pred['pop'] == "YRI"] YRI_ac_all = np.apply_along_axis(sum, 0, realYRI) YRI_ac_all = np.array(YRI_ac_all, dtype="i") tmp = np.array([100 - x for x in YRI_ac_all], dtype="i") YRI_ac_all = allel.AlleleCountsArray(np.transpose(np.vstack( (tmp, YRI_ac_all)))) simYRI = np.transpose(sim_dc) simYRI = simYRI[simdf['pop'] == "YRI"] simYRI_ac_all = np.apply_along_axis(sum, 0, simYRI) simYRI_ac_all = np.array(simYRI_ac_all, dtype="i") tmp = np.array([100 - x for x in simYRI_ac_all], dtype="i") simYRI_ac_all = allel.AlleleCountsArray( np.transpose(np.vstack((tmp, simYRI_ac_all)))) genYRI = bingen genYRI = genYRI[pred['pop'] == "YRI"] genYRI_ac_all = np.apply_along_axis(sum, 0, genYRI) genYRI_ac_all = np.array(genYRI_ac_all, dtype="i") tmp = np.array([100 - x for x in genYRI_ac_all], dtype="i") genYRI_ac_all = allel.AlleleCountsArray(
x for x in range(len(samples)) if sampleToPop.get(samples[x], "popNotFound!") == targetPop ] genos = genos.subset(sel1=sampleIndicesToKeep) alleleCounts = genos.count_alleles() isBiallelic = alleleCounts.is_biallelic() for i in range(len(isBiallelic)): if not isBiallelic[i]: unmasked[positions[i] - 1] = False snpIndicesToKeep = [ x for x in range(len(positions)) if unmasked[positions[x] - 1] ] genos = genos.subset(sel0=snpIndicesToKeep) haps = genos.to_haplotypes() alleleCounts = allel.AlleleCountsArray( [alleleCounts[x] for x in snpIndicesToKeep]) mapping = [mapping[x] for x in snpIndicesToKeep] alleleCounts = alleleCounts.map_alleles(mapping) positions = [positions[x] for x in snpIndicesToKeep] statNames = [ "pi", "thetaW", "tajD", "thetaH", "fayWuH", "HapCount", "H1", "H12", "H2/H1", "ZnS", "Omega", "iHSMean", "iHSMax", "iHSOutFrac", "nSLMean", "nSLMax", "nSLOutFrac", "distVar", "distSkew", "distKurt" ] for i in ["HAF", "HAFunique", "phi", "kappa", "SFS", "SAFE"]: for j in [ "Mean", "Median", "Mode", "Lower95%", "Lower50%", "Upper50%", "Upper95%", "Max", "Var", "SD", "Skew", "Kurt" ]: statNames.append("%s-%s" % (i, j))
######## ## Weir & Cockerham's Fst pfor each locus a, b, c, = al.weir_cockerham_fst(gtseg, list(subpops.values())[1:]) # estimate theta (a.k.a. Fst) for each variant & allele directly: fst = a / (a + b + c) # compare Hudson's and Weir & Cockerham's per locus Fst: # only take variants that are segregating between the two pops acu = al.AlleleCountsArray(ac_subpops['S'][:] + ac_subpops['N'][:]) flt = acu.is_segregating() & (acu.max_allele() == 1) print('retaining', np.count_nonzero(flt), 'SNPs') ac1 = al.AlleleCountsArray(ac_subpops['S'].compress(flt, axis=0)[:, :2]) ac2 = al.AlleleCountsArray(ac_subpops['N'].compress(flt, axis=0)[:, :2]) genotype = gtsub.compress(flt, axis=0) #genotype pop1_idx = subpops['S'] pop2_idx = subpops['N'] a, b, c = al.weir_cockerham_fst(genotype, subpops=[pop1_idx, pop2_idx], max_allele=1) snp_fst_wc = (a / (a + b + c))[:, 0]
i for i in range(len(positionArray)) if unmasked[positionArray[i] - 1] ] if len(unmaskedSnpIndices) == 0: for statName in statNames: statVals[statName].append([]) for subWinIndex in range(numSubWins): for statName in statNames: fvTools.appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex) else: positionArrayUnmaskedOnly = [ positionArray[i] for i in unmaskedSnpIndices ] ac = genos.count_alleles() alleleCountsUnmaskedOnly = allel.AlleleCountsArray( np.array([ac[i] for i in unmaskedSnpIndices])) sampleSizes = [sum(x) for x in alleleCountsUnmaskedOnly] assert len(set(sampleSizes)) == 1 and sampleSizes[0] == sampleSize if pMisPol > 0: alleleCountsUnmaskedOnly = fvTools.misPolarizeAlleleCounts( alleleCountsUnmaskedOnly, pMisPol) # dafs = alleleCountsUnmaskedOnly[:,1]/float(sampleSizes[0]) unmaskedHaps = haps.subset(sel0=unmaskedSnpIndices) unmaskedGenos = genos.subset(sel0=unmaskedSnpIndices) precomputedStats = {} for statName in statNames: statVals[statName].append([]) for subWinIndex in range(numSubWins): subWinStart, subWinEnd = subWinBounds[subWinIndex] unmaskedFrac = unmasked[subWinStart - 1:subWinEnd].count(True) / float(subWinLen)
archaic_allele_counts = allel.AlleleCountsArray([ [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2], [2, 0], [1, 1], [1, 1], [0, 2] ])