def calc_fst_persite(gt_array_fst, fst_pop_indicies, fst_type): # compute basic (multisite) FST via scikit allel # WC 84 if fst_type == "wc": a, b, c = allel.weir_cockerham_fst(gt_array_fst, subpops=fst_pop_indicies) fst = (np.sum(a, axis=1) / (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) return (fst) # Hudson 92 elif fst_type == "hudson": # following scikit allel docs # allel counts for each population ac1 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[0]) ac2 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[1]) #hudson fst has two components (numerator & denominator) num, den = allel.hudson_fst(ac1, ac2) fst = num / den return (fst)
def get_fst(s1,s2): ac1 = s1.count_alleles() ac2 = s2.count_alleles() num, den = allel.hudson_fst(ac1, ac2) fst = np.sum(num) / np.sum(den) print(f'The F_st value between the two populations is {round(np.abs(fst),2)}') return
def print_pi(self, tree_sequence, indices, populations): if not self.pi_needed(): return writer = self.writers['pi'] # invert populations dictionary to be keyed by population index # this keeps the order consistent instead of relying on keys pops = 'AF EU AS'.split() indices = np.array(indices) writer.write('\t'.join(pops) + '\t') writer.write('AF-EU\tAF-AS\tEU-AS\n') length = tree_sequence.get_sequence_length() haplotypes = tree_sequence.genotype_matrix() for pop in pops: mpd = allel.mean_pairwise_difference( allel.HaplotypeArray( haplotypes[:, indices == populations[pop]]).count_alleles()) writer.write(f'{mpd.sum()/length:.5}\t') for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')): count1 = allel.HaplotypeArray( haplotypes[:, indices == populations[pairs[0]]]).count_alleles() count2 = allel.HaplotypeArray( haplotypes[:, indices == populations[pairs[1]]]).count_alleles() num, den = allel.hudson_fst(count1, count2) writer.write(f'{num.sum() / den.sum():.5}\t') writer.write('\n')
def main(vcffile, pop1, pop2, binwidth, stepsize, outprefix): """ 计算pop1和pop2之间的Fst using the method of Hudson (1992) elaborated by Bhatia et al. (2013). """ pop1 = [x.strip() for x in open(pop1)] pop2 = [x.strip() for x in open(pop2)] callset = allel.read_vcf(vcffile) allsamples = callset['samples'] genotypes = allel.GenotypeChunkedArray(callset['calldata/GT']) variant_selection = np.full((genotypes.shape[0] + 1), True) # 选择vcf中的全部位点 sample_selection = [True if x in pop1 else False for x in allsamples] ac1 = getAC(genotypes, variant_selection, sample_selection) sample_selection = [True if x in pop2 else False for x in allsamples] ac2 = getAC(genotypes, variant_selection, sample_selection) num, den = allel.hudson_fst(ac1, ac2) fst = num / den meanFst = np.sum(num) / np.sum(den) print('meanFst: %s' % meanFst) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] df = pd.DataFrame({'chrom': chrom, 'pos': pos, 'hudson_Fst': fst}) df.to_csv(f'{outprefix}_persite.tsv.gz', sep='\t', index=False, na_rep='nan', compression='gzip') df['num'] = num df['den'] = den # sliding bins bdf = [] for offset in range(0, binwidth, stepsize): df['bin_index'] = ((df['pos'].values - 1) - offset) // binwidth for group_name, gdf in df.groupby(by=['chrom', 'bin_index']): chrom, bin_index = group_name start = bin_index * binwidth + offset + 1 if start < 0: # 开头几个窗口长度不足的就直接跳过 continue end = start + binwidth - 1 n_snp = gdf.shape[0] sum_num = gdf['num'].sum() sum_den = gdf['den'].sum() if sum_den > 0: meanFst = sum_num / sum_den else: meanFst = np.nan bdf.append([chrom, start, end, n_snp, meanFst]) bdf = pd.DataFrame(bdf, columns=['chrom', 'start', 'end', 'n_snp', 'meanFst']).sort_values(by=['chrom', 'start']) bdf.to_csv(f'{outprefix}_meanFst.tsv.gz', index=False, compression='gzip', sep='\t', float_format='%.3f')
def calc_fst(gt_array_fst, fst_pop_indicies, fst_type): # compute basic (multisite) FST via scikit allel # WC 84 if fst_type == "wc": a, b, c = allel.weir_cockerham_fst(gt_array_fst, subpops=fst_pop_indicies) # compute variance component sums a = np.nansum(a).tolist() b = np.nansum(b).tolist() c = np.nansum(c).tolist() n_sites = len(gt_array_fst) # compute fst if (a + b + c) > 0: fst = a / (a + b + c) else: fst = "NA" return (fst, a, b, c, n_sites) # Hudson 92 if fst_type == "hudson": # following scikit allel docs # allel counts for each population ac1 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[0]) ac2 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[1]) #hudson fst has two components (numerator & denominator) num, den = allel.hudson_fst(ac1, ac2) c = 0 # for compatibility with aggregation code for WC 84 # compute variance component sums num = np.nansum(num).tolist() den = np.nansum(den).tolist() n_sites = len(gt_array_fst) # compute fst if (num + den) > 0: fst = num / den else: fst = "NA" # same abc format as WC84, where 'a' is the numerator and # 'b' is the demoninator, and 'c' is a zero placeholder return (fst, num, den, c, n_sites)
def test_Fst__Hudson(sample_size): # scikit-allel can only calculate Fst for pairs of cohorts (populations) n_cohorts = 2 ts = simulate_ts(sample_size) ds = ts_to_dataset(ts) ds, subsets = add_cohorts(ds, ts, n_cohorts) n_variants = ds.dims["variants"] ds = window_by_variant(ds, size=n_variants) # single window ds = Fst(ds, estimator="Hudson") fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values # scikit-allel ac1 = ds.cohort_allele_count.values[:, 0, :] ac2 = ds.cohort_allele_count.values[:, 1, :] num, den = hudson_fst(ac1, ac2) ska_fst = np.sum(num) / np.sum(den) np.testing.assert_allclose(fst, ska_fst)
def compute_fst(raw): """ FST (for two populations) https://scikit-allel.readthedocs.io/en/stable/stats/fst.html """ # raw has been transposed nvar = raw.shape[0] nsam = raw.shape[1] raw = np.expand_dims(raw, axis=2).astype('i') g = allel.GenotypeArray(raw) subpops = [range(nsam // 2), range(nsam // 2, nsam)] # for each pop ac1 = g.count_alleles(subpop=subpops[0]) ac2 = g.count_alleles(subpop=subpops[1]) # compute average fst num, den = allel.hudson_fst(ac1, ac2) fst = np.sum(num) / np.sum(den) return fst
def calc_fst(mseqs): groups = list(mseqs.keys()) len_grp = len(groups) FST_mat = np.zeros((len_grp, len_grp)) allele_counts = count_allele(mseqs) for i, j in itertools.combinations(range(len_grp), 2): ac1 = allele_counts[groups[i]] ac2 = allele_counts[groups[j]] with np.errstate(divide='ignore', invalid='ignore'): num, den = allel.hudson_fst(ac1, ac2) FST_mat[i, j] = np.nanmean(num / den) #np.sum(num) / np.sum(den) FST_mat[j, i] = np.nanstd(num / den) cout('%5.4f +- %5.4f : %s <> %s' % (FST_mat[i, j], FST_mat[j, i], groups[i], groups[j])) return FST_mat, groups
def test_Fst__Hudson(sample_size): # scikit-allel can only calculate Fst for pairs of cohorts (populations) n_cohorts = 2 ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] n_variants = ds.dims["variants"] ds = window(ds, size=n_variants) # single window ds = Fst(ds, estimator="Hudson") fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values # scikit-allel ac1 = ds.cohort_allele_count.values[:, 0, :] ac2 = ds.cohort_allele_count.values[:, 1, :] num, den = hudson_fst(ac1, ac2) ska_fst = np.sum(num) / np.sum(den) np.testing.assert_allclose(fst, ska_fst)
def fst(p1, pos, gt, quants): """Calculate Hudson's FST. Hudson’s FST estimator as the ratio of averages computed following Bhatia et al. (2013). Parameters ---------- p1 : TYPE DESCRIPTION. p2 : TYPE DESCRIPTION. pos : TYPE DESCRIPTION. gt : TYPE DESCRIPTION. win_size : TYPE DESCRIPTION. length_bp : TYPE DESCRIPTION. Returns ------- fst : TYPE DESCRIPTION. """ ac1, ac2, pos_s = get_ac_seg(p1, pos, gt) # segregating in both pops loc_asc = ac1.is_segregating() & ac2.is_segregating() ac1_seg = ac1.compress(loc_asc, axis=0) ac2_seg = ac2.compress(loc_asc, axis=0) num, den = allel.hudson_fst(ac1_seg, ac2_seg) fst_snp = num / den if quants[0] < 0: fst_ = [np.nanmean(fst_snp)] else: fst_ = np.nanquantile(fst_snp, quants) return fst_
def calc_site_fst(mseqs, nan_to_zero=False): groups = list(mseqs.keys()) len_grp = len(groups) FST_sites = [] allele_counts = count_allele(mseqs) for i, j in itertools.combinations(range(len_grp), 2): ac1 = allele_counts[groups[i]] ac2 = allele_counts[groups[j]] with np.errstate(divide='ignore', invalid='ignore'): num, den = allel.hudson_fst(ac1, ac2) if nan_to_zero: # convert nan to zero fst = np.nan_to_num(num / den) else: fst = num / den FST_sites.append(('%s <> %s' % (groups[i], groups[j]), fst)) return FST_sites
def select(self, haplotypes, groups, haplotest, k=None): # we use k for redundancy parameters if k == 0 or k is None: k = 1 candidate_L = [] # [ (pos, rank, no_actual_pops)] # we traverse through the tree for (level, pop1, pop2) in traverse(self.guide_tree): n_pops = len(pop1) + len(pop2) haplotypes1 = haplotypes[np.isin(groups, pop1)] haplotypes2 = haplotypes[np.isin(groups, pop2)] if len(haplotypes1) < 4: cerr('[I - insufficient population size for %s]' % pop1) if len(haplotypes2) < 4: cerr('[I - insufficient population size for %s]' % pop2) # convert haplotypes to allele counts ac1 = count_allele(haplotypes1) ac2 = count_allele(haplotypes2) # calculate highest FST FST = [] num, den = allel.hudson_fst(ac1, ac2) # NOTE: the line below avoids warning (invalid value in true_divide) # when den == 0, which should be perfectly ok for FST calculation den[den == 0] = -1 fst = num / den # check for FST == 1.0 ultimate_fst_pos = np.nonzero(fst >= self.ultimate_fst)[0] if len(ultimate_fst_pos) > 0: self.log( 'FST: %3.2f at %s for pop %s <> %s' % (self.ultimate_fst, str(ultimate_fst_pos), pop1, pop2)) if len(ultimate_fst_pos) > k: if self.priority is not None: # get ultimate_fst based on priority ultimate_priority = self.priority[ultimate_fst_pos] sortidx = ultimate_fst_pos[np.argsort(ultimate_priority)] else: np.random.shuffle(ultimate_fst_pos) sortidx = ultimate_fst_pos #import IPython; IPython.embed() else: #fst[ np.isnan(fst) ] = 0 sortidx = np.argsort(fst) # get highest FST #highest_fst_pos = sortidx[-(k+1):-1] #highest_fst_pos = list(reversed(sortidx))[:k] highest_fst_pos = sortidx[-k:] highest_fst_val = fst[highest_fst_pos] #self.log('highest FST: %5.4f at %d for pops %s <> %s' % (highest_fst_val, highest_fst_pos, pop1, pop2)) if len(ultimate_fst_pos ) > 0 and highest_fst_pos not in ultimate_fst_pos: pass #import IPython; IPython.embed() # check suitability of SNPs snplist, F = None, -1 if highest_fst_val.max() < self.min_fst: if self.max_leaf_snp > k: X_train = np.append(haplotypes1, haplotypes2, axis=0) y_train = np.array([1] * len(haplotypes1) + [2] * len(haplotypes2)) best_iteration = (-1, None) for i in range(k, self.max_leaf_snp): features = sortidx[-(i + 1):-1] model = FixSNPSelectorLK('dummy', snpindex=features) lk_predictions, snplist, _, params = model.fit_and_predict( X_train, y_train, X_train, len(features)) scores = calculate_scores(y_train, lk_predictions) F = scores.loc[scores['REG'] == 'MIN', 'MCC'].values[0] if best_iteration[0] < F: best_iteration = (F, snplist) snplist, F = best_iteration[1], best_iteration[0] snplist_2, F_2 = self.select_2(haplotypes1, haplotypes2) if F_2 > F: snplist, F = snplist_2, F_2 if snplist is not None: self.log('F: %5.4f SNP: %d for pop %s <> %s => %s' % (F, len(snplist), pop1, pop2, snplist)) for p in snplist: candidate_L.append((p, level, n_pops)) continue # TODO: 2nd approach: find 2 SNPs with highest r^2(st) eg r^2 subpopulation vs r^2 total population # if snplist is None, just provide warning notice and skip this node! else: self.log('low FST = %5.4f < %5.4f for %s vs %s; skipping' % (highest_fst_val.max(), self.min_fst, pop1, pop2)) continue # append to candidate_L for p in highest_fst_pos: candidate_L.append((p, level, n_pops)) self.log('FST: %s SNP: %d for pop %s <> %s => %s' % (str(highest_fst_val), len(highest_fst_pos), pop1, pop2, str(highest_fst_pos))) # process candidate_L L = np.unique(np.array(sorted([x[0] for x in candidate_L]))) # return snp position return (L, None, {})
gt_seg = gt_homo[flt] print(np.count_nonzero(flt), 'positions are homoyzgous and segregating') # get all variant data for segregating positions (flt=True) variants_pass = variants_pass[flt] variants_pass ### calc hudson Fst print('calculating Hudson Fst for each position') ac1 = gt_seg.count_alleles(subpop=subpop1) ac2 = gt_seg.count_alleles(subpop=subpop2) num, den = allel.hudson_fst(ac1, ac2) num_fix = np.nan_to_num(num) den_fix = np.nan_to_num(den) fst = num_fix / den_fix ### append Fst to variants # Fst array to dataframe fst_df = pd.DataFrame(fst, columns=['Fst_hudson']) # set index of Fst to index of variants_pass fst_df.index = variants_pass.index
def select(self, haplotypes, groups, haplotest, k=None): # we use k for redundancy parameters if k == 0 or k is None: k = 1 candidate_L = [] # [ (pos, rank, no_actual_pops)] # we traverse through the tree for (level, pop1, pop2) in traverse(self.guide_tree): n_pops = len(pop1) + len(pop2) haplotypes1 = haplotypes[ np.isin(groups, pop1) ] haplotypes2 = haplotypes[ np.isin(groups, pop2) ] if len(haplotypes1) < 4: cerr('[I - insufficient population size for %s -> %d]' % (pop1, len(haplotypes)) ) if len(haplotypes2) < 4: cerr('[I - insufficient population size for %s -> %d]' % (pop2, len(haplotypes)) ) # convert haplotypes to allele counts ac1 = count_allele(haplotypes1) ac2 = count_allele(haplotypes2) # calculate highest FST FST = [] num, den = allel.hudson_fst(ac1, ac2) # NOTE: the line below might produce warning (invalid value in true_divide) # if den == 0, which should be perfectly ok for FST calculation fst = num/den fst[ np.isnan(fst) ] = 0 sortidx = np.argsort( fst ) # get highest FST highest_fst_pos = sortidx[-(k+1):-1] highest_fst_val = fst[ highest_fst_pos ] #cerr('[I - highest FST: %5.4f at %d for pops %s and %s' % (highest_fst_val, highest_fst_pos, pop1, pop2)) # check suitability of SNPs if highest_fst_val.max() < self.min_fst: snplist, F = self.select_2(haplotypes1, haplotypes2) if snplist: self.log('F: %5.4f SNP: %d for pop %s <> %s' % (F, len(snplist), pop1, pop2)) for p in snplist: candidate_L.append( (p, level, n_pops) ) continue # 2nd approach: find 2 SNPs with highest r^2(st) eg r^2 subpopulation vs r^2 total population else: self.log('low FST = %5.4f for %s vs %s' % ( highest_fst_val.max(), pop1, pop2)) # append to candidate_L for p in highest_fst_pos: candidate_L.append( (p, level, n_pops) ) # process candidate_L L = np.unique( np.array( sorted( [ x[0] for x in candidate_L] ) ) ) # return snp position return (L, None, {})
def traditional_stats(data): """ Caclulates lots of (mostly) traditional statistics, that are summaries of the site frequency spectrum. Arguments --------- data: Named tuple of results (made by collate_results function) Returns --------- Nested dictionary of statistics """ pop_names = ["domestic", "wild", "captive", "all_pops"] stats = { "sfs_mean": {}, "diversity": {}, "wattersons_theta": {}, "tajimas_d": {}, "observed_heterozygosity": {}, "expected_heterozygosity": {}, "segregating_sites": {}, "monomorphic_sites": {}, "roh_mean": {}, "roh_iqr": {}, "r2": {}, "f3": {}, "divergence": {}, "fst": {}, "f2": {}, } for pop in pop_names: # One way statistics stats["sfs_mean"][pop] = binned_sfs_mean(data.allele_counts[pop]) stats["diversity"][pop] = allel.sequence_diversity( data.positions, data.allele_counts[pop]) stats["wattersons_theta"][pop] = allel.watterson_theta( data.positions, data.allele_counts[pop]) stats["tajimas_d"][pop] = allel.tajima_d(data.allele_counts[pop], data.positions) stats["observed_heterozygosity"][pop] = allel.heterozygosity_observed( data.genotypes[pop]).mean() stats["expected_heterozygosity"][pop] = allel.heterozygosity_expected( data.allele_counts[pop].to_frequencies(), ploidy=2).mean() stats["segregating_sites"] = data.allele_counts[pop].count_segregating( ) if pop != "all_pops": # all_pops has no monomorphic sites stats["monomorphic_sites"][pop] = data.allele_counts[ pop].count_non_segregating() # Three way statistics other_pops = [ pop_name for pop_name in pop_names if pop_name not in ["all_pops", pop] ] t, b = allel.patterson_f3(data.allele_counts[pop], data.allele_counts[other_pops[0]], data.allele_counts[other_pops[1]]) stats["f3"][pop] = np.sum(t) / np.sum(b) # Two way statistics for comparison in ["domestic_wild", "domestic_captive", "wild_captive"]: p = comparison.split("_") stats["divergence"][comparison] = allel.sequence_divergence( data.positions, data.allele_counts[p[0]], data.allele_counts[p[1]]) num, den = allel.hudson_fst(data.allele_counts[p[0]], data.allele_counts[p[1]]) stats["fst"][comparison] = np.sum(num) / np.sum(den) stats["f2"][comparison] = allel.patterson_f2( data.allele_counts[p[0]], data.allele_counts[p[1]]).mean() return stats