def test_Fst__windowed(sample_size, n_cohorts, chunks): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = window(ds, size=25) fst_ds = Fst(ds, estimator="Nei") fst = fst_ds["stat_Fst"].values # Calculate Fst using tskit windows # Find the variant positions so we can have windows with a fixed number of variants positions = ts.tables.sites.position windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) n_windows = len(windows) - 1 ts_fst = np.full([n_windows, n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): ts_fst[:, i, j] = ts.Fst([subsets[i], subsets[j]], windows=windows, span_normalise=False) ts_fst[:, j, i] = ts_fst[:, i, j] # We can values close to zero, and the default value of atol isn't # appropriate for this. atol = 1e-8 np.testing.assert_allclose(fst, ts_fst, atol=atol) # scikit-allel fst_ds = Fst(ds, estimator="Hudson") for i, j in itertools.combinations(range(n_cohorts), 2): fst = fst_ds["stat_Fst"].sel(cohorts_0=f"co_{i}", cohorts_1=f"co_{j}").values ac_i = fst_ds.cohort_allele_count.values[:, i, :] ac_j = fst_ds.cohort_allele_count.values[:, j, :] ska_fst = allel.moving_hudson_fst(ac_i, ac_j, size=25) np.testing.assert_allclose( fst[:-1], ska_fst, atol=atol) # scikit-allel has final window missing
chrom=chrom, samples=metadata, numbers=numbers, ploidy=ploidy, qualflt=qualflt, missingfltprop=missingprop) #### Fst in windows #### for sus, res in comparisons: name = sus + "_" + res cohortText = f"{sus} v {res}" print(f"Calculating Fst values in sliding windows for {name}\n") for wname, size, step in zip(windownames, windowsizes, windowsteps): FstArray = allel.moving_hudson_fst(acsubpops[sus], acsubpops[res], size=size, step=step) midpoint = allel.moving_statistic(pos, np.median, size=size, step=step) cohortNoSpaceText = name + "." + wname rnaseqpop.plotWindowed( statName="Fst", cohortText=cohortText, cohortNoSpaceText=cohortNoSpaceText, values=FstArray, midpoints=midpoint, colour='dodgerblue', prefix="results/variantAnalysis/selection/fst",
def main(args): ## Step 0: get null model for SNP calling null_loc = os.path.dirname( __file__) + '/helper_files/combined_null1000000.txt' null_model = generate_snp_model(null_loc) P2C = {'A': 0, 'C': 1, 'T': 2, 'G': 3} C2P = {0: 'A', 1: 'C', 2: 'T', 3: 'G'} ## Step 1: build new counts table from all objects s_final = SNPprofile() s_final.filename = args.output i = 0 counts_per_block = {} s1 = SNPprofile() print("loading " + args.input[0]) s1.load(args.input[0]) s_final.scaffold_list = s1.scaffold_list s_final.counts_table = copy.deepcopy(s1.counts_table) s2 = SNPprofile() print("loading " + args.input[1]) s2.load(args.input[1]) for scaf in s2.scaffold_list: if scaf not in s_final.scaffold_list: sys.exit( "Error: scaffold " + scaf + " in " + fn + " not found in initial file. Your inStrain objects were probably not run on the same FASTA." ) scaf_counter = 0 for scaf in s2.counts_table: s_final.counts_table[scaf_counter] += scaf scaf_counter += 1 i += 1 # Step 2: call all SNPs for new object allele_counts_total = {} allele_counts1 = {} allele_counts2 = {} snp_table = defaultdict(list) scaf_counter = 0 for scaf in tqdm(s_final.counts_table, desc='Calling new SNVs...'): pos_counter = 0 for counts in scaf: snp = call_snv_site(counts, min_cov=5, min_freq=0.05, model=null_model) if snp: # means that there was coverage at this position if snp != -1: # means this is a SNP # calculate varBase snp, varbase = major_minor_allele(counts) snp_table['scaffold'].append( s_final.scaffold_list[scaf_counter]) snp_table['position'].append(pos_counter) snp_table['varBase'].append(snp) snp_table['conBase'].append(varbase) allele_counts_total[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s_final.counts_table[scaf_counter] [pos_counter]) allele_counts1[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s1.counts_table[scaf_counter] [pos_counter]) allele_counts2[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s2.counts_table[scaf_counter] [pos_counter]) pos_counter += 1 # 0 based positions!! scaf_counter += 1 # Step 3: Save new FST_SNP table to disk. SNPTable = pd.DataFrame(snp_table) FstTable = defaultdict(list) for gene in tqdm(create_gene_index(args.gene_file), desc="calculating fst"): snps = SNPTable[(SNPTable.scaffold == gene['scaf']) & (SNPTable.position >= gene['start']) & (SNPTable.position <= gene['end'])] snp_list = [] for index, row in snps.iterrows(): snp_list.append(row['scaffold'] + ":" + str(row['position'])) # only continue if there are at least 3 snps in this gene if len(snp_list) >= 3: allele_counts_1 = [] allele_counts_2 = [] for snp in snp_list: allele_counts_1.append(allele_counts1[snp]) allele_counts_2.append(allele_counts2[snp]) allel1 = allel.AlleleCountsArray(allele_counts_1) allel2 = allel.AlleleCountsArray(allele_counts_2) fst_h = allel.moving_hudson_fst( allel1, allel2, size=len(snp_list))[0] #allel.moving_hudson_fst(a1,a2, size=3) nd_1 = np.sum(allel.mean_pairwise_difference(allel1)) / ( 1 + gene['end'] - gene['start']) nd_2 = np.sum(allel.mean_pairwise_difference(allel2)) / ( 1 + gene['end'] - gene['start']) FstTable['gene'].append(gene['name']) FstTable['snp_num'].append(len(snp_list)) FstTable['fst'].append(fst_h) FstTable['pi_1'].append(nd_1) FstTable['pi_2'].append(nd_2) FstTable['cov_1'].append(np.mean(np.sum(allele_counts_1, axis=1))) FstTable['cov_2'].append(np.mean(np.sum(allele_counts_2, axis=1))) FstTable = pd.DataFrame(FstTable) print(np.mean(FstTable['fst'])) FstTable.to_csv(args.output + '.Fst.tsv', index=False, sep='\t')