def test_mean_pairwise_diversity(self): # start with simplest case, two haplotypes, one pairwise comparison h = HaplotypeArray([[0, 0], [1, 1], [0, 1], [1, 2], [0, -1], [-1, -1]]) ac = h.count_alleles() expect = [0, 0, 1, 1, -1, -1] actual = allel.mean_pairwise_difference(ac, fill=-1) aeq(expect, actual) # four haplotypes, 6 pairwise comparison h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) ac = h.count_alleles() expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1] actual = allel.mean_pairwise_difference(ac, fill=-1) assert_array_almost_equal(expect, actual)
def test_diversity__windowed(sample_size): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] ds, subsets = add_cohorts( ds, ts, cohort_key_names=["cohorts"]) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = diversity(ds) div = ds["stat_diversity"].sel(cohorts="co_0").compute() # Calculate diversity using tskit windows # Find the variant positions so we can have windows with a fixed number of variants positions = ts.tables.sites.position windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) ts_div = ts.diversity(windows=windows, span_normalise=False) np.testing.assert_allclose(div, ts_div) # Calculate diversity using scikit-allel moving_statistic # (Don't use windowed_diversity, since it treats the last window differently) ds = count_variant_alleles( ts_to_dataset(ts)) # type: ignore[no-untyped-call] ac = ds["variant_allele_count"].values mpd = allel.mean_pairwise_difference(ac, fill=0) ska_div = allel.moving_statistic(mpd, np.sum, size=25) np.testing.assert_allclose( div[:-1], ska_div) # scikit-allel has final window missing
def print_pi(self, tree_sequence, indices, populations): if not self.pi_needed(): return writer = self.writers['pi'] # invert populations dictionary to be keyed by population index # this keeps the order consistent instead of relying on keys pops = 'AF EU AS'.split() indices = np.array(indices) writer.write('\t'.join(pops) + '\t') writer.write('AF-EU\tAF-AS\tEU-AS\n') length = tree_sequence.get_sequence_length() haplotypes = tree_sequence.genotype_matrix() for pop in pops: mpd = allel.mean_pairwise_difference( allel.HaplotypeArray( haplotypes[:, indices == populations[pop]]).count_alleles()) writer.write(f'{mpd.sum()/length:.5}\t') for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')): count1 = allel.HaplotypeArray( haplotypes[:, indices == populations[pairs[0]]]).count_alleles() count2 = allel.HaplotypeArray( haplotypes[:, indices == populations[pairs[1]]]).count_alleles() num, den = allel.hudson_fst(count1, count2) writer.write(f'{num.sum() / den.sum():.5}\t') writer.write('\n')
def removeMissingStats(region, region_length): """ Calculates haplotype frequencies, haplotypic diversity, and nucleotide diversity for a given region of a vcf. Subjects with missing data for 1+ variant will be removed before calculation. Args: region: a haplotypeArray covering coordinates of interest. Returns: List containing: - Number of individuals with no missing variant data in this region; only these individuals' data is used in further calculations. - Haplotypic diversity for region. Returns 0 if only 1 subject present. - Nucleotide diversity. - List of haplotype frequencies. """ # remove missing in region keep_subject = np.ones(region.shape[1], dtype=int) for i in range(1, region.shape[1]): if -1 in region[:, i]: keep_subject[i] = 0 region_complete = region.compress(condition=keep_subject, axis=1) # calculate haplotype frequencies freqs = region_complete.distinct_frequencies() nh = region_complete.n_haplotypes if nh == 1: hap_div = 0 else: hap_div = allel.haplotype_diversity(region_complete) # calculate nucleotide diversity specifically on nonmissing region ac = region_complete.count_alleles() diffs = allel.mean_pairwise_difference(ac, fill=0) pi = np.sum(diffs) / region_length return [nh, hap_div, pi, freqs]
def main(args): ## Step 0: get null model for SNP calling null_loc = os.path.dirname( __file__) + '/helper_files/combined_null1000000.txt' null_model = generate_snp_model(null_loc) P2C = {'A': 0, 'C': 1, 'T': 2, 'G': 3} C2P = {0: 'A', 1: 'C', 2: 'T', 3: 'G'} ## Step 1: build new counts table from all objects s_final = SNPprofile() s_final.filename = args.output i = 0 counts_per_block = {} s1 = SNPprofile() print("loading " + args.input[0]) s1.load(args.input[0]) s_final.scaffold_list = s1.scaffold_list s_final.counts_table = copy.deepcopy(s1.counts_table) s2 = SNPprofile() print("loading " + args.input[1]) s2.load(args.input[1]) for scaf in s2.scaffold_list: if scaf not in s_final.scaffold_list: sys.exit( "Error: scaffold " + scaf + " in " + fn + " not found in initial file. Your inStrain objects were probably not run on the same FASTA." ) scaf_counter = 0 for scaf in s2.counts_table: s_final.counts_table[scaf_counter] += scaf scaf_counter += 1 i += 1 # Step 2: call all SNPs for new object allele_counts_total = {} allele_counts1 = {} allele_counts2 = {} snp_table = defaultdict(list) scaf_counter = 0 for scaf in tqdm(s_final.counts_table, desc='Calling new SNVs...'): pos_counter = 0 for counts in scaf: snp = call_snv_site(counts, min_cov=5, min_freq=0.05, model=null_model) if snp: # means that there was coverage at this position if snp != -1: # means this is a SNP # calculate varBase snp, varbase = major_minor_allele(counts) snp_table['scaffold'].append( s_final.scaffold_list[scaf_counter]) snp_table['position'].append(pos_counter) snp_table['varBase'].append(snp) snp_table['conBase'].append(varbase) allele_counts_total[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s_final.counts_table[scaf_counter] [pos_counter]) allele_counts1[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s1.counts_table[scaf_counter] [pos_counter]) allele_counts2[s_final.scaffold_list[scaf_counter] + ":" + str(pos_counter)] = ( s2.counts_table[scaf_counter] [pos_counter]) pos_counter += 1 # 0 based positions!! scaf_counter += 1 # Step 3: Save new FST_SNP table to disk. SNPTable = pd.DataFrame(snp_table) FstTable = defaultdict(list) for gene in tqdm(create_gene_index(args.gene_file), desc="calculating fst"): snps = SNPTable[(SNPTable.scaffold == gene['scaf']) & (SNPTable.position >= gene['start']) & (SNPTable.position <= gene['end'])] snp_list = [] for index, row in snps.iterrows(): snp_list.append(row['scaffold'] + ":" + str(row['position'])) # only continue if there are at least 3 snps in this gene if len(snp_list) >= 3: allele_counts_1 = [] allele_counts_2 = [] for snp in snp_list: allele_counts_1.append(allele_counts1[snp]) allele_counts_2.append(allele_counts2[snp]) allel1 = allel.AlleleCountsArray(allele_counts_1) allel2 = allel.AlleleCountsArray(allele_counts_2) fst_h = allel.moving_hudson_fst( allel1, allel2, size=len(snp_list))[0] #allel.moving_hudson_fst(a1,a2, size=3) nd_1 = np.sum(allel.mean_pairwise_difference(allel1)) / ( 1 + gene['end'] - gene['start']) nd_2 = np.sum(allel.mean_pairwise_difference(allel2)) / ( 1 + gene['end'] - gene['start']) FstTable['gene'].append(gene['name']) FstTable['snp_num'].append(len(snp_list)) FstTable['fst'].append(fst_h) FstTable['pi_1'].append(nd_1) FstTable['pi_2'].append(nd_2) FstTable['cov_1'].append(np.mean(np.sum(allele_counts_1, axis=1))) FstTable['cov_2'].append(np.mean(np.sum(allele_counts_2, axis=1))) FstTable = pd.DataFrame(FstTable) print(np.mean(FstTable['fst'])) FstTable.to_csv(args.output + '.Fst.tsv', index=False, sep='\t')
def print_pi(self, tree_sequence, indices, populations): if not self.pi_needed(): return writer = self.writers['pi'] # invert populations dictionary to be keyed by population index # this keeps the order consistent instead of relying on keys pops = 'AF EU AS'.split() indices = np.array(indices) writer.write('\t'.join(pops) + '\t') writer.write('AF-EU\tAF-AS\tEU-AS\n') length = tree_sequence.get_sequence_length() haplotypes = tree_sequence.genotype_matrix() ga_comb = allel.HaplotypeArray( haplotypes[:, indices == populations['AF']]).to_genotypes( ploidy=2).concatenate([ allel.HaplotypeArray( haplotypes[:, indices == populations['EU']]).to_genotypes( ploidy=2), allel.HaplotypeArray( haplotypes[:, indices == populations['AS']]).to_genotypes( ploidy=2) ], 1) keep_alleles = ga_comb.count_alleles().is_biallelic_01( min_mac=int(0.05 * (ga_comb.n_samples))) # for pop in pops: # mpd = allel.mean_pairwise_difference( # allel.HaplotypeArray( # haplotypes[:, indices == populations[pop]] # ).count_alleles()) # writer.write( # f'{mpd.sum()/length:.5}\t') # # for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')): # count1 = allel.HaplotypeArray( # haplotypes[:, indices == populations[pairs[0]]] # ).count_alleles() # count2 = allel.HaplotypeArray( # haplotypes[:, indices == populations[pairs[1]]] # ).count_alleles() # num, den = allel.hudson_fst(count1, count2) # writer.write(f'{num.sum() / den.sum():.5}\t') # writer.write('\n') # Calculate pi for pop in pops: ## Create genotype array from tree_sequence haplotype data for ## population and ploidy=2 counts = allel.HaplotypeArray( haplotypes[:, indices == populations[pop]]).to_genotypes( ploidy=2).count_alleles() ## keep with maf > 5% and < 95% maf = counts.values[:, 1] / sum(counts.values[0, :]) counts = counts[np.logical_and(maf > 0.05, maf < 0.95)] ## Calculate mean_pairwise_difference for genotype array including ## variants with maf > 5% mpd = allel.mean_pairwise_difference(counts) writer.write(f'{mpd.sum()/counts.shape[0]:.5}\t') #Calculate Fst for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')): num1 = sum(indices == populations[pairs[0]]) // 2 num2 = sum(indices == populations[pairs[1]]) // 2 ## Set up empty list of lists for subpop array indices subpops = [list(range(0, num1)), list(range(num1, num1 + num2))] ga = allel.HaplotypeArray( haplotypes[:, np.logical_or(indices == populations[pairs[0]], indices == populations[pairs[1]])]).to_genotypes( ploidy=2) counts = ga.count_alleles() maf = counts.values[:, 1] / sum(counts.values[0, :]) ## Calculate mean Fst based on combined genotype data a, b, c = allel.weir_cockerham_fst( ga[np.logical_and(maf > 0.05, maf < 0.95)], subpops) fst = np.mean( np.sum(a, axis=1) / (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) writer.write(f'{fst:.5}\t') writer.write('\n')
def RecombinationRepper( pooled_args): # provide r_rate, model_function, reps, samples r_rate = pooled_args[0] model_function = pooled_args[1] reps = pooled_args[2] samples = pooled_args[3] mean_Fst_dists = [] var_Fst_dists = [] mean_SE_dists = [] mean_SE_dists_shuf = [] tree_counts = [] mean_Dxy_dists = [] mean_Tajima_dists = [] mean_diversity_dists = [] mean_H12_dists = [] for t in range(reps): print(t) new_tree = migration_simulation_2patch(r_rate) # Add mutations to the tree count = 0 for r in new_tree.trees(): count += 1 # print(t, count) tree_counts.append(count) new_tree_dist_Fst = [] new_tree_dist_SE = [] new_tree_dist_SE_shuf = [] new_tree_dist_Dxy = [] new_tree_dist_diversity = [] new_tree_dist_Tajima = [] new_tree_dist_H12 = [] for i in range(samples): ## Repeat 100 times # Add mutations to the tree muts = 0 while muts == 0: mutated_tree = msprime.mutate(new_tree, 1.25e-7) muts = len([v for v in mutated_tree.variants()]) # Get the genotype matrix, ready for using sci-kit.allel msprime_genotype_matrix = mutated_tree.genotype_matrix() # Convert msprime's haplotype matrix into genotypes by randomly merging chromosomes haplotype_array = allel.HaplotypeArray(msprime_genotype_matrix) genotype_array = haplotype_array.to_genotypes(ploidy=2) shuffled_genotypes = shuffle(genotype_array, random_state=0) ac1 = haplotype_array.count_alleles( subpop=[s for s in range(0, 100)]) ac2 = haplotype_array.count_alleles( subpop=[s for s in range(100, 200)]) ## Calculate Tajima's D Tajimas_D = allel.tajima_d(ac1) ## Calculate Dxy dxy = sum(allel.mean_pairwise_difference_between(ac1, ac2)) / 10000. ## Calculate Garud's H statistics for the population ## Grab the haplotypes for 400SNPs from deme 1 hapslice = haplotype_array[:400, 0:100] H_vector = allel.garud_h(hapslice) ## Calculate Diversity pi = sum(allel.mean_pairwise_difference(ac1)) / 10000. subpopulations = [[p for p in range(0, 50)], [z for z in range(50, 100)]] mean_fst = allel.average_weir_cockerham_fst(genotype_array, blen=100, subpops=subpopulations) mean_fst_shuf = allel.average_weir_cockerham_fst( shuffled_genotypes, blen=100, subpops=subpopulations) new_tree_dist_Fst.append(mean_fst[0]) new_tree_dist_SE.append(mean_fst[1]) new_tree_dist_SE_shuf.append(mean_fst_shuf[1]) new_tree_dist_Tajima.append(Tajimas_D) new_tree_dist_Dxy.append(dxy) new_tree_dist_H12.append(H_vector[1]) new_tree_dist_diversity.append(pi) mean_Fst_dists.append(np.mean(new_tree_dist_Fst)) var_Fst_dists.append(np.sqrt(np.var(new_tree_dist_Fst))) mean_SE_dists.append(np.mean(new_tree_dist_SE)) mean_SE_dists_shuf.append(np.mean(new_tree_dist_SE_shuf)) mean_Dxy_dists.append(np.mean(new_tree_dist_Dxy)) mean_Tajima_dists.append(np.mean(new_tree_dist_Tajima)) mean_H12_dists.append(np.mean(new_tree_dist_H12)) mean_diversity_dists.append(np.mean(new_tree_dist_diversity)) return [ r_rate, mean_Fst_dists, mean_SE_dists, mean_SE_dists_shuf, var_Fst_dists, tree_counts, mean_Dxy_dists, mean_Tajima_dists, mean_diversity_dists, mean_H12_dists ]