def print_pi(self, tree_sequence, indices, populations): if not self.pi_needed(): return writer = self.writers['pi'] # invert populations dictionary to be keyed by population index # this keeps the order consistent instead of relying on keys pops = 'AF EU AS'.split() indices = np.array(indices) writer.write('\t'.join(pops) + '\t') writer.write('AF-EU\tAF-AS\tEU-AS\n') length = tree_sequence.get_sequence_length() haplotypes = tree_sequence.genotype_matrix() for pop in pops: mpd = allel.mean_pairwise_difference( allel.HaplotypeArray( haplotypes[:, indices == populations[pop]]).count_alleles()) writer.write(f'{mpd.sum()/length:.5}\t') for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')): count1 = allel.HaplotypeArray( haplotypes[:, indices == populations[pairs[0]]]).count_alleles() count2 = allel.HaplotypeArray( haplotypes[:, indices == populations[pairs[1]]]).count_alleles() num, den = allel.hudson_fst(count1, count2) writer.write(f'{num.sum() / den.sum():.5}\t') writer.write('\n')
def ts_to_dadi_sfs(ts_path, out_path, out_path_nonvariant, sample_size=20, mask_file=None): ''' Generate however many different SFS with msprime and convert+save them into SFS for dadi to use. ''' ts = tskit.load(ts_path) #haps_pops_joint = np.array(ts.genotype_matrix()) haps = ts.genotype_matrix() total_length = ts.sequence_length # Masking retain = np.full(ts.get_num_mutations(), False) if mask_file: mask_table = pd.read_csv(mask_file, sep="\t", header=None) chrom = ts_path.split("/")[-1].split(".")[0] sub = mask_table[mask_table[0] == chrom] mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2]) snp_locs = [int(x.site.position) for x in ts.variants()] tmp_bool = [mask_ints.contains(x) for x in snp_locs] retain = np.logical_or(retain, tmp_bool) #print(retain) total_length -= np.sum(mask_ints.length) #print(ts.sequence_length) #print(total_length) retain = np.logical_not(retain) haps_pops_joint = np.array(haps[retain, :]) #Break up the haplotypes into seperate populations based on sample_size haps_pop0_joint = haps_pops_joint[:, :sample_size] haps_pop1_joint = haps_pops_joint[:, sample_size:] genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes( ploidy=2) allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles() genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes( ploidy=2) allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles() sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1], allele_counts_pop1_joint[:, 1]) num_sites = sum(sum(sfs_joint)) #print(ts.num_sites) sfs_joint = dadi.Spectrum(sfs_joint) sfs_joint.to_file(out_path) sfs_joint[ 0, 0] = total_length - num_sites # need to get the number of nonvariant sites for the [0,0] entry sfs_joint.to_file(out_path_nonvariant)
def ts_to_stairway(self, ts_path, num_bootstraps=1, mask_file=None): """ Converts the specified tskit tree sequence to text files used by stairway plot. """ derived_counts_all = [[] for _ in range(num_bootstraps + 1)] total_length = 0 num_samples = 0 for i, ts_p in enumerate(ts_path): ts = tskit.load(ts_p) total_length += ts.sequence_length num_samples = ts.num_samples haps = ts.genotype_matrix() SFSs = [] # Masking retain = np.full(ts.get_num_mutations(), False) if mask_file: mask_table = pd.read_csv(mask_file, sep="\t", header=None) chrom = ts_p.split("/")[-1].split(".")[0] sub = mask_table[mask_table[0] == chrom] mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2]) snp_locs = [int(x.site.position) for x in ts.variants()] tmp_bool = [mask_ints.contains(x) for x in snp_locs] retain = np.logical_or(retain, tmp_bool) total_length -= np.sum(mask_ints.length) retain = np.logical_not(retain) # append unmasked SFS SFSs.append(allel.sfs(allel.HaplotypeArray(haps).count_alleles()[:, 1])[1:]) # get masked allele counts and append SFS allele_counts = allel.HaplotypeArray(haps[retain, :]).count_alleles() SFSs.append(allel.sfs(allele_counts[:, 1])[1:]) sfs_path = ts_p+".sfs.pdf" plots.plot_sfs(SFSs, sfs_path) # Bootstrap allele counts derived_counts_all[0].extend(allele_counts[:, 1]) for j in range(1, num_bootstraps + 1): nsites = np.shape(allele_counts)[0] bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True) bootac = allele_counts[bootset, :] der_bootac = bootac[:, 1] derived_counts_all[j].extend(der_bootac) # Get the SFS minus the 0 bin and write output stairway_files = [] for l in range(len(derived_counts_all)): sfs = allel.sfs(derived_counts_all[l])[1:] filename = self.workdir / "sfs_{}.txt".format(l) write_stairway_sfs(total_length, num_samples, sfs, filename) stairway_files.append(filename) return stairway_files
def msp2sf2(tree_sequence, npops): """ """ pix = [tree_sequence.get_samples(pop) for pop in range(npops)] # get derived allele counts from allel muts = tree_sequence.get_num_mutations() sample_size = tree_sequence.get_sample_size() V = np.zeros((muts, sample_size), dtype=np.int8) for variant in tree_sequence.variants(): V[variant.index] = variant.genotypes gt = allel.HaplotypeArray(V) pos = allel.SortedIndex( [int(variant.position) for variant in tree_sequence.variants()]) for i, p in enumerate(pix): ac = gt[:, p].count_alleles()[:, 1] d = open("{}.Neutral.sf2inrecomb".format(i), 'w') d.write("position\trate\n") with open("{}.Neutral.sf2in".format(i), 'w') as f: f.write("position\tx\tn\tfolded\n") for r, dac in enumerate(ac): if dac > 0: f.write("{}\t{}\t{}\t0\n".format(pos[r], dac, len(p))) if r != 0: d.write("{}\t{}\n".format(pos[r], pos[r] / 850000.0)) else: d.write("{}\t{}\n".format(pos[r], 0)) d.close() return (None)
def test_masked_windowed_diversity(self): # four haplotypes, 6 pairwise comparison h = allel.HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) ac = h.count_alleles() # mean pairwise diversity # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1] pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27]) mask = np.tile(np.repeat(np.array([True, False]), 5), 3) # expected is every other window with size 5 expect, _, _, _ = allel.windowed_diversity(pos, ac, size=5, start=1, stop=31) # only getting every other element expect = expect[::2] # actual is window of size 10 with the last half masked out actual, _, _, _ = allel.windowed_diversity(pos, ac, size=10, start=1, stop=31, is_accessible=mask) assert_array_almost_equal(expect, actual)
def simulate(out_path, species, model, genetic_map, seed, chrmStr, sample_size=20, population=0, ld_thresh=1.0, max_workers=1): mask_path = out_path + ".r2Mask.p" sfs_path = out_path + ".sfs.pdf" chrom = species.genome.chromosomes[chrmStr] samples = [msp.Sample(population=population, time=0)] * sample_size print("Simulating...") ts = msp.simulate(samples=samples, recombination_map=chrom.recombination_map( genetic_map.name), mutation_rate=chrom.default_mutation_rate, random_seed=seed, **model.asdict()) ts.dump(out_path) haps = allel.HaplotypeArray(ts.genotype_matrix()) SFSs = [] SFSs.append(allel.sfs(haps.count_alleles()[:, 1])[1:]) print("Simulation finished!") if ld_thresh < 1.0: ul = unlinked(ts, ld_thresh, max_workers) mask_file = open(mask_path, "wb") pickle.dump(ul, mask_file) SFSs.append(allel.sfs(haps[ul, :].count_alleles()[:, 1])[1:]) plot_sfs(SFSs, sfs_path)
def extract_haplotype_array(haparray, haplotype_pos, core_left, core_right, flank): # Get the haplotype on the right of the group. loc_right = haplotype_pos.locate_range(core_right, core_right + flank) haps_right_3d = haparray[loc_right, :1142, :] haps_right = allel.HaplotypeArray( haps_right_3d.reshape(haps_right_3d.shape[0], 2284)) pos_right = haplotype_pos[loc_right] # Get the haplotype on the left of the group. loc_left = haplotype_pos.locate_range(core_left - flank, core_left) haps_left_3d = haparray[loc_left, :1142, :] haps_left = allel.HaplotypeArray( haps_left_3d.reshape(haps_left_3d.shape[0], 2284)) pos_left = haplotype_pos[loc_left] return (pos_left, haps_left, pos_right, haps_right)
def __init__(self, input_path: str, genotypes: allel.GenotypeArray, variants: allel.VariantTable, chrom: str, sample_list: list, parent_sample: str): self.__input_path: str = input_path self.__genotypes: allel.GenotypeArray = genotypes self.__variants: allel.VariantTable = variants self.__chrom: str = chrom self.__sample_list: list = sample_list self.__parent_sample: str = parent_sample self.__zygosity: Zygosity = Zygosity.UNDEFINED self.__parent_haplotypes: allel.HaplotypeArray = allel.HaplotypeArray( # Empty allel.HaplotypeArray np.empty((self.__genotypes.n_variants, self.__genotypes.n_samples), dtype='i1')) self.__parent_n_progeny_haplotypes: allel.HaplotypeArray = allel.HaplotypeArray( np.empty((self.__genotypes.n_variants, self.__genotypes.n_samples), dtype='i1'))
def ts_to_stairway(self, ts_path, num_bootstraps=1): """ Converts the specified tskit tree sequence to text files used by stairway plot. """ derived_counts_all = [[] for _ in range(num_bootstraps + 1)] total_length = 0 num_samples = 0 for i, ts_p in enumerate(ts_path): ts = tskit.load(ts_p) total_length += ts.sequence_length num_samples = ts.num_samples haps = ts.genotype_matrix() # Mask high-ld sites and return genotypes mask_path = ts_p + ".unlinkedMask.p" if os.path.exists(mask_path): mask_file = open(mask_path, "rb") ul = pickle.load(mask_file) allele_counts = allel.HaplotypeArray( haps[ul, :]).count_alleles() else: allele_counts = allel.HaplotypeArray(haps).count_alleles() # Bootstrap allele counts derived_counts_all[0].extend(allele_counts[:, 1]) for j in range(1, num_bootstraps + 1): nsites = np.shape(allele_counts)[0] bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True) bootac = allele_counts[bootset, :] der_bootac = bootac[:, 1] derived_counts_all[j].extend(der_bootac) # Get the SFS minus the 0 bin and write output stairway_files = [] for l in range(len(derived_counts_all)): sfs = allel.sfs(derived_counts_all[l])[1:] filename = self.workdir / "sfs_{}.txt".format(l) write_stairway_sfs(total_length, num_samples, sfs, filename) stairway_files.append(filename) return stairway_files
def jsfs(self, fold=False): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) props = afs.jsfs_stats(len(p1), gtpops, pos, fold) stats_ls.extend(props) return stats_ls
def calc_heterozygous_haplotypes( self) -> (allel.HaplotypeArray, allel.HaplotypeArray): """ Returns a the parent and progeny haplotypes from a given 'allel.GenotypeArray'. It considers BOTH alleles due to it supposes the genotypes are heterozygous. Parent haplotypes are found in self.__parent_haplotypes. Parent plus progeny haplotypes are found in self.__parent_n_progeny_haplotypes. """ # Parent genotype is indexed in position 0 genotypes_parent = self.genotypes[:, 0] # Convert to haplotype array haplotypes_parent = genotypes_parent.to_haplotypes() # Pull out the both allele (haplotypes) from the other samples in the VCF, treated as progeny # Skip genotype 0 (parent) left_alleles = allel.HaplotypeArray(self.genotypes[:, :, 0]) right_alleles = allel.HaplotypeArray(self.genotypes[:, :, 1]) # Initially, copy parent alleles haplotypes_parent_n_progeny = allel.HaplotypeArray(haplotypes_parent, copy=True) wanted_variants = np.repeat( True, self.genotypes.n_variants) # Get all variants wanted_sample = np.repeat( False, self.genotypes.n_samples) # Set to False initially # Start at 1, we already copied the parents haplotypes in 'haplotypes_parent_n_progeny' for sample_index in range( 1, self.genotypes.n_samples ): # Skip haplotype 0 (parent), it is already inserted wanted_sample[sample_index] = True subset_left_alleles = left_alleles.subset(wanted_variants, wanted_sample)[:] subset_right_alleles = right_alleles.subset( wanted_variants, wanted_sample)[:] wanted_sample[sample_index] = False haplotypes_parent_n_progeny = haplotypes_parent_n_progeny.concatenate( subset_left_alleles, axis=1) haplotypes_parent_n_progeny = haplotypes_parent_n_progeny.concatenate( subset_right_alleles, axis=1) self.__parent_haplotypes = haplotypes_parent self.__parent_n_progeny_haplotypes = haplotypes_parent_n_progeny self.__set_heterozygous()
def sfs(self, fold=False): fold = self.stats["sfs_fold"] gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) stats_ls = [] for pop in self.stats["pop_config"]: gtpop = gt.take(pop, axis=1) sfs = afs.asfs_stats(gtpop, pos, fold) stats_ls.extend(sfs) return stats_ls
def Fst_IBD(trees): ts = pyslim.load(trees) mutated_tree = msprime.mutate(ts, 1e-8) # muts = len( [ v for v in mutated_tree.variants() ] ) # Get the genotype matrix, ready for using sci-kit.allel msprime_genotype_matrix = mutated_tree.genotype_matrix() # Convert msprime's haplotype matrix into genotypes by randomly merging chromosomes haplotype_array = allel.HaplotypeArray(msprime_genotype_matrix) genotype_array = haplotype_array.to_genotypes(ploidy=2) print(genotype_array.shape) ## Calculate Diversity pi = mutated_tree.diversity(windows=[ 0, 1e6, 2e6, 3e6, 4e6, 5e6, 6e6, 7e6, 8e6, 9e6, 10e6, 10e6 + 1 ]) ## Calculate Tajima's D ac = genotype_array.count_alleles() TD = allel.tajima_d(ac) print(TD) row = np.random.choice(13) pairs = [[row, row + (14 * i)] for i in range(14)] subpopulations = [[y for y in range(x, x + 100)] for x in range(0, genotype_array.shape[1], 100)] subpops = np.array(subpopulations)[np.random.choice(len(subpopulations), 10, replace=False)] mean_fst = allel.average_weir_cockerham_fst(genotype_array, blen=10000, subpops=subpops) rep = trees.split("/")[-1].split("_")[0] output = [] output.append([str(rep), str(int(-1)), str(mean_fst[0])]) for p in pairs: print(p) dist = (p[1] - p[0]) / 14 if dist == 0: continue subpops = np.array(subpopulations)[p] mean_fst = allel.average_weir_cockerham_fst(genotype_array, blen=1000, subpops=subpops) output.append([str(rep), str(int(dist)), str(mean_fst[0])]) # output.write( ",".join( str(rep), str(int(dist)), str(mean_fst[0]) ) + "\n") return (output)
def ac_from_ts(ts, n_pops, N): ''' This function takes a tree sequence, and returns tuple with a list of allele counts for each subpop and the positions''' acs = [] hap = allel.HaplotypeArray(ts.genotype_matrix()) geno = hap.to_genotypes(ploidy=2) for i in range(n_pops): subpop_indexes = list(np.arange(i * N, (i + 1) * N)) acs.append(geno.count_alleles(subpop=subpop_indexes)) pos = np.array([s.position for s in ts.sites()]) return (acs, pos)
def tajd(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) win_size = self.stats["win_size1"] length_bp = self.stats["length_bp"] stats_ls = [] for pop in self.stats["pop_config"]: gtpop = gt.take(pop, axis=1) tajd_, tajd_std = popstats.tajimaD(pos, gtpop, win_size, length_bp) stats_ls.extend([tajd_, tajd_std]) return stats_ls
def genotypes(tree_seq): """ Returns sampled genotypes in scikit allel genotypes format""" samples = get_sampled_nodes(tree_seq) samples = np.concatenate(samples).flatten() haplotype_array = np.empty((tree_seq.num_mutations, len(samples)), dtype=np.int8) for j, variant in enumerate(tree_seq.variants( samples=samples)): # output order corresponds to samples haplotype_array[j, :] = variant.genotypes haplotype_array = allel.HaplotypeArray(haplotype_array) allel_genotypes = haplotype_array.to_genotypes(ploidy=2) return allel_genotypes
def msprime_to_dadi_simulation(path, seed, org, chrom, sample_size=20): ''' Generate however many different SFS with msprime and convert+save them into SFS for dadi to use. ''' #For testing # print(path, seed, chrom, sample_size) # chrom = homo_sapiens.genome.chromosomes[chrom] # model = homo_sapiens.GutenkunstThreePopOutOfAfrica() chrom = getattr(stdpopsim, '_'.join(org.split('_')[:-1])).genome.chromosomes[chrom] model = getattr(getattr(stdpopsim, '_'.join(org.split('_')[:-1])), org.split('_')[-1:][0])() samples_pops_joint = [ msprime.Sample(population=0, time=0) ] * sample_size + [msprime.Sample(population=1, time=0)] * sample_size ts_pops_joint = msprime.simulate( samples=samples_pops_joint, recombination_map=chrom.recombination_map(), mutation_rate=chrom.default_mutation_rate, random_seed=seed, **model.asdict()) haps_pops_joint = np.array(ts_pops_joint.genotype_matrix()) #Break up the haplotypes into seperate populations based on sample_size haps_pop0_joint = haps_pops_joint[:, :sample_size] haps_pop1_joint = haps_pops_joint[:, sample_size:] genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes( ploidy=2) allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles() genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes( ploidy=2) allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles() sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1], allele_counts_pop1_joint[:, 1]) sfs_joint = dadi.Spectrum(sfs_joint) sfs_joint.to_file(path)
def delta_tajD(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) win_size = self.stats["win_size1"] length_bp = self.stats["length_bp"] quants = self.stats["pw_quants"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.d_tajD(len(p1), pos, gtpops, win_size, length_bp, quants) stats_ls.extend(flt) return stats_ls
def ddRank12(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) quants = self.stats["pw_quants"] win_size = self.stats["win_size2"] length_bp = self.stats["length_bp"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.ddRank1_2(len(p1), pos, gtpops, win_size, length_bp, quants) stats_ls.extend(flt) # 2 values returned as list [dd1, dd2] return stats_ls
def FST(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) quants = self.stats["pw_quants"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.fst(len(p1), pos, gtpops, quants) try: stats_ls.extend(flt) except TypeError: flt = [np.nan] * len(quants) stats_ls.extend(flt) return stats_ls
def dmin(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) quants = self.stats["pw_quants"] win_size = self.stats["win_size2"] length_bp = self.stats["length_bp"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.dmin(len(p1), pos, gtpops, win_size, length_bp) if quants[0] < 0: dminq = [np.nanmean(flt)] else: dminq = np.nanquantile(flt, quants) stats_ls.extend(dminq) return stats_ls
def pop_sample_ac(geno_mat): haplo_arr = allel.HaplotypeArray(geno_mat) ac_one = haplo_arr[:, 0:10].count_alleles() ac_two = haplo_arr[:, 10:20].count_alleles() ac_three = haplo_arr[:, 20:30].count_alleles() ac_four = haplo_arr[:, 30:40].count_alleles() ac_five = haplo_arr[:, 40:50].count_alleles() ac_six = haplo_arr[:, 50:60].count_alleles() ac_seven = haplo_arr[:, 60:70].count_alleles() ac_eight = haplo_arr[:, 70:80].count_alleles() # stack arrays with frames = population allele counts for all SNPs arrays = [ ac_one, ac_two, ac_three, ac_four, ac_five, ac_six, ac_seven, ac_eight ] ac_All = np.stack(arrays, axis=0) return ac_All
def geno2genediv( args ): lineparser = tabparser.GenotypeLineParser( args ) lineparser.set_translator(lineparser.diploid_translator) # set group groups = lineparser.parse_grouping() cout('Grouping:') group_keys = sorted(groups.keys()) for k in group_keys: cout(' %12s %3d' % (k, len(groups[k]))) outfile = open(args.outfile, 'wt') outfile.write('CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tFST\tdHe\tHe\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' % '\t'.join( group_keys )) for idx, region in enumerate(lineparser.parse_genes()): haplotypes = set( region.haplotypes()) enc_haplos = region.encode_haplotypes() haploarray = allel.HaplotypeArray( [enc_haplos] ) cerr( 'I: calculating %d - %s' % (idx, region.name)) # calculate total He first He = 1 - np.sum( haploarray.count_alleles().to_frequencies()**2 ) # calculate He per population, He_p values = [] pHe = 0 for g in group_keys: he_p = 1 - np.sum( haploarray.count_alleles(subpop=groups[g]).to_frequencies()**2 ) pHe += he_p * len(groups[g]) values.append(he_p) dHe = He - pHe / sum( len(x) for x in groups.values() ) FST = dHe/He #print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value) params = ( FST, dHe, He, np.mean(values), np.median(values), np.max(values), np.min(values)) outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' % ( region.P[0][0], region.P[0][1], region.name, len(region.P), len(haplotypes), '\t'.join( '%5.4f' % x for x in params), '\t'.join( '%5.4f' % x for x in values)))
def geno2genediv(args): lineparser = tabparser.GenotypeLineParser(args) lineparser.set_translator(lineparser.diploid_translator) # set group groups = lineparser.parse_grouping() cout('Grouping:') group_keys = sorted(groups.keys()) for k in group_keys: cout(' %12s %3d' % (k, len(groups[k]))) outfile = open(args.outfile, 'wt') outfile.write( 'CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' % '\t'.join(group_keys)) for idx, region in enumerate(lineparser.parse_genes()): haplotypes = set(region.haplotypes()) enc_haplos = region.encode_haplotypes() assert len(haplotypes) == max(enc_haplos) + 1 haploarray = allel.HaplotypeArray([enc_haplos]) cerr('I: calculating %d - %s' % (idx, region.name)) value = [] for g in group_keys: ac_g = haploarray.count_alleles(subpop=groups[g]) ac_ng = haploarray.count_alleles( subpop=list(lineparser.sample_idx - set(groups[g]))) num, den = allel.stats.hudson_fst(ac_g, ac_ng) value.append(den) #print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value) params = (np.mean(value), np.median(value), np.max(value), np.min(value)) outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' % (region.P[0][0], region.P[0][1], region.name, len(region.P), len(haplotypes), '\t'.join( '%5.4f' % x for x in params), '\t'.join('%5.4f' % x for x in value)))
def load_genotypes(): if args.zarr is not None: print("reading zarr") callset = zarr.open_group(args.zarr, mode='r') gt = callset['calldata/GT'] genotypes = allel.GenotypeArray(gt[:]) samples = callset['samples'][:] positions = callset['variants/POS'] elif args.vcf is not None: print("reading VCF") vcf = allel.read_vcf(args.vcf, log=sys.stderr) genotypes = allel.GenotypeArray(vcf['calldata/GT']) samples = vcf['samples'] elif args.matrix is not None: gmat = pd.read_csv(args.matrix, sep="\t") samples = np.array(gmat['sampleID']) gmat = gmat.drop(labels="sampleID", axis=1) gmat = np.array(gmat, dtype="int8") for i in range(gmat.shape[0] ): #kludge to get haplotypes for reading in to allel. h1 = [] h2 = [] for j in range(gmat.shape[1]): count = gmat[i, j] if count == 0: h1.append(0) h2.append(0) elif count == 1: h1.append(1) h2.append(0) elif count == 2: h1.append(1) h2.append(1) if i == 0: hmat = h1 hmat = np.vstack((hmat, h2)) else: hmat = np.vstack((hmat, h1)) hmat = np.vstack((hmat, h2)) genotypes = allel.HaplotypeArray( np.transpose(hmat)).to_genotypes(ploidy=2) return genotypes, samples
def ts_to_stairway(self, ts_path, num_bootstraps=1): """ Converts the specified tskit tree sequence to text files used by stairway plot. """ derived_counts_all = [[] for _ in range(num_bootstraps + 1)] total_length = 0 num_samples = 0 for i, ts_p in enumerate(ts_path): ts = tskit.load(ts_p) total_length += ts.sequence_length num_samples = ts.num_samples # count alleles, bootstrap over sites, return the SFS minus the 0% bin haps = ts.genotype_matrix() genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2) allele_counts = genotypes.count_alleles() derived_allele_counts = allele_counts[:, 1] derived_counts_all[0].extend(derived_allele_counts) # Write bootstrapped inputs for j in range(1, num_bootstraps + 1): nsites = np.shape(allele_counts)[0] bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True) bootac = allele_counts[bootset, :] der_bootac = bootac[:, 1] derived_counts_all[j].extend(der_bootac) stairway_files = [] for l in range(len(derived_counts_all)): sfs = allel.sfs(derived_counts_all[l])[1:] filename = self.workdir / "sfs_{}.txt".format(l) write_stairway_sfs(total_length, num_samples, sfs, filename) stairway_files.append(filename) return stairway_files
def calc_homozygous_haplotypes(self): """ Calculates a the parent and progeny haplotypes from a given 'allel.GenotypeArray'. It considers ONLY ONE of the alleles due to it supposes the genotypes are homozygous. Parent haplotypes are found in self.__parent_haplotypes. Parent plus progeny haplotypes are found in self.__parent_n_progeny_haplotypes. """ # Parent genotype genotypes_parent = self.genotypes[:, 0] # Convert to haplotype array haplotypes_parent = genotypes_parent.to_haplotypes() # Pull out the "left" allele (haplotypes) from the other samples in the VCF, treated as progeny # Here we assume the genotypes are homozygous haplotypes_rest_varieties = allel.HaplotypeArray(self.genotypes[:, 1:, 0]) # Stack parent's haplotypes alongside haplotypes it transmitted to its progeny haplotypes_parent_n_progeny = haplotypes_parent.concatenate( haplotypes_rest_varieties, axis=1) self.__parent_haplotypes = haplotypes_parent self.__parent_n_progeny_haplotypes = haplotypes_parent_n_progeny self.__set_homozygous()
def extract_haplotype_array(haparray, genarray, haplotype_pos, genotype_pos, male_indices, core_left, core_right, flank): # Get the haplotype on the right of the group. haps_loc_right = haplotype_pos.locate_range(core_right, core_right + flank) haps_right_3d = haparray[haps_loc_right, :1058, :] haps_right = allel.HaplotypeArray( haps_right_3d.reshape(haps_right_3d.shape[0], 1058 * 2)) pos_right = haplotype_pos[haps_loc_right] # Get the genotypes on the right of the group. Some genotypes are not present in the haplotype table so # we need to remove them. It takes a long time to do this directly, and is much quicker to first take a # slice for the correct range, and then filter out the loci not found in the haplotypes data. gen_loc_right = genotype_pos.locate_range(core_right, core_right + flank) gen_right_all_3d = genarray[gen_loc_right, :, :] gen_pos_right = genotype_pos[gen_loc_right] gen_loc_right_inhap = gen_pos_right.locate_keys(pos_right) gen_right_all_inhap_3d = gen_right_all_3d[gen_loc_right_inhap, :, :] gen_right = allel.HaplotypeArray(gen_right_all_inhap_3d[:, male_indices, 0]) # Combine the haplotypes and genotypes genhaps_right_all = allel.HaplotypeArray( np.concatenate((haps_right, gen_right), 1)) # Get the haplotype on the left of the group. haps_loc_left = haplotype_pos.locate_range(core_left - flank, core_left) haps_left_3d = haparray[haps_loc_left, :1058, :] haps_left = allel.HaplotypeArray( haps_left_3d.reshape(haps_left_3d.shape[0], 1058 * 2)) pos_left = haplotype_pos[haps_loc_left] # Get the genotypes on the left of the group gen_loc_left = genotype_pos.locate_range(core_left - flank, core_left) gen_left_all_3d = genarray[gen_loc_left, :, :] gen_pos_left = genotype_pos[gen_loc_left] gen_loc_left_inhap = gen_pos_left.locate_keys(pos_left) gen_left_all_inhap_3d = gen_left_all_3d[gen_loc_left_inhap, :, :] gen_left = allel.HaplotypeArray(gen_left_all_inhap_3d[:, male_indices, 0]) # Combine the haplotypes and genotypes genhaps_left_all = allel.HaplotypeArray( np.concatenate((haps_left, gen_left), 1)) # We need to exclude loci where there are missing values (which occurs in the genotype table) no_missing_values_right = np.apply_along_axis( lambda x: len(np.where(x == -1)[0]) == 0, 1, genhaps_right_all) genhaps_right = genhaps_right_all[no_missing_values_right, :] no_missing_values_left = np.apply_along_axis( lambda x: len(np.where(x == -1)[0]) == 0, 1, genhaps_left_all) genhaps_left = genhaps_left_all[no_missing_values_left, :] return (pos_left[no_missing_values_left], genhaps_left, pos_right[no_missing_values_right], genhaps_right)
def load_genotypes(): if args.zarr is not None: print("reading zarr") callset = zarr.open_group(args.zarr, mode='r') gt = callset['calldata/GT'] genotypes = allel.GenotypeArray(gt[:]) samples = callset['samples'][:] else: print("reading VCF") vcf = allel.read_vcf(args.vcf, log=sys.stderr) gt = vcf['calldata/GT'] genotypes = allel.GenotypeArray(gt) hap0 = genotypes[:, :, 0] hap1 = genotypes[:, :, 1] haps = allel.HaplotypeArray( np.concatenate((hap0, hap1), axis=1) ) #note order is all hap0 in order of samples, then all hap1 in order of samples. samples = vcf['samples'] s0 = [x + "_h0" for x in samples] s1 = [x + "_h1" for x in samples] samples = np.concatenate((s0, s1), axis=0) return haps, samples
def summary_stats(tree_sequence_file): # ts = pyslim.load(tree_sequence_file) ts = pyslim.load(tree_sequence_file) mutated_tree = msprime.mutate(ts, 1e-7) # muts = len( [ v for v in mutated_tree.variants() ] ) # Get the genotype matrix, ready for using sci-kit.allel msprime_genotype_matrix = mutated_tree.genotype_matrix() # Convert msprime's haplotype matrix into genotypes by randomly merging chromosomes haplotype_array = allel.HaplotypeArray( msprime_genotype_matrix ) genotype_array = haplotype_array.to_genotypes(ploidy=2) ## Calculate Diversity pi = mutated_tree.diversity(windows =[0,1e6,1e6+3000]) # print(pi, genotype_array.shape) subpopulations = [ [y for y in range(x, x+100)] for x in range(0,genotype_array.shape[1],100)] # print(len(individuals), genotype_array.shape) subpops = np.array(subpopulations)[np.random.choice(len(subpopulations),10, replace = False)] mean_fst = allel.average_weir_cockerham_fst(genotype_array, blen = 10000, subpops=subpops) # print(mean_fst) return(pi[0], mean_fst[0])