def load_text_format_data(mapfn, pop_a_fn, pop_b_fn): tbl = pd.read_csv(mapfn, sep="\t", header=None, engine="c") try: tbl.columns = ["ID", "CHROM", "GDist", "POS", "REF", "ALT"] except ValueError: logger.info("File not tab delimited as expected- trying with spaces") tbl = pd.read_csv(mapfn, sep=" ", header=None, engine="c", names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"]) try: vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") except ValueError: tbl = tbl.sort_values(["CHROM", "POS"]) logger.warning( "Possible SNPs file is not sorted. Attempting to sort. This is likely to be inefficient" ) vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") d1 = np.loadtxt(pop_a_fn, dtype="int8") geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2))) d2 = np.loadtxt(pop_b_fn, dtype="int8") geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2))) pos = allel.SortedIndex(vartbl.POS[:]) assert np.isnan(pos).sum() == 0, "nans values are not supported" return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
def load_arrays_noncoding_and_centromeres(local_path, _set, chrom, coding_reg_df, sitefilter='gamb_colu', filter_centro=True): """ This function reads and filters a genotyping array to the noncoding, noncentromeric regions, and applys a filter depending on whether the samples are arabiensis (arab) or gambiae/coluzzii (gamb_colu) """ Ag_array = zarr.open_array( f"{local_path}/snp_genotypes/all/{_set}/{chrom}/calldata/GT/", mode='r') filters = zarr.open( f"{local_path}/site_filters/dt_20200416/{sitefilter}/{chrom}/variants/filter_pass", mode="r") positions = zarr.open_array( f"{local_path}/snp_genotypes/all/sites/{chrom}/variants/POS/", mode='r') positions = positions[:][filters[:]] geno = allel.GenotypeDaskArray(Ag_array) geno = geno[filters[:]] if filter_centro is True: if chrom == '2L': centromere = (positions > 3000000) elif chrom == '2R': centromere = (positions < 57000000) elif chrom == '3L': centromere = (positions > 2000000) elif chrom == '3R': centromere = (positions < 50000000) elif chrom == 'X': centromere = (positions < 21000000) positions = allel.SortedIndex(positions[centromere]) else: positions = allel.SortedIndex(positions) #get boolean array for positions that are coding - allel.locate_ranges so fast! coding = positions.locate_ranges(coding_reg_df.start, coding_reg_df.end, strict=False) #compress to get noncoding SNPs and remove centromeric regions of low recombination #get non-centromeric regions. currently chosen by eye based on ag1000g phase1 paper fig1. if filter_centro is True: geno = geno.compress(centromere, axis=0) geno = geno.compress( ~coding, axis=0) #we want noncoding regions so '~' to get inverse of boolean positions = positions[~coding] return (geno, positions)
def plotvars(chrm, callset, window_size=100000, title=None, saved=True): """ """ try: chrm = chrm.decode("utf-8") except AttributeError: chrm = chrm chrom = callset['variants/CHROM'] chrom_mask = np.where(chrom[:] == chrm) pos = callset['variants/POS'] p = pos[:][chrom_mask] varpos = allel.SortedIndex(p) # setup windows bins = np.arange(0, varpos.max(), window_size) # use window midpoints as x coordinate x = (bins[1:] + bins[:-1]) / 2 # compute variant density in each window h, _ = np.histogram(varpos, bins=bins) y = h / window_size # plot fig, ax = plt.subplots(figsize=(12, 3)) sns.despine(ax=ax, offset=10) ax.plot(x, y) ax.set_xlabel('Chromosome position (bp)') ax.set_ylabel('Variant density (bp$^{-1}$)') if title: ax.set_title(title) else: ax.set_title(chrm) if saved: fig.savefig("{}.vars.pdf".format(chrm), bbox_inches='tight')
def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2, sites_list_chunk): # a string representation of the target region of the current window window_region = chromosome + ":" + str(window_pos_1) + "-" + str( window_pos_2) # read in data from the source VCF for the current window callset = allel.read_vcf(args.vcf, region=window_region, fields=[ 'CHROM', 'POS', 'calldata/GT', 'variants/is_snp', 'variants/numalt' ]) # keep track of whether the callset was empty (no sites for this range in the VCF) # used by compute_summary_stats to add info about completely missing sites if callset is None: callset_is_none = True gt_array = None pos_array = None else: # if the callset is NOT empty (None), continue with pipeline callset_is_none = False # convert to a genotype array object gt_array = allel.GenotypeArray( allel.GenotypeDaskArray(callset['calldata/GT'])) # build an array of positions for the region pos_array = allel.SortedIndex(callset['variants/POS']) # create a mask for biallelic snps and invariant sites snp_invar_mask = np.logical_or( np.logical_and(callset['variants/is_snp'][:] == 1, callset['variants/numalt'][:] == 1), callset['variants/numalt'][:] == 0) # remove rows that are NOT snps or invariant sites from the genotype array gt_array = np.delete(gt_array, np.where(np.invert(snp_invar_mask)), axis=0) gt_array = allel.GenotypeArray(gt_array) # select rows that ARE snps or invariant sites in the position array pos_array = pos_array[snp_invar_mask] # if a list of target sites was specified, mask out all non-target sites if sites_list_chunk is not None: gt_array = mask_non_target_sites(gt_array, pos_array, sites_list_chunk) # extra 'none' check to catch cases where every site was removed by the mask if len(gt_array) == 0: callset_is_none = True gt_array = None pos_array = None return callset_is_none, gt_array, pos_array
def misspos(chrm, callset, pc, samples, window_size=10000, title=None, saved=False): """ """ # chrm = chrm.decode("utf-8") chrom = callset['variants/CHROM'] chrom_mask = np.where(chrom[:] == chrm) pos = callset['variants/POS'] p = pos[:][chrom_mask] varpos = allel.SortedIndex(p) bins = np.arange(0, varpos.max(), window_size) # use window midpoints as x coordinate x = bins miss_site = pc[:][chrom_mask] yy = [] for i, j in enumerate(x): try: left = bisect.bisect_left(varpos, j) right = bisect.bisect_left(varpos, x[i + 1]) - 1 yy.append(np.mean(miss_site[left:right])) except Exception: yy.append(0) y = np.array(yy) ap.plotmiss(x, y / samples, title, chrm, saved)
def msp2sf2(tree_sequence, npops): """ """ pix = [tree_sequence.get_samples(pop) for pop in range(npops)] # get derived allele counts from allel muts = tree_sequence.get_num_mutations() sample_size = tree_sequence.get_sample_size() V = np.zeros((muts, sample_size), dtype=np.int8) for variant in tree_sequence.variants(): V[variant.index] = variant.genotypes gt = allel.HaplotypeArray(V) pos = allel.SortedIndex( [int(variant.position) for variant in tree_sequence.variants()]) for i, p in enumerate(pix): ac = gt[:, p].count_alleles()[:, 1] d = open("{}.Neutral.sf2inrecomb".format(i), 'w') d.write("position\trate\n") with open("{}.Neutral.sf2in".format(i), 'w') as f: f.write("position\tx\tn\tfolded\n") for r, dac in enumerate(ac): if dac > 0: f.write("{}\t{}\t{}\t0\n".format(pos[r], dac, len(p))) if r != 0: d.write("{}\t{}\n".format(pos[r], pos[r] / 850000.0)) else: d.write("{}\t{}\n".format(pos[r], 0)) d.close() return (None)
def countPatternDFOIL(callset, sample_ix, outgroup): """Count patterns for all samples """ print("counting patterns in file...") gt = allel.GenotypeArray(callset['calldata/GT']) pos = allel.SortedIndex(callset['variants/POS']) # remove any sites where outgroup is ./. or 0/1 keep = gt[:, outgroup].is_hom() & gt.count_alleles().is_biallelic() gt = gt.compress(keep, axis=0) pos = pos[keep] windict = {} permute = 1 g1, g2, g3, g4 = sample_ix quartet = list(product(g1, g2, g3, g4)) print("total number of combinations: {}".format(len(quartet))) for quart in quartet: print("permutation number {}".format(permute)) i, j, k, m = quart gt_sub = gt.take([i, j, k, m, outgroup], axis=1) keep = gt_sub.is_hom().all(axis=1) gt_sub = gt_sub.compress(keep, axis=0) pos_sub = pos[keep] count_array = gt_sub.is_hom_alt() pattern_array = np.packbits(count_array, axis=1) # windows windict[permute] = (pos_sub, pattern_array) permute += 1 return (windict)
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None): import zarr samples1 = get_sample_ids(s1) samples2 = get_sample_ids(s2) zfh = zarr.open_group(zarr_fn, mode="r")[chrom] samples_x = zfh["samples"][:] sample_name = [sid.decode() for sid in samples_x.tolist()] idx1 = np.array([sample_name.index(sid) for sid in samples1]) idx2 = np.array([sample_name.index(sid) for sid in samples2]) g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"]) pos = allel.SortedIndex(zfh["variants"]["POS"][:]) if gdistkey is not None: gdist = h5fh["variants"][gdistkey][:] else: gdist = None return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
def plth12(chromlist): """ """ for c in chromlist: # callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) h = g.to_haplotypes() pos = allel.SortedIndex(callset["variants/POS"][:]) acc = h.count_alleles()[:, 1] # H12 h12 = allel.moving_garud_h(h, window_size)[1] # set window size h12_pos = [] p = 0 end = window_size i = 0 while i < len(h12): stop = pos[end] while pos[p] < stop: h12_pos.append(h12[i]) p += 1 i += 1 end += window_size while len(h12_pos) < len(pos): h12_pos.append(h12[-1]) plt.plot(pos, h12_pos) plt.xlabel("{} genomic position".format(c)) plt.ylabel("H12") plt.savefig("PNG.{}.H12.pdf".format(c)) plt.clf()
def calculate_overlap(chrom_overlap_regions, window, modern_haplotype_id, informative_site_positions): overlapping_bp = 0 overlapping_informative_sites = list() if not chrom_overlap_regions.empty: sample_chrom_overlap_regions = (chrom_overlap_regions[ chrom_overlap_regions['sample'] == modern_haplotype_id]) if not sample_chrom_overlap_regions.empty: logging.debug( "Overlap regions in chrom:\n{}".format(chrom_overlap_regions)) logging.debug("Window: {}".format(window.start)) overlapping_regions = sample_chrom_overlap_regions[ (chrom_overlap_regions['start'] <= window.end) & (chrom_overlap_regions['end'] >= window.start)] logging.debug("Overlapping regions for window:\n{}".format( overlapping_regions)) overlapping_bp = 0 for index, region in overlapping_regions.iterrows(): logging.debug(region) overlap = (min(region['end'], window.end) - max(region['start'], window.start)) overlapping_bp += overlap logging.debug("Informative site positions: {}".format( informative_site_positions)) informative_site_index = allel.SortedIndex( informative_site_positions) overlapping_informative_sites = ( informative_site_index.intersect_ranges( starts=overlapping_regions['start'], stops=overlapping_regions['end'])) return (overlapping_bp, len(overlapping_informative_sites))
def ld_prune(gn, pos, size=500, step=200, threshold=.1, n_iter=5): """Remove sites in LD. Parameters ---------- gn : TYPE DESCRIPTION. pos : TYPE DESCRIPTION. size : TYPE, optional DESCRIPTION. The default is 500. step : TYPE, optional DESCRIPTION. The default is 200. threshold : TYPE, optional DESCRIPTION. The default is .1. n_iter : TYPE, optional DESCRIPTION. The default is 5. Returns ------- TYPE DESCRIPTION. gn : TYPE DESCRIPTION. """ for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print(f"iteration {i+1} retaining {n} removing {n_remove} variants") gn = gn.compress(loc_unlinked, axis=0) pos = pos[loc_unlinked] return allel.SortedIndex(pos), gn
def calculate_switch_distances(windows, switch_array, marker_pos, hz_pos, rohz, gaps): marker_pos = allel.SortedIndex(marker_pos) gap_mp = np.mean(gaps, axis=1) assert np.in1d(marker_pos, hz_pos).all(), "all markers are subset of hets" marker_count = np.zeros(windows.shape[0], dtype="int") marker_dist = np.zeros(windows.shape[0], dtype="float") error_count = np.zeros(windows.shape[0], dtype="int") hz_count = np.zeros(windows.shape[0], dtype="int") pos_sw = ph.switch.derive_position_switch_array(switch_array) pos_errors = np.take(marker_pos, pos_sw[:-1].cumsum()) for i, (start, stop) in enumerate(windows): # this is the code I need to change # A don't count error if immediately after GAP # B don't count towards distance try: ix = marker_pos.locate_range(start, stop) except KeyError: marker_dist[i] = 0.0 marker_count[i] = 0 error_count[i] = 0 hz_count[i] = 0 continue # how many separate gaps between first and last ix? gap_ix = np.searchsorted(marker_pos[ix], gap_mp) # interested in number of gaps gap_pos = np.unique( np.compress((gap_ix < marker_pos[ix].size) & (gap_ix > 0), gap_ix)) # now insert 0 and pos size at beginning and end cuts = np.concatenate([[0], gap_pos, [marker_pos[ix].size]]) assert cuts.size >= 2 for p, q in zip(cuts[:-1], cuts[1:]): first, last = marker_pos[ix][p], marker_pos[ix][q-1] # how many hets between first an last? counthets = np.searchsorted(hz_pos, last) - \ np.searchsorted(hz_pos, first) error_count[i] += np.sum(evaluate_markers(marker_pos[ix][p:q], pos_errors)) marker_dist[i] += calc_marker_dist(marker_pos[ix][p:q], rohz) # just one marker is not informative. marker_count[i] += (q - p - 1) hz_count[i] += counthets return np.vstack([marker_dist, marker_count, error_count, hz_count])
def load_hdf5_data(hdf5_fn, chrom, s1, s2): callset = h5py.File(hdf5_fn, mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] idx1 = np.array([sample_name.index(sid) for sid in s1]) idx2 = np.array([sample_name.index(sid) for sid in s2]) g = allel.GenotypeChunkedArray(callset["calldata/GT"]) pos = allel.SortedIndex(callset["variants/POS"][:]) return g.take(idx1, axis=1), g.take(idx2, axis=1), pos
def __main__(): parser = arg.ArgumentParser() parser.add_argument('--chr', dest='chrom') args = parser.parse_args() # read in extra data bed = pd.read_csv( '/psych/ripke/vasa/reference_data/ldetect-data/EUR/fourier_ls-chr{}.bed' .format(args.chrom), sep='\s+') eur_samples = pd.read_csv( '/psych/ripke/1000Genomes_reference/1KG_Oct14/1000GP_Phase3_sr_0517d/integrated_call_samples_v3.20130502.ALL.panel.fam.EUR', sep='\t', names=['fid', 'iid', 'mid', 'pid', 'sex', 'pheno'], header=None) # read in genotype data zarr_path = '/psych/ripke/vasa/reference_data/1000G/loc.ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.zarr'.format( args.chrom) callset = zarr.open_group(zarr_path, mode='r') pos = ska.SortedIndex(callset['variants/POS']) callset_samples = list(callset['samples'][:]) eur_samples['callset_index'] = [ callset_samples.index(s) for s in eur_samples['iid'] ] gt = callset['calldata/GT'] gt_da = ska.GenotypeDaskArray(gt) print('Subsetting to europeans') eur_da = gt_da.take(eur_samples['callset_index'].values, axis=1) eur_ac = eur_da.count_alleles() print('Filtering european singletons and invariants') flt = (eur_ac.max_allele() == 1) & (eur_ac[:, :2].min(axis=1) > 1) flt_mask = flt.compute() flt_da = eur_da.compress(flt_mask, axis=0).compute() # update variant index pos = pos[flt_mask] #import ipdb #ipdb.set_trace() print('Counting region window sizes: ') bed['num_variants'] = np.nan for i, region in bed.iterrows(): print('\t{} of {}'.format(i, bed.shape[0])) loc_region = pos.locate_range(region['start'], region['stop']) bed.loc[i, ['num_variants']] = flt_da[loc_region, :, :].n_variants bed.to_csv('data/1000G_eur_chr{}_region_variant_counts.tsv'.format( args.chrom), sep='\t')
def jsfs(self, fold=False): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) props = afs.jsfs_stats(len(p1), gtpops, pos, fold) stats_ls.extend(props) return stats_ls
def sfs(self, fold=False): fold = self.stats["sfs_fold"] gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) stats_ls = [] for pop in self.stats["pop_config"]: gtpop = gt.take(pop, axis=1) sfs = afs.asfs_stats(gtpop, pos, fold) stats_ls.extend(sfs) return stats_ls
def filterGT(callset, outgroup): """Count patterns from VCF """ gt = allel.GenotypeArray(callset['calldata/GT']) p = callset['variants/POS'] pos = allel.SortedIndex(p) acs = gt[:, outgroup].count_alleles(max_allele=1) flt = acs.is_segregating() # needs to be segregating in the outgroup gt = gt.compress(flt, axis=0) pos = pos[flt] return (gt, pos)
def load_vcf_wrapper(path, seqid, samples): callset = allel.read_vcf(path, region=seqid, fields=['variants/POS', 'calldata/GT', 'samples'], tabix="tabix", samples=samples) p = allel.SortedIndex(callset["variants/POS"]) g = allel.GenotypeArray(callset['calldata/GT']) return p, g
def tajd(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) win_size = self.stats["win_size1"] length_bp = self.stats["length_bp"] stats_ls = [] for pop in self.stats["pop_config"]: gtpop = gt.take(pop, axis=1) tajd_, tajd_std = popstats.tajimaD(pos, gtpop, win_size, length_bp) stats_ls.extend([tajd_, tajd_std]) return stats_ls
def delta_tajD(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) win_size = self.stats["win_size1"] length_bp = self.stats["length_bp"] quants = self.stats["pw_quants"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.d_tajD(len(p1), pos, gtpops, win_size, length_bp, quants) stats_ls.extend(flt) return stats_ls
def ddRank12(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) quants = self.stats["pw_quants"] win_size = self.stats["win_size2"] length_bp = self.stats["length_bp"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.ddRank1_2(len(p1), pos, gtpops, win_size, length_bp, quants) stats_ls.extend(flt) # 2 values returned as list [dd1, dd2] return stats_ls
def getSNPHistogram(callset, winSize): pos = allel.SortedIndex(callset['variants/POS']) bins = np.arange(0, pos.max(), winSize) # use window midpoints as x coordinate x = (bins[1:] + bins[:-1]) / 2 # compute variant density in each window y, _ = np.histogram(pos, bins=bins) #y = y / windowSize return [x, y]
def whatsnpisit(locs, chrom, inaccessible=False, missense=True, provide_region=False): """ Given a list of locations+chrom, returns a table of those snps with their aa change if a missense variant. Useful for RNA_seq variant calling pipeline""" if inaccessible is False: ############ Read zarrs ############# Ag_store = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') positions = allel.SortedIndex( zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS", mode='r')[:]) callset_fn = '/home/sanj/ag1000g/data/snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.pass.h5' callset = h5py.File(callset_fn, mode='r') snp_eff = callset[chrom]['variants']['ANN'][:] else: Ag_store = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/", mode='r') positions = allel.SortedIndex( zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS", mode='r')[:]) callset_fn = '/home/sanj/ag1000g/data/all_snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.h5' callset = h5py.File(callset_fn, mode='r') snp_eff = callset[chrom]['variants']['ANN'][:] positions_bool, pos_bool = positions.locate_intersection(locs) snp_eff = snp_eff[positions_bool] return (snp_eff)
def get_callables_sites(callset, chrom): ''' Input: - chrom : chromosome number - callset : Zarr object which directs to all the arrays Output: - callable : np array boolean of shape (# SNPs, ) which encodes which positions are located in callalble regions ''' callable_regions = get_callable(chrom) return allel.SortedIndex( callset['{}/variants/POS'.format(chrom)]).locate_ranges( starts=callable_regions[:, 0], stops=callable_regions[:, 1], strict=False)
def FST(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) quants = self.stats["pw_quants"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.fst(len(p1), pos, gtpops, quants) try: stats_ls.extend(flt) except TypeError: flt = [np.nan] * len(quants) stats_ls.extend(flt) return stats_ls
def locate_intersection(positions_a, lengths_a, positions_b, lengths_b): log("Computing position overlap") loc_a = np.zeros(positions_a.shape, dtype=bool) loc_b = np.zeros(positions_b.shape, dtype=bool) ix_b, ix_a = 0, 0 for va, vb in zip(lengths_a, lengths_b): positions_given_seq_a = allel.SortedIndex(positions_a[ix_a:(ix_a + va)]) positions_given_seq_b = allel.SortedIndex(positions_b[ix_b:(ix_b + vb)]) temp_loc_a, temp_loc_b = positions_given_seq_a.locate_intersection( positions_given_seq_b) loc_a[ix_a:(ix_a + va)] = temp_loc_a loc_b[ix_b:(ix_b + vb)] = temp_loc_b ix_a += va ix_b += vb return loc_a, loc_b
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn): tbl = pd.read_csv(mapfn, sep=" ", names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"]) vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") d1 = np.loadtxt(pop_a_fn, dtype="int8") geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2))) d2 = np.loadtxt(pop_b_fn, dtype="int8") geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2))) return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
def dmin(self): gt = allel.HaplotypeArray(self.haparr.T) pos = allel.SortedIndex(self.pos) quants = self.stats["pw_quants"] win_size = self.stats["win_size2"] length_bp = self.stats["length_bp"] stats_ls = [] for p1, p2 in combinations(self.stats["pop_config"], 2): gtpops = gt.take(p1 + p2, axis=1) flt = pwpopstats.dmin(len(p1), pos, gtpops, win_size, length_bp) if quants[0] < 0: dminq = [np.nanmean(flt)] else: dminq = np.nanquantile(flt, quants) stats_ls.extend(dminq) return stats_ls
def load_vcf_wrapper(path, seqid, samples, samples_path): callset = allel.read_vcf(path, region=seqid, fields=['variants/POS', 'calldata/GT', 'samples'], tabix="tabix", samples=samples) assert "samples" in callset.keys( ), "None of the samples provided in {0!r} are found in {1!r}".format( samples_path, path) p = allel.SortedIndex(callset["variants/POS"]) g = allel.GenotypeArray(callset['calldata/GT']) return p, g
def countPattern(callset, sample_ix, outgroup): """Count patterns for all samples """ print("counting patterns in file...") gt = allel.GenotypeArray(callset['calldata/GT']) pos = allel.SortedIndex(callset['variants/POS']) # remove any sites where outgroup is ./. or 0/1 keep = gt[:, outgroup].is_hom() & gt.count_alleles().is_biallelic() gt = gt.compress(keep, axis=0) pos = pos[keep] # permute among all sample indexes, list of lists # [[1,2,3,4,5],[6,7,8,9],[12,14,15,16]] t1t2dict = defaultdict(list) windict = {} permute = 1 g1, g2, g3 = sample_ix quartet = list(product(g1, g2, g3)) print("total number of combinations: {}".format(len(quartet))) for quart in quartet: print("permutation number {}".format(permute)) i, j, k = quart gt_sub = gt.take([i, j, k, outgroup], axis=1) keep = gt_sub.is_hom().all(axis=1) gt_sub = gt_sub.compress(keep, axis=0) pos_sub = pos[keep] count_array = gt_sub.is_hom_alt() pattern_array = np.packbits(count_array, axis=1) calc_patterns = np.unique(pattern_array, return_counts=True) d = {n: calc_patterns[1][i] for i, n in enumerate(calc_patterns[0])} # total counts AAAA = d.get(0, 0) + d.get(240, 0) # FFFF TTTT 240 and 0 BAAA = d.get(112, 0) + d.get(128, 0) # FTTT + TFFF 112 and 128 ABAA = d.get(176, 0) + d.get(64, 0) # TFTT + FTFF 176 and 64 AABA = d.get(208, 0) + d.get(32, 0) # TTFT + FFTF 208 and 32 BBAA = d.get(48, 0) + d.get(192, 0) # FFTT + TTFF 48 and 192 ABBA = d.get(144, 0) + d.get(96, 0) # TFFT + FTTF 144 and 96 BABA = d.get(80, 0) + d.get(160, 0) # FTFT + TFTF 80 and 160 BBBA = d.get(224, 0) + d.get(16, 0) # FFFT + TTTF 224 and 16 # t1t2 calc t1, t2 = calct1t2(AAAA, BAAA, ABAA, AABA, BBAA, ABBA, BABA, BBBA) t1t2dict["t1"].append(t1) t1t2dict["t2"].append(t2) # windows windict[permute] = (pos_sub, pattern_array) permute += 1 return (t1t2dict, windict)