def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2, sites_list_chunk): # a string representation of the target region of the current window window_region = chromosome + ":" + str(window_pos_1) + "-" + str( window_pos_2) # read in data from the source VCF for the current window callset = allel.read_vcf(args.vcf, region=window_region, fields=[ 'CHROM', 'POS', 'calldata/GT', 'variants/is_snp', 'variants/numalt' ]) # keep track of whether the callset was empty (no sites for this range in the VCF) # used by compute_summary_stats to add info about completely missing sites if callset is None: callset_is_none = True gt_array = None pos_array = None else: # if the callset is NOT empty (None), continue with pipeline callset_is_none = False # convert to a genotype array object gt_array = allel.GenotypeArray( allel.GenotypeDaskArray(callset['calldata/GT'])) # build an array of positions for the region pos_array = allel.SortedIndex(callset['variants/POS']) # create a mask for biallelic snps and invariant sites snp_invar_mask = np.logical_or( np.logical_and(callset['variants/is_snp'][:] == 1, callset['variants/numalt'][:] == 1), callset['variants/numalt'][:] == 0) # remove rows that are NOT snps or invariant sites from the genotype array gt_array = np.delete(gt_array, np.where(np.invert(snp_invar_mask)), axis=0) gt_array = allel.GenotypeArray(gt_array) # select rows that ARE snps or invariant sites in the position array pos_array = pos_array[snp_invar_mask] # if a list of target sites was specified, mask out all non-target sites if sites_list_chunk is not None: gt_array = mask_non_target_sites(gt_array, pos_array, sites_list_chunk) # extra 'none' check to catch cases where every site was removed by the mask if len(gt_array) == 0: callset_is_none = True gt_array = None pos_array = None return callset_is_none, gt_array, pos_array
def get_genotype_array_concat(callsets, genotype_array_type=config.GENOTYPE_ARRAY_DASK): if len(callsets) == 1: # Only one callset provided. No need for concatenation callset = callsets[0] return get_genotype_array(callset=callset, genotype_array_type=genotype_array_type) gt_list = [] # Get genotype data for each callset for callset in callsets: gt = get_callset_genotype_data(callset) if genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Encapsulate underlying zarr array with a chunked dask array gt = da.from_array(gt, chunks=gt.chunks) gt_list.append(gt) if genotype_array_type == config.GENOTYPE_ARRAY_DASK: combined_gt = da.concatenate(gt_list, axis=0) combined_gt = allel.GenotypeDaskArray(combined_gt) elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED: combined_gt = allel.GenotypeChunkedArray( np.concatenate(gt_list, axis=0)) elif genotype_array_type == config.GENOTYPE_ARRAY_NORMAL: combined_gt = allel.GenotypeArray(np.concatenate(gt_list, axis=0)) else: raise ValueError( 'Error: Invalid option specified for genotype_array_type.') return combined_gt
def get_singletons(zarr_folder, chrom, samples, start=-9, stop=-9): callset = zarr.open_group(zarr_folder, mode='r') pos = callset[chrom]['variants']['POS'] # pdb.set_trace() ref = callset[chrom]['variants']['REF'] alt = callset[chrom]['variants']['ALT'] ids = callset[chrom]['variants']['ID'] gt = allel.GenotypeDaskArray( callset[str(chrom)]['calldata']['GT']) # Retrieve genotype data gt = gt.take(samples, axis=1).compute() # subset data to samples of interest ac = gt.count_alleles() if start == -9: start = min(pos) if stop == -9: stop = max(pos) flt = ac.is_singleton(1) pos2 = pos.get_mask_selection(flt) gf = gt.compress(flt, axis=0) sing_dict = {p: i for p, i in zip(pos2, np.where(gf.is_het())[1])} ind_dict = {} for key, value in sing_dict.items(): if value in ind_dict: ind_dict[value].append(key) else: ind_dict[value] = [key] return ind_dict, gt, ids, ref, alt, pos, start, stop
def __main__(): parser = arg.ArgumentParser() parser.add_argument('--chr', dest='chrom') args = parser.parse_args() # read in extra data bed = pd.read_csv( '/psych/ripke/vasa/reference_data/ldetect-data/EUR/fourier_ls-chr{}.bed' .format(args.chrom), sep='\s+') eur_samples = pd.read_csv( '/psych/ripke/1000Genomes_reference/1KG_Oct14/1000GP_Phase3_sr_0517d/integrated_call_samples_v3.20130502.ALL.panel.fam.EUR', sep='\t', names=['fid', 'iid', 'mid', 'pid', 'sex', 'pheno'], header=None) # read in genotype data zarr_path = '/psych/ripke/vasa/reference_data/1000G/loc.ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.zarr'.format( args.chrom) callset = zarr.open_group(zarr_path, mode='r') pos = ska.SortedIndex(callset['variants/POS']) callset_samples = list(callset['samples'][:]) eur_samples['callset_index'] = [ callset_samples.index(s) for s in eur_samples['iid'] ] gt = callset['calldata/GT'] gt_da = ska.GenotypeDaskArray(gt) print('Subsetting to europeans') eur_da = gt_da.take(eur_samples['callset_index'].values, axis=1) eur_ac = eur_da.count_alleles() print('Filtering european singletons and invariants') flt = (eur_ac.max_allele() == 1) & (eur_ac[:, :2].min(axis=1) > 1) flt_mask = flt.compute() flt_da = eur_da.compress(flt_mask, axis=0).compute() # update variant index pos = pos[flt_mask] #import ipdb #ipdb.set_trace() print('Counting region window sizes: ') bed['num_variants'] = np.nan for i, region in bed.iterrows(): print('\t{} of {}'.format(i, bed.shape[0])) loc_region = pos.locate_range(region['start'], region['stop']) bed.loc[i, ['num_variants']] = flt_da[loc_region, :, :].n_variants bed.to_csv('data/1000G_eur_chr{}_region_variant_counts.tsv'.format( args.chrom), sep='\t')
def get_genotype_array(callset, genotype_array_type=config.GENOTYPE_ARRAY_DASK): gtz = get_callset_genotype_data(callset) if genotype_array_type == config.GENOTYPE_ARRAY_NORMAL: return allel.GenotypeArray(gtz) elif genotype_array_type == config.GENOTYPE_ARRAY_DASK: return allel.GenotypeDaskArray(gtz) elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED: return allel.GenotypeChunkedArray(gtz) else: return None
def get_biallelic(zarr_folder, chrom, samples): callset = zarr.open_group(zarr_folder, mode='r') gt = allel.GenotypeDaskArray( callset[str(chrom)]['calldata']['GT']) # Retrieve genotype data gt = gt.take(samples, axis=1).compute() # subset data to samples of interest ac = gt.count_alleles() sites = ac.is_biallelic_01()[:] return sites
def load_arrays_noncoding_and_centromeres(local_path, _set, chrom, coding_reg_df, sitefilter='gamb_colu', filter_centro=True): """ This function reads and filters a genotyping array to the noncoding, noncentromeric regions, and applys a filter depending on whether the samples are arabiensis (arab) or gambiae/coluzzii (gamb_colu) """ Ag_array = zarr.open_array( f"{local_path}/snp_genotypes/all/{_set}/{chrom}/calldata/GT/", mode='r') filters = zarr.open( f"{local_path}/site_filters/dt_20200416/{sitefilter}/{chrom}/variants/filter_pass", mode="r") positions = zarr.open_array( f"{local_path}/snp_genotypes/all/sites/{chrom}/variants/POS/", mode='r') positions = positions[:][filters[:]] geno = allel.GenotypeDaskArray(Ag_array) geno = geno[filters[:]] if filter_centro is True: if chrom == '2L': centromere = (positions > 3000000) elif chrom == '2R': centromere = (positions < 57000000) elif chrom == '3L': centromere = (positions > 2000000) elif chrom == '3R': centromere = (positions < 50000000) elif chrom == 'X': centromere = (positions < 21000000) positions = allel.SortedIndex(positions[centromere]) else: positions = allel.SortedIndex(positions) #get boolean array for positions that are coding - allel.locate_ranges so fast! coding = positions.locate_ranges(coding_reg_df.start, coding_reg_df.end, strict=False) #compress to get noncoding SNPs and remove centromeric regions of low recombination #get non-centromeric regions. currently chosen by eye based on ag1000g phase1 paper fig1. if filter_centro is True: geno = geno.compress(centromere, axis=0) geno = geno.compress( ~coding, axis=0) #we want noncoding regions so '~' to get inverse of boolean positions = positions[~coding] return (geno, positions)
def gtstats(calls, pop2color, n_variants): """ """ gtd = allel.GenotypeDaskArray(calls['calldata/GT']) pc_missing = gtd.count_missing(axis=0)[:].compute() # per sample miss = gtd.count_missing(axis=1)[:].compute() pc_het = gtd.count_het(axis=0)[:].compute() # per sample dep = calls['calldata/DP'] dp = np.mean(dep[:, :], axis=0) ap.plotstats(pc_het / n_variants, 'Heterozygous', pop2color) ap.plotstats(pc_missing / n_variants, 'Missing', pop2color) ap.plotstats(dp, 'Depth', pop2color) return (miss)
def get_ACdata(zarr_folder, chrom, samples, start=-9, stop=-9): callset = zarr.open_group(zarr_folder, mode='r') pos = callset[chrom]['variants']['POS'] gt = allel.GenotypeDaskArray( callset[str(chrom)]['calldata']['GT']) # Retrieve genotype data gt = gt.take(samples, axis=1).compute() # subset data to samples of interest ac = gt.count_alleles() if start == -9: start = min(pos) if stop == -9: stop = max(pos) return ac, pos, start, stop
def get_genotype_data(callset): genotype_ref_name = '' # Ensure 'calldata' is within the callset if 'calldata' in callset: # Try to find either GT or genotype in calldata if 'GT' in callset['calldata']: genotype_ref_name = 'GT' elif 'genotype' in callset['calldata']: genotype_ref_name = 'genotype' else: return None else: return None gtz = callset['calldata'][genotype_ref_name] return allel.GenotypeDaskArray(gtz)
def test_ld(self): ''' unit test for ldshrink ''' input_hdf = "/home/nwknoblauch/Dropbox/Repos/LD_dask/test_data/reference_genotype.h5" callset = h5.File(input_hdf, mode='r') ref_geno = allel.GenotypeDaskArray(callset['calldata/GT']) vt = allel.VariantChunkedTable(callset['variants']) map_data = vt['MAP'] geno_ac = ref_geno.to_n_alt().T.compute() m = 85 Ne = 11490.672741 cutoff = 0.001 test_R_file = "test_data/reference_ld.txt" sub_X = geno_ac[:, :4] sub_map = map_data[:4] est_r = lddask.ld.ldshrink(sub_X, sub_map, m, Ne, cutoff) true_r = np.loadtxt(test_R_file, delimiter="\t") sub_est_r = true_r[:4, :4] assert (np.allclose(true_r[:4, :4], est_r))
def load_calldata_by_sampleset(self, seq_id, sampleset, field="GT", mask=None): if isinstance(sampleset, str): path = self.release_dir / "snp_genotypes" / "all" / sampleset print(path) # need to open as mapping if this on cloud storez = self.gcs.get_mapper(path.as_posix()) calldata = zarr.Group(storez) arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"]) elif isinstance(sampleset, list): arr = da.concatenate([ self.load_calldata_by_sampleset( seq_id, s, field=field, mask=None) for s in sampleset ], axis=1) else: raise ValueError( "sampleset must be a string, or a list of strings") if mask is not None: assert isinstance(mask, da.core.Array), "mask must be a dask_array" arr = da.compress(mask, arr, axis=0).compute_chunk_sizes() if field == "GT": arr = allel.GenotypeDaskArray(arr) return arr
def collect_metrics(self, dataset, metadataObj, indels_flag): pos = dataset[self.seq_id]['variants']['POS'][:] # numpy.ndarray gts = allel.GenotypeDaskArray( dataset[self.seq_id]['calldata'] ['GT']) # allel.model.dask.GenotypeDaskArray acs = gts.count_alleles( max_allele=3).compute() # allel.model.ndarray.AlleleCountsArray is_snp = np.array( dataset[self.seq_id]['variants']['is_snp'] ) # zarr.core.array ; len(REF) == 1 && len(ALT) == 1 (excludes SNP+other) is_var = acs.is_variant() # numpy.ndarray; AlternateCall >= 1 # Missingness of SNP/nonSNP vars/invars (works) self.missingness_counter_by_label = get_missingness_counter_by_label( gts, is_var, is_snp) # snps numalts numalts = dataset[self.seq_id]['variants']['numalt'][:] self.snp_numalt_counter = get_snp_numalt_counter(numalts, is_snp) # variant_type_counter_by_idx if indels_flag and np.any(~is_snp & is_var): reflen = np.frompyfunc(len, 1, 1)( dataset[self.seq_id]['variants']['REF'][:]) self.variant_type_counter_by_idx = get_variant_counter_by_idx( reflen, pos, is_snp) # sample_counts_by_gt self.sample_counts_by_gt = get_sample_counts_by_gt( gts, metadataObj, is_snp) # sample_snp_dp_counter_by_sample_id # https://matplotlib.org/3.1.0/gallery/statistics/multiple_histograms_side_by_side.html#sphx-glr-gallery-statistics-multiple-histograms-side-by-side-py sample_snp_dps = dataset[self.seq_id]['calldata']['DP'][:][is_snp] self.sample_snp_dp_counter_by_sample_id = get_sample_snp_dp_counter_by_sample_id( sample_snp_dps, metadataObj) # SNP density over windows counts, windows = allel.windowed_count( pos[:][is_snp], size=WINDOWSIZE, start=1, stop=self.seq_length) # seq_length self.snp_densities = np.round(counts / WINDOWSIZE, 4) # biallelic SNPs is_biallelic = acs.is_biallelic()[:] is_biallelic_snp = (is_biallelic & is_snp) self.biallelic_snp_singletons_count = np.count_nonzero( (acs.max_allele() == 1) & acs.is_singleton(1)) # TS/TV biallelic_snps_REF = np.array( dataset[self.seq_id]['variants']['REF'][:][is_biallelic_snp], dtype="|S2") biallelic_snps_ALT = np.array( dataset[self.seq_id]['variants']['ALT'][:, 0][is_biallelic_snp], dtype="|S2") biallelic_snps_DP = dataset[ self.seq_id]['variants']['DP'][:][is_biallelic_snp] biallelic_snps_QUAL = dataset[ self.seq_id]['variants']['QUAL'][:][is_biallelic_snp] biallelic_snps_acs = acs[is_biallelic_snp] self.biallelic_snps_mutations = np.char.add(biallelic_snps_REF, biallelic_snps_ALT) self.biallelic_snps_QUAL_DPs = biallelic_snps_QUAL / biallelic_snps_DP self.biallelic_snps_count = len(biallelic_snps_acs) biallelic_snps_gts = gts[is_biallelic_snp].compute() biallelic_snps_allelecounts_subpops = biallelic_snps_gts.count_alleles_subpops( metadataObj.sample_ids_by_pop_id, max_allele=1 ) # max_allele=1, otherwise error in allel.stats.sf._check_ac_n() self.biallelic_snps_seg_count_by_pop_id = { pop_id: biallelic_snps_allelecounts.count_segregating() for pop_id, biallelic_snps_allelecounts in biallelic_snps_allelecounts_subpops.items() } is_segregating_biallelic_snp = biallelic_snps_allelecounts_subpops[ 'all'].is_segregating()[:] self.segregating_biallelic_snp_acs_by_pop_id = { pop_id: biallelic_snps_allelecounts_subpops[pop_id] [is_segregating_biallelic_snp] for pop_id in metadataObj.pop_ids_order }
ac, ac2, size=winsize, start=start, stop=stop, step=int(winsize / 2)) new_dat = format_results(stat=Fst_Pat, stat_name="Fst_Pat", chrom=chrom, windows=windows, nvar=counts, pop=pop) df_list.append(new_dat) if 'r2' in args.s: #pdb.set_trace() ct = allel.GenotypeDaskArray( callset[str(chrom)]['calldata']['GT']) ct = ct.take(loc_samples, axis=1).compute() ct = ct.compress(biallelic, axis=0) ct = ct.to_n_alt(fill=-1) r2, windows, counts = allel.windowed_r_squared( pos, ct, size=winsize, start=start, stop=stop, step=int(winsize / 2), fill=-9) new_dat = format_results(stat=r2, stat_name="r2", chrom=chrom, windows=windows,
mapping = polarize_map(callset, ancestral_sequence) # calculating mutation rate in windows - should probably be set up as a function, but I will start out as without gt_outgroup = gt.compress(callables*biallelic*ancestral, axis = 0).take(outgroup, axis = 1) ps_outgroup = allel.SortedIndex(callset['variants/POS']).compress(callables*biallelic*ancestral) gt_outgroup_allele_count = gt_outgroup.count_alleles() polymorphic_loci = (gt_outgroup_allele_count[:, 0] != 0)*(gt_outgroup_allele_count[:, 1] != 0) mutrate(ps_outgroup, polymorphic_loci, chrom_number) # boolean numpy array encoding if a position in the genome (after filtering and polarizying) for the outgroup individuals # is variant (more than 0 alleles of type "1", in this case, derived) or not variant_loci_outgroup = (allel.GenotypeDaskArray(callset['/calldata/GT']) .map_alleles(mapping) .compress(callables*biallelic*ancestral, axis = 0) .take(outgroup, axis = 1) .count_alleles() .is_variant() .compute()) #Genotype Array polarized and with SNPs filtered for the ingroup individuals gt_ingroup = (allel.GenotypeDaskArray(callset['/calldata/GT']) .map_alleles(mapping) .compress(callables*biallelic*ancestral, axis = 0) .take(ingroup, axis = 1) .compute()) #Write the observation file per ind obs(ps, gt_ingroup, variant_loci_outgroup, chrom_number, ingroup_names, dir_name)
samples_HGDP = list(callset_HGDP["{}/samples".format(chrom)][:]) samples_1KGP = list(callset_1KGP["{}/samples".format(chrom)][:]) #outgroup and ingroup individuals index outgroup_index_HGDP = get_outgroup_index_HGDP(samples_HGDP) outgroup_index_1KGP = get_outgroup_index_1KGP(samples_1KGP) ingroup_index = get_ingroup_index(samples_HGDP, ingroup_names) #mapping array to polarize SNPS mapping = polarize_map(callset_HGDP, chrom) #boolean numpy array encoding if a position in the genome (after filtering and polarizying) for the outgroup individuals #is variant (more than 0 alleles of type "1", in this case, derived) or not variant_loci_outgroup_HGDP = (allel.GenotypeDaskArray(callset_HGDP[ '{}/calldata/GT'.format(chrom)]).map_alleles(mapping).compress( callables * biallelic * ancestral, axis=0).take(outgroup_index_HGDP, axis=1).count_alleles().is_variant().compute()) #boolean numpy array encoding for each position in the VCF of 1KGP if it appears also in the HGDP data intersect_loci_1KGP, intersect_loci_HGDP = allel.SortedIndex( callset_1KGP["{}/variants/POS".format(chrom)]).locate_intersection(ps) #boolean numpy array encoding for each position in the VCF of 1KGP if it appears also in the HGDP data variant_loci_outgroup_1KGP = (allel.GenotypeDaskArray( callset_1KGP['{}/calldata/GT'.format(chrom)]).compress( intersect_loci_1KGP, axis=0).take(outgroup_index_1KGP, axis=1).count_alleles().is_variant().compute()) variant_loci_outgroup_HGDP[intersect_loci_HGDP] += variant_loci_outgroup_1KGP
species = samples_df.groupby('sp_sex').indices species_ix = {k: list(v) for k, v in species.items()} unknown_subpops = ['GM_F', 'GW_F', 'KE_F'] main_species = ['An. coluzzii_F', 'An. gambiae_F'] def compute_divergence(allele_freqs): sum_alt = allele_freqs.sum(axis=0) return (sum_alt[1:].sum()) window_size = 100000 vref_dxy_by_window = dict() xpop_dxy_by_window = dict() for chrom in chroms: print('\nChromosome ' + chrom) gt = allel.GenotypeDaskArray(phase2_ar1.callset_pass[chrom]['calldata/genotype']) ac = gt.count_alleles_subpops(subpops_ix) ac_species = gt.count_alleles_subpops(species_ix) pos = allel.SortedIndex(phase2_ar1.callset_pass[chrom]['variants/POS']) accessibility = phase2_ar1.accessibility[chrom]['is_accessible'] eqa = allel.equally_accessible_windows(accessibility, window_size) # Use the middle of the window as the index window_middle = np.sum(eqa, axis=1)/2 vref_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int), columns=list(subpops.keys()) + list(species.keys())) xpop_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int)) # Calculate distance from the reference in each sub-population for pop in subpops.keys():
def LoadRegion( callset, meta, # minimally a dataframe with sample names as index region, min_FMTDP=0, filter_snp=False, filter_biallelic=False, max_missing_proportion=None, group_col=None, # column name in meta used to identify groups for group_max_missing_proportion group_max_missing_proportion=None): # NOTE: returned meta should be in the same order and same length as the returned genotype array # determine the index in the full callset for all the samples in meta callset_all_sample_ids = list(list(callset.values())[0]['samples']) meta['callset_idx'] = [callset_all_sample_ids.index(x) for x in meta.index] meta['idx'] = np.arange(meta.shape[0]) # index in the genotype array ch, start, stop = str2range(region) print("Region:", region, '->', (ch, start, stop)) pos = allel.SortedIndex(callset[ch]['variants/POS']) if pos.shape[0] == 0: # return empty if nothing for chrom return [], [], meta # create the slice try: sl = pos.locate_range(start, stop) pos = pos[sl] except KeyError: pos = [] if len(pos) == 0: # no loci in slice return [], [], meta # load combined set of both groups sample_idxs = meta['callset_idx'].values g = allel.GenotypeDaskArray(callset[ch]['calldata/GT'])[sl].take( sample_idxs, axis=1) g = g.compute() # need to convert GenotypeDaskArray to GenotypeArray ## Filtering num_loci_in = g.shape[0] flt = np.ones(num_loci_in, dtype=bool) ac = None print('total number of loci =', flt.shape[0]) # filter genotypes on FMT:DP if min_FMTDP > 0: genoflt_FMTDP = callset[ch]['calldata/DP'][sl].take(sample_idxs, axis=1) < min_FMTDP g[genoflt_FMTDP] = [-1, -1] tmp_num_calls = g.shape[0] * g.shape[1] tmp = np.count_nonzero(genoflt_FMTDP) print('{} genotype calls of {} ({:02.2f}%) fail FMT:DP filter'.format( tmp, tmp_num_calls, 100 * tmp / float(tmp_num_calls))) if filter_snp: flt_snp = np.all(np.logical_or( callset[ch]['variants/TYPE'][sl] == 'snp', callset[ch]['variants/TYPE'][sl] == ''), axis=1) flt = flt & flt_snp print('=', np.count_nonzero(flt), 'passing previous filters & SNP') if filter_biallelic: if ac is None: ac = g.count_alleles() flt_biallelic = ac.allelism() == 2 flt = flt & flt_biallelic print('=', np.count_nonzero(flt), 'passing previous filters & biallelic') # filter max_missing (genotype calls) if max_missing_proportion is not None: max_missing = int(np.floor(g.shape[1] * max_missing_proportion)) flt_max_missing = g.is_missing().sum(axis=1) <= max_missing tmp = np.count_nonzero(flt_max_missing) print("max missing proportion {} of {} is {}".format( max_missing_proportion, g.shape[1], max_missing)) print("max missing passing loci = {} ({:2.2f}%)".format( tmp, 100 * tmp / flt_max_missing.shape[0])) flt = flt & flt_max_missing print('=', np.count_nonzero(flt), 'passing previous filters & max_missing') if group_max_missing_proportion is not None and group_col is not None: gmmflt = np.ones(g.shape[0], dtype=bool) for grp in meta[group_col].unique(): grp_meta = meta[meta[group_col] == grp] max_missing = int( np.floor(grp_meta.shape[0] * group_max_missing_proportion)) print("### Group max missing filter:", grp) print(grp_meta['idx']) print("N =", grp_meta.shape[0]) print("max missing =", max_missing) print("loci in =", flt.shape[0]) f = g[:, grp_meta['idx']].is_missing().sum(axis=1) <= max_missing tmp = np.count_nonzero(f) print("passing loci = {} ({:2.2f}%)".format( tmp, 100 * tmp / f.shape[0])) gmmflt = gmmflt & f tmp = np.count_nonzero(mmflt) print("passing all max missing filters {:d} of {:d} ({:.2f}%)".format( tmp, gmmflt.shape[0], 100 * tmp / gmmflt.shape[0])) flt = flt & gmmflt print('=', np.count_nonzero(flt), 'passing previous filters & max_missing') # apply combined filter tmp = np.count_nonzero(flt) print("Passing all all filters {:d} of {:d} ({:.2f}%)".format( tmp, flt.shape[0], 100 * tmp / flt.shape[0])) return g.compress(flt, axis=0), pos.compress(flt, axis=0), meta
x = allele_depth.compute() n_disc = x[np.arange(0, x.shape[0], dtype=int), index] ad = x.sum(axis=1) return ad - n_disc, ad chrom = snakemake.wildcards.chrom phase2_callset_pass = zarr.open_group(snakemake.input.phase2_callset, mode="r") pass_pos = allel.SortedIndex(phase2_callset_pass[chrom]["variants/POS"]) x_callset = h5py.File(snakemake.input.cross_callset, mode="r") xdf = pd.read_table(snakemake.input.metadata, index_col=0) x_pos = allel.SortedIndex(x_callset[chrom]['variants/POS']) x_gt = allel.GenotypeDaskArray(x_callset[chrom]['calldata/genotype']) x_ad = x_callset[chrom]['calldata/AD'] x_ad = da.from_array(x_ad, chunks=x_ad.chunks) call_class = ("HOMREF", "HET", "HOMALT") columns = pd.MultiIndex.from_product( (("ALL", "PASS"), call_class, ("SUCCESS", "N"))) # take sample names from the hdf5 file sample_list = x_callset[chrom]["samples"][:].astype("U8").tolist() # Drop samples that were not included in sequencing. xdf = xdf.set_index("ox_code").reindex(sample_list).reset_index() xdf = xdf.loc[xdf.cross.notna()] xids = xdf.cross.unique()
#Meta data for the sample present in the zarr data structure - Kasper has removed some of the samples. samples_list = list(callset['chr1/samples'][:]) meta_data_samples = meta_data.loc[meta_data.PGDP_ID.isin(samples_list)].copy() samples_callset_index = [ samples_list.index(s) for s in meta_data_samples.PGDP_ID ] meta_data_samples['callset_index'] = samples_callset_index def het_counting(gt): return gt.count_het() gt_zarr = callset["{}/calldata/GT".format(chrom)] pos = callset["{}/variants/POS".format(chrom)] gt = allel.GenotypeDaskArray(gt_zarr) df_list = [] for i, row in meta_data_samples.iterrows(): df = pd.DataFrame() individual = (gt.take([row.callset_index], axis=1)) nnz, windows, counts = allel.windowed_statistic(pos, individual, statistic=het_counting, size=window_size) df["het"] = nnz if i % 10 == 0: print(i) window_numbering = [] df.insert(0, column="chr", value=chrom) window_numbering.extend(range(len(nnz))) df.insert(1, column="window", value=window_numbering)
def main(args=None): if args is None: args = sys.argv[1:] # the ascii help image help_image = "█▀▀█ ░▀░ █░█ █░░█\n" "█░░█ ▀█▀ ▄▀▄ █▄▄█\n" "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n" help_text = 'pixy: sensible estimates of pi and dxy from a VCF' version_text = 'version 0.95.0' # initialize arguments parser = argparse.ArgumentParser( description=help_image + help_text + '\n' + version_text, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version=version_text) parser.add_argument( '--stats', nargs='+', choices=['pi', 'dxy', 'fst'], help= 'Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)', required=True) parser.add_argument('--vcf', type=str, nargs='?', help='Path to the input VCF', required=True) parser.add_argument('--zarr_path', type=str, nargs='?', help='Folder in which to build the Zarr array(s)', required=True) parser.add_argument( '--reuse_zarr', choices=['yes', 'no'], default='no', help='Use existing Zarr array(s) (saves time if re-running)') parser.add_argument('--populations', type=str, nargs='?', help='Path to the populations file', required=True) parser.add_argument( '--window_size', type=int, nargs='?', help='Window size in base pairs over which to calculate pi/dxy') parser.add_argument( '--chromosomes', type=str, nargs='?', default='all', help= 'A single-quoted, comma separated list of chromosome(s) (e.g. \'X,1,2\')', required=False) parser.add_argument( '--interval_start', type=str, nargs='?', help= 'The start of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.' ) parser.add_argument( '--interval_end', type=str, nargs='?', help= 'The end of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.' ) parser.add_argument( '--variant_filter_expression', type=str, nargs='?', help= 'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,GQ>=20\') to apply to SNPs', required=False) parser.add_argument( '--invariant_filter_expression', type=str, nargs='?', help= 'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,RGQ>=20\') to apply to invariant sites', required=False) parser.add_argument( '--outfile_prefix', type=str, nargs='?', default='./pixy_output', help='Path and prefix for the output file, e.g. path/to/outfile') parser.add_argument( '--bypass_filtration', choices=['yes', 'no'], default='no', help= 'Bypass all variant filtration (for data lacking FORMAT fields, use with caution)' ) parser.add_argument( '--bypass_invariant_check', choices=['yes', 'no'], default='no', help= 'Allow computation of stats without invariant sites, will result in wildly incorrect estimates most of the time. Use with extreme caution.' ) parser.add_argument( '--fst_maf_filter', default=0.05, type=float, nargs='?', help= 'Minor allele frequency filter for FST calculations, with value 0.0-1.0 (default 0.05).' ) # ag1000g test data # args = parser.parse_args('--stats fst --vcf data/vcf/multi_chr.vcf.gz --zarr_path data/vcf/multi --window_size 10000 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --outfile_prefix output/pixy_out'.split()) # filter test data # args = parser.parse_args('--stats pi --vcf data/vcf/filter_test.vcf.gz --zarr_path data/vcf/filter_test --window_size 3 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --fst_maf_filter 0.05 --outfile_prefix output/pixy_out'.split()) # catch arguments from the command line args = parser.parse_args() # CHECK FOR TABIX # (disabled until we implement site level and BED support) #tabix_path = shutil.which("tabix") #if tabix_path is None: # warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"') #if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None: # raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"') # VALIDATE ARGUMENTS print("[pixy] pixy " + version_text) print( "[pixy] Validating VCF and input parameters (this may take some time)..." ) # expand all file paths args.vcf = os.path.expanduser(args.vcf) args.zarr_path = os.path.expanduser(args.zarr_path) args.populations = os.path.expanduser(args.populations) args.outfile_prefix = os.path.expanduser(args.outfile_prefix) # CHECK FOR EXISTANCE OF VCF AND POPFILES if os.path.exists(args.vcf) is not True: raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) + ' does not exist') if os.path.exists(args.populations) is not True: raise Exception('[pixy] ERROR: The specified populations file ' + str(args.populations) + ' does not exist') # VALIDATE FILTER EXPRESSIONS # get vcf header info vcf_headers = allel.read_vcf_headers(args.vcf) # skip invariant check if only asking for FST if len(args.stats) == 1 and (args.stats[0] == 'fst'): args.bypass_invariant_check = "yes" # if we are bypassing the invariant check, spoof in a invariant filter if args.bypass_invariant_check == "yes": args.invariant_filter_expression = "DP>=0" if args.bypass_filtration == 'no' and ( args.variant_filter_expression is None or args.invariant_filter_expression is None): raise Exception( '[pixy] ERROR: One or more filter expression is missing. Provide two filter expressions, or set --bypass_filtration to \'yes\'' ) if args.bypass_filtration == 'no': # get the list of format fields and requested filter fields format_fields = vcf_headers.formats.keys() filter_fields = list() for x in args.variant_filter_expression.split(","): filter_fields.append(re.sub("[^A-Za-z]+", "", x)) for x in args.invariant_filter_expression.split(","): filter_fields.append(re.sub("[^A-Za-z]+", "", x)) missing = list(set(filter_fields) - set(format_fields)) if len(missing) > 0: raise Exception( '[pixy] ERROR: the following genotype filters were requested but not occur in the VCF: ', missing) else: print( "[pixy] WARNING: --bypass_filtration is set to \'yes\', genotype filtration will be not be performed." ) # VALIDATE THE VCF # check if the vcf is zipped if re.search(".gz", args.vcf): cat_prog = "gunzip -c " else: cat_prog = "cat " # check if the vcf contains any invariant sites # a very basic check: just looks for at least one invariant site in the alt field if args.bypass_invariant_check == 'no': alt_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq", shell=True).decode("utf-8").split() if "." not in alt_list: raise Exception( '[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.' ) else: if not (len(args.stats) == 1 and (args.stats[0] == 'fst')): print( "[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\', which assumes that your VCF contains invariant sites. Lack of invariant sites will result in incorrect estimates." ) # check if requested chromosomes exist in vcf # defaults to all the chromosomes contained in the VCF (first data column) if args.chromosomes == 'all': chrom_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() chrom_all = chrom_list if args.chromosomes == 'all': chrom_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() chrom_all = chrom_list else: chrom_list = list(args.chromosomes.split(",")) chrom_all = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() missing = list(set(chrom_list) - set(chrom_all)) if len(missing) > 0: raise Exception( '[pixy] ERROR: the following chromosomes were requested but not occur in the VCF: ', missing) # INTERVALS # check if intervals are correctly specified if args.interval_start is not None and args.interval_end is None: raise Exception( '[pixy] ERROR: Both --interval_start and --interval_end must be specified' ) if args.interval_start is None and args.interval_end is not None: raise Exception( '[pixy] ERROR: Both --interval_start and --interval_end must be specified' ) if args.interval_start is not None and args.interval_end is not None and len( chrom_list) > 1: raise Exception( '[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.' ) # SAMPLES # check if requested samples exist in vcf # - parse + validate the population file # - format is IND POP (tab separated) # - throws an error if individuals are missing from VCF # read in the list of samples/populations poppanel = pandas.read_csv(args.populations, sep='\t', usecols=[0, 1], names=['ID', 'Population']) poppanel.head() # get a list of samples from the callset samples_list = vcf_headers.samples # make sure every indiv in the pop file is in the VCF callset IDs = list(poppanel['ID']) missing = list(set(IDs) - set(samples_list)) # find the samples in the callset index by matching up the order of samples between the population file and the callset # also check if there are invalid samples in the popfile try: samples_callset_index = [samples_list.index(s) for s in poppanel['ID']] except ValueError as e: raise Exception( '[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ', missing) from e else: poppanel['callset_index'] = samples_callset_index # use the popindices dictionary to keep track of the indices for each population popindices = {} popnames = poppanel.Population.unique() for name in popnames: popindices[name] = poppanel[poppanel.Population == name].callset_index.values print("[pixy] Preparing for calculation of summary statistics: " + ','.join(map(str, args.stats))) print("[pixy] Data set contains " + str(len(popnames)) + " population(s), " + str(len(chrom_list)) + " chromosome(s), and " + str(len(IDs)) + " sample(s)") # initialize and remove any previous output files if os.path.exists(re.sub(r"[^\/]+$", "", args.outfile_prefix)) is not True: os.mkdir(re.sub(r"[^\/]+$", "", args.outfile_prefix)) # initialize the output files for writing if 'pi' in args.stats: pi_file = str(args.outfile_prefix) + "_pi.txt" if os.path.exists(pi_file): os.remove(pi_file) outfile = open(pi_file, 'a') outfile.write("pop" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_pi" + "\t" + "no_sites" + "\t" + "count_diffs" + "\t" + "count_comparisons" + "\t" + "count_missing" + "\n") outfile.close() if 'dxy' in args.stats: dxy_file = str(args.outfile_prefix) + "_dxy.txt" if os.path.exists(dxy_file): os.remove(dxy_file) outfile = open(dxy_file, 'a') outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_dxy" + "\t" + "no_sites" + "\t" + "count_diffs" + "\t" + "count_comparisons" + "\t" + "count_missing" + "\n") outfile.close() if 'fst' in args.stats: fst_file = str(args.outfile_prefix) + "_fst.txt" if os.path.exists(fst_file): os.remove(fst_file) outfile = open(fst_file, 'a') outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_wc_fst" + "\t" + "no_snps" + "\n") outfile.close() # initialize the folder structure for the zarr array if os.path.exists(args.zarr_path) is not True: pathlib.Path(args.zarr_path).mkdir(parents=True, exist_ok=True) # main loop for computing summary stats # time the calculations start_time = time.time() print("[pixy] Started calculations at " + time.strftime("%H:%M:%S", time.localtime(start_time))) for chromosome in chrom_list: # Zarr array conversion # the chromosome specific zarr path zarr_path = args.zarr_path + "/" + chromosome # determine the fields that will be included # TBD: just reading all fields currently # vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)] # build region string (if using an interval) if args.interval_start is not None: targ_region = chromosome + ":" + str( args.interval_start) + "-" + str(args.interval_end) else: targ_region = chromosome # allow for resuse of previously calculated zarr arrays if args.reuse_zarr == 'yes' and os.path.exists(zarr_path): print( "[pixy] If a zarr array exists, it will be reused for chromosome " + chromosome + "...") elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True: print("[pixy] Building zarr array for chromosome " + chromosome + "...") warnings.filterwarnings("ignore") allel.vcf_to_zarr(args.vcf, zarr_path, region=targ_region, fields='*', overwrite=True) warnings.resetwarnings() print("[pixy] Calculating statistics for chromosome " + targ_region + "...") # open the zarr callset = zarr.open_group(zarr_path, mode='r') # parse the filtration expression and build the boolean filter array # define an operator dictionary for parsing the operator strings ops = { "<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge, "==": operator.eq } # determine the complete list of available calldata fields usable for filtration calldata_fields = sorted(callset['/calldata/'].array_keys()) # check if bypassing filtration, otherwise filter if args.bypass_filtration == 'no': # VARIANT SITE FILTERS var_filters = [] # iterate over each requested variant filter for x in args.variant_filter_expression.split(","): stat = re.sub("[^A-Za-z]+", "", x) value = int(re.sub("[^0-9]+", "", x)) compare = re.sub("[A-Za-z0-9]+", "", x) # check if the requested filter/format exists in the VCF try: stat_index = calldata_fields.index(stat) except ValueError as e: raise Exception( "[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF FORMAT field" ) from e else: if type(var_filters) is list: var_filters = ops[compare](callset['/calldata/' + stat][:], value) elif type(var_filters) is not list: var_filters = np.logical_and( var_filters, ops[compare](callset['/calldata/' + stat][:], value)) # create a mask for variants only # is snp is a site level (1d) array # np.tile below creates a column of "is_snp" once for each sample # (i.e. makes it the same dimensions as the genotype table) is_snp = np.array([callset['/variants/is_snp'][:].flatten() ]).transpose() snp_mask = np.tile(is_snp, (1, var_filters.shape[1])) # force only variant sites (snps, remember we ignore indels) to be included in the filter var_filters = np.logical_and(var_filters, snp_mask) # INVARIANT SITE FILTERS invar_filters = [] for x in args.invariant_filter_expression.split(","): stat = re.sub("[^A-Za-z]+", "", x) value = int(re.sub("[^0-9]+", "", x)) compare = re.sub("[A-Za-z0-9]+", "", x) # check if the requested filter/format exists in the VCF try: stat_index = calldata_fields.index(stat) except ValueError as e: raise Exception( "[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF") from e else: if type(invar_filters) is list: invar_filters = ops[compare](callset['/calldata/' + stat][:], value) elif type(var_filters) is not list: invar_filters = np.logical_and( invar_filters, ops[compare](callset['/calldata/' + stat][:], value)) # create a mask for invariant sites by inverting the snp filter # join that to the invariant sites filter invar_filters = np.logical_and(invar_filters, np.invert(snp_mask)) # join the variant and invariant filter masks (logical OR) filters = np.logical_or(invar_filters, var_filters) # applying the filter to the data # all the filters are in a boolean array ('filters' above) # first, recode the gt matrix as a Dask array (saves memory) -> packed # create a packed genotype array # this is a array with dims snps x samples # genotypes are represented by single byte codes # critically, as the same dims as the filters array below gt_array = allel.GenotypeArray( allel.GenotypeDaskArray(callset['/calldata/GT'])).to_packed() # apply filters # only if not bypassing filtration if args.bypass_filtration == 'no': # set all genotypes that fail filters (the inversion of the array) # to 'missing', 239 = -1 (i.e. missing) for packed arrays gt_array[np.invert(filters)] = 239 # convert the packed array back to a GenotypeArray gt_array = allel.GenotypeArray.from_packed(gt_array) # build the position array pos_array = allel.SortedIndex(callset['/variants/POS']) # a mask for snps and invariant sites snp_invar_mask = np.logical_or( np.logical_and(callset['/variants/is_snp'][:] == 1, callset['/variants/numalt'][:] == 1), callset['/variants/numalt'][:] == 0) # remove rows that are NOT snps or invariant sites from the genotype array gt_array = np.delete(gt_array, np.where(np.invert(snp_invar_mask)), axis=0) gt_array = allel.GenotypeArray(gt_array) # select rows that ARE snps or invariant sites in the position array pos_array = pos_array[snp_invar_mask] #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data #For the given region: return average pi, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data #For the given region: return average pi, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it def tallyRegion(gt_region): total_diffs = 0 total_comps = 0 total_missing = 0 for site in gt_region: vec = site.flatten() #now we have an individual site as a numpy.ndarray, pass it to the comparison function site_diffs, site_comps, missing = compareGTs(vec) total_diffs += site_diffs total_comps += site_comps total_missing += missing if total_comps > 0: avg_pi = total_diffs / total_comps else: avg_pi = 0 return (avg_pi, total_diffs, total_comps, total_missing) #For the given region: return average dxy, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it def dxyTallyRegion(pop1_gt_region, pop2_gt_region): total_diffs = 0 total_comps = 0 total_missing = 0 for x in range(0, len(pop1_gt_region)): site1 = pop1_gt_region[x] site2 = pop2_gt_region[x] vec1 = site1.flatten() vec2 = site2.flatten() #now we have an individual site as 2 numpy.ndarrays, pass them to the comparison function site_diffs, site_comps, missing = dxyCompareGTs(vec1, vec2) total_diffs += site_diffs total_comps += site_comps total_missing += missing if total_comps > 0: avg_pi = total_diffs / total_comps else: avg_pi = 0 return (avg_pi, total_diffs, total_comps, total_missing) #Return the number of differences, the number of comparisons, and missing data count. def compareGTs(vec): #for pi c = Counter(vec) diffs = c[1] * c[0] gts = c[1] + c[0] missing = ( len(vec) ) - gts #anything that's not 1 or 0 is ignored and counted as missing comps = int(special.comb(gts, 2)) return (diffs, comps, missing) def dxyCompareGTs(vec1, vec2): #for dxy c1 = Counter(vec1) c2 = Counter(vec2) gt1zeros = c1[0] gt1ones = c1[1] gts1 = c1[1] + c1[0] gt2zeros = c2[0] gt2ones = c2[1] gts2 = c2[1] + c2[0] missing = (len(vec1) + len(vec2)) - ( gts1 + gts2 ) #anything that's not 1 or 0 is ignored and counted as missing diffs = (gt1zeros * gt2ones) + (gt1ones * gt2zeros) comps = gts1 * gts2 return (diffs, comps, missing) # Interval specification check # check if computing over specific intervals (otherwise, compute over whole chromosome) # window size window_size = args.window_size # set intervals based on args if (args.interval_end is None): interval_end = max(pos_array) else: interval_end = int(args.interval_end) if (args.interval_start is None): interval_start = min(pos_array) else: interval_start = int(args.interval_start) try: if (interval_start > interval_end): raise ValueError() except ValueError as e: raise Exception("[pixy] ERROR: The specified interval start (" + str(interval_start) + ") exceeds the interval end (" + str(interval_end) + ")") from e # catch misspecified intervals # TBD: harmonize this with the new interval method for the zarr array if (interval_end > max(pos_array)): print( "[pixy] WARNING: The specified interval end (" + str(interval_end) + ") exceeds the last position of the chromosome and has been substituted with " + str(max(pos_array))) interval_end = max(pos_array) if (interval_start < min(pos_array)): print( "[pixy] WARNING: The specified interval start (" + str(interval_start) + ") begins before the first position of the chromosome and has been substituted with " + str(min(pos_array))) interval_start = min(pos_array) if ((interval_end - interval_start + 1) < window_size): print( "[pixy] WARNING: The requested interval or total number of sites in the VCF (" + str(interval_start) + "-" + str(interval_end) + ") is smaller than the requested window size (" + str(window_size) + ")") # PI: # AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS # Compute pi over a chosen interval and window size if (args.populations is not None) and ('pi' in args.stats): # open the pi output file for writing outfile = open(pi_file, 'a') for pop in popnames: # window size: window_size = args.window_size # initialize window_pos_2 window_pos_2 = (interval_start + window_size) - 1 # loop over populations and windows, compute stats and write to file for window_pos_1 in range(interval_start, interval_end, window_size): # if the window has no sites, assign all NAs, # otherwise calculate pi if len(pos_array[(pos_array > window_pos_1) & (pos_array < window_pos_2)]) == 0: avg_pi, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0 else: # pull out the genotypes for the window loc_region = pos_array.locate_range( window_pos_1, window_pos_2) gt_region1 = gt_array[loc_region] no_sites = len(gt_region1) # subset the window for the individuals in each population gt_pop = gt_region1.take(popindices[pop], axis=1) avg_pi, total_diffs, total_comps, total_missing = tallyRegion( gt_pop) outfile.write( str(pop) + "\t" + str(chromosome) + "\t" + str(window_pos_1) + "\t" + str(window_pos_2) + "\t" + str(avg_pi) + "\t" + str(no_sites) + "\t" + str(total_diffs) + "\t" + str(total_comps) + "\t" + str(total_missing) + "\n") window_pos_2 += window_size if window_pos_2 > interval_end: window_pos_2 = interval_end # close output file and print complete message outfile.close() print("[pixy] Pi calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_pi.txt") # DXY: # AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS if (args.populations is not None) and ('dxy' in args.stats): # create a list of all pairwise comparisons between populations in the popfile dxy_pop_list = list(combinations(popnames, 2)) # open the dxy output file for writing outfile = open(dxy_file, 'a') # interate over all population pairs and compute dxy for pop_pair in dxy_pop_list: pop1 = pop_pair[0] pop2 = pop_pair[1] # window size: window_size = args.window_size # initialize window_pos_2 window_pos_2 = (interval_start + window_size) - 1 # perform the dxy calculation for all windows in the range for window_pos_1 in range(interval_start, interval_end, window_size): if len(pos_array[(pos_array > window_pos_1) & (pos_array < window_pos_2)]) == 0: avg_dxy, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0 else: loc_region = pos_array.locate_range( window_pos_1, window_pos_2) gt_region1 = gt_array[loc_region] no_sites = len(gt_region1) # use the popGTs dictionary to keep track of this region's GTs for each population popGTs = {} for name in pop_pair: gt_pop = gt_region1.take(popindices[name], axis=1) popGTs[name] = gt_pop pop1_gt_region1 = popGTs[pop1] pop2_gt_region1 = popGTs[pop2] avg_dxy, total_diffs, total_comps, total_missing = dxyTallyRegion( pop1_gt_region1, pop2_gt_region1) outfile.write( str(pop1) + "\t" + str(pop2) + "\t" + str(chromosome) + "\t" + str(window_pos_1) + "\t" + str(window_pos_2) + "\t" + str(avg_dxy) + "\t" + str(no_sites) + "\t" + str(total_diffs) + "\t" + str(total_comps) + "\t" + str(total_missing) + "\n") window_pos_2 += window_size if window_pos_2 > interval_end: window_pos_2 = interval_end outfile.close() print("[pixy] Dxy calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_dxy.txt") # FST: # WEIR AND COCKERHAMS FST # This is just a plain wrapper for the scikit-allel fst function if (args.populations is not None) and ('fst' in args.stats): # open the fst output file for writing outfile = open(fst_file, 'a') # determine all the possible population pairings pop_names = list(popindices.keys()) fst_pop_list = list(combinations(pop_names, 2)) #calculate maf allele_counts = gt_array.count_alleles() allele_freqs = allele_counts.to_frequencies() maf_array = allele_freqs[:, 1] > args.fst_maf_filter # apply the maf filter to the genotype array] gt_array_fst = gt_array[maf_array] gt_array_fst = allel.GenotypeArray(gt_array_fst) # apply the maf filter to the position array pos_array_fst = pos_array[maf_array] # for each pair, compute fst for pop_pair in fst_pop_list: # the indices for the individuals in each population fst_pop_indicies = [ popindices[pop_pair[0]].tolist(), popindices[pop_pair[1]].tolist() ] # compute FST # windowed_weir_cockerham_fst seems to generate (spurious?) warnings about div/0, so suppressing warnings # (this assumes that the scikit-allel function is working as intended) np.seterr(divide='ignore', invalid='ignore') a, b, c = allel.windowed_weir_cockerham_fst( pos_array_fst, gt_array_fst, subpops=fst_pop_indicies, size=args.window_size, start=interval_start, stop=interval_end) for fst, wind, snps in zip(a, b, c): outfile.write( str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" + str(chromosome) + "\t" + str(wind[0]) + "\t" + str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) + "\n") outfile.close() print("[pixy] Fst calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_fst.txt") print("\n[pixy] All calculations complete at " + time.strftime("%H:%M:%S", time.localtime(start_time))) end_time = (time.time() - start_time) print("[pixy] Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(end_time)))