def load_text_format_data(mapfn, pop_a_fn, pop_b_fn): tbl = pd.read_csv(mapfn, sep="\t", header=None, engine="c") try: tbl.columns = ["ID", "CHROM", "GDist", "POS", "REF", "ALT"] except ValueError: logger.info("File not tab delimited as expected- trying with spaces") tbl = pd.read_csv(mapfn, sep=" ", header=None, engine="c", names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"]) try: vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") except ValueError: tbl = tbl.sort_values(["CHROM", "POS"]) logger.warning( "Possible SNPs file is not sorted. Attempting to sort. This is likely to be inefficient" ) vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") d1 = np.loadtxt(pop_a_fn, dtype="int8") geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2))) d2 = np.loadtxt(pop_b_fn, dtype="int8") geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2))) pos = allel.SortedIndex(vartbl.POS[:]) assert np.isnan(pos).sum() == 0, "nans values are not supported" return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
def getVariants(vcfFileName): convertVCFToH5( vcfFileName) # Need to conver to H5 format to use VariantChunkedTable names = vcfFileName.split('.') h5FileName = names[0] + '.h5' callset = h5py.File(h5FileName, mode='r') chrom = 'variants' variants = allel.VariantChunkedTable( callset[chrom], index='POS') #['variants'], names=['POS', 'REF', 'ALT'],index='POS') poss = variants['POS'] refs = variants['REF'] alts = variants['ALT'][:, 0] variantSNPs = {} #make a new format of variants: pos:A->T etc. i = 0 for snp in zip(refs, alts): snpx = snp[0] + '->' + snp[1] pos = poss[i] i = i + 1 variantSNPs[pos] = snpx return callset, variants, variantSNPs
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn): tbl = pd.read_csv(mapfn, sep=" ", names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"]) vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") d1 = np.loadtxt(pop_a_fn, dtype="int8") geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2))) d2 = np.loadtxt(pop_b_fn, dtype="int8") geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2))) return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
def test_ld(self): ''' unit test for ldshrink ''' input_hdf = "/home/nwknoblauch/Dropbox/Repos/LD_dask/test_data/reference_genotype.h5" callset = h5.File(input_hdf, mode='r') ref_geno = allel.GenotypeDaskArray(callset['calldata/GT']) vt = allel.VariantChunkedTable(callset['variants']) map_data = vt['MAP'] geno_ac = ref_geno.to_n_alt().T.compute() m = 85 Ne = 11490.672741 cutoff = 0.001 test_R_file = "test_data/reference_ld.txt" sub_X = geno_ac[:, :4] sub_map = map_data[:4] est_r = lddask.ld.ldshrink(sub_X, sub_map, m, Ne, cutoff) true_r = np.loadtxt(test_R_file, delimiter="\t") sub_est_r = true_r[:4, :4] assert (np.allclose(true_r[:4, :4], est_r))
def process_SNP_data(selected_chromosome): z = root['{}/score'.format(selected_chromosome)][0, :] # Load Ag1000g variation data data_ag1000g = h5py.File( 'ag1000g/variation/ag1000g.phase2.ar1.pass.{}.h5'.format( selected_chromosome), mode='r') variants = allel.VariantChunkedTable( data_ag1000g[selected_chromosome]['variants'], names=['POS'], index='POS') # SNP data is 1-based snp_positions = variants['POS'][:] - 1 pos_array = np.zeros(len(z)) pos_array[snp_positions] = 1 pos_roll = pd.DataFrame(pos_array).rolling(WINDOW_SIZE, center=True).apply( lambda x: np.sum(x) / WINDOW_SIZE, raw=True).fillna(0) root['{}/score'.format(selected_chromosome)][1, :] = np.array(pos_roll)
def import_data(filepath, chrom_name, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'num_alleles']): '''Take the path to a well-formed h5py file and return a VariantTable and a GenotypeArray.''' ##to-do: check that h5py file is well-formed callset_handle = filepath callset = h5py.File(callset_handle, mode='r') variants = allel.VariantChunkedTable(callset[chrom_name]['variants'], names=names, index='POS') genotypes = allel.GenotypeChunkedArray( callset[chrom_name]['calldata']['genotype']) if not genotypes.shape[0] == variants.shape[0]: raise ValueError("Genotypes and variant table must contain the\ same number of positions") return variants, genotypes
ax.set_title('Depth of Coverage per individual') ax.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=True) ax.set_xticklabels(ids.id, rotation= 40, ha= 'right', fontsize= 8) ax.errorbar(np.arange(len(dpMu)), y= dpMu, yerr=dpSd, fmt= 'none', ecolor= 'grey') plt.tight_layout() plotDP(dpMu, dpSd, ids) ## create VariantChunkedTable object variants = al.VariantChunkedTable(subs['variants']) #, index= 'CHROM') ## count the number of variants per scaffold scafs, scaf_counts = np.unique(variants['CHROM'], return_counts= True) scafdf = pd.DataFrame({'scaffold': scafs, 'nVariables': scaf_counts}) ## barplot of number of variants per scaffold fig, ax = plt.subplots(figsize= (14,4)) ax.plot(np.arange(len(scafdf['nVariables'])), scafdf['nVariables']) ax.set_xlabel('scaffolds') ax.set_ylabel('count') ax.set_title('Number of variants per scaffold')
import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns sns.set_style('white') sns.set_style('ticks') sns.set_context('notebook') import h5py import allel print('scikit-allel', allel.__version__) callset_fn = '9Moose_joint_Filter_B_NC_037355.1.h5' callset = h5py.File(callset_fn, mode='r') print(callset) variants = allel.VariantChunkedTable( callset['variants'], names=['POS', 'REF', 'ALT', 'FILTER', 'DP', 'MQ', 'QD'], index='POS') print(variants) pos = variants['POS'][:] print(pos) def plot_windowed_variant_density(pos, window_size, title=None): # setup windows bins = np.arange(0, pos.max(), window_size) # use window midpoints as x coordinate x = (bins[1:] + bins[:-1]) / 2
sns.set_style('white') sns.set_style('ticks') sns.set_context('notebook') import h5py import allel print('scikit-allel', allel.__version__) callset_fn = '/u/home/c/ckyriazi/kirk-bigdata/Ag1000G_data_hdf5/ag1000g.phase1.ar3.h5' callset = h5py.File(callset_fn, mode='r') callset chrom = '3L' variants = allel.VariantChunkedTable( callset[chrom]['variants'], names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'num_alleles'], index='POS') print(variants) pos = variants['POS'][:] print(pos) def plot_windowed_variant_density(pos, window_size, title=None): # setup windows bins = np.arange(0, pos.max(), window_size) # use window midpoints as x coordinate x = (bins[1:] + bins[:-1]) / 2
def get_haplos(pops, chrom, p1, p2, samples, inaccessible=False, geno=False, biallelic=False, zarrpath=None): """ Returns a haplotype array or genotype array for the region and populations requested """ print( '---------------------- retrieving haplotypes -----------------------') # Open Zarrs, genotype and variant data if zarrpath is False: if inaccessible is False: ############ Read zarrs ############# Ag_array = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') Ag_store = zarr.open_group( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/", mode='r') else: Ag_array = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/", mode='r') Ag_store = zarr.open_group( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/", mode='r') else: if inaccessible is False: ############ Read zarrs ############# Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r') Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r') else: Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r') Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r') variants = allel.VariantChunkedTable( Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD'], index='POS')[:] positions = allel.SortedIndex(variants['POS']) positions = positions.intersect_range(p1, p2) # focus on haplotype region sweep_region = (variants['POS'] >= p1) & (variants['POS'] <= p2) ag_geno = allel.GenotypeChunkedArray(Ag_array) print('Zarr arrays opened') ag_geno = ag_geno.compress(sweep_region, axis=0) print( f'------------------------------- {pops} ------------------------------------' ) # Restrict genotypeArray to population and make HapArray pop_bool = samples.population.isin(pops) pop_geno = ag_geno.compress(pop_bool, axis=1) pop_haplo = pop_geno.to_haplotypes() print("HaplotypeArray constructed") if biallelic is True: ac = pop_geno.count_alleles() bi_al = ac.is_biallelic_01() pop_haplo = pop_haplo.compress(bi_al, axis=0) positions = positions[bi_al] if geno is True: return (pop_geno, pop_bool, sweep_region, positions) else: return (pop_haplo, pop_bool, sweep_region, positions)
def multiple_alignment(pops, chrom, p1, p2, samples, hap_only=False): """ Returns a multiple sequence alignment FASTA for a region, given populations, chromosome and locations. Useful for constructing phylogenetic trees (in IQTREE, e.g) Currently not bi-allelic which may be incorrect """ print( '---------------------- multiple sequence alignment -----------------------' ) # Open Zarrs, genotype and variant data Ag_array = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') Ag_store = zarr.open_group( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/", mode='r') variants = allel.VariantChunkedTable( Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD'], index='POS') # focus on haplotype region sweep_region = (variants['POS'][:] >= p1) & (variants['POS'][:] <= p2) variants_in_region = variants.compress(sweep_region, axis=0) ag_geno = allel.GenotypeChunkedArray(Ag_array) print('Zarr arrays opened') ag_geno = ag_geno.compress(sweep_region, axis=0) # clean metadata species_map = {'M': 'coluzzii', 'S': 'gambiae'} samples['species'] = samples['m_s'].map(species_map) color_map = {'BFcol': 'gold'} samples = samples[[ 'ox_code', 'population', 'country', 'species', 'region' ]] #empty df for FASTAS multi_fastas = pd.DataFrame() all_samples = pd.DataFrame() for pop in pops: print( f'------------------------------- {pop} ------------------------------------' ) # Restrict genotypeArray to population and make HapArray pop_bool = samples.population == pop pop_geno = ag_geno.compress(pop_bool, axis=1) pop_haplo = pop_geno.to_haplotypes() print("HaplotypeArray constructed") list_of_haplotypes = np.arange(0, pop_haplo.shape[1]).astype('str') # all_haps = pd.DataFrame(np.repeat(all_samples.values,2,axis=0)) list_of_haplotypes = list(list_of_haplotypes) pop_hap_sizes = dict() pop_hap_sizes[pop] = len(list_of_haplotypes) # THIS CREATES AN EMPTY DATAFRAME TO FILL WITH SEQUENCES # EACH ROW IS A HAPLOTYPE fastas = pd.DataFrame({ "hap": np.nan, "seq": np.nan }, columns=["hap", "seq"]) # THIS LOOPS THROUGH HAPLOTYPES AND POPULATES "seq" VARIABLE WITH A CONCATENATED ARRAY OF ALT/REF VARIANTS # genotypes_in_region: array of genotypes as loaded by scikit-allel (compress it to region of interest) # variants_in_region: table of variants as loaded by scikit-allel (compress it to region of interest) print(f"Extracting variants and writing to Pandas Dataframe") for n, i in enumerate(list_of_haplotypes): gen = np.ndarray.tolist(pop_haplo[:, n]) endstring = '' for gn, allele in enumerate(gen): if allele == 1: seq = variants_in_region['ALT'][gn][0].astype(str) if allele == 2: seq = variants_in_region['ALT'][gn][1].astype( str ) #should this be here, or should it be bi-allelic only? else: seq = variants_in_region['REF'][gn].astype( str) # if allele 0 then REF endstring += seq # concatenate bases into sequence fastas["seq"][ n] = endstring #input to corresponding seq column of df # Join the dfs of different pops multi_fastas = multi_fastas.append(fastas, ignore_index=True) print(len(multi_fastas), "Haplotypes complete") pop_samples = samples[samples.population == pop] all_samples = all_samples.append(pop_samples) multi_fastas['hap'] = '>' + all_samples['population'].astype( str) + '_' + all_samples['ox_code'].astype(str) #write to csv with \n sep to make FASTA file multi_fastas.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.fasta", sep="\n", index=False, header=False) print('Multiple alignment FASTA written') #remove > and join with metadata for each pop, useful for plotting phylo trees multi_fastas['hap'] = multi_fastas['hap'].str.strip('>') all_haps = pd.DataFrame(np.repeat(all_samples.values, 2, axis=0)) all_haps.columns = all_samples.columns all_haps = pd.concat([multi_fastas.reset_index(drop=True), all_haps], axis=1) all_haps.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.metadata", sep="\t", index=False, header=True) return (multi_fastas, all_haps)
print("Data:") print("* Samples = ", p2_samples.shape[0]) print("* Populations = ", set(p2_samples[p2_popc])) print(p2_samples.groupby(("population")).size()) # Variants and genotypes: # In[5]: # declare objects with variant data p2_callset = zarr.open(p2_callset_fn) # variants of genotypes print("Variants...") p2_callset_var = p2_callset[chrom]["variants"] p2_genvars = allel.VariantChunkedTable(p2_callset_var, names=["POS", "REF", "ALT"], index="POS") print(p2_genvars.shape) # genotype data print("Genotypes...") p2_callset_gen = p2_callset[chrom]["calldata"]["genotype"] p2_genotyp = allel.GenotypeChunkedArray(p2_callset_gen) p2_genotyp = p2_genotyp.subset(sel1=p2_samples_bool) print(p2_genotyp.shape) # #### Outgroups # # Loads one outgroup, removes indels (duplicated variant positions) and subsets phase2 to include variants present in this outgroup. Then, loads outgroup genotypes and subsets them to remove indels and fit phase2. Then, loads the second outgroup and performs the same task. Thus, at each iteration, less and less variants remain (hopefully not too many are lost; worst offenders are `chri` and `epir`). # In[6]:
oc_popdict["all"] = oc_popdict["all"] + oc_popdict[popi] # report print("Data:") print("* Samples = ", oc_samples.shape[0]) print("* Populations = ", set(oc_samples[oc_popc])) print(oc_samples.groupby(("population")).size()) # Phased variants and genotypes: # declare objects with variant data oc_hapcall = h5py.File(oc_hapcall_fn) # variants of genotypes print("Variants phased...") oc_hapcall_var = oc_hapcall[chrom]["variants"] oc_hapvars = allel.VariantChunkedTable(oc_hapcall_var,names=["POS","REF","ALT"],index="POS") print(oc_hapvars.shape) # genotype data print("Genotypes phased...") oc_hapcall_hap = oc_hapcall[chrom]["calldata"]["genotype"] oc_haploty = allel.GenotypeChunkedArray(oc_hapcall_hap) oc_haploty = oc_haploty.subset(sel1=oc_samples_bool) print(oc_haploty.shape) # Effects: oc_effcall = zarr.open(oc_effcall_fn) oc_effvars = allel.VariantChunkedTable(oc_effcall["variants"],names=[ "POS","REF","ALT","ANN_HGVS_p","ANN_HGVS_c", "ANN_Annotation","ANN_AA_pos","ANN_CDS_pos", "ANN_Feature_ID","ANN_Gene_ID","ANN_Gene_Name"
### Save to hdf5 #import sys #allel.vcf_to_hdf5(ftlx,'FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1_Shrt.h5', # fields='*', alt_number=4,transformers=allel.ANNTransformer(),log=sys.stdout, vlen=False) # In[18]: ### HDF5 from VCF database ftlxh5 = 'tracks/WGS/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1_Shrt.h5' # read HDF5 file csh = h5py.File(ftlxh5, mode='r') var_tb = allel.VariantChunkedTable( csh['variants'], names=[ 'CHROM', 'POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'ANN_AA_length', 'ANN_Allele', 'ANN_Annotation', 'ANN_Annotation_Impact', 'ANN_Feature_ID', 'ANN_Feature_Type', 'ANN_Gene_ID', 'ANN_Gene_Name', 'ANN_Rank', 'ANN_Transcript_BioType', 'numalt' ]) # In[19]: #a,b,c=plt.hist(var_tb['DP'][:], bins=10) #csh['variants/REF'] # ## Now we can work with filters # In[20]: #fltr_expr = '(QD > 5) & (MQ > 40) & (DP > 1500) & (DP < 3000)' fltr_expr = "ANN_Feature_Type==b'transcript'"
action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") (options, args) = parser.parse_args() zarr_path = options.zarr_filename chromID = options.chromID samples_fn = options.sample_metadata output_fp = options.output_fp # load variants callset = zarr.open_group(zarr_path, mode='r') variants = allel.VariantChunkedTable( callset[chromID]['variants'], names=['POS', 'REF', 'ALT', 'AN', 'AC', 'numalt'], index='POS') filter_expression = '(AN >= 800)' variant_selection = variants.eval(filter_expression)[:] pos = variants['POS'][:] variants_pass_pos = pos.compress(variant_selection) genotypes = allel.GenotypeChunkedArray(callset[chromID + '/calldata/GT']) genotypes_subset = genotypes.subset(variant_selection, ) samples = pd.read_csv(samples_fn) # paired groups filtering paired_groupIDs_list = samples.GroupID.value_counts() == 2 # D-R pairs