def getContigStats(vcf,window_size,outfile,subpops): #load data vcf=allel.read_vcf(vcf) snps=allel.GenotypeArray(vcf['calldata/GT']) positions=vcf['variants/POS'] sample_indices=dict() for i in range(len(vcf['samples'])): sample_indices[vcf['samples'][i]]=i #prep output file outfile=open(str(outfile),'w') outfile.write('chrom\tchromStart\tchromEnd\tnumSites\tfst\ttajD1\ttajD2\tthetaW1\tthetaW2\tdxy_bw\tpi\tdfd\n') outfile.close() #get window bounds window_bounds=getSubWinBounds(window_size,max(positions)) window_bound_indices=getSnpIndicesInSubWins(window_bounds,positions) nwindows=max(positions)//window_size - 1 #loop over windows and print summary stats to file for i in range(nwindows): if(len(window_bound_indices[i])<10): #if <n snps in the window outfile=open(str(outfile),'a') sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(0), "NA","NA","NA","NA","NA","NA","NA","NA"] sumstats='\t'.join(sumstats)+'\n' outfile.write(sumstats) outfile.close() else: window_snp_positions=positions[window_bound_indices[i]] window_snps=snps.subset(window_bound_indices[i]) window_ac_all=window_snps.count_alleles() window_ac_subpop=window_snps.count_alleles_subpops(subpops=subpops) window_ac_per_ind=window_snps.to_allele_counts() #summary stats a,b,c=allel.stats.fst.weir_cockerham_fst(window_snps,[subpops['rufus'],subpops['sasin']]) fst=np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) tajD1=allel.stats.diversity.tajima_d(window_ac_subpop['rufus']) tajD2=allel.stats.diversity.tajima_d(window_ac_subpop['sasin']) thetaW1=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['rufus']) thetaW2=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['sasin']) dxy_bw=allel.stats.diversity.sequence_divergence(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin']) pi=allel.stats.diversity.sequence_diversity(window_snp_positions,window_ac_all) dfd=allel.stats.diversity.windowed_df(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin'],size=window_size)[0][0] # pdxy=allel.stats.distance.pairwise_dxy(window_snp_positions,window_ac_per_ind) # dmax=pdxy.max() # dmin=pdxy.min() # f2=allel.stats.admixture.patterson_f2(window_ac_subpop['rufus'],window_ac_subpop['sasin']) #need to drop non-biallelic sites for this #write a vector of summary stats to file outfile=open(str(outfile),'a') sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(window_snps.shape[0]), str(round(fst,6)),str(round(tajD1,6)),str(round(tajD2,6)), str(round(thetaW1,6)),str(round(thetaW2,6)),str(round(dxy_bw,6)), str(round(pi,6)),str(round(dfd,6))] sumstats='\t'.join(sumstats)+'\n' outfile.write(sumstats) outfile.close()
def read_vcf_founderliab(path): """ Read whole vcf and return ONLY founder matrix """ geno_dosage = allel.GenotypeArray(allel.read_vcf(path, fields=['calldata/GT'])['calldata/GT']).to_n_alt().T return geno_dosage
def read_vcf_allel(file_vcf): ''' Use scikit allel to read vcf file. Organise variant information into summary pandas df. ''' print(file_vcf) vcf_ori= allel.read_vcf(file_vcf) if not vcf_ori: print('empty vcf.') return {}, {}, {} print(vcf_ori.keys()) ### get genotype array geno= vcf_ori['calldata/GT'] mult_alt= [x for x in range(geno.shape[0]) if vcf_ori['variants/ALT'][x][1]] #len(vcf_ori['variants/REF'][x]) > 1 indel= [x for x in range(geno.shape[0]) if len(vcf_ori['variants/REF'][x]) == 1 and len(vcf_ori['variants/ALT'][x][0]) == 1] ## eliminate +1 segregating mutations. for mult in mult_alt: gen_t= geno[mult] gen_t[gen_t > 1] = 0 geno[mult]= gen_t geno= allel.GenotypeArray(geno) geno= geno.to_n_alt().T ## setup summary column_names= ['CHROM','POS','ID','REF','ALT','QUAL','FILTER'] alts= [vcf_ori['variants/ALT'][x][0] for x in range(geno.shape[1])] PASS= [['.','PASS'][int(vcf_ori['variants/FILTER_PASS'][x])] for x in range(geno.shape[1])] summary= [ vcf_ori['variants/CHROM'], vcf_ori['variants/POS'], vcf_ori['variants/ID'], vcf_ori['variants/REF'], alts, vcf_ori['variants/QUAL'], PASS, ] summary= np.array(summary).T if len(indel): print('mutliple ref loci: {}'.format(geno.shape[1] - len(indel))) geno= geno[:,indel] summary= summary[indel,:] summary= pd.DataFrame(summary,columns= column_names) return geno, summary, vcf_ori['samples']
def __get_variants_from_vcf(cls, vcf: str) -> Optional[Dict[str, Any]]: # variants is None precisely when filtered vcf file has no variants try: variants = allel.read_vcf(vcf, fields=cls.FIELD_NAMES, transformers=allel.ANNTransformer()) except IOError: raise FileNotFoundError("File " + vcf + " not found or cannot be opened.") return variants
def get_ann_from_output_snpeff(temp_out_name): callset = allel.read_vcf(temp_out_name, fields='ANN', transformers=allel.ANNTransformer(), \ numbers={'ANN': num_ann_max}) df1 = pd.DataFrame(data=callset['variants/ANN_Allele']) df2 = pd.DataFrame(data=callset['variants/ANN_Annotation']) df3 = pd.concat((df1, df2), axis=1) df3.columns = range(0, df3.shape[1]) return df3
def main(vcffile, pop1, pop2, binwidth, stepsize, outprefix): """ 计算pop1和pop2之间的Fst using the method of Hudson (1992) elaborated by Bhatia et al. (2013). """ pop1 = [x.strip() for x in open(pop1)] pop2 = [x.strip() for x in open(pop2)] callset = allel.read_vcf(vcffile) allsamples = callset['samples'] genotypes = allel.GenotypeChunkedArray(callset['calldata/GT']) variant_selection = np.full((genotypes.shape[0] + 1), True) # 选择vcf中的全部位点 sample_selection = [True if x in pop1 else False for x in allsamples] ac1 = getAC(genotypes, variant_selection, sample_selection) sample_selection = [True if x in pop2 else False for x in allsamples] ac2 = getAC(genotypes, variant_selection, sample_selection) num, den = allel.hudson_fst(ac1, ac2) fst = num / den meanFst = np.sum(num) / np.sum(den) print('meanFst: %s' % meanFst) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] df = pd.DataFrame({'chrom': chrom, 'pos': pos, 'hudson_Fst': fst}) df.to_csv(f'{outprefix}_persite.tsv.gz', sep='\t', index=False, na_rep='nan', compression='gzip') df['num'] = num df['den'] = den # sliding bins bdf = [] for offset in range(0, binwidth, stepsize): df['bin_index'] = ((df['pos'].values - 1) - offset) // binwidth for group_name, gdf in df.groupby(by=['chrom', 'bin_index']): chrom, bin_index = group_name start = bin_index * binwidth + offset + 1 if start < 0: # 开头几个窗口长度不足的就直接跳过 continue end = start + binwidth - 1 n_snp = gdf.shape[0] sum_num = gdf['num'].sum() sum_den = gdf['den'].sum() if sum_den > 0: meanFst = sum_num / sum_den else: meanFst = np.nan bdf.append([chrom, start, end, n_snp, meanFst]) bdf = pd.DataFrame(bdf, columns=['chrom', 'start', 'end', 'n_snp', 'meanFst']).sort_values(by=['chrom', 'start']) bdf.to_csv(f'{outprefix}_meanFst.tsv.gz', index=False, compression='gzip', sep='\t', float_format='%.3f')
def import_data(callset_path): '''Read in the VCF in the appropriate format.''' callset = allel.read_vcf(callset_path, fields=[ 'samples', 'calldata/GT', 'variants/CHROM', 'variants/FILTER', 'variants/POS', 'variants/REF', 'variants/ALT' ]) return callset
def _load_calldata(self): callset = allel.read_vcf(self.data, fields=["samples", "GT"]) self.samples_vcforder = callset["samples"] gt = allel.GenotypeArray(callset['calldata/GT']) ## All this is for removing multi-allelic snps, and biallelic singletons ac = gt.count_alleles() flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) self.genotypes = gt.compress(flt, axis=0)
def vcf2ped( args ): """ create a ped and map file based on vcf and metafile, suitable for isoRelate """ # open group file group_parser = grpparser.GroupParser( args ) # open VCF file cerr('[I: reading VCF...]') start_time = time.monotonic() vcfset = allel.read_vcf(args.infile, fields = ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT']) cerr('[I: read %s site, %s samples in %d secs]' % (len(vcfset['variants/CHROM']), len(vcfset['samples']), time.monotonic() - start_time)) # assign groups samples = vcfset['samples'] group_parser.assign_groups(samples) groups = group_parser.group_keys #import IPython; IPython.embed() # write to PED with open(args.outprefix + '.ped', 'w') as outf: for i in range(len(samples)): outf.write('%s\t%s\t0\t0\t1\t0\t' % (groups[i], samples[i])) alleles = [] for gt in vcfset['calldata/GT'][:,i]: allele_1, allele_2 = gt #print(allele_1, allele_2) if allele_1 == allele_2: if allele_1 == -1: alleles += [0, 0] elif allele_1 == 0: alleles += [1, 1] elif allele_1 == 1: alleles += [2, 2] else: alleles += [1, 1] else: alleles += [1, 2] outf.write('\t'.join( str(i) for i in alleles)) outf.write('\n') #import IPython; IPython.embed() # write to MAP with open(args.outprefix + '.map', 'w') as outf: last_pos = 0 curr_chr = None for (chrom, pos) in zip( vcfset['variants/CHROM'], vcfset['variants/POS'] ): if curr_chr != chrom: curr_chr = chrom last_pos = 0 dist = (pos - last_pos) * 1e-6 last_pos = pos outf.write('%s\t%s:%d\t%8.6f\t%d\n' % (chrom, chrom, pos, dist, pos))
def sample_genotype_array(self, sample, part): """Get a genotype array for the specified individual""" file = self._sample_genotype_path(sample, part) if not self._check_local(file): cmd = f'bcftools view -s {sample} -v snps -m2 -M2 -Oz -o {file} {self._chr_path} {self._query(part)}' subprocess.call(cmd, shell=True, stdout=subprocess.PIPE) gt = allel.read_vcf(file, fields=['GT', 'POS']) return pd.Series(allel.GenotypeArray(gt['calldata/GT'])[:, 0], index=gt['variants/POS'])
def vcf2npy(vcffile, samples): callset = allel.read_vcf(vcffile, samples=samples) haplotypes_1 = callset['calldata/GT'][:, :, 0] haplotypes_2 = callset['calldata/GT'][:, :, 1] m, n = haplotypes_1.shape mat_haplo = np.empty((2 * n, m)) mat_haplo[::2] = haplotypes_1.T mat_haplo[1::2] = haplotypes_2.T return mat_haplo.astype(np.uint8)
def load_vcf_wrapper(path, seqid, samples): callset = allel.read_vcf(path, region=seqid, fields=['variants/POS', 'calldata/GT', 'samples'], tabix="tabix", samples=samples) p = allel.SortedIndex(callset["variants/POS"]) g = allel.GenotypeArray(callset['calldata/GT']) return p, g
def main() -> None: file_path1 = sys.argv[1] file_path2 = sys.argv[2] if len(sys.argv) == 5: population1 = sys.argv[3] population2 = sys.argv[4] callset1 = allel.read_vcf(file_path1, fields=VCF_FIELDS) callset2 = allel.read_vcf(file_path2, fields=VCF_FIELDS) genotypes1, positions1, _, _ = biallelic_variant_filter(callset1) genotypes2, positions2, _, _ = biallelic_variant_filter(callset2) shared_position_indices1, shared_position_indices2 = joint_position_indices(positions1, positions2) positions1 = positions1[shared_position_indices1] positions2 = positions2[shared_position_indices2] genotypes1 = genotypes1[shared_position_indices1] genotypes2 = genotypes2[shared_position_indices2] genotypes1[genotypes1 < 0] = 0 genotypes2[genotypes2 < 0] = 0 # # allele_counts1 = genotypes1.reshape(genotypes1.shape[0], -1).sum(1) # allele_counts2 = genotypes2.reshape(genotypes2.shape[0], -1).sum(1) # sfs1 = allel.sfs(allele_counts1, np.product(genotypes1.shape[1:])) # sfs2 = allel.sfs(allele_counts2, np.product(genotypes2.shape[1:])) # plt.title('real vs synthetic PGP site frequency spectrum') # ax = plt.gca() # ax = allel.plot_sfs(sfs1, ax=ax, label=population1, plot_kwargs=dict([('c','b')])) # ax = allel.plot_sfs(sfs2, ax=ax, label=population2, plot_kwargs=dict([('c','g')])) # ax.legend() # plt.savefig(os.path.join(FIGURES_DIR, 'synthetic_PGP.PGP.sfs.png')) # plt.clf() # # sfs1 = site_frequency_spectrum(genotypes1, population1) # sfs2 = site_frequency_spectrum(genotypes2, population2) joint_site_frequency_spectrum(genotypes1, genotypes2, population1, population2)
def mutect2(inputs, normal_name) -> dict: # chrom: pos: normal_genotype chrom_pos_gt = dict() cnt = 0 cnt_het_hom = 0 for ifile in inputs: # ["variants/CHROM", "variants/POS", "variants/REF", "variants/ALT", "calldata/GT"] in_vcf = allel.read_vcf(ifile, fields='*') idx_normal = np.argwhere(in_vcf["samples"] == normal_name)[0][0] zipped = zip( in_vcf["variants/CHROM"][in_vcf["variants/is_snp"]], in_vcf["variants/POS"][in_vcf["variants/is_snp"]], in_vcf["variants/REF"][in_vcf["variants/is_snp"]], in_vcf["variants/ALT"][in_vcf["variants/is_snp"]], in_vcf["calldata/GT"][in_vcf["variants/is_snp"]], in_vcf["variants/FILTER_PASS"][in_vcf["variants/is_snp"]], in_vcf["variants/FILTER_artifact_in_normal"][ in_vcf["variants/is_snp"]]) for chrom, pos, ref, alt, gt, is_pass, is_artifact in zipped: if is_artifact: continue chrom = str(chrom) pos = int(pos) num_pass = int(is_pass) alt = alt[0] ref_alt = ref + alt normal = ref_alt[gt[idx_normal][0]] + ref_alt[gt[idx_normal][1]] if gt[idx_normal][0] != 0 or gt[idx_normal][1] != 0: cnt_het_hom += 1 if chrom in chrom_pos_gt: if pos in chrom_pos_gt[chrom]: chrom_pos_gt[chrom][pos]["num_pass"] += num_pass if chrom_pos_gt[chrom][pos]["gt"][0] == chrom_pos_gt[ chrom][pos]["gt"][1] and normal[0] != normal[1]: cnt += 1 chrom_pos_gt[chrom][pos]["gt"] = normal else: chrom_pos_gt[chrom][pos] = { "gt": normal, "num_pass": num_pass } else: chrom_pos_gt[chrom] = { pos: { "gt": normal, "num_pass": num_pass } } print(f"Disagreement on normal: {cnt} times.") print(f"Not ref: {cnt_het_hom} times.") return chrom_pos_gt
def fit_em_smm( variants_vcf: str, n_iterations: int, K: int, seed: int, logsum_approx: bool, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: variants = ( allel.vcf_to_dataframe(variants_vcf, fields=[ 'POS', 'REF', 'ALT' ]).drop(['ALT_2', 'ALT_3'], axis=1) # ALT_2, ALT_3 are always empty ) genotypes = allel.read_vcf(variants_vcf, fields=['calldata/GT']) genotypes = genotypes['calldata/GT'] # scikit-allel reads missing values as -1 genotypes = np.where(genotypes == -1, 0, genotypes) haplo_1 = genotypes[:, :, 0] haplo_2 = genotypes[:, :, 1] haplos = np.hstack((haplo_1, haplo_2)).T n_variants_pos = ( variants # find number of variants by position .groupby('POS' ) # add 1 to account for fact that we always have a reference .count()['REF'].values) + 1 max_n_variants = np.sort(n_variants_pos)[-1] n_loci = len(variants['POS'].unique()) n_samples = haplos.shape[0] haplos = _encode_haplotypes(variants['POS'].values, haplos, n_samples, n_loci) # em initialization rng = np.random.default_rng(seed) group_e_ini = rng.random(size=(n_samples, K)) group_e = group_e_ini / np.sum(group_e_ini, axis=1, keepdims=1) group_probs = np.full(6, 1 / K) # make this a probability vector variant_ini = rng.random(size=(K, n_loci, max_n_variants)) variant_probs = variant_ini / np.sum(variant_ini, axis=2, keepdims=1) # TODO: add step filtering this to correct number of variants return _em_loop( n_iterations, K, n_samples, n_loci, n_variants_pos, group_e, group_probs, variant_probs, haplos, logsum_approx, )
def load_genotypes(): if args.zarr is not None: print("reading zarr") callset = zarr.open_group(args.zarr, mode='r') gt = callset['calldata/GT'] genotypes = allel.GenotypeArray(gt[:]) samples = callset['samples'][:] elif args.vcf is not None: print("reading VCF") vcf = allel.read_vcf(args.vcf, log=sys.stderr) genotypes = allel.GenotypeArray(vcf['calldata/GT']) samples = vcf['samples'] return genotypes, samples
def generate_vcf_classes(vcfs): print("Parsing VCFs") parsed_vcf_bodies = list(map(lambda x: allel.read_vcf(x, fields="*"), vcfs)) parsed_vcf_bodies = list(filter(None, parsed_vcf_bodies)) deque( map( lambda x: x.update(samples=numpy.char.upper(x['samples'].tolist()) ), parsed_vcf_bodies)) deque(map(lambda x, y: x.update(FILE=y), parsed_vcf_bodies, vcfs)) add_headers = lambda x, y: x.update(header=allel.read_vcf_headers(y)) deque(map(add_headers, parsed_vcf_bodies, vcfs)) return parsed_vcf_bodies
def vcf2npy( vcfpath ): #converts a .vcf file to a numpy matrix w/ values 0, 1, and 2; https://github.com/bcm-uga/Loter/blob/master/python-package/Local_Ancestry_Example.ipynb callset = allel.read_vcf(vcfpath) haplotypes_1 = callset['calldata/GT'][:, :, 0] haplotypes_2 = callset['calldata/GT'][:, :, 1] m, n = haplotypes_1.shape mat_haplo = np.empty((2 * n, m)) mat_haplo[::2] = haplotypes_1.T mat_haplo[1::2] = haplotypes_2.T return mat_haplo.astype(np.uint8)
def vcf2npy(vcffile, samples): callset = allel.read_vcf(vcffile, samples=samples, fields=['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT']) haplotypes_1 = callset['calldata/GT'][:,:,0] haplotypes_2 = callset['calldata/GT'][:,:,1] m, n = haplotypes_1.shape mat_haplo = np.empty((2*n, m)) mat_haplo[::2] = haplotypes_1.T mat_haplo[1::2] = haplotypes_2.T keep_samples = callset['samples'] return mat_haplo.astype(np.uint8), keep_samples, callset['variants/CHROM'], callset['variants/POS']
def test_few_chrom_small_region_num(self): self.data_holder.filename = VCF_DATA_FEW_CHR self.region_len = 500000 chromosomes = create_bed_files_and_extract_chromosomes( data_holder=self.data_holder, output_dir=self.output_dir, region_len=self.region_len, ) vcf_data = allel.read_vcf(self.data_holder.filename) self.assertEqual(len(set(vcf_data['variants/CHROM'])), len(chromosomes)) regions_num = len(listdir(self.output_dir)) self.assertEqual(regions_num, 16)
def loadvcf(vcfFile): """Reads VCF using scikit-allel, object is stored as pandas DF """ print("loading vcf file...") print("using scitkit allele version:", allel.__version__) h5 = "{}h5".format(vcfFile.strip("vcf")) if os.path.isfile(h5): callset = h5py.File(h5, 'r') else: callset = allel.read_vcf(vcfFile) print("creating h5 for faster loading") allel.vcf_to_hdf5(vcfFile, h5) return(callset)
def loadvcf(vcfFile): """Reads VCF using scikit-allel, object is stored as pandas DF """ print("loading vcf file...") print("using scitkit allele version:", allel.__version__) h5 = "{}h5".format(vcfFile.strip("vcf")) if os.path.isfile(h5): callset = h5py.File(h5, 'r') else: callset = allel.read_vcf(vcfFile) print("creating h5 for faster loading") allel.vcf_to_hdf5(vcfFile, h5) return (callset)
def simulate(reference_file_bcf, sample_file, idx_to_pop_map, genetic_map_file, generations, num_out, sim_output_path, chm, use_phase_shift): # path to RFMix/simulate binary rfmix_sim_path = "./Admixture/simulate" # NOTE: If running from a different directory than XGMIX/, this needs to # be updated. # assume everyone in the sample map is founder print("Creating founders.bcf.gz for {}".format(sample_file)) write_founders_bcf(reference_file_bcf, sample_file, sim_output_path) for gen in generations: print("-"*80) print("Simulation for generation {} from {}".format(gen,sample_file)) # generation simulation output path gen_path = join_paths(sim_output_path, "gen_"+str(gen)) out_basename = gen_path+'/admix' # Simulating the individuals via rfmix simulation print("simulating ...") rfmix_sim_cmd = rfmix_sim_path + " -f %s/founders.bcf.gz -m %s/founders.map -g %s -o %s --growth-rate=1.5 --maximum-size=2000 --n-output=%d -c %s -G %d -p %f --random-seed=%d %s" rfmix_sim = rfmix_sim_cmd % (sim_output_path, sim_output_path, genetic_map_file, out_basename, num_out, chm, gen, 0.0, 123, "--dephase" if use_phase_shift else "") try: run_shell_cmd(rfmix_sim, verb=True) except: print("something went wrong using rfmix/simulate ...", end=" ") print("trying -c chr"+chm+" insted of -c "+chm+" ...") rfmix_sim = rfmix_sim_cmd % (sim_output_path, sim_output_path, genetic_map_file, out_basename, num_out, "chr"+chm, gen, 0.0, 123, "--dephase" if use_phase_shift else "") run_shell_cmd(rfmix_sim, verb=True) # reading .vcf output of simulation and converting to npy matricies print('reading .vcf output of simulation and converting to npy matricies ...') vcf_data = allel.read_vcf(out_basename+".query.vcf") chm_len, nout, _ = vcf_data["calldata/GT"].shape mat_vcf_2d = vcf_data["calldata/GT"].reshape(chm_len,nout*2).T np.save(gen_path+"/mat_vcf_2d.npy", mat_vcf_2d) # reading .map output of simulation and converting to npy matricies print('reading .map output of simulation and converting to npy matricies ...') map_path = out_basename + ".result" matrix = sample_map_to_matrix(map_path) # Finally map them to original labels (which can be further mapped to coordinates) and saving matrix = np.vectorize(idx_to_pop_map.get)(matrix) np.save(gen_path+"/mat_map.npy",matrix) print("-"*80) print("Finishing up ...")
def import_data(callset_path): '''Read in the VCF in the appropriate format.''' numbers = get_numbers_dict(args.ploidy) callset = allel.read_vcf(callset_path, numbers=numbers, fields=['samples', 'calldata/GT', 'variants/CHROM', 'variants/FILTER', 'variants/POS', 'variants/REF', 'variants/ALT']) return callset
def load_data(self, genome_file, panel_file, population_file): ''' Load files and create the population codes and description files ''' self.genome_file = allel.read_vcf(genome_file) self.panel_file = pd.read_csv(panel_file, sep='\t') self.population_file = pd.read_csv(population_file, sep="\t") #Basic Processing self.panel_file['pop'] = self.panel_file['pop'].str.strip() self.panel_pop = self.panel_file['pop'].values self.df_code = self.population_file["Population Code"].dropna().values self.df_desc = self.population_file["Population Description"].dropna( ).values[:-1]
def readVcf(inFile, logDebug): log.info("reading the VCF file") ## We read only one sample from the VCF file if logDebug: vcf = allel.read_vcf(inFile, samples=[0], fields='*') else: import StringIO import sys sys.stderr = StringIO.StringIO() vcf = allel.read_vcf(inFile, samples=[0], fields='*') #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) sys.stderr = sys.__stderr__ (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM']) try: snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] except AttributeError: snpmatch.die("input VCF file doesnt have required GT field") snpsREQ = snpsREQ[np.where(snpGT != './.')[0]] snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] if 'calldata/PL' in sorted(vcf.keys()): snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float') snpWEI = snpWEI / (-10) snpWEI = np.exp(snpWEI) else: snpBinary = parseGT(snpGT) snpWEI = np.ones((len(snpsREQ), 3)) ## for h**o and het snpWEI[np.where(snpBinary != 0), 0] = 0 snpWEI[np.where(snpBinary != 1), 2] = 0 snpWEI[np.where(snpBinary != 2), 1] = 0 snpCHR = snpCHR[snpsREQ] if 'calldata/DP' in sorted(vcf.keys()): DPmean = np.mean(vcf['calldata/DP'][snpsREQ, 0]) else: DPmean = "NA" snpPOS = np.array(vcf['variants/POS'][snpsREQ]) return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
def read_haplotype_matrix_from_vcf(filepath): """" Read the vcf file into a haplotype matrix (numpy) assume everything is phased, """ callset = allel.read_vcf(filepath) gt = callset['calldata/GT'] hap0 = gt[:, :, 0].T hap1 = gt[:, :, 1].T haps = np.empty((hap0.shape[0]*2, hap0.shape[1]), dtype=hap0.dtype) haps[0::2] = hap0 haps[1::2] = hap1 return haps
def process_vcf(vcf_file): vcf = allel.read_vcf(vcf_file) gt = vcf['calldata/GT'] n_variants, n_samples, ploidy = gt.shape gt_matrix = gt.reshape(n_variants, n_samples * ploidy).astype(np.float32) np.place(gt_matrix, gt_matrix < 0, np.nan) IDs = vcf['variants/ID'] rs_IDs = [int(x[2:]) for x in IDs] samples = vcf['samples'] ind_IDs = [] for sample in samples: ind_IDs.append(sample + '_A') ind_IDs.append(sample + '_B') ind_IDs = np.array(ind_IDs) positions = vcf['variants/POS'].tolist() return gt_matrix, rs_IDs, ind_IDs, positions
def load_vcf_wrapper(path, seqid, samples, samples_path): callset = allel.read_vcf( path, region=seqid, fields=['variants/POS', 'calldata/GT', 'samples'], tabix="tabix", samples=samples) assert "samples" in callset.keys(), "None of the samples provided in {0!r} are found in {1!r}".format( samples_path, path) p = allel.SortedIndex(callset["variants/POS"]) g = allel.GenotypeArray(callset['calldata/GT']) return p, g
def load_vcf_wrapper(path, seqid, samples, samples_path): callset = allel.read_vcf(path, region=seqid, fields=['variants/POS', 'calldata/GT', 'samples'], tabix="tabix", samples=samples) assert "samples" in callset.keys( ), "None of the samples provided in {0!r} are found in {1!r}".format( samples_path, path) p = allel.SortedIndex(callset["variants/POS"]) g = allel.GenotypeArray(callset['calldata/GT']) return p, g
def main(vcffile: str = typer.Argument(..., help="总vcf文件"), groupfile: str = typer.Argument(..., help='样本分群信息,两列,一列vcf中的样本ID,一列对应的群体ID。不需要包含vcf中的全部个体。'), outfile: str = typer.Argument(..., help='输出文件名(.ac.tsv.gz)'), region: str = typer.Option(None, help='只使用指定区域chrom:start-end或者chrom')): """把vcf文件转换为treemix的输入格式,可以只转指定区域指定样本""" group2samples = load_group(groupfile) df = {} for group, samples in group2samples.items(): callset = allel.read_vcf(vcffile, fields=['calldata/GT', 'samples'], numbers={'ALT': 1}, region=region) samples_missed = set(samples) - set(callset['samples']) if samples_missed: print(f'{len(samples_missed)} / {len(samples)} samples missed in {group}:') print(','.join(samples_missed)) df[group] = pd.DataFrame(allel.GenotypeArray(callset['calldata/GT']).count_alleles()).apply(lambda x: ','.join([str(x[0]), str(x[1])]), axis=1).values df = pd.DataFrame(df) df.to_csv(outfile, sep=' ', index=False, compression='gzip')
def load_vcf2array(vcffile, region: None, chrom2sites: None, samples, outsamples: None): callset = allel.read_vcf(vcffile, region=region, samples=samples, fields=['samples', 'calldata/GT', 'variants/POS']) gt_array = callset['calldata/GT'] # 三维array samples_all = callset['samples'] pos_array = callset['variants/POS'] chrom = region.split(':')[0] # 只保留sitefile中的位点 if chrom2sites: selection_requiredsites = [ True if x in chrom2sites[chrom] else False for x in pos_array ] gt_array = gt_array[selection_requiredsites, :, :] pos_array = pos_array[selection_requiredsites] print( f'{np.sum(selection_requiredsites)} remained according to the sitefile.' ) # 只保留双等位,因为对于多等位杂合后续没法区分,如0/2和 1/1加了之后都会转化为2 selection_biallelic = np.max(np.max(gt_array, axis=2), axis=1) < 2 gt_array = gt_array[selection_biallelic, :, :] pos_array = pos_array[selection_biallelic] n_sites, n_samples, n_hap = gt_array.shape print(f'{n_sites} biallelic sites were remained.') # 把outsamples中最高频率的allele设置为alt if outsamples: selection_outsamples = select_samples(samples_all, outsamples) print(f'{np.sum(selection_outsamples)} outgroup samples in vcf file.') gt_array_out = gt_array[:, selection_outsamples, :].reshape( n_sites, np.sum(selection_outsamples) * n_hap) selection_swtich = np.sum(gt_array_out == 0, axis=1) > np.sum( gt_array_out > 0, axis=1) # ref(0)的数量比非ref(!=0)但不是miss(-1)的数量多 print(f'Swtich REF and ALT in {np.sum(selection_swtich)} sites.') assert gt_array.min() >= -1 assert gt_array.max() <= 1 gt_swtich = gt_array[selection_swtich, :, :] gt_swtich[gt_swtich == 1] = 9 gt_swtich[gt_swtich == 0] = 1 gt_swtich[gt_swtich == 9] = 0 gt_array[selection_swtich, :, :] = gt_swtich return gt_array, callset['samples'], pos_array
# import scikit-allel import allel import sys # Sys arguments in_vcf = sys.argv[1] out_file = sys.argv[2] callset = allel.read_vcf(in_vcf) # available keys in vcf file sorted(callset.keys()) # to get reference from vcf file callset['variants/REF'] # to get genotype/allel form vcf file callset['calldata/GT'] # to get genotype infomations in array form gt = allel.GenotypeArray(callset['calldata/GT']) # write the output to a file with open(out_file, 'w') as fh_out: fh_out.write(str(gt))