def readVcf(inFile, logDebug): log.info("reading the VCF file") if logDebug: vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) else: sys.stderr = StringIO.StringIO() vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) sys.stderr = sys.__stderr__ DPthres = np.mean(vcfD.DP[np.where(vcfD.DP > 0)[0]]) * 4 DPmean = DPthres / 4 snpCHROM = np.char.replace(np.core.defchararray.lower( vcf.CHROM), "chr", "") ## Should take care of all possible chr names snpsREQ = np.where((vcfD.is_called[:, 0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") snpPOS = np.array(vcf.POS[snpsREQ]) try: snpGT = np.array(vcfD.GT[snpsREQ, 0]) except AttributeError: die("input VCF file doesnt have required GT field") try: snpPL = vcfD.PL[snpsREQ, 0] snpWEI = np.copy(snpPL) snpWEI = snpWEI.astype(float) snpWEI = snpWEI / (-10) snpWEI = np.exp(snpWEI) except AttributeError: snpBinary = parseGT(snpGT) snpWEI = np.ones((len(snpsREQ), 3)) ## for h**o and het snpWEI[np.where(snpBinary != 0), 0] = 0 snpWEI[np.where(snpBinary != 1), 2] = 0 snpWEI[np.where(snpBinary != 2), 1] = 0 return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
def run_vcfnp_calls_all(chrom='Pf3D7_01_v3'): vcf_fn = OUTGROUP_VCF_FORMAT % chrom c = vcfnp.calldata_2d( vcf_fn=vcf_fn, progress=10000, # fields=['GT', 'AD'], fields=['AD'], arities={'AD': 7}, cache=True, cachedir=FULL_NPY_FORMAT % chrom ) return(0)
def run_vcfnp(sample='7G8', file_prefix='WG'): vcf_fn = INDIVIDAL_VALIDATION_SAMPLES_VCF_FORMAT % (file_prefix, sample) v = vcfnp.variants( vcf_fn=vcf_fn, progress=10000, arities={ 'ALT': 2, 'AF': 2, 'AC': 2, 'MLEAF': 2, 'MLEAC': 2, 'RPA': 3 }, dtypes={ 'REF': 'a400', 'ALT': 'a400', 'RegionType': 'a25', 'VariantType': 'a40', 'RU': 'a40', 'set': 'a40', 'SNPEFF_AMINO_ACID_CHANGE':'a20', 'SNPEFF_CODON_CHANGE':'a20', 'SNPEFF_EFFECT':'a33', 'SNPEFF_EXON_ID':'a2', 'SNPEFF_FUNCTIONAL_CLASS':'a8', 'SNPEFF_GENE_BIOTYPE':'a14', 'SNPEFF_GENE_NAME':'a20', 'SNPEFF_IMPACT':'a8', 'SNPEFF_TRANSCRIPT_ID':'a20', 'VariantType':'a60', 'culprit':'a14', }, cache=True ) c = vcfnp.calldata_2d( vcf_fn=vcf_fn, progress=10000, fields=['GT', 'AD'], arities={'AD': 3}, cache=True, ) print(sample, max(v['num_alleles']), max([len(x) for x in v['REF']])) return(v, c)
def work(self): import vcfnp import numpy as np import pandas as pd import matplotlib.pyplot as plt variants = vcfnp.variants(self.input().path) calldata_2d = vcfnp.calldata_2d(self.input().path) var = np.logical_and(variants['ALT'] != b'<NON_REF>', variants['DP'] > self.DP_thresh) counts = np.sort(calldata_2d['AD'][var][:, 0, :], axis=1)[:, ::-1] freqs = counts / calldata_2d['DP'][var] third = 1 - np.sum(freqs, axis=1) df = pd.DataFrame(np.hstack((freqs, third.reshape((-1, 1))))) df[df == 0] = float('nan') df.hist(sharex=True, sharey=True, range=(0, 1), bins=20) plt.gcf().suptitle(self.library, fontsize=20) plt.gcf().text(0.5, 0.04, 'Allele frequency', ha='center') plt.gcf().text(0.02, 0.5, 'Counts', va='center', rotation='vertical') plt.gcf().savefig(self.output().path)
def test_caching(): vcf_fn = "fixture/sample.vcf.gz" cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="variants") if os.path.exists(cache_fn): os.remove(cache_fn) A = variants(vcf_fn, cache=True, verbose=True) A2 = np.load(cache_fn) assert np.all(A == A2) cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata") if os.path.exists(cache_fn): os.remove(cache_fn) A = calldata(vcf_fn, cache=True, verbose=True) A2 = np.load(cache_fn) assert np.all(A == A2) cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata_2d") if os.path.exists(cache_fn): os.remove(cache_fn) A = calldata_2d(vcf_fn, cache=True, verbose=True) A2 = np.load(cache_fn) assert np.all(A == A2)
def vcf2snp(filename, missing=3, cache=True): """ Return a SNP matrix based on a VCF file. This take a VCF file and create a SNP matrix. It will keep only the SNP with 2 variants. This function is based on the vcfnp package. :param filename: The path of the VCF file :param missing: How to annotate missing data :param cache: Use cache :type filename: string :type missing: np.int8 :type cache: boolean :return: The genotype matrix containing SNP :rtype: np.array of np.int8 :Example: >>> G = vcf2snp('file.vcf') ... warnings:: This function is not maintain. """ c = vcfnp.calldata_2d(filename, cache=cache).view(np.recarray) G = c.genotype ## create a mask to keep only 0/0, 1/0, 0/1, 1/1 and missing datas mask = np.logical_and.reduce(np.logical_and(G >= -1, G <= 1), axis = 2) mask = np.logical_and.reduce(mask, axis=1) G = G[mask, :] mask_missing = np.logical_and.reduce(G == -1, axis=2) G = np.sum(G.T, axis=0, dtype=np.int8) G[mask_missing.T] = missing return G
inOptions.add_option("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file", type="string") inOptions.add_option("-o", "--output", dest="outFile", help="Output file with the probability scores", type="string") inOptions.add_option("-r", "--refScore", dest="refScore", help="Output for refined score", type="string") (options, args) = inOptions.parse_args() logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=logging.DEBUG) GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File) #GenotypeData_acc = genotype.load_hdf5_genotype_data(options.hdf5accFile) #num_lines = len(GenotypeData.accessions) logging.info("Reading the VCF file") vcf = vcfnp.variants(options.vcfFile, cache=True).view(numpy.recarray) vcfD = vcfnp.calldata_2d(options.vcfFile, cache=True).view(numpy.recarray) ## Doubtful .... whether there should be a threshold based on just mean of std #DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) + numpy.std(vcf.DP[numpy.where(vcf.DP > 0)[0]]) DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) * 4 print "Threshold for depth is set at: ", DPthres #snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0))[0] snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0] snpCHR = numpy.array(numpy.chararray.replace(vcf.CHROM[snpsREQ], "Chr", "")).astype("int8") snpPOS = numpy.array(vcf.POS[snpsREQ]) snpGT = vcfD.GT[snpsREQ, 0] ## since one sample snpPL = vcfD.PL[snpsREQ, 0] snpDP = vcf.DP[snpsREQ]
out_file = 'panoptes_ready_vcf_data/datatables/variants/data' with open(out_file, 'w') as f: f.write('\t'.join(flatten(names_from_dtype(variants.dtype)))) f.write('\n') for line in variants: f.write('\t'.join(map(str, flatten_numpy_line(line)))) f.write('\n') import h5py out = h5py.File('data.hdf5', 'w') variants_out = out.create_dataset("variant_index", variants.shape, 'S20', maxshape=variants.shape, compression='gzip', fletcher32=False, shuffle=False) for i in xrange(len(variants)): variants_out[i] = variants['CHROM'][i] + '_' + str(variants['POS'][i]).zfill(10) print 'Parsing genotypes' c = vcfnp.calldata_2d('ag1000g.phase1.AR1.Y_unplaced.PASS.vcf.gz', fields=['DP', 'GT']) depth = c['DP'] genotypes = c['GT'] try: depth_out = out.create_dataset("total_depth", depth.shape, depth.dtype, maxshape=depth.shape, compression='szip', fletcher32=False, shuffle=False) first_allele = out.create_dataset("first_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='szip', fletcher32=False, shuffle=False) second_allele = out.create_dataset("second_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='szip', fletcher32=False, shuffle=False) except ValueError: depth_out = out.create_dataset("total_depth", depth.shape, depth.dtype, maxshape=depth.shape, compression='gzip', fletcher32=False, shuffle=False) first_allele = out.create_dataset("first_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='gzip', fletcher32=False, shuffle=False) second_allele = out.create_dataset("second_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='gzip', fletcher32=False, shuffle=False) #Parse "a/b" for i in xrange(genotypes.shape[0]): for j in xrange(genotypes.shape[1]): a,b = genotypes[i,j].split('/')
v = vcfnp.variants(filename, cache=True).view(np.recarray) # print some simple variant metrics print('found %s variants (%s SNPs)' % (v.size, np.count_nonzero(v.is_snp))) print('QUAL mean (std): %s (%s)' % (np.mean(v.QUAL), np.std(v.QUAL))) # plot a histogram of variant depth fig = plt.figure(1) ax = fig.add_subplot(111) ax.hist(v.DP) ax.set_title('DP histogram') ax.set_xlabel('DP') plt.show() # load data from sample columns c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray) # print some simple genotype metrics count_phased = np.count_nonzero(c.is_phased) count_variant = np.count_nonzero(np.any(c.genotype > 0, axis=2)) count_missing = np.count_nonzero(~c.is_called) print('calls (phased, variant, missing): %s (%s, %s, %s)' % (c.flatten().size, count_phased, count_variant, count_missing)) # plot a histogram of genotype quality fig = plt.figure(2) ax = fig.add_subplot(111) ax.hist(c.GQ.flatten()) ax.set_title('GQ histogram') ax.set_xlabel('GQ') plt.show()
def readVcf(inFile): bvcf = vcfnp.variants(inFile, cache=True).view(np.recarray) bvcfD = vcfnp.calldata_2d(inFile, cache=True).view(np.recarray) return(bvcf, bvcfD)
import numpy as np import vcfnp import os filename = "comt.chr06.snp.full.final.vcf" c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray) G = np.sum(c.genotype.T, axis=0) G[G == -2] = 3 # In[7]: (n, m) = G.shape nb_missing_data = np.count_nonzero(G == 3) nb_data = n * m nb_0 = np.count_nonzero(G == 0) nb_1 = np.count_nonzero(G == 1) nb_2 = np.count_nonzero(G == 2) print("nb data : {0}".format(nb_data)) print("nb missing data : {0}".format(nb_missing_data)) print("nb 0 : {0}".format(nb_0)) print("nb 1 : {0}".format(nb_1)) print("nb 2 : {0}".format(nb_2)) # In[8]: float(nb_missing_data) / nb_data # In[9]: