def test_condition(): V = variants('fixture/sample.vcf') eq_(9, len(V)) C = calldata('fixture/sample.vcf', condition=V['FILTER']['PASS']) eq_(5, len(C)) Vf = variants('fixture/sample.vcf', condition=V['FILTER']['PASS']) eq_(5, len(Vf))
def readVcf(inFile, logDebug): log.info("reading the VCF file") if logDebug: vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) else: sys.stderr = StringIO.StringIO() vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) sys.stderr = sys.__stderr__ DPthres = np.mean(vcfD.DP[np.where(vcfD.DP > 0)[0]]) * 4 DPmean = DPthres / 4 snpCHROM = np.char.replace(np.core.defchararray.lower( vcf.CHROM), "chr", "") ## Should take care of all possible chr names snpsREQ = np.where((vcfD.is_called[:, 0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") snpPOS = np.array(vcf.POS[snpsREQ]) try: snpGT = np.array(vcfD.GT[snpsREQ, 0]) except AttributeError: die("input VCF file doesnt have required GT field") try: snpPL = vcfD.PL[snpsREQ, 0] snpWEI = np.copy(snpPL) snpWEI = snpWEI.astype(float) snpWEI = snpWEI / (-10) snpWEI = np.exp(snpWEI) except AttributeError: snpBinary = parseGT(snpGT) snpWEI = np.ones((len(snpsREQ), 3)) ## for h**o and het snpWEI[np.where(snpBinary != 0), 0] = 0 snpWEI[np.where(snpBinary != 1), 2] = 0 snpWEI[np.where(snpBinary != 2), 1] = 0 return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
def test_condition(): V = variants("fixture/sample.vcf") eq_(9, len(V)) C = calldata("fixture/sample.vcf", condition=V["FILTER"]["PASS"]) eq_(5, len(C)) Vf = variants("fixture/sample.vcf", condition=V["FILTER"]["PASS"]) eq_(5, len(Vf))
def test_missing_info_definition(): # INFO field DP not declared in VCF header V = variants('fixture/test14.vcf', fields=['DP']) eq_('14', V[2]['DP']) # default is string V = variants('fixture/test14.vcf', fields=['DP'], vcf_types={'DP':'Integer'}) eq_(14, V[2]['DP']) # what about a field which isn't present at all? V = variants('fixture/test14.vcf', fields=['FOO']) eq_('.', V[2]['FOO']) # default missing value for string field
def test_condition(): v = variants('fixture/sample.vcf') eq_(9, len(v)) c = calldata('fixture/sample.vcf', condition=v['FILTER']['PASS']) eq_(5, len(c)) i = info('fixture/sample.vcf', condition=v['FILTER']['PASS']) eq_(5, len(i)) vf = variants('fixture/sample.vcf', condition=v['FILTER']['PASS']) eq_(5, len(vf))
def test_missing_info_definition(): # INFO field DP not declared in VCF header V = variants("fixture/test14.vcf", fields=["DP"]) eq_("14", V[2]["DP"]) # default is string V = variants("fixture/test14.vcf", fields=["DP"], vcf_types={"DP": "Integer"}) eq_(14, V[2]["DP"]) # what about a field which isn't present at all? V = variants("fixture/test14.vcf", fields=["FOO"]) eq_("", V[2]["FOO"]) # default missing value for string field
def test_variants_transformers(): def _test(V): eq_("STOP_GAINED", V["EFF"]["Effect"][0]) eq_("HIGH", V["EFF"]["Effect_Impact"][0]) eq_("NONSENSE", V["EFF"]["Functional_Class"][0]) eq_("Cag/Tag", V["EFF"]["Codon_Change"][0]) eq_("Q236*", V["EFF"]["Amino_Acid_Change"][0]) eq_(749, V["EFF"]["Amino_Acid_Length"][0]) eq_("NOC2L", V["EFF"]["Gene_Name"][0]) eq_(".", V["EFF"]["Transcript_BioType"][0]) eq_(1, V["EFF"]["Gene_Coding"][0]) eq_("NM_015658", V["EFF"]["Transcript_ID"][0]) eq_(-1, V["EFF"]["Exon"][0]) eq_("NON_SYNONYMOUS_CODING", V["EFF"]["Effect"][1]) eq_("MODERATE", V["EFF"]["Effect_Impact"][1]) eq_("MISSENSE", V["EFF"]["Functional_Class"][1]) eq_("gTt/gGt", V["EFF"]["Codon_Change"][1]) eq_("V155G", V["EFF"]["Amino_Acid_Change"][1]) eq_(-1, V["EFF"]["Amino_Acid_Length"][1]) eq_("PF3D7_0108900", V["EFF"]["Gene_Name"][1]) eq_(".", V["EFF"]["Transcript_BioType"][1]) eq_(-1, V["EFF"]["Gene_Coding"][1]) eq_("rna_PF3D7_0108900-1", V["EFF"]["Transcript_ID"][1]) eq_(1, V["EFF"]["Exon"][1]) eq_(".", V["EFF"]["Effect"][2]) eq_(".", V["EFF"]["Effect_Impact"][2]) eq_(".", V["EFF"]["Functional_Class"][2]) eq_(".", V["EFF"]["Codon_Change"][2]) eq_(".", V["EFF"]["Amino_Acid_Change"][2]) eq_(-1, V["EFF"]["Amino_Acid_Length"][2]) eq_(".", V["EFF"]["Gene_Name"][2]) eq_(".", V["EFF"]["Transcript_BioType"][2]) eq_(-1, V["EFF"]["Gene_Coding"][2]) eq_(".", V["EFF"]["Transcript_ID"][2]) eq_(-1, V["EFF"]["Exon"][2]) V = variants( "fixture/test12.vcf", dtypes={"EFF": EFF_DEFAULT_DTYPE}, arities={"EFF": 1}, transformers={"EFF": eff_default_transformer()}, ) _test(V) # test EFF is included in defaults V = variants("fixture/test12.vcf") _test(V)
def test_error_handling(): # try to open a directory vcf_fn = '.' with assert_raises(ValueError): a = vcfnp.variants(vcf_fn) # try to open a file that doesn't exist vcf_fn = 'doesnotexist' with assert_raises(ValueError): a = vcfnp.variants(vcf_fn) # file is nothing like a VCF (has no header etc.) vcf_fn = 'fixture/test48a.vcf' with assert_raises(RuntimeError): a = vcfnp.variants(vcf_fn)
def test_svlen(): # V = variants('fixture/test13.vcf').view(np.recarray) # assert hasattr(V, 'svlen') # eq_(0, V.svlen[0]) # eq_(1, V.svlen[1]) # eq_(-1, V.svlen[2]) # eq_(3, V.svlen[3]) # eq_(3, V.svlen[4]) V = variants("fixture/test13.vcf", arities={"svlen": 2}).view(np.recarray)
def test_variants(): a = variants('fixture/sample.vcf', arities={'ALT': 2}) print repr(a) eq_(9, len(a)) eq_('19', a[0]['CHROM']) eq_(111, a[0]['POS']) eq_('rs6054257', a[2]['ID']) eq_('A', a[0]['REF']) eq_('ATG', a[8]['ALT'][1]) eq_(10.0, a[1]['QUAL']) eq_(True, a[2]['FILTER']['PASS']) eq_(False, a[3]['FILTER']['PASS']) eq_(True, a[3]['FILTER']['q10']) eq_(2, a[0]['num_alleles']) eq_(False, a[5]['is_snp'])
def build_arrays(vcf_fn, region, samples=None, force=False): variants_array_fn = vcf_fn + '.' + region.replace('-:', '_') + '.variants.npy' if force or not os.path.exists(variants_array_fn): print >>sys.stderr, 'building', variants_array_fn V = vcfnp.variants(vcf_fn, region=region, progress=20000) np.save(variants_array_fn, V) else: print >>sys.stderr, 'skipping', variants_array_fn info_array_fn = vcf_fn + '.' + region.replace('-:', '_') + '.info.npy' if force or not os.path.exists(info_array_fn): print >>sys.stderr, 'building', info_array_fn I = vcfnp.info(vcf_fn, region=region, progress=20000, fields=['NS', 'UQ', 'CODING', 'DP', 'AD'], vcf_types={'DP': 'Integer', 'AD': 'Integer'}, arities={'AD': 2}) np.save(info_array_fn, I) else: print >>sys.stderr, 'skipping', info_array_fn
def run_vcfnp(sample='7G8', file_prefix='WG'): vcf_fn = INDIVIDAL_VALIDATION_SAMPLES_VCF_FORMAT % (file_prefix, sample) v = vcfnp.variants( vcf_fn=vcf_fn, progress=10000, arities={ 'ALT': 2, 'AF': 2, 'AC': 2, 'MLEAF': 2, 'MLEAC': 2, 'RPA': 3 }, dtypes={ 'REF': 'a400', 'ALT': 'a400', 'RegionType': 'a25', 'VariantType': 'a40', 'RU': 'a40', 'set': 'a40', 'SNPEFF_AMINO_ACID_CHANGE':'a20', 'SNPEFF_CODON_CHANGE':'a20', 'SNPEFF_EFFECT':'a33', 'SNPEFF_EXON_ID':'a2', 'SNPEFF_FUNCTIONAL_CLASS':'a8', 'SNPEFF_GENE_BIOTYPE':'a14', 'SNPEFF_GENE_NAME':'a20', 'SNPEFF_IMPACT':'a8', 'SNPEFF_TRANSCRIPT_ID':'a20', 'VariantType':'a60', 'culprit':'a14', }, cache=True ) c = vcfnp.calldata_2d( vcf_fn=vcf_fn, progress=10000, fields=['GT', 'AD'], arities={'AD': 3}, cache=True, ) print(sample, max(v['num_alleles']), max([len(x) for x in v['REF']])) return(v, c)
def test_variants_transformers(): V = variants('fixture/test12.vcf', dtypes={'EFF': EFF_DEFAULT_DTYPE}, arities={'EFF': 1}, transformers={'EFF': eff_default_transformer()}) eq_('STOP_GAINED', V['EFF']['Effect'][0]) eq_('HIGH', V['EFF']['Effect_Impact'][0]) eq_('NONSENSE', V['EFF']['Functional_Class'][0]) eq_('Cag/Tag', V['EFF']['Codon_Change'][0]) eq_('Q236*', V['EFF']['Amino_Acid_Change'][0]) eq_(749, V['EFF']['Amino_Acid_Length'][0]) eq_('NOC2L', V['EFF']['Gene_Name'][0]) eq_('.', V['EFF']['Transcript_BioType'][0]) eq_(1, V['EFF']['Gene_Coding'][0]) eq_('NM_015658', V['EFF']['Transcript_ID'][0]) eq_(-1, V['EFF']['Exon'][0]) eq_('NON_SYNONYMOUS_CODING', V['EFF']['Effect'][1]) eq_('MODERATE', V['EFF']['Effect_Impact'][1]) eq_('MISSENSE', V['EFF']['Functional_Class'][1]) eq_('gTt/gGt', V['EFF']['Codon_Change'][1]) eq_('V155G', V['EFF']['Amino_Acid_Change'][1]) eq_(-1, V['EFF']['Amino_Acid_Length'][1]) eq_('PF3D7_0108900', V['EFF']['Gene_Name'][1]) eq_('.', V['EFF']['Transcript_BioType'][1]) eq_(-1, V['EFF']['Gene_Coding'][1]) eq_('rna_PF3D7_0108900-1', V['EFF']['Transcript_ID'][1]) eq_(1, V['EFF']['Exon'][1]) eq_('.', V['EFF']['Effect'][2]) eq_('.', V['EFF']['Effect_Impact'][2]) eq_('.', V['EFF']['Functional_Class'][2]) eq_('.', V['EFF']['Codon_Change'][2]) eq_('.', V['EFF']['Amino_Acid_Change'][2]) eq_(-1, V['EFF']['Amino_Acid_Length'][2]) eq_('.', V['EFF']['Gene_Name'][2]) eq_('.', V['EFF']['Transcript_BioType'][2]) eq_(-1, V['EFF']['Gene_Coding'][2]) eq_('.', V['EFF']['Transcript_ID'][2]) eq_(-1, V['EFF']['Exon'][2])
def test_variants(): a = variants("fixture/sample.vcf", arities={"ALT": 2, "AC": 2}) print repr(a) eq_(9, len(a)) eq_("19", a[0]["CHROM"]) eq_(111, a[0]["POS"]) eq_("rs6054257", a[2]["ID"]) eq_("A", a[0]["REF"]) eq_("ATG", a[8]["ALT"][1]) eq_(10.0, a[1]["QUAL"]) eq_(True, a[2]["FILTER"]["PASS"]) eq_(False, a[3]["FILTER"]["PASS"]) eq_(True, a[3]["FILTER"]["q10"]) eq_(2, a[0]["num_alleles"]) eq_(False, a[5]["is_snp"]) # INFO fields eq_(3, a[2]["NS"]) eq_(0.5, a[2]["AF"]) eq_(True, a[2]["DB"]) eq_((3, 1), tuple(a[6]["AC"]))
def work(self): import vcfnp import numpy as np import pandas as pd import matplotlib.pyplot as plt variants = vcfnp.variants(self.input().path) calldata_2d = vcfnp.calldata_2d(self.input().path) var = np.logical_and(variants['ALT'] != b'<NON_REF>', variants['DP'] > self.DP_thresh) counts = np.sort(calldata_2d['AD'][var][:, 0, :], axis=1)[:, ::-1] freqs = counts / calldata_2d['DP'][var] third = 1 - np.sum(freqs, axis=1) df = pd.DataFrame(np.hstack((freqs, third.reshape((-1, 1))))) df[df == 0] = float('nan') df.hist(sharex=True, sharey=True, range=(0, 1), bins=20) plt.gcf().suptitle(self.library, fontsize=20) plt.gcf().text(0.5, 0.04, 'Allele frequency', ha='center') plt.gcf().text(0.02, 0.5, 'Counts', va='center', rotation='vertical') plt.gcf().savefig(self.output().path)
def test_caching(): vcf_fn = "fixture/sample.vcf.gz" cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="variants") if os.path.exists(cache_fn): os.remove(cache_fn) A = variants(vcf_fn, cache=True, verbose=True) A2 = np.load(cache_fn) assert np.all(A == A2) cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata") if os.path.exists(cache_fn): os.remove(cache_fn) A = calldata(vcf_fn, cache=True, verbose=True) A2 = np.load(cache_fn) assert np.all(A == A2) cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata_2d") if os.path.exists(cache_fn): os.remove(cache_fn) A = calldata_2d(vcf_fn, cache=True, verbose=True) A2 = np.load(cache_fn) assert np.all(A == A2)
def run_vcfnp_all(chrom='Pf3D7_01_v3'): vcf_fn = OUTGROUP_VCF_FORMAT % chrom v = vcfnp.variants( vcf_fn=vcf_fn, progress=10000, arities={ 'ALT': 6, 'AF': 6, 'AC': 6, 'MLEAF': 6, 'MLEAC': 6, 'RPA': 7 }, dtypes={ # 'REF': 'a100', # 'ALT': 'a100', 'RegionType': 'a25', 'VariantType': 'a40', 'RU': 'a40', 'set': 'a40', 'SNPEFF_AMINO_ACID_CHANGE':'a20', 'SNPEFF_CODON_CHANGE':'a20', 'SNPEFF_EFFECT':'a33', 'SNPEFF_EXON_ID':'a2', 'SNPEFF_FUNCTIONAL_CLASS':'a8', 'SNPEFF_GENE_BIOTYPE':'a14', 'SNPEFF_GENE_NAME':'a20', 'SNPEFF_IMPACT':'a8', 'SNPEFF_TRANSCRIPT_ID':'a20', 'VariantType':'a60', 'culprit':'a14', }, cache=True, cachedir=FULL_NPY_FORMAT % chrom ) return(0)
def run_vcfnp( annotated_vcf_fn = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/consensus_alignment/nucmer/7G8.3d7coordinates.annoatated.vcf' ): v = vcfnp.variants( vcf_fn=annotated_vcf_fn, progress=10000, arities={ 'ALT': 2, 'AF': 2, 'AC': 2, 'MLEAF': 2, 'MLEAC': 2, 'RPA': 3 }, dtypes={ 'REF': 'a400', 'ALT': 'a400', 'RegionType': 'a25', 'VariantType': 'a40', 'RU': 'a40', 'set': 'a40', 'SNPEFF_AMINO_ACID_CHANGE':'a20', 'SNPEFF_CODON_CHANGE':'a20', 'SNPEFF_EFFECT':'a33', 'SNPEFF_EXON_ID':'a2', 'SNPEFF_FUNCTIONAL_CLASS':'a8', 'SNPEFF_GENE_BIOTYPE':'a14', 'SNPEFF_GENE_NAME':'a20', 'SNPEFF_IMPACT':'a8', 'SNPEFF_TRANSCRIPT_ID':'a20', 'VariantType':'a60', 'culprit':'a14', }, cache=True ) return(v)
def test_error_handling(): # try to open a directory vcf_fn = '.' with assert_raises(ValueError): vcfnp.variants(vcf_fn) # try to open a file that doesn't exist vcf_fn = 'doesnotexist' with assert_raises(ValueError): vcfnp.variants(vcf_fn) # file is nothing like a VCF (has no header etc.) vcf_fn = 'fixture/test48a.vcf' with assert_raises(RuntimeError): vcfnp.variants(vcf_fn) # file has mode sample columns than in header row vcf_fn = 'fixture/test48b.vcf' with assert_raises(RuntimeError): vcfnp.calldata(vcf_fn)
from __future__ import print_function, division import numpy as np import matplotlib.pyplot as plt import vcfnp vcfnp.__version__ filename = 'fixture/sample.vcf' # load data from fixed fields (including INFO) v = vcfnp.variants(filename, cache=True).view(np.recarray) # print some simple variant metrics print('found %s variants (%s SNPs)' % (v.size, np.count_nonzero(v.is_snp))) print('QUAL mean (std): %s (%s)' % (np.mean(v.QUAL), np.std(v.QUAL))) # plot a histogram of variant depth fig = plt.figure(1) ax = fig.add_subplot(111) ax.hist(v.DP) ax.set_title('DP histogram') ax.set_xlabel('DP') plt.show() # load data from sample columns c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray) # print some simple genotype metrics count_phased = np.count_nonzero(c.is_phased) count_variant = np.count_nonzero(np.any(c.genotype > 0, axis=2)) count_missing = np.count_nonzero(~c.is_called) print('calls (phased, variant, missing): %s (%s, %s, %s)'
def readVcf(inFile): bvcf = vcfnp.variants(inFile, cache=True).view(np.recarray) bvcfD = vcfnp.calldata_2d(inFile, cache=True).view(np.recarray) return(bvcf, bvcfD)
def test_override_vcf_types(): V = variants("fixture/test4.vcf") eq_(0, V["MQ0FractionTest"][2]) V = variants("fixture/test4.vcf", vcf_types={"MQ0FractionTest": "Float"}) assert_almost_equal(0.03, V["MQ0FractionTest"][2])
def test_variants_slice(): a = variants("fixture/sample.vcf.gz") eq_("rs6054257", a["ID"][2]) a = variants("fixture/sample.vcf.gz", slice=(0, None, 2)) eq_("rs6054257", a["ID"][1])
def test_variants_exclude_fields(): a = variants("fixture/sample.vcf", exclude_fields=["ID", "FILTER"]) assert "CHROM" in a.dtype.names assert "ID" not in a.dtype.names assert "FILTER" not in a.dtype.names
def test_variants_count(): a = variants("fixture/sample.vcf", count=3) eq_(3, len(a))
from compiler.ast import flatten import numpy def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise vcf_file = "ag1000g.phase1.AR1.Y_unplaced.PASS.vcf.gz" print 'Parsing variants' variants = vcfnp.variants(vcf_file) #Recursivly get the names of the columns def names_from_dtype(dtype, path=''): if dtype.names: #dtypes don't support iter.... yes. I know. dtype_as_list = [dtype[i] for i in xrange(len(dtype))] return [names_from_dtype(inner_dtype, path+'.'+name if path else name) for name, inner_dtype in zip(dtype.names, dtype_as_list)] elif dtype.kind == 'V': #a vector with no names! We will have to just number them... return [path+'.'+str(i) if path else str(i) for i in range(dtype.shape[0])] else: return path def flatten_numpy_line(line): for entry in line:
def profile(): a = vcfnp.variants(sys.argv[1], count=int(sys.argv[2]))
def test_variants_slice(): a = variants('fixture/sample.vcf.gz') eq_('rs6054257', a['ID'][2]) a = variants('fixture/sample.vcf.gz', slice=(0, None, 2)) eq_('rs6054257', a['ID'][1])
def test_explicit_pass_definition(): # explicit PASS FILTER definition V = variants("fixture/test16.vcf")
def test_duplicate_field_definitions(): V = variants("fixture/test10.vcf") # should not raise, but print useful message to stderr C = calldata("fixture/test10.vcf")
def profile(): v = vcfnp.variants(sys.argv[1], count=int(sys.argv[2])) c = vcfnp.calldata(sys.argv[1], condition=v['FILTER']['PASS'])
def test_variants_flatten_filter(): a = variants("fixture/sample.vcf", flatten_filter=True) eq_(True, a[2]["FILTER_PASS"]) eq_(False, a[3]["FILTER_PASS"]) eq_(True, a[3]["FILTER_q10"])
def test_variants_region(): a = variants("fixture/sample.vcf.gz", region="20") eq_(6, len(a))
inOptions.add_option("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file", type="string") inOptions.add_option("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file", type="string") inOptions.add_option("-o", "--output", dest="outFile", help="Output file with the probability scores", type="string") inOptions.add_option("-r", "--refScore", dest="refScore", help="Output for refined score", type="string") (options, args) = inOptions.parse_args() logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=logging.DEBUG) GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File) #GenotypeData_acc = genotype.load_hdf5_genotype_data(options.hdf5accFile) #num_lines = len(GenotypeData.accessions) logging.info("Reading the VCF file") vcf = vcfnp.variants(options.vcfFile, cache=True).view(numpy.recarray) vcfD = vcfnp.calldata_2d(options.vcfFile, cache=True).view(numpy.recarray) ## Doubtful .... whether there should be a threshold based on just mean of std #DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) + numpy.std(vcf.DP[numpy.where(vcf.DP > 0)[0]]) DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) * 4 print "Threshold for depth is set at: ", DPthres #snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0))[0] snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0] snpCHR = numpy.array(numpy.chararray.replace(vcf.CHROM[snpsREQ], "Chr", "")).astype("int8") snpPOS = numpy.array(vcf.POS[snpsREQ]) snpGT = vcfD.GT[snpsREQ, 0] ## since one sample snpPL = vcfD.PL[snpsREQ, 0] snpDP = vcf.DP[snpsREQ]
def test_variants_region_empty(): a = variants("fixture/sample.vcf.gz", region="18") eq_(0, len(a)) a = variants("fixture/sample.vcf.gz", region="19:113-200") eq_(0, len(a))