Ejemplo n.º 1
0
def readVcf(inFile, logDebug):
    log.info("reading the VCF file")
    if logDebug:
        vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
        vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
    else:
        sys.stderr = StringIO.StringIO()
        vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
        vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
        sys.stderr = sys.__stderr__
    DPthres = np.mean(vcfD.DP[np.where(vcfD.DP > 0)[0]]) * 4
    DPmean = DPthres / 4
    snpCHROM = np.char.replace(np.core.defchararray.lower(
        vcf.CHROM), "chr", "")  ## Should take care of all possible chr names
    snpsREQ = np.where((vcfD.is_called[:, 0]) & (vcf.QUAL > 30) & (vcf.DP > 0)
                       & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0]
    snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8")
    snpPOS = np.array(vcf.POS[snpsREQ])
    try:
        snpGT = np.array(vcfD.GT[snpsREQ, 0])
    except AttributeError:
        die("input VCF file doesnt have required GT field")
    try:
        snpPL = vcfD.PL[snpsREQ, 0]
        snpWEI = np.copy(snpPL)
        snpWEI = snpWEI.astype(float)
        snpWEI = snpWEI / (-10)
        snpWEI = np.exp(snpWEI)
    except AttributeError:
        snpBinary = parseGT(snpGT)
        snpWEI = np.ones((len(snpsREQ), 3))  ## for h**o and het
        snpWEI[np.where(snpBinary != 0), 0] = 0
        snpWEI[np.where(snpBinary != 1), 2] = 0
        snpWEI[np.where(snpBinary != 2), 1] = 0
    return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
def run_vcfnp_calls_all(chrom='Pf3D7_01_v3'):
    vcf_fn = OUTGROUP_VCF_FORMAT % chrom
    c = vcfnp.calldata_2d(
        vcf_fn=vcf_fn,
        progress=10000,
#         fields=['GT', 'AD'],
        fields=['AD'],
        arities={'AD': 7},
        cache=True,
        cachedir=FULL_NPY_FORMAT % chrom
    )
    return(0)
def run_vcfnp(sample='7G8', file_prefix='WG'):
    vcf_fn = INDIVIDAL_VALIDATION_SAMPLES_VCF_FORMAT % (file_prefix, sample)
    v = vcfnp.variants(
        vcf_fn=vcf_fn,
        progress=10000,
        arities={
            'ALT': 2,
            'AF': 2,
            'AC': 2,
            'MLEAF': 2,
            'MLEAC': 2,
            'RPA': 3
        },
        dtypes={
            'REF': 'a400', 
            'ALT': 'a400',
            'RegionType': 'a25', 
            'VariantType': 'a40',
            'RU': 'a40',
            'set': 'a40',
            'SNPEFF_AMINO_ACID_CHANGE':'a20',
            'SNPEFF_CODON_CHANGE':'a20',
            'SNPEFF_EFFECT':'a33',
            'SNPEFF_EXON_ID':'a2',
            'SNPEFF_FUNCTIONAL_CLASS':'a8',
            'SNPEFF_GENE_BIOTYPE':'a14',
            'SNPEFF_GENE_NAME':'a20',
            'SNPEFF_IMPACT':'a8',
            'SNPEFF_TRANSCRIPT_ID':'a20',
            'VariantType':'a60',
            'culprit':'a14',
        },
        cache=True
    )
    c = vcfnp.calldata_2d(
        vcf_fn=vcf_fn,
        progress=10000,
        fields=['GT', 'AD'],
        arities={'AD': 3},
        cache=True,
    )
    print(sample, max(v['num_alleles']), max([len(x) for x in v['REF']]))
    return(v, c)
Ejemplo n.º 4
0
    def work(self):
        import vcfnp
        import numpy as np
        import pandas as pd
        import matplotlib.pyplot as plt

        variants = vcfnp.variants(self.input().path)
        calldata_2d = vcfnp.calldata_2d(self.input().path)

        var = np.logical_and(variants['ALT'] != b'<NON_REF>',
                             variants['DP'] > self.DP_thresh)
        counts = np.sort(calldata_2d['AD'][var][:, 0, :], axis=1)[:, ::-1]
        freqs = counts / calldata_2d['DP'][var]
        third = 1 - np.sum(freqs, axis=1)

        df = pd.DataFrame(np.hstack((freqs, third.reshape((-1, 1)))))
        df[df == 0] = float('nan')

        df.hist(sharex=True, sharey=True, range=(0, 1), bins=20)
        plt.gcf().suptitle(self.library, fontsize=20)
        plt.gcf().text(0.5, 0.04, 'Allele frequency', ha='center')
        plt.gcf().text(0.02, 0.5, 'Counts', va='center', rotation='vertical')
        plt.gcf().savefig(self.output().path)
Ejemplo n.º 5
0
def test_caching():
    vcf_fn = "fixture/sample.vcf.gz"

    cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="variants")
    if os.path.exists(cache_fn):
        os.remove(cache_fn)
    A = variants(vcf_fn, cache=True, verbose=True)
    A2 = np.load(cache_fn)
    assert np.all(A == A2)

    cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata")
    if os.path.exists(cache_fn):
        os.remove(cache_fn)
    A = calldata(vcf_fn, cache=True, verbose=True)
    A2 = np.load(cache_fn)
    assert np.all(A == A2)

    cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata_2d")
    if os.path.exists(cache_fn):
        os.remove(cache_fn)
    A = calldata_2d(vcf_fn, cache=True, verbose=True)
    A2 = np.load(cache_fn)
    assert np.all(A == A2)
Ejemplo n.º 6
0
def vcf2snp(filename, missing=3, cache=True):
    """
    Return a SNP matrix based on a VCF file.

    This take a VCF file and create a SNP matrix. It will keep only the SNP with 2
    variants. This function is based on the vcfnp package.

    :param filename: The path of the VCF file
    :param missing: How to annotate missing data
    :param cache: Use cache
    :type filename: string
    :type missing: np.int8
    :type cache: boolean
    :return: The genotype matrix containing SNP
    :rtype: np.array of np.int8

    :Example:

    >>> G = vcf2snp('file.vcf')

    ... warnings:: This function is not maintain.
    """

    c = vcfnp.calldata_2d(filename, cache=cache).view(np.recarray)
    G = c.genotype

    ## create a mask to keep only 0/0, 1/0, 0/1, 1/1 and missing datas
    mask = np.logical_and.reduce(np.logical_and(G >= -1, G <= 1), axis = 2)
    mask = np.logical_and.reduce(mask, axis=1)

    G = G[mask, :]
    mask_missing = np.logical_and.reduce(G == -1, axis=2)
    G = np.sum(G.T, axis=0, dtype=np.int8)

    G[mask_missing.T] = missing

    return G
inOptions.add_option("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file", type="string")
inOptions.add_option("-o", "--output", dest="outFile", help="Output file with the probability scores", type="string")
inOptions.add_option("-r", "--refScore", dest="refScore", help="Output for refined score", type="string")

(options, args) = inOptions.parse_args()

logging.basicConfig(format='%(levelname)s:%(asctime)s:  %(message)s', level=logging.DEBUG)

GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File)
#GenotypeData_acc = genotype.load_hdf5_genotype_data(options.hdf5accFile)
#num_lines = len(GenotypeData.accessions)


logging.info("Reading the VCF file")
vcf = vcfnp.variants(options.vcfFile, cache=True).view(numpy.recarray)
vcfD = vcfnp.calldata_2d(options.vcfFile, cache=True).view(numpy.recarray)


## Doubtful .... whether there should be a threshold based on just mean of std
#DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) + numpy.std(vcf.DP[numpy.where(vcf.DP > 0)[0]])
DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) * 4
print "Threshold for depth is set at: ", DPthres

#snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0))[0]
snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0]
snpCHR = numpy.array(numpy.chararray.replace(vcf.CHROM[snpsREQ], "Chr", "")).astype("int8")
snpPOS = numpy.array(vcf.POS[snpsREQ])
snpGT = vcfD.GT[snpsREQ, 0]   ## since one sample 
snpPL = vcfD.PL[snpsREQ, 0]
snpDP = vcf.DP[snpsREQ]
Ejemplo n.º 8
0
out_file = 'panoptes_ready_vcf_data/datatables/variants/data'
with open(out_file, 'w') as f:
    f.write('\t'.join(flatten(names_from_dtype(variants.dtype))))
    f.write('\n')
    for line in variants:
        f.write('\t'.join(map(str, flatten_numpy_line(line))))
        f.write('\n')

import h5py
out = h5py.File('data.hdf5', 'w')
variants_out = out.create_dataset("variant_index", variants.shape, 'S20', maxshape=variants.shape, compression='gzip', fletcher32=False, shuffle=False)
for i in xrange(len(variants)):
    variants_out[i] = variants['CHROM'][i] + '_' + str(variants['POS'][i]).zfill(10)

print 'Parsing genotypes'
c = vcfnp.calldata_2d('ag1000g.phase1.AR1.Y_unplaced.PASS.vcf.gz', fields=['DP', 'GT'])
depth = c['DP']
genotypes = c['GT']
try:
    depth_out = out.create_dataset("total_depth", depth.shape, depth.dtype, maxshape=depth.shape, compression='szip', fletcher32=False, shuffle=False)
    first_allele = out.create_dataset("first_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='szip', fletcher32=False, shuffle=False)
    second_allele = out.create_dataset("second_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='szip', fletcher32=False, shuffle=False)
except ValueError:
    depth_out = out.create_dataset("total_depth", depth.shape, depth.dtype, maxshape=depth.shape, compression='gzip', fletcher32=False, shuffle=False)
    first_allele = out.create_dataset("first_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='gzip', fletcher32=False, shuffle=False)
    second_allele = out.create_dataset("second_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='gzip', fletcher32=False, shuffle=False)

#Parse "a/b"
for i in xrange(genotypes.shape[0]):
    for j in xrange(genotypes.shape[1]):
        a,b = genotypes[i,j].split('/')
Ejemplo n.º 9
0
v = vcfnp.variants(filename, cache=True).view(np.recarray)

# print some simple variant metrics
print('found %s variants (%s SNPs)' % (v.size, np.count_nonzero(v.is_snp)))
print('QUAL mean (std): %s (%s)' % (np.mean(v.QUAL), np.std(v.QUAL)))

# plot a histogram of variant depth
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.hist(v.DP)
ax.set_title('DP histogram')
ax.set_xlabel('DP')
plt.show()

# load data from sample columns
c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray)

# print some simple genotype metrics
count_phased = np.count_nonzero(c.is_phased)
count_variant = np.count_nonzero(np.any(c.genotype > 0, axis=2))
count_missing = np.count_nonzero(~c.is_called)
print('calls (phased, variant, missing): %s (%s, %s, %s)'
    % (c.flatten().size, count_phased, count_variant, count_missing))

# plot a histogram of genotype quality
fig = plt.figure(2)
ax = fig.add_subplot(111)
ax.hist(c.GQ.flatten())
ax.set_title('GQ histogram')
ax.set_xlabel('GQ')
plt.show()
Ejemplo n.º 10
0
def readVcf(inFile):
  bvcf = vcfnp.variants(inFile, cache=True).view(np.recarray)
  bvcfD = vcfnp.calldata_2d(inFile, cache=True).view(np.recarray)
  return(bvcf, bvcfD)
Ejemplo n.º 11
0
out_file = 'panoptes_ready_vcf_data/datatables/variants/data'
with open(out_file, 'w') as f:
    f.write('\t'.join(flatten(names_from_dtype(variants.dtype))))
    f.write('\n')
    for line in variants:
        f.write('\t'.join(map(str, flatten_numpy_line(line))))
        f.write('\n')

import h5py
out = h5py.File('data.hdf5', 'w')
variants_out = out.create_dataset("variant_index", variants.shape, 'S20', maxshape=variants.shape, compression='gzip', fletcher32=False, shuffle=False)
for i in xrange(len(variants)):
    variants_out[i] = variants['CHROM'][i] + '_' + str(variants['POS'][i]).zfill(10)

print 'Parsing genotypes'
c = vcfnp.calldata_2d('ag1000g.phase1.AR1.Y_unplaced.PASS.vcf.gz', fields=['DP', 'GT'])
depth = c['DP']
genotypes = c['GT']
try:
    depth_out = out.create_dataset("total_depth", depth.shape, depth.dtype, maxshape=depth.shape, compression='szip', fletcher32=False, shuffle=False)
    first_allele = out.create_dataset("first_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='szip', fletcher32=False, shuffle=False)
    second_allele = out.create_dataset("second_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='szip', fletcher32=False, shuffle=False)
except ValueError:
    depth_out = out.create_dataset("total_depth", depth.shape, depth.dtype, maxshape=depth.shape, compression='gzip', fletcher32=False, shuffle=False)
    first_allele = out.create_dataset("first_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='gzip', fletcher32=False, shuffle=False)
    second_allele = out.create_dataset("second_allele", genotypes.shape, 'i1', maxshape=genotypes.shape, compression='gzip', fletcher32=False, shuffle=False)

#Parse "a/b"
for i in xrange(genotypes.shape[0]):
    for j in xrange(genotypes.shape[1]):
        a,b = genotypes[i,j].split('/')
Ejemplo n.º 12
0
import numpy as np
import vcfnp
import os

filename = "comt.chr06.snp.full.final.vcf"
c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray)

G = np.sum(c.genotype.T, axis=0)
G[G == -2] = 3

# In[7]:

(n, m) = G.shape
nb_missing_data = np.count_nonzero(G == 3)
nb_data = n * m
nb_0 = np.count_nonzero(G == 0)
nb_1 = np.count_nonzero(G == 1)
nb_2 = np.count_nonzero(G == 2)

print("nb data : {0}".format(nb_data))
print("nb missing data : {0}".format(nb_missing_data))
print("nb 0 : {0}".format(nb_0))
print("nb 1 : {0}".format(nb_1))
print("nb 2 : {0}".format(nb_2))

# In[8]:

float(nb_missing_data) / nb_data

# In[9]: