Esempio n. 1
0
def test_condition():
    V = variants('fixture/sample.vcf')
    eq_(9, len(V))
    C = calldata('fixture/sample.vcf', condition=V['FILTER']['PASS'])
    eq_(5, len(C))
    Vf = variants('fixture/sample.vcf', condition=V['FILTER']['PASS'])
    eq_(5, len(Vf))
Esempio n. 2
0
def readVcf(inFile, logDebug):
    log.info("reading the VCF file")
    if logDebug:
        vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
        vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
    else:
        sys.stderr = StringIO.StringIO()
        vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
        vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
        sys.stderr = sys.__stderr__
    DPthres = np.mean(vcfD.DP[np.where(vcfD.DP > 0)[0]]) * 4
    DPmean = DPthres / 4
    snpCHROM = np.char.replace(np.core.defchararray.lower(
        vcf.CHROM), "chr", "")  ## Should take care of all possible chr names
    snpsREQ = np.where((vcfD.is_called[:, 0]) & (vcf.QUAL > 30) & (vcf.DP > 0)
                       & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0]
    snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8")
    snpPOS = np.array(vcf.POS[snpsREQ])
    try:
        snpGT = np.array(vcfD.GT[snpsREQ, 0])
    except AttributeError:
        die("input VCF file doesnt have required GT field")
    try:
        snpPL = vcfD.PL[snpsREQ, 0]
        snpWEI = np.copy(snpPL)
        snpWEI = snpWEI.astype(float)
        snpWEI = snpWEI / (-10)
        snpWEI = np.exp(snpWEI)
    except AttributeError:
        snpBinary = parseGT(snpGT)
        snpWEI = np.ones((len(snpsREQ), 3))  ## for h**o and het
        snpWEI[np.where(snpBinary != 0), 0] = 0
        snpWEI[np.where(snpBinary != 1), 2] = 0
        snpWEI[np.where(snpBinary != 2), 1] = 0
    return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
Esempio n. 3
0
def test_condition():
    V = variants("fixture/sample.vcf")
    eq_(9, len(V))
    C = calldata("fixture/sample.vcf", condition=V["FILTER"]["PASS"])
    eq_(5, len(C))
    Vf = variants("fixture/sample.vcf", condition=V["FILTER"]["PASS"])
    eq_(5, len(Vf))
Esempio n. 4
0
def test_missing_info_definition():
    # INFO field DP not declared in VCF header
    V = variants('fixture/test14.vcf', fields=['DP'])
    eq_('14', V[2]['DP'])  # default is string
    V = variants('fixture/test14.vcf', fields=['DP'], vcf_types={'DP':'Integer'})
    eq_(14, V[2]['DP'])
    # what about a field which isn't present at all?
    V = variants('fixture/test14.vcf', fields=['FOO'])
    eq_('.', V[2]['FOO'])  # default missing value for string field
Esempio n. 5
0
def test_condition():
    v = variants('fixture/sample.vcf')
    eq_(9, len(v))
    c = calldata('fixture/sample.vcf', condition=v['FILTER']['PASS'])
    eq_(5, len(c))
    i = info('fixture/sample.vcf', condition=v['FILTER']['PASS'])
    eq_(5, len(i))
    vf = variants('fixture/sample.vcf', condition=v['FILTER']['PASS'])
    eq_(5, len(vf))
Esempio n. 6
0
def test_missing_info_definition():
    # INFO field DP not declared in VCF header
    V = variants("fixture/test14.vcf", fields=["DP"])
    eq_("14", V[2]["DP"])  # default is string
    V = variants("fixture/test14.vcf", fields=["DP"], vcf_types={"DP": "Integer"})
    eq_(14, V[2]["DP"])
    # what about a field which isn't present at all?
    V = variants("fixture/test14.vcf", fields=["FOO"])
    eq_("", V[2]["FOO"])  # default missing value for string field
Esempio n. 7
0
def test_variants_transformers():
    def _test(V):

        eq_("STOP_GAINED", V["EFF"]["Effect"][0])
        eq_("HIGH", V["EFF"]["Effect_Impact"][0])
        eq_("NONSENSE", V["EFF"]["Functional_Class"][0])
        eq_("Cag/Tag", V["EFF"]["Codon_Change"][0])
        eq_("Q236*", V["EFF"]["Amino_Acid_Change"][0])
        eq_(749, V["EFF"]["Amino_Acid_Length"][0])
        eq_("NOC2L", V["EFF"]["Gene_Name"][0])
        eq_(".", V["EFF"]["Transcript_BioType"][0])
        eq_(1, V["EFF"]["Gene_Coding"][0])
        eq_("NM_015658", V["EFF"]["Transcript_ID"][0])
        eq_(-1, V["EFF"]["Exon"][0])

        eq_("NON_SYNONYMOUS_CODING", V["EFF"]["Effect"][1])
        eq_("MODERATE", V["EFF"]["Effect_Impact"][1])
        eq_("MISSENSE", V["EFF"]["Functional_Class"][1])
        eq_("gTt/gGt", V["EFF"]["Codon_Change"][1])
        eq_("V155G", V["EFF"]["Amino_Acid_Change"][1])
        eq_(-1, V["EFF"]["Amino_Acid_Length"][1])
        eq_("PF3D7_0108900", V["EFF"]["Gene_Name"][1])
        eq_(".", V["EFF"]["Transcript_BioType"][1])
        eq_(-1, V["EFF"]["Gene_Coding"][1])
        eq_("rna_PF3D7_0108900-1", V["EFF"]["Transcript_ID"][1])
        eq_(1, V["EFF"]["Exon"][1])

        eq_(".", V["EFF"]["Effect"][2])
        eq_(".", V["EFF"]["Effect_Impact"][2])
        eq_(".", V["EFF"]["Functional_Class"][2])
        eq_(".", V["EFF"]["Codon_Change"][2])
        eq_(".", V["EFF"]["Amino_Acid_Change"][2])
        eq_(-1, V["EFF"]["Amino_Acid_Length"][2])
        eq_(".", V["EFF"]["Gene_Name"][2])
        eq_(".", V["EFF"]["Transcript_BioType"][2])
        eq_(-1, V["EFF"]["Gene_Coding"][2])
        eq_(".", V["EFF"]["Transcript_ID"][2])
        eq_(-1, V["EFF"]["Exon"][2])

    V = variants(
        "fixture/test12.vcf",
        dtypes={"EFF": EFF_DEFAULT_DTYPE},
        arities={"EFF": 1},
        transformers={"EFF": eff_default_transformer()},
    )
    _test(V)

    # test EFF is included in defaults
    V = variants("fixture/test12.vcf")
    _test(V)
Esempio n. 8
0
def test_error_handling():

    # try to open a directory
    vcf_fn = '.'
    with assert_raises(ValueError):
        a = vcfnp.variants(vcf_fn)

    # try to open a file that doesn't exist
    vcf_fn = 'doesnotexist'
    with assert_raises(ValueError):
        a = vcfnp.variants(vcf_fn)

    # file is nothing like a VCF (has no header etc.)
    vcf_fn = 'fixture/test48a.vcf'
    with assert_raises(RuntimeError):
        a = vcfnp.variants(vcf_fn)
Esempio n. 9
0
def test_svlen():
    # V = variants('fixture/test13.vcf').view(np.recarray)
    # assert hasattr(V, 'svlen')
    # eq_(0, V.svlen[0])
    # eq_(1, V.svlen[1])
    # eq_(-1, V.svlen[2])
    # eq_(3, V.svlen[3])
    # eq_(3, V.svlen[4])
    V = variants("fixture/test13.vcf", arities={"svlen": 2}).view(np.recarray)
Esempio n. 10
0
def test_variants():
    a = variants('fixture/sample.vcf', arities={'ALT': 2})
    print repr(a)
    eq_(9, len(a))
    eq_('19', a[0]['CHROM'])
    eq_(111, a[0]['POS'])
    eq_('rs6054257', a[2]['ID'])
    eq_('A', a[0]['REF'])
    eq_('ATG', a[8]['ALT'][1])
    eq_(10.0, a[1]['QUAL'])
    eq_(True, a[2]['FILTER']['PASS'])
    eq_(False, a[3]['FILTER']['PASS'])
    eq_(True, a[3]['FILTER']['q10'])
    eq_(2, a[0]['num_alleles'])
    eq_(False, a[5]['is_snp'])
def build_arrays(vcf_fn, region, samples=None, force=False):
    
    variants_array_fn = vcf_fn + '.' + region.replace('-:', '_') + '.variants.npy'
    if force or not os.path.exists(variants_array_fn):
        print >>sys.stderr, 'building', variants_array_fn
        V = vcfnp.variants(vcf_fn, region=region, progress=20000)
        np.save(variants_array_fn, V)
    else:
        print >>sys.stderr, 'skipping', variants_array_fn 
        
    info_array_fn = vcf_fn + '.' + region.replace('-:', '_') + '.info.npy'
    if force or not os.path.exists(info_array_fn):
        print >>sys.stderr, 'building', info_array_fn
        I = vcfnp.info(vcf_fn, region=region, progress=20000, fields=['NS', 'UQ', 'CODING', 'DP', 'AD'], vcf_types={'DP': 'Integer', 'AD': 'Integer'}, arities={'AD': 2})
        np.save(info_array_fn, I)
    else:
        print >>sys.stderr, 'skipping', info_array_fn 
def run_vcfnp(sample='7G8', file_prefix='WG'):
    vcf_fn = INDIVIDAL_VALIDATION_SAMPLES_VCF_FORMAT % (file_prefix, sample)
    v = vcfnp.variants(
        vcf_fn=vcf_fn,
        progress=10000,
        arities={
            'ALT': 2,
            'AF': 2,
            'AC': 2,
            'MLEAF': 2,
            'MLEAC': 2,
            'RPA': 3
        },
        dtypes={
            'REF': 'a400', 
            'ALT': 'a400',
            'RegionType': 'a25', 
            'VariantType': 'a40',
            'RU': 'a40',
            'set': 'a40',
            'SNPEFF_AMINO_ACID_CHANGE':'a20',
            'SNPEFF_CODON_CHANGE':'a20',
            'SNPEFF_EFFECT':'a33',
            'SNPEFF_EXON_ID':'a2',
            'SNPEFF_FUNCTIONAL_CLASS':'a8',
            'SNPEFF_GENE_BIOTYPE':'a14',
            'SNPEFF_GENE_NAME':'a20',
            'SNPEFF_IMPACT':'a8',
            'SNPEFF_TRANSCRIPT_ID':'a20',
            'VariantType':'a60',
            'culprit':'a14',
        },
        cache=True
    )
    c = vcfnp.calldata_2d(
        vcf_fn=vcf_fn,
        progress=10000,
        fields=['GT', 'AD'],
        arities={'AD': 3},
        cache=True,
    )
    print(sample, max(v['num_alleles']), max([len(x) for x in v['REF']]))
    return(v, c)
Esempio n. 13
0
def test_variants_transformers():
    V = variants('fixture/test12.vcf',
                 dtypes={'EFF': EFF_DEFAULT_DTYPE},
                 arities={'EFF': 1},
                 transformers={'EFF': eff_default_transformer()})

    eq_('STOP_GAINED', V['EFF']['Effect'][0])
    eq_('HIGH', V['EFF']['Effect_Impact'][0])
    eq_('NONSENSE', V['EFF']['Functional_Class'][0])
    eq_('Cag/Tag', V['EFF']['Codon_Change'][0])
    eq_('Q236*', V['EFF']['Amino_Acid_Change'][0])
    eq_(749, V['EFF']['Amino_Acid_Length'][0])
    eq_('NOC2L', V['EFF']['Gene_Name'][0])
    eq_('.', V['EFF']['Transcript_BioType'][0])
    eq_(1, V['EFF']['Gene_Coding'][0])
    eq_('NM_015658', V['EFF']['Transcript_ID'][0])
    eq_(-1, V['EFF']['Exon'][0])

    eq_('NON_SYNONYMOUS_CODING', V['EFF']['Effect'][1])
    eq_('MODERATE', V['EFF']['Effect_Impact'][1])
    eq_('MISSENSE', V['EFF']['Functional_Class'][1])
    eq_('gTt/gGt', V['EFF']['Codon_Change'][1])
    eq_('V155G', V['EFF']['Amino_Acid_Change'][1])
    eq_(-1, V['EFF']['Amino_Acid_Length'][1])
    eq_('PF3D7_0108900', V['EFF']['Gene_Name'][1])
    eq_('.', V['EFF']['Transcript_BioType'][1])
    eq_(-1, V['EFF']['Gene_Coding'][1])
    eq_('rna_PF3D7_0108900-1', V['EFF']['Transcript_ID'][1])
    eq_(1, V['EFF']['Exon'][1])

    eq_('.', V['EFF']['Effect'][2])
    eq_('.', V['EFF']['Effect_Impact'][2])
    eq_('.', V['EFF']['Functional_Class'][2])
    eq_('.', V['EFF']['Codon_Change'][2])
    eq_('.', V['EFF']['Amino_Acid_Change'][2])
    eq_(-1, V['EFF']['Amino_Acid_Length'][2])
    eq_('.', V['EFF']['Gene_Name'][2])
    eq_('.', V['EFF']['Transcript_BioType'][2])
    eq_(-1, V['EFF']['Gene_Coding'][2])
    eq_('.', V['EFF']['Transcript_ID'][2])
    eq_(-1, V['EFF']['Exon'][2])
Esempio n. 14
0
def test_variants():
    a = variants("fixture/sample.vcf", arities={"ALT": 2, "AC": 2})
    print repr(a)
    eq_(9, len(a))

    eq_("19", a[0]["CHROM"])
    eq_(111, a[0]["POS"])
    eq_("rs6054257", a[2]["ID"])
    eq_("A", a[0]["REF"])
    eq_("ATG", a[8]["ALT"][1])
    eq_(10.0, a[1]["QUAL"])
    eq_(True, a[2]["FILTER"]["PASS"])
    eq_(False, a[3]["FILTER"]["PASS"])
    eq_(True, a[3]["FILTER"]["q10"])
    eq_(2, a[0]["num_alleles"])
    eq_(False, a[5]["is_snp"])

    # INFO fields
    eq_(3, a[2]["NS"])
    eq_(0.5, a[2]["AF"])
    eq_(True, a[2]["DB"])
    eq_((3, 1), tuple(a[6]["AC"]))
Esempio n. 15
0
    def work(self):
        import vcfnp
        import numpy as np
        import pandas as pd
        import matplotlib.pyplot as plt

        variants = vcfnp.variants(self.input().path)
        calldata_2d = vcfnp.calldata_2d(self.input().path)

        var = np.logical_and(variants['ALT'] != b'<NON_REF>',
                             variants['DP'] > self.DP_thresh)
        counts = np.sort(calldata_2d['AD'][var][:, 0, :], axis=1)[:, ::-1]
        freqs = counts / calldata_2d['DP'][var]
        third = 1 - np.sum(freqs, axis=1)

        df = pd.DataFrame(np.hstack((freqs, third.reshape((-1, 1)))))
        df[df == 0] = float('nan')

        df.hist(sharex=True, sharey=True, range=(0, 1), bins=20)
        plt.gcf().suptitle(self.library, fontsize=20)
        plt.gcf().text(0.5, 0.04, 'Allele frequency', ha='center')
        plt.gcf().text(0.02, 0.5, 'Counts', va='center', rotation='vertical')
        plt.gcf().savefig(self.output().path)
Esempio n. 16
0
def test_caching():
    vcf_fn = "fixture/sample.vcf.gz"

    cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="variants")
    if os.path.exists(cache_fn):
        os.remove(cache_fn)
    A = variants(vcf_fn, cache=True, verbose=True)
    A2 = np.load(cache_fn)
    assert np.all(A == A2)

    cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata")
    if os.path.exists(cache_fn):
        os.remove(cache_fn)
    A = calldata(vcf_fn, cache=True, verbose=True)
    A2 = np.load(cache_fn)
    assert np.all(A == A2)

    cache_fn = vcfnp._mk_cache_fn(vcf_fn, array_type="calldata_2d")
    if os.path.exists(cache_fn):
        os.remove(cache_fn)
    A = calldata_2d(vcf_fn, cache=True, verbose=True)
    A2 = np.load(cache_fn)
    assert np.all(A == A2)
def run_vcfnp_all(chrom='Pf3D7_01_v3'):
    vcf_fn = OUTGROUP_VCF_FORMAT % chrom
    v = vcfnp.variants(
        vcf_fn=vcf_fn,
        progress=10000,
        arities={
            'ALT': 6,
            'AF': 6,
            'AC': 6,
            'MLEAF': 6,
            'MLEAC': 6,
            'RPA': 7
        },
        dtypes={
#             'REF': 'a100', 
#             'ALT': 'a100',
            'RegionType': 'a25', 
            'VariantType': 'a40',
            'RU': 'a40',
            'set': 'a40',
            'SNPEFF_AMINO_ACID_CHANGE':'a20',
            'SNPEFF_CODON_CHANGE':'a20',
            'SNPEFF_EFFECT':'a33',
            'SNPEFF_EXON_ID':'a2',
            'SNPEFF_FUNCTIONAL_CLASS':'a8',
            'SNPEFF_GENE_BIOTYPE':'a14',
            'SNPEFF_GENE_NAME':'a20',
            'SNPEFF_IMPACT':'a8',
            'SNPEFF_TRANSCRIPT_ID':'a20',
            'VariantType':'a60',
            'culprit':'a14',
        },
        cache=True,
        cachedir=FULL_NPY_FORMAT % chrom
    )
    return(0)
def run_vcfnp(
    annotated_vcf_fn = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/consensus_alignment/nucmer/7G8.3d7coordinates.annoatated.vcf'
):
    v = vcfnp.variants(
        vcf_fn=annotated_vcf_fn,
        progress=10000,
        arities={
            'ALT': 2,
            'AF': 2,
            'AC': 2,
            'MLEAF': 2,
            'MLEAC': 2,
            'RPA': 3
        },
        dtypes={
            'REF': 'a400', 
            'ALT': 'a400',
            'RegionType': 'a25', 
            'VariantType': 'a40',
            'RU': 'a40',
            'set': 'a40',
            'SNPEFF_AMINO_ACID_CHANGE':'a20',
            'SNPEFF_CODON_CHANGE':'a20',
            'SNPEFF_EFFECT':'a33',
            'SNPEFF_EXON_ID':'a2',
            'SNPEFF_FUNCTIONAL_CLASS':'a8',
            'SNPEFF_GENE_BIOTYPE':'a14',
            'SNPEFF_GENE_NAME':'a20',
            'SNPEFF_IMPACT':'a8',
            'SNPEFF_TRANSCRIPT_ID':'a20',
            'VariantType':'a60',
            'culprit':'a14',
        },
        cache=True
    )
    return(v)
Esempio n. 19
0
def test_error_handling():

    # try to open a directory
    vcf_fn = '.'
    with assert_raises(ValueError):
        vcfnp.variants(vcf_fn)

    # try to open a file that doesn't exist
    vcf_fn = 'doesnotexist'
    with assert_raises(ValueError):
        vcfnp.variants(vcf_fn)

    # file is nothing like a VCF (has no header etc.)
    vcf_fn = 'fixture/test48a.vcf'
    with assert_raises(RuntimeError):
        vcfnp.variants(vcf_fn)

    # file has mode sample columns than in header row
    vcf_fn = 'fixture/test48b.vcf'
    with assert_raises(RuntimeError):
        vcfnp.calldata(vcf_fn)
Esempio n. 20
0
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
import vcfnp
vcfnp.__version__

filename = 'fixture/sample.vcf'

# load data from fixed fields (including INFO)
v = vcfnp.variants(filename, cache=True).view(np.recarray)

# print some simple variant metrics
print('found %s variants (%s SNPs)' % (v.size, np.count_nonzero(v.is_snp)))
print('QUAL mean (std): %s (%s)' % (np.mean(v.QUAL), np.std(v.QUAL)))

# plot a histogram of variant depth
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.hist(v.DP)
ax.set_title('DP histogram')
ax.set_xlabel('DP')
plt.show()

# load data from sample columns
c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray)

# print some simple genotype metrics
count_phased = np.count_nonzero(c.is_phased)
count_variant = np.count_nonzero(np.any(c.genotype > 0, axis=2))
count_missing = np.count_nonzero(~c.is_called)
print('calls (phased, variant, missing): %s (%s, %s, %s)'
Esempio n. 21
0
def readVcf(inFile):
  bvcf = vcfnp.variants(inFile, cache=True).view(np.recarray)
  bvcfD = vcfnp.calldata_2d(inFile, cache=True).view(np.recarray)
  return(bvcf, bvcfD)
Esempio n. 22
0
def test_override_vcf_types():
    V = variants("fixture/test4.vcf")
    eq_(0, V["MQ0FractionTest"][2])
    V = variants("fixture/test4.vcf", vcf_types={"MQ0FractionTest": "Float"})
    assert_almost_equal(0.03, V["MQ0FractionTest"][2])
Esempio n. 23
0
def test_variants_slice():
    a = variants("fixture/sample.vcf.gz")
    eq_("rs6054257", a["ID"][2])
    a = variants("fixture/sample.vcf.gz", slice=(0, None, 2))
    eq_("rs6054257", a["ID"][1])
Esempio n. 24
0
def test_variants_exclude_fields():
    a = variants("fixture/sample.vcf", exclude_fields=["ID", "FILTER"])
    assert "CHROM" in a.dtype.names
    assert "ID" not in a.dtype.names
    assert "FILTER" not in a.dtype.names
Esempio n. 25
0
def test_variants_count():
    a = variants("fixture/sample.vcf", count=3)
    eq_(3, len(a))
Esempio n. 26
0
from compiler.ast import flatten
import numpy


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

vcf_file = "ag1000g.phase1.AR1.Y_unplaced.PASS.vcf.gz"

print 'Parsing variants'
variants = vcfnp.variants(vcf_file)

#Recursivly get the names of the columns
def names_from_dtype(dtype, path=''):
    if dtype.names:
        #dtypes don't support iter.... yes. I know.
        dtype_as_list = [dtype[i] for i in xrange(len(dtype))]
        return [names_from_dtype(inner_dtype, path+'.'+name if path else name)
                for name, inner_dtype in zip(dtype.names, dtype_as_list)]
    elif dtype.kind == 'V': #a vector with no names! We will have to just number them...
        return [path+'.'+str(i) if path else str(i) for i in range(dtype.shape[0])]
    else:
        return path

def flatten_numpy_line(line):
    for entry in line:
Esempio n. 27
0
def profile():
    a = vcfnp.variants(sys.argv[1], count=int(sys.argv[2]))
Esempio n. 28
0
def test_variants_slice():
    a = variants('fixture/sample.vcf.gz')
    eq_('rs6054257', a['ID'][2])
    a = variants('fixture/sample.vcf.gz', slice=(0, None, 2))
    eq_('rs6054257', a['ID'][1])
Esempio n. 29
0
def test_explicit_pass_definition():
    # explicit PASS FILTER definition
    V = variants("fixture/test16.vcf")
Esempio n. 30
0
def profile():
    a = vcfnp.variants(sys.argv[1], count=int(sys.argv[2]))
Esempio n. 31
0
def test_duplicate_field_definitions():
    V = variants("fixture/test10.vcf")
    # should not raise, but print useful message to stderr
    C = calldata("fixture/test10.vcf")
Esempio n. 32
0
def profile():
    v = vcfnp.variants(sys.argv[1], count=int(sys.argv[2]))
    c = vcfnp.calldata(sys.argv[1], condition=v['FILTER']['PASS'])
Esempio n. 33
0
def test_variants_flatten_filter():
    a = variants("fixture/sample.vcf", flatten_filter=True)
    eq_(True, a[2]["FILTER_PASS"])
    eq_(False, a[3]["FILTER_PASS"])
    eq_(True, a[3]["FILTER_q10"])
Esempio n. 34
0
from compiler.ast import flatten
import numpy


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

vcf_file = "ag1000g.phase1.AR1.Y_unplaced.PASS.vcf.gz"

print 'Parsing variants'
variants = vcfnp.variants(vcf_file)

#Recursivly get the names of the columns
def names_from_dtype(dtype, path=''):
    if dtype.names:
        #dtypes don't support iter.... yes. I know.
        dtype_as_list = [dtype[i] for i in xrange(len(dtype))]
        return [names_from_dtype(inner_dtype, path+'.'+name if path else name)
                for name, inner_dtype in zip(dtype.names, dtype_as_list)]
    elif dtype.kind == 'V': #a vector with no names! We will have to just number them...
        return [path+'.'+str(i) if path else str(i) for i in range(dtype.shape[0])]
    else:
        return path

def flatten_numpy_line(line):
    for entry in line:
Esempio n. 35
0
def test_variants_region():
    a = variants("fixture/sample.vcf.gz", region="20")
    eq_(6, len(a))
inOptions.add_option("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file", type="string")
inOptions.add_option("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file", type="string")
inOptions.add_option("-o", "--output", dest="outFile", help="Output file with the probability scores", type="string")
inOptions.add_option("-r", "--refScore", dest="refScore", help="Output for refined score", type="string")

(options, args) = inOptions.parse_args()

logging.basicConfig(format='%(levelname)s:%(asctime)s:  %(message)s', level=logging.DEBUG)

GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File)
#GenotypeData_acc = genotype.load_hdf5_genotype_data(options.hdf5accFile)
#num_lines = len(GenotypeData.accessions)


logging.info("Reading the VCF file")
vcf = vcfnp.variants(options.vcfFile, cache=True).view(numpy.recarray)
vcfD = vcfnp.calldata_2d(options.vcfFile, cache=True).view(numpy.recarray)


## Doubtful .... whether there should be a threshold based on just mean of std
#DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) + numpy.std(vcf.DP[numpy.where(vcf.DP > 0)[0]])
DPthres = numpy.mean(vcf.DP[numpy.where(vcf.DP > 0)[0]]) * 4
print "Threshold for depth is set at: ", DPthres

#snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0))[0]
snpsREQ = numpy.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0]
snpCHR = numpy.array(numpy.chararray.replace(vcf.CHROM[snpsREQ], "Chr", "")).astype("int8")
snpPOS = numpy.array(vcf.POS[snpsREQ])
snpGT = vcfD.GT[snpsREQ, 0]   ## since one sample 
snpPL = vcfD.PL[snpsREQ, 0]
snpDP = vcf.DP[snpsREQ]
Esempio n. 37
0
def test_variants_region_empty():
    a = variants("fixture/sample.vcf.gz", region="18")
    eq_(0, len(a))
    a = variants("fixture/sample.vcf.gz", region="19:113-200")
    eq_(0, len(a))