Example #1
0
 def test_read_variant_vcf(self):
     p = pd.read_csv(P, index_col=0, sep='\t')['binary']
     infile = VariantFile(VCF)
     t = read_variant(infile, p, 'vcf', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(abs((k - np.zeros(50)).max()), 0.0)
     self.assertEqual(var_name, 'FM211187_16_G_A')
     self.assertEqual(kstrains, [])
     self.assertEqual(nkstrains,
                      sorted(['sample_%d' % x for x in range(1, 51)]))
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # not providing samples
     t = read_variant(infile, p, 'vcf', False, [], False, set(), [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     # providing burden
     burden_regions = deque([])
     load_burden(B, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name, 'CDS1')
     self.assertEqual(kstrains, [])
     self.assertEqual(
         nkstrains,
         ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # uncompressed option - no effect
     t = read_variant(infile, p.head(5), 'vcf', False, [], True,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k - np.array([0, 1, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name, 'FM211187_3982_C_A')
     self.assertEqual(kstrains, ['sample_2'])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(af, 0.2)
     self.assertEqual(missing, 0.0)
     # different type
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'kmers', False, [], True,
                          p.head(5).index, [])
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'Rtab', False, [], False,
                          p.head(5).index, [])
     # read until exhaustion
     while not t[0]:
         t = read_variant(infile, p, 'vcf', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, True)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     self.assertEqual(missing, None)
     # different file
     infile = gzip.open(KMER)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf', False, [], False,
                          p.head(5).index, [])
     infile = open(PRES)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf', False, [], False,
                          p.head(5).index, [])
Example #2
0
 def test_read_variant_rtab(self):
     p = pd.read_csv(P, index_col=0, sep='\t')['binary']
     infile = open(PRES)
     header = infile.readline().rstrip()
     sample_order = header.split()[1:]
     t = read_variant(infile, p, 'Rtab', False, [], False, p.index,
                      sample_order)
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(abs((k - np.ones(50)).max()), 0.0)
     self.assertEqual(var_name, 'COG_1')
     self.assertEqual(kstrains,
                      sorted(['sample_%d' % x for x in range(1, 51)]))
     self.assertEqual(nkstrains, [])
     self.assertEqual(af, 1.0)
     self.assertEqual(missing, 0.0)
     # not providing samples
     with self.assertRaises(ValueError):
         t = read_variant(infile, p, 'Rtab', False, [], False, set(), [])
     with self.assertRaises(ZeroDivisionError):
         t = read_variant(infile, p, 'Rtab', False, [], False, set(),
                          sample_order)
     # providing burden - no effect
     t = read_variant(infile, p.head(5), 'Rtab', True, [], False,
                      p.head(5).index, sample_order)
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k - np.array([1, 1, 1, 1, 1])).max()) < 1E-7)
     self.assertEqual(var_name, 'COG_4')
     self.assertEqual(
         kstrains,
         ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(nkstrains, [])
     self.assertEqual(af, 1.0)
     self.assertEqual(missing, 0.0)
     # uncompressed option - no effect
     t = read_variant(infile, p.head(5), 'Rtab', False, [], True,
                      p.head(5).index, sample_order)
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k - np.array([1, 1, 1, 1, 1])).max()) < 1E-7)
     self.assertEqual(var_name, 'COG_5')
     self.assertEqual(
         kstrains,
         ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(nkstrains, [])
     self.assertEqual(af, 1.0)
     self.assertEqual(missing, 0.0)
     # different type
     with self.assertRaises(IndexError):
         t = read_variant(infile, p.head(5), 'kmers', False, [], True,
                          p.head(5).index, sample_order)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf', False, [], False,
                          p.head(5).index, [])
     # read until exhaustion
     infile = open(PRES)
     header = infile.readline().rstrip()
     while not t[0]:
         t = read_variant(infile, p, 'Rtab', False, [], False, p.index,
                          sample_order)
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, True)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     self.assertEqual(missing, None)
     # different file
     infile = gzip.open(KMER)
     with self.assertRaises(ValueError):
         t = read_variant(infile, p.head(5), 'Rtab', False, [], False,
                          p.head(5).index, sample_order)
     infile = gzip.open(VCF)
     with self.assertRaises(ValueError):
         t = read_variant(infile, p.head(5), 'Rtab', False, [], False,
                          p.head(5).index, sample_order)
     # space in OG's names
     infile = open(PRESSPACE)
     header = infile.readline().rstrip()
     sample_order = header.split()[1:]
     t = read_variant(infile, p, 'Rtab', False, [], False, p.index,
                      sample_order)
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(abs((k - np.ones(50)).max()), 0.0)
     self.assertEqual(var_name, 'COG 1')
     self.assertEqual(kstrains,
                      sorted(['sample_%d' % x for x in range(1, 51)]))
     self.assertEqual(nkstrains, [])
     self.assertEqual(af, 1.0)
     self.assertEqual(missing, 0.0)
Example #3
0
 def test_read_variant(self):
     with self.assertRaises(ValueError):
         read_variant(None, None, 'test', None, None, None, None, None)
Example #4
0
 def test_read_variant_kmer(self):
     p = pd.read_csv(P, index_col=0, sep='\t')['binary']
     infile = gzip.open(KMER)
     t = read_variant(infile, p, 'kmers', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(
         abs((k - np.array([
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             1, 0, 0, 0, 0, 0, 0, 0
         ])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
     self.assertEqual(kstrains, ['sample_43'])
     self.assertEqual(
         nkstrains,
         sorted([
             'sample_%d' % x
             for x in list(range(1, 43)) + list(range(44, 51))
         ]))
     self.assertEqual(af, 0.02)
     self.assertEqual(missing, 0.0)
     # not providing samples
     with self.assertRaises(ZeroDivisionError):
         t = read_variant(infile, p, 'kmers', False, [], False, set(), [])
     # providing burden - no effect
     t = read_variant(infile, p.head(5), 'kmers', True, [], False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k - np.array([1, 1, 0, 1, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
     self.assertEqual(kstrains, ['sample_1', 'sample_2', 'sample_4'])
     self.assertEqual(nkstrains, ['sample_3', 'sample_5'])
     self.assertEqual(af, 0.6)
     self.assertEqual(missing, 0.0)
     # uncompressed option - only with python3+
     if sys.version_info[0] >= 3:
         with self.assertRaises(TypeError):
             t = read_variant(infile, p, 'kmers', False, [], True, p.index,
                              [])
     # different type
     with self.assertRaises(ValueError):
         t = read_variant(infile, p.head(5), 'Rtab', False, [], False,
                          p.head(5).index, [])
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf', False, [], False,
                          p.head(5).index, [])
     # read until exhaustion
     while not t[0]:
         t = read_variant(infile, p, 'kmers', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, True)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     self.assertEqual(missing, None)
     # different file
     infile = open(PRES)
     with self.assertRaises(IndexError):
         t = read_variant(infile, p.head(5), 'kmers', True, [], True,
                          p.head(5).index, [])
     infile = gzip.open(VCF)
     with self.assertRaises(IndexError):
         t = read_variant(infile, p.head(5), 'kmers', True, [], False,
                          p.head(5).index, [])
Example #5
0
 def test_read_variant_vcf(self):
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCF)
     t = read_variant(infile, p, 'vcf',
                      False, [], False,
                      p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(abs((k - np.zeros(50)).max()), 0.0)
     self.assertEqual(var_name,
                      'FM211187_16_G_A')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      sorted(['sample_%d' % x
                              for x in range(1, 51)]))
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # not providing samples
     t = read_variant(infile, p, 'vcf',
                      False, [], False,
                      set(), [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     # providing burden
     burden_regions = deque([])
     load_burden(B, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS1')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # providing burden
     burden_regions = deque([])
     load_burden(BM, burden_regions)
     # last one has multiple regions
     burden_regions.reverse()
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS3')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # uncompressed option - no effect
     infile = VariantFile(VCF)
     t = read_variant(infile, p.head(5), 'vcf',
                      False, [], True,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'FM211187_16_G_A')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # different type
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'kmers',
                          False, [], True,
                          p.head(5).index, [])
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'Rtab',
                          False, [], False,
                          p.head(5).index, [])
     # read until exhaustion
     while not t[0]:
         t = read_variant(infile, p, 'vcf',
                          False, [], False,
                          p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, True)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     self.assertEqual(missing, None)
     # different file
     infile = gzip.open(KMER)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf',
                          False, [], False,
                          p.head(5).index, [])
     infile = open(PRES)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf',
                          False, [], False,
                          p.head(5).index, [])
     # burden with missing genotypes in last read variant
     # issue #90
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCFMISSING)
     burden_regions = deque([])
     load_burden(BMISSING, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([1, 1, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS1')
     self.assertEqual(kstrains,
                      ['sample_1', 'sample_2'])
     self.assertEqual(nkstrains,
                      ['sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(af, 0.4)
     self.assertEqual(missing, 0)
     # check that missing variants are properly missed
     # issue #120
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCFMISSING)
     variant = next(infile)
     total = 0
     missing = 0
     samples = set()
     for sample, call in variant.samples.items():
         if sample not in p.index:
             continue
         for haplotype in call.get('GT', [None]):
             if haplotype is None or haplotype == '.':
                 missing += 1
             total += 1
             samples.add(sample)
     pysam_missing = missing / float(total)
     infile = VariantFile(VCFMISSING)
     t = read_variant(infile, p, 'vcf', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(pysam_missing, missing)