def test_invalid_file(self):
    invalid_file_contents = self._get_invalid_file_contents()

    for content in chain(*invalid_file_contents):
      with TempDir() as tempdir, self.assertRaises(ValueError):
        self._read_records(self._create_temp_vcf_file(content, tempdir))
        self.fail('Invalid VCF file must throw an exception')
    # Try with multiple files (any one of them will throw an exception).
    with TempDir() as tempdir, self.assertRaises(ValueError):
      for content in chain(*invalid_file_contents):
        self._create_temp_vcf_file(content, tempdir)
        self._read_records(os.path.join(tempdir.get_path(), '*.vcf'))
Ejemplo n.º 2
0
  def test_allow_malformed_records(self):
    invalid_records, invalid_headers = self._get_invalid_file_contents()

    # Invalid records should not raise errors
    for content in invalid_records:
      with TempDir() as tempdir:
        self._read_records(self._create_temp_vcf_file(content, tempdir),
                           allow_malformed_records=True)
    # Invalid headers should still raise errors
    for content in invalid_headers:
      with TempDir() as tempdir, self.assertRaises(ValueError):
        self._read_records(self._create_temp_vcf_file(content, tempdir),
                           allow_malformed_records=True)
Ejemplo n.º 3
0
 def test_read_reentrant_without_splitting(self):
     with TempDir() as tempdir:
         file_name = self._create_temp_vcf_file(
             _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
         source = VcfSource(file_name)
         source_test_utils.assert_reentrant_reads_succeed(
             (source, None, None))
Ejemplo n.º 4
0
 def _create_temp_file_and_return_records_with_file_name(
     self, lines, representative_header_lines=None,
     sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH):
   with TempDir() as tempdir:
     file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines)
     return (self._read_records(file_name, representative_header_lines,
                                sample_name_encoding), file_name)
Ejemplo n.º 5
0
 def _create_temp_file_and_read_records(
     self, lines, representative_header_lines=None,
     vcf_parser_type=VcfParserType.PYVCF):
   with TempDir() as tempdir:
     file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines)
     return self._read_records(file_name, representative_header_lines,
                               vcf_parser_type)
Ejemplo n.º 6
0
  def test_allow_malformed_records(self):
    invalid_records = self._get_invalid_file_contents()

    # Invalid records should not raise errors
    for content in invalid_records:
      with TempDir() as tempdir:
        self._read_records(self._create_temp_vcf_file(content, tempdir),
                           allow_malformed_records=True)
Ejemplo n.º 7
0
 def test_dynamic_work_rebalancing(self):
   with TempDir() as tempdir:
     file_name = self._create_temp_vcf_file(
         _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
     source = VcfSource(file_name)
     splits = list(split for split in source.split(desired_bundle_size=100000))
     assert len(splits) == 1
     source_test_utils.assert_split_at_fraction_exhaustive(
         splits[0].source, splits[0].start_position, splits[0].stop_position)
Ejemplo n.º 8
0
 def test_read_reentrant_after_splitting(self):
   with TempDir() as tempdir:
     file_name = self._create_temp_vcf_file(
         _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
     source = VcfSource(file_name)
     splits = list(split for split in source.split(desired_bundle_size=100000))
     assert len(splits) == 1
     source_test_utils.assert_reentrant_reads_succeed(
         (splits[0].source, splits[0].start_position, splits[0].stop_position))
Ejemplo n.º 9
0
 def test_pipeline_read_all_multiple_files(self):
   with TempDir() as tempdir:
     file_name_1 = self._create_temp_vcf_file(
         _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
     file_name_2 = self._create_temp_vcf_file(
         _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
     pipeline = TestPipeline()
     pcoll = (pipeline
              | 'Create' >> beam.Create([file_name_1, file_name_2])
              | 'Read' >> ReadAllFromVcf())
     assert_that(pcoll, asserts.count_equals_to(2 * len(_SAMPLE_TEXT_LINES)))
     pipeline.run()
Ejemplo n.º 10
0
 def test_file_pattern_verify_details(self):
   variant_1 = _get_sample_variant_1()
   variant_2 = _get_sample_variant_2()
   variant_3 = _get_sample_variant_3()
   with TempDir() as tempdir:
     self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + [VCF_LINE_1], tempdir)
     self._create_temp_vcf_file((_SAMPLE_HEADER_LINES +
                                 [VCF_LINE_2, VCF_LINE_3]),
                                tempdir)
     read_data = self._read_records(os.path.join(tempdir.get_path(), '*.vcf'))
     self.assertEqual(3, len(read_data))
     self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
Ejemplo n.º 11
0
 def test_file_pattern_verify_details_nucleus(self):
   variant_1, vcf_line_1 = _get_sample_variant_1(is_for_nucleus=True)
   variant_2, vcf_line_2 = _get_sample_variant_2(is_for_nucleus=True)
   variant_3, vcf_line_3 = _get_sample_variant_3(is_for_nucleus=True)
   with TempDir() as tempdir:
     self._create_temp_vcf_file(_NUCLEUS_HEADER_LINES + [vcf_line_1], tempdir)
     self._create_temp_vcf_file((_NUCLEUS_HEADER_LINES +
                                 [vcf_line_2, vcf_line_3]),
                                tempdir)
     read_data = self._read_records(os.path.join(tempdir.get_path(), '*.vcf'),
                                    vcf_parser_type=VcfParserType.NUCLEUS)
     self.assertEqual(3, len(read_data))
     self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
Ejemplo n.º 12
0
  def test_single_file_1_based_verify_details(self):
    variant = _get_sample_variant_1(use_1_based_coordinate=True)
    read_data = None
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(
          suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1])
      read_data = source_test_utils.read_from_source(
          VcfSource(file_name,
                    representative_header_lines=None,
                    sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,
                    use_1_based_coordinate=True))

    self.assertEqual(1, len(read_data))
    self.assertEqual(variant, read_data[0])
Ejemplo n.º 13
0
 def test_file_pattern_1_based_verify_details(self):
   variant_1 = _get_sample_variant_1(use_1_based_coordinate=True)
   variant_2 = _get_sample_variant_2(use_1_based_coordinate=True)
   variant_3 = _get_sample_variant_3(use_1_based_coordinate=True)
   with TempDir() as tempdir:
     _ = tempdir.create_temp_file(
         suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1])
     _ = tempdir.create_temp_file(
         suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_2, VCF_LINE_3])
     read_data = source_test_utils.read_from_source(
         VcfSource(os.path.join(tempdir.get_path(), '*.vcf'),
                   representative_header_lines=None,
                   sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,
                   use_1_based_coordinate=True))
     self.assertEqual(3, len(read_data))
     self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
Ejemplo n.º 14
0
 def test_pipeline_read_all_single_file(self):
     with TempDir() as tempdir:
         file_name = self._create_temp_vcf_file(
             _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
         self._assert_pipeline_read_files_record_count_equal(
             file_name, len(_SAMPLE_TEXT_LINES), use_read_all=True)
Ejemplo n.º 15
0
 def _create_temp_file_and_read_records(self,
                                        lines,
                                        representative_header_lines=None):
     with TempDir() as tempdir:
         file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines)
         return self._read_records(file_name, representative_header_lines)
Ejemplo n.º 16
0
 def _create_temp_file_and_read_records(self, lines):
   with TempDir() as tempdir:
     file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines)
     return self._read_records(file_name)