コード例 #1
0
 def test_read_single_file_large(self):
     test_data_conifgs = [
         {
             'file': 'valid-4.0.vcf',
             'num_infos': 6,
             'num_formats': 4
         },
         {
             'file': 'valid-4.0.vcf.gz',
             'num_infos': 6,
             'num_formats': 4
         },
         {
             'file': 'valid-4.0.vcf.bz2',
             'num_infos': 6,
             'num_formats': 4
         },
         {
             'file': 'valid-4.1-large.vcf',
             'num_infos': 21,
             'num_formats': 33
         },
         {
             'file': 'valid-4.2.vcf',
             'num_infos': 8,
             'num_formats': 5
         },
     ]
     for config in test_data_conifgs:
         read_data = source_test_utils.read_from_source(
             VcfHeaderSource(
                 testdata_util.get_full_file_path(config['file'])))
         self.assertEqual(config['num_infos'], len(read_data[0].infos))
         self.assertEqual(config['num_formats'], len(read_data[0].formats))
コード例 #2
0
 def _create_file_and_read_headers(self):
     with temp_dir.TempDir() as tempdir:
         filename = tempdir.create_temp_file(suffix='.vcf',
                                             lines=self.lines)
         headers = source_test_utils.read_from_source(
             VcfHeaderSource(filename))
         return headers[0]
コード例 #3
0
 def _read_records(self, file_or_pattern, representative_header_lines=None,
                   vcf_parser_type=VcfParserType.PYVCF, **kwargs):
   return source_test_utils.read_from_source(
       VcfSource(file_or_pattern,
                 representative_header_lines=representative_header_lines,
                 vcf_parser_type=vcf_parser_type,
                 **kwargs))
コード例 #4
0
 def _read_records(self, file_or_pattern, representative_header_lines=None,
                   sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,
                   **kwargs):
   return source_test_utils.read_from_source(
       VcfSource(file_or_pattern,
                 representative_header_lines=representative_header_lines,
                 sample_name_encoding=sample_name_encoding,
                 **kwargs))
コード例 #5
0
 def _read_records(self,
                   file_or_pattern,
                   representative_header_lines=None,
                   **kwargs):
     return source_test_utils.read_from_source(
         VcfSource(file_or_pattern,
                   representative_header_lines=representative_header_lines,
                   **kwargs))
コード例 #6
0
ファイル: avroio_test.py プロジェクト: zhoufek/beam
  def test_corrupted_file(self):
    file_name = self._write_data()
    with open(file_name, 'rb') as f:
      data = f.read()

    # Corrupt the last character of the file which is also the last character of
    # the last sync_marker.
    # https://avro.apache.org/docs/current/spec.html#Object+Container+Files
    corrupted_data = bytearray(data)
    corrupted_data[-1] = (corrupted_data[-1] + 1) % 256
    with tempfile.NamedTemporaryFile(delete=False,
                                     prefix=tempfile.template) as f:
      f.write(corrupted_data)
      corrupted_file_name = f.name

    source = _create_avro_source(corrupted_file_name)
    with self.assertRaisesRegex(ValueError, r'expected sync marker'):
      source_test_utils.read_from_source(source, None, None)
コード例 #7
0
  def test_read_after_splitting_skip_header(self):
    file_name, expected_data = write_data(100)
    assert len(expected_data) == 100
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder(), skip_header_lines=2)
    splits = list(source.split(desired_bundle_size=33))

    reference_source_info = (source, None, None)
    sources_info = ([
        (split.source, split.start_position, split.stop_position) for
        split in splits])
    self.assertGreater(len(sources_info), 1)
    reference_lines = source_test_utils.read_from_source(*reference_source_info)
    split_lines = []
    for source_info in sources_info:
      split_lines.extend(source_test_utils.read_from_source(*source_info))

    self.assertEqual(expected_data[2:], reference_lines)
    self.assertEqual(reference_lines, split_lines)
コード例 #8
0
ファイル: textio_test.py プロジェクト: dpmills/incubator-beam
  def test_read_after_splitting_skip_header(self):
    file_name, expected_data = write_data(100)
    assert len(expected_data) == 100
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder(), skip_header_lines=2)
    splits = list(source.split(desired_bundle_size=33))

    reference_source_info = (source, None, None)
    sources_info = ([
        (split.source, split.start_position, split.stop_position) for
        split in splits])
    self.assertGreater(len(sources_info), 1)
    reference_lines = source_test_utils.read_from_source(*reference_source_info)
    split_lines = []
    for source_info in sources_info:
      split_lines.extend(source_test_utils.read_from_source(*source_info))

    self.assertEqual(expected_data[2:], reference_lines)
    self.assertEqual(reference_lines, split_lines)
コード例 #9
0
ファイル: avroio_test.py プロジェクト: gyamxxx/beam
  def test_corrupted_file(self):
    file_name = self._write_data()
    with open(file_name, 'rb') as f:
      data = f.read()

    # Corrupt the last character of the file which is also the last character of
    # the last sync_marker.
    last_char_index = len(data) - 1
    corrupted_data = data[:last_char_index]
    corrupted_data += 'A' if data[last_char_index] == 'B' else 'B'
    with tempfile.NamedTemporaryFile(
        delete=False, prefix=tempfile.template) as f:
      f.write(corrupted_data)
      corrupted_file_name = f.name

    source = AvroSource(corrupted_file_name)
    with self.assertRaises(ValueError) as exn:
      source_test_utils.read_from_source(source, None, None)
      self.assertEqual(0, exn.exception.message.find('Unexpected sync marker'))
コード例 #10
0
ファイル: avroio_test.py プロジェクト: JavierRoger/beam
  def test_corrupted_file(self):
    file_name = self._write_data()
    with open(file_name, 'rb') as f:
      data = f.read()

    # Corrupt the last character of the file which is also the last character of
    # the last sync_marker.
    last_char_index = len(data) - 1
    corrupted_data = data[:last_char_index]
    corrupted_data += 'A' if data[last_char_index] == 'B' else 'B'
    with tempfile.NamedTemporaryFile(
        delete=False, prefix=tempfile.template) as f:
      f.write(corrupted_data)
      corrupted_file_name = f.name

    source = AvroSource(corrupted_file_name)
    with self.assertRaises(ValueError) as exn:
      source_test_utils.read_from_source(source, None, None)
      self.assertEqual(0, exn.exception.message.find('Unexpected sync marker'))
コード例 #11
0
 def test_read_after_splitting(self):
   file_name = get_full_file_path('valid-4.1-large.vcf')
   source = VcfSource(file_name)
   splits = [p for p in source.split(desired_bundle_size=500)]
   self.assertGreater(len(splits), 1)
   sources_info = ([
       (split.source, split.start_position, split.stop_position) for
       split in splits])
   self.assertGreater(len(sources_info), 1)
   split_records = []
   for source_info in sources_info:
     split_records.extend(source_test_utils.read_from_source(*source_info))
   self.assertEqual(9882, len(split_records))
コード例 #12
0
 def test_read_after_splitting(self):
     file_name = testdata_util.get_full_file_path('valid-4.1-large.vcf')
     source = VcfSource(file_name)
     splits = [p for p in source.split(desired_bundle_size=500)]
     self.assertGreater(len(splits), 1)
     sources_info = ([(split.source, split.start_position,
                       split.stop_position) for split in splits])
     self.assertGreater(len(sources_info), 1)
     split_records = []
     for source_info in sources_info:
         split_records.extend(
             source_test_utils.read_from_source(*source_info))
     self.assertEqual(9882, len(split_records))
コード例 #13
0
  def test_single_file_1_based_verify_details(self):
    variant = _get_sample_variant_1(use_1_based_coordinate=True)
    read_data = None
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(
          suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1])
      read_data = source_test_utils.read_from_source(
          VcfSource(file_name,
                    representative_header_lines=None,
                    sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,
                    use_1_based_coordinate=True))

    self.assertEqual(1, len(read_data))
    self.assertEqual(variant, read_data[0])
コード例 #14
0
 def test_file_pattern_1_based_verify_details(self):
   variant_1 = _get_sample_variant_1(use_1_based_coordinate=True)
   variant_2 = _get_sample_variant_2(use_1_based_coordinate=True)
   variant_3 = _get_sample_variant_3(use_1_based_coordinate=True)
   with TempDir() as tempdir:
     _ = tempdir.create_temp_file(
         suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1])
     _ = tempdir.create_temp_file(
         suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_2, VCF_LINE_3])
     read_data = source_test_utils.read_from_source(
         VcfSource(os.path.join(tempdir.get_path(), '*.vcf'),
                   representative_header_lines=None,
                   sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,
                   use_1_based_coordinate=True))
     self.assertEqual(3, len(read_data))
     self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
コード例 #15
0
  def _run_parquet_test(self, pattern, columns, desired_bundle_size,
                        perform_splitting, expected_result):
    source = _create_parquet_source(pattern, columns=columns)
    if perform_splitting:
      assert desired_bundle_size
      sources_info = [
          (split.source, split.start_position, split.stop_position)
          for split in source.split(desired_bundle_size=desired_bundle_size)
      ]
      if len(sources_info) < 2:
        raise ValueError('Test is trivial. Please adjust it so that at least '
                         'two splits get generated')

      source_test_utils.assert_sources_equal_reference_source(
          (source, None, None), sources_info)
    else:
      read_records = source_test_utils.read_from_source(source, None, None)
      self.assertCountEqual(expected_result, read_records)
コード例 #16
0
  def test_read_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      headers_1 = [self.lines[1], self.lines[-1]]
      headers_2 = [self.lines[2], self.lines[3], self.lines[-1]]
      headers_3 = [self.lines[4], self.lines[-1]]
      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3)

      actual = source_test_utils.read_from_source(VcfHeaderSource(
          os.path.join(tempdir.get_path(), '*.vcf')))

      expected = [_get_vcf_header_from_lines(h, file_name=file_name)
                  for h, file_name in [(headers_1, file_name_1),
                                       (headers_2, file_name_2),
                                       (headers_3, file_name_3)]]

      asserts.header_vars_equal(expected)(actual)
コード例 #17
0
 def test_read_single_file_large(self):
     test_data_conifgs = [
         {
             'file': 'valid-4.0.vcf',
             'variant_count': 4,
             'size': 1500
         },
         {
             'file': 'valid-4.0.vcf.gz',
             'variant_count': 13,
             'size': 1454
         },
         {
             'file': 'valid-4.0.vcf.bz2',
             'variant_count': 14,
             'size': 1562
         },
         {
             'file': 'valid-4.1-large.vcf',
             'variant_count': 14425,
             'size': 832396
         },
         {
             'file': 'valid-4.1-large.vcf.gz',
             'variant_count': 5498,
             'size': 313430
         },
         {
             'file': 'valid-4.2.vcf',
             'variant_count': 10,
             'size': 3195
         },
     ]
     for config in test_data_conifgs:
         read_data = source_test_utils.read_from_source(
             VcfEstimateSource(
                 testdata_util.get_full_file_path(config['file'])))
         self.assertEqual(config['variant_count'],
                          int(read_data[0].estimated_variant_count))
         self.assertEqual(config['size'], read_data[0].size_in_bytes)
コード例 #18
0
ファイル: avroio_test.py プロジェクト: Chet-Sheng/apache-beam
    def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting,
                       expected_result):
        source = _create_avro_source(pattern, use_fastavro=self.use_fastavro)

        read_records = []
        if perform_splitting:
            assert desired_bundle_size
            splits = [
                split for split in source.split(
                    desired_bundle_size=desired_bundle_size)
            ]
            if len(splits) < 2:
                raise ValueError(
                    'Test is trivial. Please adjust it so that at least '
                    'two splits get generated')

            sources_info = [(split.source, split.start_position,
                             split.stop_position) for split in splits]
            source_test_utils.assert_sources_equal_reference_source(
                (source, None, None), sources_info)
        else:
            read_records = source_test_utils.read_from_source(
                source, None, None)
            self.assertItemsEqual(expected_result, read_records)
コード例 #19
0
    def test_read_file_pattern(self):
        with temp_dir.TempDir() as tempdir:
            lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2]
            lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4]
            lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:]
            file_name_1 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=lines_1)
            file_name_2 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=lines_2)
            file_name_3 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=lines_3)

            actual = source_test_utils.read_from_source(
                VcfEstimateSource(os.path.join(tempdir.get_path(), '*.vcf')))

            expected = [
                _get_estimate_from_lines(lines, file_name=file_name)
                for lines, file_name in [(
                    lines_1, file_name_1), (lines_2,
                                            file_name_2), (lines_3,
                                                           file_name_3)]
            ]

            asserts.header_vars_equal(expected)(actual)
コード例 #20
0
ファイル: avroio_test.py プロジェクト: JavierRoger/beam
  def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting,
                     expected_result):
    source = AvroSource(pattern)

    read_records = []
    if perform_splitting:
      assert desired_bundle_size
      splits = [
          split
          for split in source.split(desired_bundle_size=desired_bundle_size)
      ]
      if len(splits) < 2:
        raise ValueError('Test is trivial. Please adjust it so that at least '
                         'two splits get generated')

      sources_info = [
          (split.source, split.start_position, split.stop_position)
          for split in splits
      ]
      source_test_utils.assert_sources_equal_reference_source(
          (source, None, None), sources_info)
    else:
      read_records = source_test_utils.read_from_source(source, None, None)
      self.assertItemsEqual(expected_result, read_records)
コード例 #21
0
 def test_read_from_source(self):
   data = self._create_data(100)
   source = self._create_source(data)
   self.assertItemsEqual(
       data, source_test_utils.read_from_source(source, None, None))
コード例 #22
0
ファイル: create_test.py プロジェクト: Sil1991/gcpdf-demo
 def check_read(self, values, coder):
     source = Create._create_source_from_iterable(values, coder)
     read_values = source_test_utils.read_from_source(source)
     self.assertEqual(sorted(values), sorted(read_values))
コード例 #23
0
ファイル: create_test.py プロジェクト: vikkyrk/incubator-beam
 def check_read(self, values, coder):
   source = Create._create_source_from_iterable(values, coder)
   read_values = source_test_utils.read_from_source(source)
   self.assertEqual(sorted(values), sorted(read_values))
コード例 #24
0
 def test_read_from_source(self):
     data = self._create_data(100)
     source = self._create_source(data)
     self.assertItemsEqual(
         data, source_test_utils.read_from_source(source, None, None))
コード例 #25
0
 def _read_records(self, file_or_pattern, **kwargs):
   return source_test_utils.read_from_source(
       VcfSource(file_or_pattern, **kwargs))
コード例 #26
0
ファイル: vcfio_test.py プロジェクト: wscheep/beam
 def _read_records(self, file_or_pattern, **kwargs):
     return source_test_utils.read_from_source(
         VcfSource(file_or_pattern, **kwargs))