Ejemplo n.º 1
0
 def test_pipeline_read_all_multiple_files_large(self):
   pipeline = TestPipeline()
   pcoll = (pipeline
            | 'Create' >> beam.Create(
                [testdata_util.get_full_file_path('valid-4.0.vcf'),
                 testdata_util.get_full_file_path('valid-4.1-large.vcf'),
                 testdata_util.get_full_file_path('valid-4.2.vcf')])
            | 'Read' >> ReadAllFromVcf())
   assert_that(pcoll, asserts.count_equals_to(9900))
   pipeline.run()
Ejemplo n.º 2
0
 def test_pipeline_read_all_multiple_files(self):
   with TempDir() as tempdir:
     file_name_1 = self._create_temp_vcf_file(
         _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
     file_name_2 = self._create_temp_vcf_file(
         _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
     pipeline = TestPipeline()
     pcoll = (pipeline
              | 'Create' >> beam.Create([file_name_1, file_name_2])
              | 'Read' >> ReadAllFromVcf())
     assert_that(pcoll, asserts.count_equals_to(2 * len(_SAMPLE_TEXT_LINES)))
     pipeline.run()
Ejemplo n.º 3
0
  def _assert_pipeline_read_files_record_count_equal(
      self, input_pattern, expected_count, use_read_all=False):
    """Helper method for verifying total records read.

    Args:
      input_pattern (str): Input file pattern to read.
      expected_count (int): Expected number of reacords that was read.
      use_read_all (bool): Whether to use the scalable ReadAllFromVcf transform
        instead of ReadFromVcf.
    """
    pipeline = TestPipeline()
    if use_read_all:
      pcoll = (pipeline
               | 'Create' >> beam.Create([input_pattern])
               | 'Read' >> ReadAllFromVcf())
    else:
      pcoll = pipeline | 'Read' >> ReadFromVcf(input_pattern)
    assert_that(pcoll, asserts.count_equals_to(expected_count))
    pipeline.run()