Exemple #1
0
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.bz2')
      with bz2.BZ2File(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(file_name)
      assert_that(pcoll, equal_to(lines))
      pipeline.run()
Exemple #2
0
 def test_process_auto(self):
     with TempDir() as temp_dir:
         path = temp_dir.create_temp_file('result.gz')
         _write_file_gzip(path, FOO_BAR_RECORD_BASE64)
         with TestPipeline() as p:
             result = (p
                       | Create([path])
                       | ReadAllFromTFRecord(
                           coder=coders.BytesCoder(),
                           compression_type=CompressionTypes.AUTO))
             assert_that(result, equal_to(['foo', 'bar']))
Exemple #3
0
 def test_process_glob(self):
     with TempDir() as temp_dir:
         self._write_glob(temp_dir, 'result')
         glob = temp_dir.get_path() + os.path.sep + '*result'
         with TestPipeline() as p:
             result = (p
                       | Create([glob])
                       | ReadAllFromTFRecord(
                           coder=coders.BytesCoder(),
                           compression_type=CompressionTypes.AUTO))
             assert_that(result, equal_to(['foo', 'bar'] * 3))
Exemple #4
0
    def test_read_bzip2(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with bz2.BZ2File(file_name, 'wb') as f:
                f.write('\n'.join(lines).encode('utf-8'))

            with TestPipeline() as pipeline:
                pcoll = pipeline | 'Read' >> ReadFromText(
                    file_name, compression_type=CompressionTypes.BZIP2)
                assert_that(pcoll, equal_to(lines))
Exemple #5
0
    def test_read_auto_gzip(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file(suffix='.gz')

            with gzip.GzipFile(file_name, 'wb') as f:
                f.write('\n'.join(lines).encode('utf-8'))

            with TestPipeline() as pipeline:
                pcoll = pipeline | 'Read' >> ReadFromText(file_name)
                assert_that(pcoll, equal_to(lines))
Exemple #6
0
  def test_read_gzip(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      with TestPipeline() as pipeline:
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
Exemple #7
0
 def test_read_all_gzip(self):
   _, lines = write_data(100)
   with TempDir() as tempdir:
     file_name = tempdir.create_temp_file()
     with gzip.GzipFile(file_name, 'wb') as f:
       f.write('\n'.join(lines).encode('utf-8'))
     with TestPipeline() as pipeline:
       pcoll = (pipeline
                | Create([file_name])
                | 'ReadAll' >> ReadAllFromText(
                    compression_type=CompressionTypes.GZIP))
       assert_that(pcoll, equal_to(lines))
Exemple #8
0
 def test_process_gzip(self):
   with TempDir() as temp_dir:
     path = temp_dir.create_temp_file('result')
     _write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
       result = (p
                 | ReadFromTFRecord(
                     path,
                     coder=coders.BytesCoder(),
                     compression_type=CompressionTypes.GZIP,
                     validate=True))
       assert_that(result, equal_to([b'foo', b'bar']))
Exemple #9
0
 def test_read_reentrant_after_splitting(self):
     with TempDir() as tempdir:
         file_name = self._create_temp_vcf_file(
             _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
         source = VcfSource(file_name)
         splits = [
             split for split in source.split(desired_bundle_size=100000)
         ]
         assert len(splits) == 1
         source_test_utils.assert_reentrant_reads_succeed(
             (splits[0].source, splits[0].start_position,
              splits[0].stop_position))
Exemple #10
0
 def test_dynamic_work_rebalancing(self):
     with TempDir() as tempdir:
         file_name = self._create_temp_vcf_file(
             _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir)
         source = VcfSource(file_name)
         splits = [
             split for split in source.split(desired_bundle_size=100000)
         ]
         assert len(splits) == 1
         source_test_utils.assert_split_at_fraction_exhaustive(
             splits[0].source, splits[0].start_position,
             splits[0].stop_position)
Exemple #11
0
 def test_file_pattern_verify_details(self):
   variant_1, vcf_line_1 = self._get_sample_variant_1()
   variant_2, vcf_line_2 = self._get_sample_variant_2()
   variant_3, vcf_line_3 = self._get_sample_variant_3()
   with TempDir() as tempdir:
     self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + [vcf_line_1], tempdir)
     self._create_temp_vcf_file((_SAMPLE_HEADER_LINES +
                                 [vcf_line_2, vcf_line_3]),
                                tempdir)
     read_data = self._read_records(os.path.join(tempdir.get_path(), '*.vcf'))
     self.assertEqual(3, len(read_data))
     self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
Exemple #12
0
  def test_end2end(self):
    with TempDir() as temp_dir:
      file_path_prefix = temp_dir.create_temp_file('result')

      # Generate a TFRecord file.
      with TestPipeline() as p:
        expected_data = [self.create_inputs() for _ in range(0, 10)]
        _ = p | beam.Create(expected_data) | WriteToTFRecord(file_path_prefix)

      # Read the file back and compare.
      with TestPipeline() as p:
        actual_data = p | ReadFromTFRecord(file_path_prefix + '-*')
        assert_that(actual_data, equal_to(expected_data))
Exemple #13
0
    def test_read_deflate(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with open(file_name, 'wb') as f:
                f.write(zlib.compress('\n'.join(lines).encode('utf-8')))

            pipeline = TestPipeline()
            pcoll = pipeline | 'Read' >> ReadFromText(
                file_name, 0, CompressionTypes.DEFLATE, True,
                coders.StrUtf8Coder())
            assert_that(pcoll, equal_to(lines))
            pipeline.run()
Exemple #14
0
  def test_read_gzip_with_skip_lines(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(
          file_name, 0, CompressionTypes.GZIP,
          True, coders.StrUtf8Coder(), skip_header_lines=2)
      assert_that(pcoll, equal_to(lines[2:]))
      pipeline.run()
Exemple #15
0
  def test_end2end_read_write_read(self):
    with TempDir() as temp_dir:
      path = temp_dir.create_temp_file('result')
      with TestPipeline() as p:
        # Initial read to validate the pipeline doesn't fail before the file is
        # created.
        _ = p | ReadFromTFRecord(path + '-*', validate=False)
        expected_data = [self.create_inputs() for _ in range(0, 10)]
        _ = p | beam.Create(expected_data) | WriteToTFRecord(
            path, file_name_suffix='.gz')

      # Read the file back and compare.
      with TestPipeline() as p:
        actual_data = p | ReadFromTFRecord(path+'-*', validate=True)
        assert_that(actual_data, equal_to(expected_data))
Exemple #16
0
  def test_write_record_multiple(self):
    with TempDir() as temp_dir:
      path = temp_dir.create_temp_file('result')
      record = binascii.a2b_base64(FOO_BAR_RECORD_BASE64)
      sink = _TFRecordSink(
          path,
          coder=coders.BytesCoder(),
          file_name_suffix='',
          num_shards=0,
          shard_name_template=None,
          compression_type=CompressionTypes.UNCOMPRESSED)
      self._write_lines(sink, path, [b'foo', b'bar'])

      with open(path, 'rb') as f:
        self.assertEqual(f.read(), record)
Exemple #17
0
  def test_write_record_auto(self):
    with TempDir() as temp_dir:
      file_path_prefix = temp_dir.create_temp_file('result')
      with TestPipeline() as p:
        input_data = [b'foo', b'bar']
        _ = p | beam.Create(input_data) | WriteToTFRecord(
            file_path_prefix, file_name_suffix='.gz')

      actual = []
      file_name = glob.glob(file_path_prefix + '-*.gz')[0]
      for r in tf.python_io.tf_record_iterator(
          file_name, options=tf.python_io.TFRecordOptions(
              tf.python_io.TFRecordCompressionType.GZIP)):
        actual.append(r)
      self.assertEqual(actual, input_data)
Exemple #18
0
  def test_process_multiple_globs(self):
    with TempDir() as temp_dir:
      globs = []
      for i in range(3):
        suffix = 'result' + str(i)
        self._write_glob(temp_dir, suffix)
        globs.append(temp_dir.get_path() + os.path.sep + '*' + suffix)

      with TestPipeline() as p:
        result = (p
                  | Create(globs)
                  | ReadAllFromTFRecord(
                      coder=coders.BytesCoder(),
                      compression_type=CompressionTypes.AUTO))
        assert_that(result, equal_to([b'foo', b'bar'] * 9))
Exemple #19
0
    def test_read_corrupted_deflate_fails(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with open(file_name, 'wb') as f:
                f.write(zlib.compress('\n'.join(lines).encode('utf-8')))

            with open(file_name, 'wb') as f:
                f.write(b'corrupt')

            with self.assertRaises(Exception):
                with TestPipeline() as pipeline:
                    pcoll = pipeline | 'Read' >> ReadFromText(
                        file_name, 0, CompressionTypes.DEFLATE, True,
                        coders.StrUtf8Coder())
                    assert_that(pcoll, equal_to(lines))
Exemple #20
0
    def test_read_corrupted_bzip2_fails(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with bz2.BZ2File(file_name, 'wb') as f:
                f.write('\n'.join(lines).encode('utf-8'))

            with open(file_name, 'wb') as f:
                f.write(b'corrupt')

            pipeline = TestPipeline()
            pcoll = pipeline | 'Read' >> ReadFromText(
                file_name, compression_type=CompressionTypes.BZIP2)
            assert_that(pcoll, equal_to(lines))
            with self.assertRaises(Exception):
                pipeline.run()
Exemple #21
0
    def test_read_corrupted_gzip_fails(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with gzip.GzipFile(file_name, 'wb') as f:
                f.write(b'\n'.join(lines))

            with open(file_name, 'wb') as f:
                f.write('corrupt')

            pipeline = TestPipeline()
            pcoll = pipeline | 'Read' >> ReadFromText(
                file_name, 0, CompressionTypes.GZIP, True,
                coders.StrUtf8Coder())
            assert_that(pcoll, equal_to(lines))

            with self.assertRaises(Exception):
                pipeline.run()
Exemple #22
0
  def test_end2end_example_proto(self):
    with TempDir() as temp_dir:
      file_path_prefix = temp_dir.create_temp_file('result')

      example = tf.train.Example()
      example.features.feature['int'].int64_list.value.extend(list(range(3)))
      example.features.feature['bytes'].bytes_list.value.extend(
          [b'foo', b'bar'])

      with TestPipeline() as p:
        _ = p | beam.Create([example]) | WriteToTFRecord(
            file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__))

      # Read the file back and compare.
      with TestPipeline() as p:
        actual_data = (p | ReadFromTFRecord(
            file_path_prefix + '-*',
            coder=beam.coders.ProtoCoder(example.__class__)))
        assert_that(actual_data, equal_to([example]))
def test_pubsub_to_gcs():
    PubSubToGCS.run(
        input_topic="unused",  # mocked by TestStream
        output_path="gs://{}/pubsub/{}/output".format(BUCKET, UUID),
        window_size=1,  # 1 minute
        pipeline_args=[
            "--project",
            PROJECT,
            "--temp_location",
            TempDir().get_path(),
        ],
    )

    # Check for output files on GCS.
    gcs_client = beam.io.gcp.gcsio.GcsIO()
    files = gcs_client.list_prefix("gs://{}/pubsub/{}".format(BUCKET, UUID))
    assert len(files) > 0

    # Clean up.
    gcs_client.delete_batch(list(files))
Exemple #24
0
  def test_read_bzip2_concat(self):
    with TempDir() as tempdir:
      bzip2_file_name1 = tempdir.create_temp_file()
      lines = ['a', 'b', 'c']
      with bz2.BZ2File(bzip2_file_name1, 'wb') as dst:
        data = '\n'.join(lines) + '\n'
        dst.write(data.encode('utf-8'))

      bzip2_file_name2 = tempdir.create_temp_file()
      lines = ['p', 'q', 'r']
      with bz2.BZ2File(bzip2_file_name2, 'wb') as dst:
        data = '\n'.join(lines) + '\n'
        dst.write(data.encode('utf-8'))

      bzip2_file_name3 = tempdir.create_temp_file()
      lines = ['x', 'y', 'z']
      with bz2.BZ2File(bzip2_file_name3, 'wb') as dst:
        data = '\n'.join(lines) + '\n'
        dst.write(data.encode('utf-8'))

      final_bzip2_file = tempdir.create_temp_file()
      with open(bzip2_file_name1, 'rb') as src, open(
          final_bzip2_file, 'wb') as dst:
        dst.writelines(src.readlines())

      with open(bzip2_file_name2, 'rb') as src, open(
          final_bzip2_file, 'ab') as dst:
        dst.writelines(src.readlines())

      with open(bzip2_file_name3, 'rb') as src, open(
          final_bzip2_file, 'ab') as dst:
        dst.writelines(src.readlines())

      pipeline = TestPipeline()
      lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText(
          final_bzip2_file,
          compression_type=beam.io.filesystem.CompressionTypes.BZIP2)

      expected = ['a', 'b', 'c', 'p', 'q', 'r', 'x', 'y', 'z']
      assert_that(lines, equal_to(expected))
      pipeline.run()
Exemple #25
0
    def test_read_deflate_concat(self):
        with TempDir() as tempdir:
            deflate_file_name1 = tempdir.create_temp_file()
            lines = ['a', 'b', 'c']
            with open(deflate_file_name1, 'wb') as dst:
                data = '\n'.join(lines) + '\n'
                dst.write(zlib.compress(data.encode('utf-8')))

            deflate_file_name2 = tempdir.create_temp_file()
            lines = ['p', 'q', 'r']
            with open(deflate_file_name2, 'wb') as dst:
                data = '\n'.join(lines) + '\n'
                dst.write(zlib.compress(data.encode('utf-8')))

            deflate_file_name3 = tempdir.create_temp_file()
            lines = ['x', 'y', 'z']
            with open(deflate_file_name3, 'wb') as dst:
                data = '\n'.join(lines) + '\n'
                dst.write(zlib.compress(data.encode('utf-8')))

            final_deflate_file = tempdir.create_temp_file()
            with open(deflate_file_name1, 'rb') as src, \
                    open(final_deflate_file, 'wb') as dst:
                dst.writelines(src.readlines())

            with open(deflate_file_name2, 'rb') as src, \
                    open(final_deflate_file, 'ab') as dst:
                dst.writelines(src.readlines())

            with open(deflate_file_name3, 'rb') as src, \
                    open(final_deflate_file, 'ab') as dst:
                dst.writelines(src.readlines())

            with TestPipeline() as pipeline:
                lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText(
                    final_deflate_file,
                    compression_type=beam.io.filesystem.CompressionTypes.
                    DEFLATE)

                expected = ['a', 'b', 'c', 'p', 'q', 'r', 'x', 'y', 'z']
                assert_that(lines, equal_to(expected))
def test_pubsub_to_gcs():
    PubSubToGCS.run(
        input_topic="unused",  # mocked by TestStream
        output_path=f"gs://{BUCKET}/pubsub/{UUID}/output",
        window_size=1,  # 1 minute
        num_shards=1,
        pipeline_args=[
            "--project",
            PROJECT,
            "--temp_location",
            TempDir().get_path(),
        ],
    )

    # Check for output files on GCS.
    gcs_client = GcsIO()
    files = gcs_client.list_prefix(f"gs://{BUCKET}/pubsub/{UUID}")
    assert len(files) > 0

    # Clean up.
    gcs_client.delete_batch(list(files))
Exemple #27
0
  def test_read_gzip_large_after_splitting(self):
    _, lines = write_data(10000)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      source = TextSource(file_name, 0, CompressionTypes.GZIP, True,
                          coders.StrUtf8Coder())
      splits = list(source.split(desired_bundle_size=1000))

      if len(splits) > 1:
        raise ValueError('FileBasedSource generated more than one initial '
                         'split for a compressed file.')

      reference_source_info = (source, None, None)
      sources_info = ([
          (split.source, split.start_position, split.stop_position) for
          split in splits])
      source_test_utils.assert_sources_equal_reference_source(
          reference_source_info, sources_info)
Exemple #28
0
  def test_read_gzip_concat(self):
    with TempDir() as tempdir:
      gzip_file_name1 = tempdir.create_temp_file()
      lines = ['a', 'b', 'c']
      with gzip.open(gzip_file_name1, 'wb') as dst:
        data = '\n'.join(lines) + '\n'
        dst.write(data)

      gzip_file_name2 = tempdir.create_temp_file()
      lines = ['p', 'q', 'r']
      with gzip.open(gzip_file_name2, 'wb') as dst:
        data = '\n'.join(lines) + '\n'
        dst.write(data)

      gzip_file_name3 = tempdir.create_temp_file()
      lines = ['x', 'y', 'z']
      with gzip.open(gzip_file_name3, 'wb') as dst:
        data = '\n'.join(lines) + '\n'
        dst.write(data)

      final_gzip_file = tempdir.create_temp_file()
      with open(gzip_file_name1, 'rb') as src, \
           open(final_gzip_file, 'wb') as dst:
        dst.writelines(src.readlines())

      with open(gzip_file_name2, 'rb') as src, \
           open(final_gzip_file, 'ab') as dst:
        dst.writelines(src.readlines())

      with open(gzip_file_name3, 'rb') as src, \
           open(final_gzip_file, 'ab') as dst:
        dst.writelines(src.readlines())

      pipeline = TestPipeline()
      lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText(
          final_gzip_file,
          compression_type=beam.io.filesystem.CompressionTypes.GZIP)

      expected = ['a', 'b', 'c', 'p', 'q', 'r', 'x', 'y', 'z']
      assert_that(lines, equal_to(expected))
Exemple #29
0
 def _create_temp_file_and_read_records(self, lines):
     with TempDir() as tempdir:
         file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines)
         return self._read_records(file_name)
Exemple #30
0
 def test_read_reentrant_without_splitting(self):
   with TempDir() as tempdir:
     file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES +
                                            _SAMPLE_TEXT_LINES, tempdir)
     source = VcfSource(file_name)
     source_test_utils.assert_reentrant_reads_succeed((source, None, None))