Example #1
0
    def test_write_to_dynamic_destination(self):

        sink_params = [
            fileio.TextSink,  # pass a type signature
            fileio.TextSink()  # pass a FileSink object
        ]

        for sink in sink_params:
            dir = self._new_tempdir()

            with TestPipeline() as p:
                _ = (p
                     | "Create" >> beam.Create(range(100))
                     | beam.Map(lambda x: str(x))
                     | fileio.WriteToFiles(
                         path=dir,
                         destination=lambda n: "odd" if int(n) % 2 else "even",
                         sink=sink,
                         file_naming=fileio.destination_prefix_naming("test")))

            with TestPipeline() as p:
                result = (
                    p
                    | fileio.MatchFiles(FileSystems.join(dir, '*'))
                    | fileio.ReadMatches()
                    | beam.Map(lambda f: (
                        os.path.basename(f.metadata.path).split('-')[0],
                        sorted(map(int,
                                   f.read_utf8().strip().split('\n'))))))

                assert_that(
                    result,
                    equal_to([('odd', list(range(1, 100, 2))),
                              ('even', list(range(0, 100, 2)))]))
Example #2
0
  def test_write_to_different_file_types_some_spilling(self):

    dir = self._new_tempdir()

    with TestPipeline() as p:
      _ = (
          p
          | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
          | beam.io.fileio.WriteToFiles(
              path=dir,
              destination=lambda record: record['foundation'],
              sink=lambda dest: (
                  WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
                  if dest == 'apache' else WriteFilesTest.JsonSink()),
              file_naming=fileio.destination_prefix_naming(),
              max_writers_per_bundle=1))

    with TestPipeline() as p:
      cncf_res = (
          p
          | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
          | fileio.ReadMatches()
          | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n'))
          | beam.Map(json.loads))

      apache_res = (
          p
          |
          "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*'))
          | "ReadApache" >> fileio.ReadMatches()
          | "MapApache" >>
          beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))

      assert_that(
          cncf_res,
          equal_to([
              row for row in self.SIMPLE_COLLECTION
              if row['foundation'] == 'cncf'
          ]),
          label='verifyCNCF')

      assert_that(
          apache_res,
          equal_to([[row['project'], row['foundation']]
                    for row in self.SIMPLE_COLLECTION
                    if row['foundation'] == 'apache']),
          label='verifyApache')
Example #3
0
 def no_colon_file_naming(*args):
     file_name = fileio.destination_prefix_naming()(*args)
     return file_name.replace(':', '_')
Example #4
0
    def test_streaming_different_file_types(self):
        dir = self._new_tempdir()
        input = iter(WriteFilesTest.SIMPLE_COLLECTION)
        ts = (TestStream().advance_watermark_to(0).add_elements([
            next(input), next(input)
        ]).advance_watermark_to(10).add_elements([
            next(input), next(input)
        ]).advance_watermark_to(20).add_elements([
            next(input), next(input)
        ]).advance_watermark_to(30).add_elements([next(input),
                                                  next(input)
                                                  ]).advance_watermark_to(40))

        with TestPipeline() as p:
            _ = (p
                 | ts
                 | beam.WindowInto(FixedWindows(10))
                 | beam.io.fileio.WriteToFiles(
                     path=dir,
                     destination=lambda record: record['foundation'],
                     sink=lambda dest:
                     (WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
                      if dest == 'apache' else WriteFilesTest.JsonSink()),
                     file_naming=fileio.destination_prefix_naming(),
                     max_writers_per_bundle=0,
                 ))

        with TestPipeline() as p:
            cncf_files = (p
                          | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
                          | "CncfFileNames" >> beam.Map(lambda fm: fm.path))

            apache_files = (p
                            | "MatchApache" >> fileio.MatchFiles(
                                FileSystems.join(dir, 'apache*'))
                            |
                            "ApacheFileNames" >> beam.Map(lambda fm: fm.path))

            assert_that(
                cncf_files,
                matches_all([
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'cncf-1970-01-01T00:00:00-1970-01-01T00:00:10--.*')
                    ),
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'cncf-1970-01-01T00:00:10-1970-01-01T00:00:20--.*')
                    ),
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'cncf-1970-01-01T00:00:20-1970-01-01T00:00:30--.*')
                    ),
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'cncf-1970-01-01T00:00:30-1970-01-01T00:00:40--.*')
                    )
                ]),
                label='verifyCNCFFiles')

            assert_that(
                apache_files,
                matches_all([
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'apache-1970-01-01T00:00:00-1970-01-01T00:00:10--.*'
                        )),
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'apache-1970-01-01T00:00:10-1970-01-01T00:00:20--.*'
                        )),
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'apache-1970-01-01T00:00:20-1970-01-01T00:00:30--.*'
                        )),
                    stringmatches.matches_regexp(
                        FileSystems.join(
                            dir,
                            'apache-1970-01-01T00:00:30-1970-01-01T00:00:40--.*'
                        ))
                ]),
                label='verifyApacheFiles')
def hash_naming(*args):
    file_name = fileio.destination_prefix_naming()(
        *args)  # -1885604661473532601----00000-00001
    destination = file_name.split('----')[0]  # -1885604661473532601
    return '{}.json'.format(destination)  # -1885604661473532601.json