Esempio n. 1
0
 def test_create_do_with_side_in_memory_write(self):
   elements = ['abc', 'def', 'ghi']
   side_elements = ['x', 'y', 'z']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=3),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, side)]),
               tag_and_type=('inmemory', pvalue.SingletonPCollectionView,
                             (False, None))),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(
                   inmemory.InMemorySource(
                       elements=[pickler.dumps(e) for e in side_elements],
                       start_index=None,
                       end_index=None),
                   tag='inmemory')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(
           output_buffer=output_buffer,
           input=(1, 0),
           output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as singleton therefore we should see
   # only the first element appended.
   self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
Esempio n. 2
0
 def _parse_inmemory_source(specs, codec_specs, unused_context):
   if specs['@type'] == 'InMemorySource':
     # We do not wrap values sent to the service in a Create transform and
     # received here in a WindowedValue wrapper, but the service needs to be
     # sent the wrapped encoding so subsequent GroupByKey operations work
     # correctly.
     #
     # Note: The service may create a dummy empty InMemorySource that is a
     # windowed value when processing a BigQuerySource.  In that case, we do
     # not unwrap this coder.
     # TODO(ccy): investigate if we can make these semantics cleaner.
     coder = get_coder_from_spec(codec_specs)
     if isinstance(coder, coders.WindowedValueCoder):
       coder = coder.wrapped_value_coder
     # Handle the case where 'elements' for an InMemory source is empty
     # list.
     if specs['elements']:
       # start_index/end_index could be missing if default behavior should be
       # used. For instance a list with one element will have start_index=0 and
       # end_index=1 by default.
       start_index = (
           None
           if 'start_index' not in specs else int(
               specs['start_index']['value']))
       end_index = (
           None if 'end_index' not in specs
           else int(specs['end_index']['value']))
       return inmemory.InMemorySource(
           elements=[base64.b64decode(v['value']) for v in specs['elements']],
           coder=coder,
           start_index=start_index, end_index=end_index)
     else:
       return inmemory.InMemorySource(elements=[], coder=coder)
Esempio n. 3
0
 def test_create_do_with_side_text_file_write(self):
   input_path = self.create_temp_file('x\ny\n')
   elements = ['aa', 'bb']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=2),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, s) for s in side]),
               tag_and_type=('textfile', pvalue.IterablePCollectionView, ())),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(fileio.TextFileSource(
                   file_path=input_path, start_offset=None, end_offset=None,
                   strip_trailing_newlines=True,
                   coder=coders.StrUtf8Coder()),
                                             tag='textfile')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as collection therefore we should see
   # all elements of the side source.
   self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                    sorted(output_buffer))
Esempio n. 4
0
 def test_create_do_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               # Start at the last element.
               start_index=2,
               # Go beyond the end to test that case is handled.
               end_index=15),
           output_coders=[coders.ToStringCoder()]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi\n', f.read())
Esempio n. 5
0
    def test_create_do_avro_write(self):
        output_path = self.create_temp_file('n/a')
        elements = ['abc', 'def', 'ghi']
        work_item = workitem.BatchWorkItem(None)

        work_item.map_task = make_map_task([
            maptask.WorkerRead(
                inmemory.InMemorySource(
                    elements=[pickler.dumps(e) for e in elements],
                    start_index=2,  # Start at the last element.
                    end_index=3),
                output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=None,
                               output_coders=[self.OUTPUT_CODER]),
            make_text_sink(output_path,
                           input=(1, 0),
                           coder=coders.Base64PickleCoder())
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
 def test_in_memory_source_updates_progress_many(self):
   source = inmemory.InMemorySource([1, 2, 3, 4, 5], coder=FakeCoder())
   with source.reader() as reader:
     self.assertEqual(0, reader.get_progress().percent_complete)
     i = 0
     for item in reader:
       i += 1
       self.assertEqual(i + 10, item)
       self.assertEqual(float(i) / 5, reader.get_progress().percent_complete)
     self.assertEqual(5, i)
     self.assertEqual(1, reader.get_progress().percent_complete)
 def test_in_memory_source_updates_progress_one(self):
   source = inmemory.InMemorySource([1], coder=FakeCoder())
   with source.reader() as reader:
     self.assertEqual(0, reader.get_progress().percent_complete)
     i = 0
     for item in reader:
       i += 1
       self.assertEqual(11, item)
       self.assertEqual(1, reader.get_progress().percent_complete)
     self.assertEqual(1, i)
     self.assertEqual(1, reader.get_progress().percent_complete)
Esempio n. 8
0
 def test_in_memory_source_updates_progress_many(self):
   source = inmemory.InMemorySource([1, 2, 3, 4, 5], coder=FakeCoder())
   with source.reader() as reader:
     self.assertEqual(None, reader.get_progress())
     i = 0
     for item in reader:
       self.assertEqual(i, reader.get_progress().position.record_index)
       self.assertEqual(11 + i, item)
       i += 1
     self.assertEqual(5, i)
     self.assertEqual(4, reader.get_progress().position.record_index)
 def test_in_memory_source_to_flatten(self):
     work = workitem.get_work_items(
         get_in_memory_source_to_flatten_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(inmemory.InMemorySource(
             start_index=1,
             end_index=3,
             elements=[
                 base64.b64decode(v['value']) for v in IN_MEMORY_ELEMENTS
             ],
             coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerFlatten(inputs=[(0, 0)], output_coders=[CODER])
     ]))
Esempio n. 10
0
    def test_create_do_with_collection_side_bigquery_write(self):
        elements = ['aa', 'bb']
        side_elements = ['x', 'y']
        output_buffer = []
        patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
        with mock.patch(target=patch_target) as mock_class:
            # Setup the reader so it will yield the values in 'side_elements'.
            reader_mock = mock_class.return_value
            reader_mock.__enter__.return_value = reader_mock
            # Use a lambda so that multiple readers can be created, each reading the
            # entirety of the side elements.
            reader_mock.__iter__.side_effect = lambda: (x
                                                        for x in side_elements)

            executor.MapTaskExecutor().execute(
                make_map_task([
                    maptask.WorkerRead(inmemory.InMemorySource(
                        elements=[pickler.dumps(e) for e in elements],
                        start_index=0,
                        end_index=3),
                                       output_coders=[self.OUTPUT_CODER]),
                    maptask.
                    WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                        ptransform.CallableWrapperDoFn(
                            lambda x, side: ['%s:%s' % (x, s) for s in side]),
                        tag_and_type=('bigquery',
                                      pvalue.IterablePCollectionView, ())),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=[
                                   maptask.WorkerSideInputSource(
                                       bigquery.BigQuerySource(
                                           project='project',
                                           dataset='dataset',
                                           table='table',
                                           coder=get_bigquery_source_coder()),
                                       tag='bigquery')
                               ],
                               output_coders=[self.OUTPUT_CODER]),
                    maptask.WorkerInMemoryWrite(
                        output_buffer=output_buffer,
                        input=(1, 0),
                        output_coders=(self.OUTPUT_CODER, ))
                ]))
        # The side source was specified as collection therefore we should see
        # all elements of the side source.
        self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'],
                         sorted(output_buffer))
Esempio n. 11
0
 def test_create_do_with_side_avro_file_write(self):
     input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x'))
     input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y'))
     elements = ['aa', 'bb']
     output_buffer = []
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(inmemory.InMemorySource(
                 elements=[pickler.dumps(e) for e in elements],
                 start_index=0,
                 end_index=2),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(
                 serialized_fn=pickle_with_side_inputs(
                     ptransform.CallableWrapperDoFn(
                         lambda x, side: ['%s:%s' % (x, s) for s in side]),
                     tag_and_type=('sometag',
                                   pvalue.IterablePCollectionView, ())),
                 output_tags=['out'],
                 input=(0, 0),
                 # Note that the two side inputs have the same tag. This is quite
                 # common for intermediary PCollections used as side inputs that
                 # are saved as AVRO files. The files will contain the sharded
                 # PCollection.
                 side_inputs=[
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path1,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag'),
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path2,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag')
                 ],
                 output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerInMemoryWrite(
                 output_buffer=output_buffer,
                 input=(1, 0),
                 output_coders=(self.OUTPUT_CODER, ))
         ]))
     # The side source was specified as collection therefore we should see
     # all three elements of the side source.
     self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                      sorted(output_buffer))
Esempio n. 12
0
 def test_pgbk(self):
   elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements
                                            ],
                                   start_index=0,
                                   end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerPartialGroupByKey(
           combine_fn=None,
           input=(0, 0),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
 def test_in_memory_source_to_text_sink(self):
     work = workitem.get_work_items(
         get_in_memory_source_to_text_sink_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(inmemory.InMemorySource(
             start_index=1,
             end_index=3,
             elements=[
                 base64.b64decode(v['value']) for v in IN_MEMORY_ELEMENTS
             ],
             coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerWrite(fileio.NativeTextFileSink(
             file_path_prefix='gs://somefile',
             append_trailing_newlines=True,
             coder=CODER),
                             input=(0, 0),
                             output_coders=(CODER, ))
     ]))
Esempio n. 14
0
 def test_combine(self):
   elements = [('a', [1, 2, 3]), ('b', [10])]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CombineFn.from_callable(sum)),
                               phase='all',
                               input=(0, 0),
                               output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', 6), ('b', 10)], output_buffer)
Esempio n. 15
0
  def test_create_do_with_singleton_side_bigquery_write(self):
    elements = ['abc', 'def', 'ghi']
    side_elements = ['x', 'y', 'z']
    output_buffer = []
    patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
    with mock.patch(target=patch_target) as mock_class:
      # Setup the reader so it will yield the values in 'side_elements'.
      reader_mock = mock_class.return_value
      reader_mock.__enter__.return_value = reader_mock
      reader_mock.__iter__.return_value = (x for x in side_elements)

      pickled_elements = [pickler.dumps(e) for e in elements]
      executor.MapTaskExecutor().execute(make_map_task([
          maptask.WorkerRead(
              inmemory.InMemorySource(elements=pickled_elements,
                                      start_index=0,
                                      end_index=3),
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerDoFn(
              serialized_fn=pickle_with_side_inputs(
                  ptransform.CallableWrapperDoFn(
                      lambda x, side: ['%s:%s' % (x, side)]),
                  tag_and_type=('bigquery', pvalue.SingletonPCollectionView,
                                (False, None))),
              output_tags=['out'], input=(0, 0),
              side_inputs=[
                  maptask.WorkerSideInputSource(
                      bigquery.BigQuerySource(
                          project='project',
                          dataset='dataset',
                          table='table',
                          coder=get_bigquery_source_coder()),
                      tag='bigquery')],
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerInMemoryWrite(
              output_buffer=output_buffer,
              input=(1, 0),
              output_coders=(self.OUTPUT_CODER,))]))
    # The side source was specified as singleton therefore we should see
    # only the first element appended.
    self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
Esempio n. 16
0
  def test_in_memory_source_dynamic_split(self):
    source = inmemory.InMemorySource([10, 20, 30, 40, 50, 60],
                                     coder=FakeCoder())

    # Unstarted reader
    with source.reader() as reader:
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          None)

    # Proposed split position out of range
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=-1))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=10))),
          None)

    # Already read past proposed split position
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      next(reader_iter)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=1))),
          None)

      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          None)

    # Successful split
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=4))),
          iobase.DynamicSplitResultWithPosition(
              stop_position=iobase.ReaderPosition(record_index=4)))

      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          iobase.DynamicSplitResultWithPosition(
              stop_position=iobase.ReaderPosition(record_index=2)))
Esempio n. 17
0
 def test_inmemory(self):
   source = inmemory.InMemorySource([1, 2, 3, 4, 5], FakeCoder(), 1, 3)
   with source.reader() as reader:
     self.assertItemsEqual([12, 13], [i for i in reader])
Esempio n. 18
0
 def test_in_memory_source_updates_progress_none(self):
   source = inmemory.InMemorySource([], coder=FakeCoder())
   with source.reader() as reader:
     self.assertEqual(None, reader.get_progress())
Esempio n. 19
0
 def test_norange(self):
   source = inmemory.InMemorySource([1, 2, 3, 4, 5], coder=FakeCoder())
   with source.reader() as reader:
     self.assertItemsEqual([11, 12, 13, 14, 15], [i for i in reader])