def test_with_setup_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = os.path.join(
            source_dir, 'setup.py')

        self.assertEqual(
            [
                dependency.WORKFLOW_TARBALL_FILE,
                names.PICKLED_MAIN_SESSION_FILE
            ],
            dependency.stage_job_resources(
                options,
                # We replace the build setup command because a realistic one would
                # require the setuptools package to be installed. Note that we can't
                # use "touch" here to create the expected output tarball file, since
                # touch is not available on Windows, so we invoke python to produce
                # equivalent behavior.
                build_setup_args=[
                    'python', '-c', 'open(__import__("sys").argv[1], "a")',
                    os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)
                ],
                temp_dir=source_dir))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
  def test_with_setup_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [dependency.WORKFLOW_TARBALL_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
            temp_dir=source_dir))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
  def test_get_unknown_args(self):

    # Used for testing newly added flags.
    class MockOptions(PipelineOptions):

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--mock_flag',
                            action='store_true',
                            help='Enable work item profiling')

    test_cases = [
        {'flags': ['--num_workers', '5'],
         'expected': {'num_workers': 5, 'mock_flag': False}},
        {
            'flags': [
                '--profile', '--profile_location', 'gs://bucket/', 'ignored'],
            'expected': {
                'profile': True, 'profile_location': 'gs://bucket/',
                'mock_flag': False}
        },
        {'flags': ['--num_workers', '5', '--mock_flag'],
         'expected': {'num_workers': 5, 'mock_flag': True}},
    ]

    for case in test_cases:
      options = PipelineOptions(flags=case['flags'])
      self.assertDictContainsSubset(case['expected'], options.get_all_options())
      self.assertEqual(options.view_as(MockOptions).mock_flag,
                       case['expected']['mock_flag'])
Example #4
0
def model_pcollection(argv):
  """Creating a PCollection from data in local memory.

  URL: https://cloud.google.com/dataflow/model/pcollection
  """
  from google.cloud.dataflow.utils.options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  # [START model_pcollection]
  p = df.Pipeline(options=pipeline_options)

  (p
   | df.Create([
       'To be, or not to be: that is the question: ',
       'Whether \'tis nobler in the mind to suffer ',
       'The slings and arrows of outrageous fortune, ',
       'Or to take arms against a sea of troubles, '])
   | df.io.Write(df.io.TextFileSink(my_options.output)))

  p.run()
Example #5
0
def examples_wordcount_minimal(renames):
  """MinimalWordCount example snippets.

  URL:
  https://cloud.google.com/dataflow/examples/wordcount-example#MinimalWordCount
  """
  import re

  import google.cloud.dataflow as df

  from google.cloud.dataflow.utils.options import GoogleCloudOptions
  from google.cloud.dataflow.utils.options import StandardOptions
  from google.cloud.dataflow.utils.options import PipelineOptions

  # [START examples_wordcount_minimal_options]
  options = PipelineOptions()
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
  google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
  options.view_as(StandardOptions).runner = 'BlockingDataflowPipelineRunner'
  # [END examples_wordcount_minimal_options]

  # Run it locally for testing.
  options = PipelineOptions()

  # [START examples_wordcount_minimal_create]
  p = df.Pipeline(options=options)
  # [END examples_wordcount_minimal_create]

  (
      # [START examples_wordcount_minimal_read]
      p | df.io.Read(df.io.TextFileSource(
          'gs://dataflow-samples/shakespeare/kinglear.txt'))
      # [END examples_wordcount_minimal_read]

      # [START examples_wordcount_minimal_pardo]
      | df.FlatMap('ExtractWords', lambda x: re.findall(r'[A-Za-z\']+', x))
      # [END examples_wordcount_minimal_pardo]

      # [START examples_wordcount_minimal_count]
      | df.combiners.Count.PerElement()
      # [END examples_wordcount_minimal_count]

      # [START examples_wordcount_minimal_map]
      | df.Map(lambda (word, count): '%s: %s' % (word, count))
      # [END examples_wordcount_minimal_map]

      # [START examples_wordcount_minimal_write]
      | df.io.Write(df.io.TextFileSink('gs://my-bucket/counts.txt'))
      # [END examples_wordcount_minimal_write]
  )

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START examples_wordcount_minimal_run]
  p.run()
    def test_no_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = False
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
 def test_option_with_spcae(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
Example #9
0
 def test_option_with_spcae(self):
     options = PipelineOptions(
         flags=['--option with space= value with space'])
     self.assertEqual(
         getattr(options.view_as(PipelineOptionsTest.MockOptions),
                 'option with space'), ' value with space')
     options_from_dict = PipelineOptions.from_dictionary(
         options.get_all_options())
     self.assertEqual(
         getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
                 'option with space'), ' value with space')
    def test_default_resources(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)

        self.assertEqual([names.PICKLED_MAIN_SESSION_FILE],
                         dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
 def test_no_temp_location(self):
     staging_dir = tempfile.mkdtemp()
     options = PipelineOptions()
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.staging_location = staging_dir
     self.update_options(options)
     google_cloud_options.temp_location = None
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(options)
     self.assertEqual('The --temp_location option must be specified.',
                      cm.exception.message)
 def test_from_dictionary(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     all_options_dict = options.get_all_options()
     options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
     self.assertEqual(options_from_dict.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
 def test_no_temp_location(self):
   staging_dir = tempfile.mkdtemp()
   options = PipelineOptions()
   google_cloud_options = options.view_as(GoogleCloudOptions)
   google_cloud_options.staging_location = staging_dir
   self.update_options(options)
   google_cloud_options.temp_location = None
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(options)
   self.assertEqual('The --temp_location option must be specified.',
                    cm.exception.message)
  def test_no_main_session(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
Example #15
0
 def test_get_all_options(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         self.assertDictContainsSubset(case['expected'],
                                       options.get_all_options())
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
  def test_default_resources(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
 def test_requirements_file_not_present(self):
     staging_dir = tempfile.mkdtemp()
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).requirements_file = 'nosuchfile'
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The file %s cannot be found. It was specified in the '
         '--requirements_file command line option.' % 'nosuchfile')
 def test_requirements_file_not_present(self):
   staging_dir = tempfile.mkdtemp()
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).requirements_file = 'nosuchfile'
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The file %s cannot be found. It was specified in the '
       '--requirements_file command line option.' % 'nosuchfile')
    def test_sdk_location_gcs(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
        self.override_file_copy(sdk_location, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(options))
Example #20
0
 def test_from_dictionary(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         all_options_dict = options.get_all_options()
         options_from_dict = PipelineOptions.from_dictionary(
             all_options_dict)
         self.assertEqual(
             options_from_dict.view_as(
                 PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
    self.override_file_copy(sdk_location, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    def test_sdk_location_local_not_present(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'nosuchdir'
        with self.assertRaises(RuntimeError) as cm:
            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).sdk_location = sdk_location

            dependency.stage_job_resources(options)
        self.assertEqual(
            'The file "%s" cannot be found. Its '
            'location was specified by the --sdk_location command-line option.'
            % sdk_location, cm.exception.message)
  def test_with_extra_packages_missing_files(self):
    staging_dir = tempfile.mkdtemp()
    with self.assertRaises(RuntimeError) as cm:

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']

      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--extra_packages command line option.' % 'nosuchfile.tar.gz')
  def test_sdk_location_local_not_present(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'nosuchdir'
    with self.assertRaises(RuntimeError) as cm:
      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).sdk_location = sdk_location

      dependency.stage_job_resources(options)
    self.assertEqual(
        'The file "%s" cannot be found. Its '
        'location was specified by the --sdk_location command-line option.' %
        sdk_location,
        cm.exception.message)
Example #25
0
def run(argv=None):
  known_args, pipeline_args = get_args(argv)
  options = PipelineOptions(pipeline_args)

  run_count1(known_args, options)
  run_count2(known_args, options)
  run_count3(known_args, options)
    def test_with_extra_packages_missing_files(self):
        staging_dir = tempfile.mkdtemp()
        with self.assertRaises(RuntimeError) as cm:

            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).extra_packages = [
                'nosuchfile.tar.gz'
            ]

            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % 'nosuchfile.tar.gz')
 def test_with_extra_packages_invalid_file_name(self):
     staging_dir = tempfile.mkdtemp()
     source_dir = tempfile.mkdtemp()
     self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing')
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).extra_packages = [
             os.path.join(source_dir, 'abc.tgz')
         ]
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The --extra_packages option expects a full path ending with '
         '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)

        self.assertEqual(sink.state, _TestSink.STATE_FINALIZED)
        if data:
            self.assertIsNotNone(sink.last_writer)
            self.assertEqual(sink.last_writer.state, _TestWriter.STATE_CLOSED)
            self.assertEqual(sink.last_writer.write_output, data)
            if return_init_result:
                self.assertEqual(sink.last_writer.init_result,
                                 _TestSink.TEST_INIT_RESULT)
                self.assertEqual(sink.init_result_at_finalize,
                                 _TestSink.TEST_INIT_RESULT)
            self.assertIsNotNone(sink.last_writer.uid)
            if return_write_results:
                self.assertEqual(sink.write_results_at_finalize,
                                 [_TestWriter.TEST_WRITE_RESULT])
        else:
            self.assertIsNone(sink.last_writer)
 def test_with_extra_packages_invalid_file_name(self):
   staging_dir = tempfile.mkdtemp()
   source_dir = tempfile.mkdtemp()
   self.create_temp_file(
       os.path.join(source_dir, 'abc.tgz'), 'nothing')
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).extra_packages = [
         os.path.join(source_dir, 'abc.tgz')]
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The --extra_packages option expects a full path ending with '
       '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Example #30
0
 def test_table_schema_without_project(self):
     # Writer should pick executing project by default.
     sink = df.io.BigQuerySink(table='mydataset.mytable')
     options = PipelineOptions(flags=['--project', 'myproject'])
     sink.pipeline_options = options
     writer = sink.writer()
     self.assertEquals('myproject', writer.project_id)
    def test_with_requirements_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).requirements_file = os.path.join(
            source_dir, dependency.REQUIREMENTS_FILE)
        self.create_temp_file(
            os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
        self.assertEqual(
            [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE],
            dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket'
    expected_from_path = utils.path.join(
        sdk_location,
        'google-cloud-dataflow-python-sdk-%s.tgz' % __version__)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    def test_with_extra_packages(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'),
                              'nothing')
        self.create_temp_file(
            os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE),
            'nothing')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).extra_packages = [
            os.path.join(source_dir, 'abc.tar.gz'),
            os.path.join(source_dir, 'xyz.tar.gz'),
            'gs://my-gcs-bucket/gcs.tar.gz'
        ]

        gcs_copied_files = []

        def file_copy(from_path, to_path):
            if from_path.startswith('gs://'):
                gcs_copied_files.append(from_path)
                _, from_name = os.path.split(from_path)
                self.create_temp_file(os.path.join(to_path, from_name),
                                      'nothing')
                logging.info('Fake copied GCS file: %s to %s', from_path,
                             to_path)
            elif to_path.startswith('gs://'):
                logging.info('Faking file_copy(%s, %s)', from_path, to_path)
            else:
                shutil.copyfile(from_path, to_path)

        dependency._dependency_file_copy = file_copy

        self.assertEqual([
            'abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz',
            dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE
        ], dependency.stage_job_resources(options))
        with open(os.path.join(staging_dir,
                               dependency.EXTRA_PACKAGES_FILE)) as f:
            self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'],
                             f.readlines())
        self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
    def test_sdk_location_default(self):
        staging_dir = tempfile.mkdtemp()
        expected_from_url = '%s/v%s.tar.gz' % (dependency.PACKAGES_URL_PREFIX,
                                               __version__)
        expected_from_path = self.override_file_download(
            expected_from_url, staging_dir)
        self.override_file_copy(expected_from_path, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(
                options, file_copy=dependency._dependency_file_copy))
Example #35
0
 def test_table_schema_without_project(self):
     # Reader should pick executing project by default.
     source = df.io.BigQuerySource(table='mydataset.mytable')
     options = PipelineOptions(flags=['--project', 'myproject'])
     source.pipeline_options = options
     reader = source.reader()
     self.assertEquals('SELECT * FROM [myproject:mydataset.mytable];',
                       reader.query)
  def test_setup_file_not_named_setup_dot_py(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = (
        os.path.join(source_dir, 'xyz-setup.py'))

    self.create_temp_file(
        os.path.join(source_dir, 'xyz-setup.py'), 'notused')
    with self.assertRaises(RuntimeError) as cm:
      dependency.stage_job_resources(options)
    self.assertTrue(
        cm.exception.message.startswith(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of '))
  def test_with_requirements_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).requirements_file = os.path.join(
        source_dir, dependency.REQUIREMENTS_FILE)
    self.create_temp_file(
        os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
    self.assertEqual(
        [dependency.REQUIREMENTS_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
    def test_setup_file_not_named_setup_dot_py(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = (os.path.join(
            source_dir, 'xyz-setup.py'))

        self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'),
                              'notused')
        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertTrue(
            cm.exception.message.startswith(
                'The --setup_file option expects the full path to a file named '
                'setup.py instead of '))
  def test_sdk_location_default(self):
    staging_dir = tempfile.mkdtemp()
    expected_from_url = '%s/v%s.tar.gz' % (
        dependency.PACKAGES_URL_PREFIX, __version__)
    expected_from_path = self.override_file_download(
        expected_from_url, staging_dir)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            file_copy=dependency._dependency_file_copy))
    def test_sdk_location_local(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = tempfile.mkdtemp()
        self.create_temp_file(
            os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE),
            'contents')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'contents')
Example #41
0
    def test_override_options(self):
        base_flags = ['--num_workers', '5']
        options = PipelineOptions(base_flags)
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], False)

        options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], True)
    def test_missing_required_options(self):
        options = PipelineOptions([''])
        runner = MockRunners.DataflowPipelineRunner()
        validator = PipelineOptionsValidator(options, runner)
        errors = validator.validate()

        self.assertEqual(
            self.check_errors_for_arguments(
                errors,
                ['project', 'job_name', 'staging_location', 'temp_location']),
            [])
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        'gs://my-gcs-bucket/gcs.tar.gz']

    gcs_copied_files = []
    def file_copy(from_path, to_path):
      if from_path.startswith('gs://'):
        gcs_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        self.create_temp_file(os.path.join(to_path, from_name), 'nothing')
        logging.info('Fake copied GCS file: %s to %s', from_path, to_path)
      elif to_path.startswith('gs://'):
        logging.info('Faking file_copy(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    dependency._dependency_file_copy = file_copy

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz',
         dependency.EXTRA_PACKAGES_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'],
                       f.readlines())
    self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
Example #44
0
  def __init__(self, runner=None, options=None, argv=None):
    """Initialize a pipeline object.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the Dataflow job.
      argv: a list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """

    if options is not None:
      if isinstance(options, PipelineOptions):
        self.options = options
      else:
        raise ValueError(
            'Parameter options, if specified, must be of type PipelineOptions. '
            'Received : %r', options)
    elif argv is not None:
      if isinstance(argv, list):
        self.options = PipelineOptions(argv)
      else:
        raise ValueError(
            'Parameter argv, if specified, must be a list. Received : %r', argv)
    else:
      self.options = None

    if runner is None and self.options is not None:
      runner = self.options.view_as(StandardOptions).runner

    if isinstance(runner, str):
      runner = create_runner(runner)
    elif not isinstance(runner, PipelineRunner):
      raise TypeError('Runner must be a PipelineRunner object or the '
                      'name of a registered runner.')
    # List of PValue objects representing a DAG of transformations.
    self._nodes = []
    # Default runner to be used.
    self.runner = runner
    # Stack of transforms generated by nested apply() calls. The stack will
    # contain a root node as an enclosing (parent) node for top transforms.
    self.transforms_stack = [AppliedPTransform(None, None, '', None)]
    # Set of transform labels (full labels) applied to the pipeline.
    # If a transform is applied and the full label is already in the set
    # then the transform will have to be cloned with a new label.
    self.applied_labels = set()
Example #45
0
    def run_pipeline(self, count_implementation, factor=1):
        input_path = self.create_temp_file('CAT\nDOG\nCAT\nCAT\nDOG\n')
        output_path = input_path + '.result'

        known_args, pipeline_args = custom_ptransform.get_args(
            ['--input=%s*' % input_path,
             '--output=%s' % output_path])

        count_implementation(known_args, PipelineOptions(pipeline_args))
        self.assertEqual([
            """(u'CAT', %d)""" % (3 * factor),
            """(u'DOG', %d)""" % (2 * factor)
        ], self.get_output(output_path))
        def get_validator(temp_location):
            options = [
                '--project=example:example', '--job_name=job',
                '--staging_location=gs://foo/bar'
            ]

            if temp_location is not None:
                options.append('--temp_location=' + temp_location)

            pipeline_options = PipelineOptions(options)
            runner = MockRunners.DataflowPipelineRunner()
            validator = PipelineOptionsValidator(pipeline_options, runner)
            return validator
  def test_sdk_location_local(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(
            sdk_location,
            names.DATAFLOW_SDK_TARBALL_FILE),
        'contents')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'contents')
Example #48
0
def get_service_credentials():
    """Get credentials to access Google services."""
    user_agent = 'dataflow-python-sdk/1.0'
    if is_running_in_gce:
        # We are currently running as a GCE taskrunner worker.
        #
        # TODO(ccy): It's not entirely clear if these credentials are thread-safe.
        # If so, we can cache these credentials to save the overhead of creating
        # them again.
        return GCEMetadataCredentials(user_agent=user_agent)
    else:
        # We are currently being run from the command line.
        google_cloud_options = PipelineOptions(
            sys.argv).view_as(GoogleCloudOptions)
        if google_cloud_options.service_account_name:
            if not google_cloud_options.service_account_key_file:
                raise AuthenticationException(
                    'key file not provided for service account.')
            if not os.path.exists(
                    google_cloud_options.service_account_key_file):
                raise AuthenticationException(
                    'Specified service account key file does not exist.')
            client_scopes = [
                'https://www.googleapis.com/auth/bigquery',
                'https://www.googleapis.com/auth/cloud-platform',
                'https://www.googleapis.com/auth/devstorage.full_control',
                'https://www.googleapis.com/auth/userinfo.email',
                'https://www.googleapis.com/auth/datastore'
            ]

            # The following code uses oauth2client >=2.0.0 functionality and if this
            # is not available due to import errors will use 1.5.2 functionality.
            try:
                from oauth2client.service_account import ServiceAccountCredentials
                return ServiceAccountCredentials.from_p12_keyfile(
                    google_cloud_options.service_account_name,
                    google_cloud_options.service_account_key_file,
                    client_scopes,
                    user_agent=user_agent)
            except ImportError:
                with file(google_cloud_options.service_account_key_file) as f:
                    service_account_key = f.read()
                from oauth2client.client import SignedJwtAssertionCredentials
                return SignedJwtAssertionCredentials(
                    google_cloud_options.service_account_name,
                    service_account_key,
                    client_scopes,
                    user_agent=user_agent)

        else:
            return _GCloudWrapperCredentials(user_agent)
Example #49
0
def pipeline_options_local(argv):
  """"Creating a Pipeline using a PipelineOptions object for local execution.

  URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
  """

  from google.cloud.dataflow import Pipeline
  from google.cloud.dataflow.utils.options import PipelineOptions

  options = PipelineOptions(flags=argv)

  # [START pipeline_options_define_custom_with_help_and_default]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the dataflow pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='Output for the dataflow pipeline',
                          default='gs://my-bucket/output')
  # [END pipeline_options_define_custom_with_help_and_default]

  my_options = options.view_as(MyOptions)

  my_input = my_options.input
  my_output = my_options.output

  # [START pipeline_options_local]
  # Create and set your Pipeline Options.
  options = PipelineOptions()
  p = Pipeline(options=options)
  # [END pipeline_options_local]

  lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input))
  lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output))
  p.run()
Example #50
0
    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz')]

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', dependency.EXTRA_PACKAGES_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n'], f.readlines())
Example #52
0
def model_pipelines(argv):
  """A wordcount snippet as a simple pipeline example.

  URL: https://cloud.google.com/dataflow/model/pipelines
  """
  # [START model_pipelines]
  import re

  import google.cloud.dataflow as df
  from google.cloud.dataflow.utils.options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          dest='input',
                          default='gs://dataflow-samples/shakespeare/kinglear'
                          '.txt',
                          help='Input file to process.')
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  p = df.Pipeline(options=pipeline_options)

  (p
   | df.io.Read(df.io.TextFileSource(my_options.input))
   | df.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   | df.Map(lambda x: (x, 1)) | df.combiners.Count.PerKey()
   | df.io.Write(df.io.TextFileSink(my_options.output)))

  p.run()
  def test_override_options(self):
    base_flags = ['--num_workers', '5']
    options = PipelineOptions(base_flags)
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertEqual(options.get_all_options()['mock_flag'], False)

    options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertEqual(options.get_all_options()['mock_flag'], True)
Example #54
0
 def __ror__(self, left):
     """Used to apply this PTransform to non-PValues, e.g., a tuple."""
     pvalueish, pvalues = self._extract_input_pvalues(left)
     pipelines = [
         v.pipeline for v in pvalues if isinstance(v, pvalue.PValue)
     ]
     if pvalues and not pipelines:
         deferred = False
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow import pipeline
         from google.cloud.dataflow.utils.options import PipelineOptions
         # pylint: enable=g-import-not-at-top
         p = pipeline.Pipeline('DirectPipelineRunner',
                               PipelineOptions(sys.argv))
     else:
         if not pipelines:
             if self.pipeline is not None:
                 p = self.pipeline
             else:
                 raise ValueError(
                     '"%s" requires a pipeline to be specified '
                     'as there are no deferred inputs.' % self.label)
         else:
             p = self.pipeline or pipelines[0]
             for pp in pipelines:
                 if p != pp:
                     raise ValueError(
                         'Mixing value from different pipelines not allowed.'
                     )
         deferred = not getattr(p.runner, 'is_eager', False)
     # pylint: disable=g-import-not-at-top
     from google.cloud.dataflow.transforms.core import Create
     # pylint: enable=g-import-not-at-top
     replacements = {
         id(v): p | Create('CreatePInput%s' % ix, v)
         for ix, v in enumerate(pvalues)
         if not isinstance(v, pvalue.PValue) and v is not None
     }
     pvalueish = _SetInputPValues().visit(pvalueish, replacements)
     self.pipeline = p
     result = p.apply(self, pvalueish)
     if deferred:
         return result
     else:
         # Get a reference to the runners internal cache, otherwise runner may
         # clean it after run.
         cache = p.runner.cache
         p.run()
         return _MaterializePValues(cache).visit(result)
    def test_remote_runner_translation(self):
        remote_runner = DataflowPipelineRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions([
                         '--dataflow_endpoint=ignored', '--job_name=test-job',
                         '--project=test-project',
                         '--staging_location=ignored',
                         '--temp_location=/dev/null', '--no_auth=True'
                     ]))

        res = (p | ptransform.Create('create', [1, 2, 3])
               | ptransform.FlatMap('do', lambda x: [(x, x)])
               | ptransform.GroupByKey('gbk'))
        remote_runner.job = apiclient.Job(p.options)
        super(DataflowPipelineRunner, remote_runner).run(p)
  def test_with_requirements_file_and_cache(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).requirements_file = os.path.join(
        source_dir, dependency.REQUIREMENTS_FILE)
    options.view_as(SetupOptions).requirements_cache = os.path.join(
        tempfile.gettempdir(), 'alternative-cache-dir')
    self.create_temp_file(
        os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
    self.assertEqual(
        sorted([dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE,
                'abc.txt', 'def.txt']),
        sorted(dependency.stage_job_resources(
            options,
            populate_requirements_cache=self.populate_requirements_cache)))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
    self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
    self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
Example #57
0
    def test_deferred_side_input_iterable(self):
        @typehints.with_input_types(str, typehints.Iterable[str])
        def concat(glue, items):
            return glue.join(sorted(items))

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', ['x', 'y', 'z'])
        result = main_input | df.Map(concat, pvalue.AsIter(side_input))
        assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))
        p.run()

        bad_side_input = p | df.Create('bad_side', [1, 2, 3])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
Example #58
0
def pipeline_options_remote(argv):
  """"Creating a Pipeline using a PipelineOptions object for remote execution.

  URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
  """

  from google.cloud.dataflow import Pipeline
  from google.cloud.dataflow.utils.options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from google.cloud.dataflow.utils.options import GoogleCloudOptions
  from google.cloud.dataflow.utils.options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowPipelineRunner or
  # BlockingDataflowPipelineRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowPipelineRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  # Overriding the runner for tests.
  options.view_as(StandardOptions).runner = 'DirectPipelineRunner'
  p = Pipeline(options=options)

  lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input))
  lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output))

  p.run()
Example #59
0
  def __init__(self, runner=None, options=None, argv=None):
    """Initialize a pipeline object.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the Dataflow job.
      argv: a list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """

    if options is not None:
      if isinstance(options, PipelineOptions):
        self.options = options
      else:
        raise ValueError(
            'Parameter options, if specified, must be of type PipelineOptions. '
            'Received : %r', options)
    elif argv is not None:
      if isinstance(argv, list):
        self.options = PipelineOptions(argv)
      else:
        raise ValueError(
            'Parameter argv, if specified, must be a list. Received : %r', argv)
    else:
      self.options = None

    if runner is None and self.options is not None:
      runner = self.options.view_as(StandardOptions).runner
      if runner is None:
        runner = StandardOptions.DEFAULT_RUNNER
        logging.info(('Missing pipeline option (runner). Executing pipeline '
                      'using the default runner: %s.'), runner)

    if isinstance(runner, str):
      runner = create_runner(runner)
    elif not isinstance(runner, PipelineRunner):
      raise TypeError('Runner must be a PipelineRunner object or the '
                      'name of a registered runner.')

    # Validate pipeline options
    if self.options is not None:
      errors = PipelineOptionsValidator(self.options, runner).validate()
      if errors:
        raise ValueError(
            'Pipeline has validations errors: \n' + '\n'.join(errors))

    # Default runner to be used.
    self.runner = runner
    # Stack of transforms generated by nested apply() calls. The stack will
    # contain a root node as an enclosing (parent) node for top transforms.
    self.transforms_stack = [AppliedPTransform(None, None, '', None)]
    # Set of transform labels (full labels) applied to the pipeline.
    # If a transform is applied and the full label is already in the set
    # then the transform will have to be cloned with a new label.
    self.applied_labels = set()
    # Store cache of views created from PCollections.  For reference, see
    # pvalue._cache_view().
    self._view_cache = {}
Example #60
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets.

  URL: https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf
  """

  import re
  import google.cloud.dataflow as df
  from google.cloud.dataflow.utils.options import PipelineOptions

  class WordCountOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the dataflow pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='output for the dataflow pipeline',
                          default='gs://my-bucket/output')

  class ExtractWordsFn(df.DoFn):

    def process(self, context):
      words = re.findall(r'[A-Za-z\']+', context.element)
      for word in words:
        yield word

  class FormatCountsFn(df.DoFn):

    def process(self, context):
      word, count = context.element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(df.PTransform):

    def apply(self, pcoll):
      return (pcoll
              # Convert lines of text into individual words.
              | df.ParDo('ExtractWords', ExtractWordsFn())
              # Count the number of times each word occurs.
              | df.combiners.Count.PerElement()
              # Format each word and count into a printable string.
              | df.ParDo('FormatCounts', FormatCountsFn()))
  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  p = df.Pipeline(options=pipeline_options)

  # [START pipeline_monitoring_execution]
  (p
   # Read the lines of the input text.
   | df.io.Read('ReadLines', df.io.TextFileSource(options.input))
   # Count the words.
   | CountWords()
   # Write the formatted word counts to output.
   | df.io.Write('WriteCounts', df.io.TextFileSink(options.output)))
  # [END pipeline_monitoring_execution]

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()