def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [ dependency.WORKFLOW_TARBALL_FILE, names.PICKLED_MAIN_SESSION_FILE ], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE) ], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [dependency.WORKFLOW_TARBALL_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def test_get_unknown_args(self): # Used for testing newly added flags. class MockOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--mock_flag', action='store_true', help='Enable work item profiling') test_cases = [ {'flags': ['--num_workers', '5'], 'expected': {'num_workers': 5, 'mock_flag': False}}, { 'flags': [ '--profile', '--profile_location', 'gs://bucket/', 'ignored'], 'expected': { 'profile': True, 'profile_location': 'gs://bucket/', 'mock_flag': False} }, {'flags': ['--num_workers', '5', '--mock_flag'], 'expected': {'num_workers': 5, 'mock_flag': True}}, ] for case in test_cases: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as(MockOptions).mock_flag, case['expected']['mock_flag'])
def model_pcollection(argv): """Creating a PCollection from data in local memory. URL: https://cloud.google.com/dataflow/model/pcollection """ from google.cloud.dataflow.utils.options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) # [START model_pcollection] p = df.Pipeline(options=pipeline_options) (p | df.Create([ 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, ']) | df.io.Write(df.io.TextFileSink(my_options.output))) p.run()
def examples_wordcount_minimal(renames): """MinimalWordCount example snippets. URL: https://cloud.google.com/dataflow/examples/wordcount-example#MinimalWordCount """ import re import google.cloud.dataflow as df from google.cloud.dataflow.utils.options import GoogleCloudOptions from google.cloud.dataflow.utils.options import StandardOptions from google.cloud.dataflow.utils.options import PipelineOptions # [START examples_wordcount_minimal_options] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'BlockingDataflowPipelineRunner' # [END examples_wordcount_minimal_options] # Run it locally for testing. options = PipelineOptions() # [START examples_wordcount_minimal_create] p = df.Pipeline(options=options) # [END examples_wordcount_minimal_create] ( # [START examples_wordcount_minimal_read] p | df.io.Read(df.io.TextFileSource( 'gs://dataflow-samples/shakespeare/kinglear.txt')) # [END examples_wordcount_minimal_read] # [START examples_wordcount_minimal_pardo] | df.FlatMap('ExtractWords', lambda x: re.findall(r'[A-Za-z\']+', x)) # [END examples_wordcount_minimal_pardo] # [START examples_wordcount_minimal_count] | df.combiners.Count.PerElement() # [END examples_wordcount_minimal_count] # [START examples_wordcount_minimal_map] | df.Map(lambda (word, count): '%s: %s' % (word, count)) # [END examples_wordcount_minimal_map] # [START examples_wordcount_minimal_write] | df.io.Write(df.io.TextFileSink('gs://my-bucket/counts.txt')) # [END examples_wordcount_minimal_write] ) p.visit(SnippetUtils.RenameFiles(renames)) # [START examples_wordcount_minimal_run] p.run()
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual([], dependency.stage_job_resources(options))
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_option_with_spcae(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_option_with_spcae(self): options = PipelineOptions( flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual([names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_no_temp_location(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.staging_location = staging_dir self.update_options(options) google_cloud_options.temp_location = None with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual('The --temp_location option must be specified.', cm.exception.message)
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary(all_options_dict) self.assertEqual(options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_no_temp_location(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.staging_location = staging_dir self.update_options(options) google_cloud_options.temp_location = None with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual('The --temp_location option must be specified.', cm.exception.message)
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual( [], dependency.stage_job_resources(options))
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_requirements_file_not_present(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def test_requirements_file_not_present(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary( all_options_dict) self.assertEqual( options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz'] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def run(argv=None): known_args, pipeline_args = get_args(argv) options = PipelineOptions(pipeline_args) run_count1(known_args, options) run_count2(known_args, options) run_count3(known_args, options)
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ 'nosuchfile.tar.gz' ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz') ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_packages option expects a full path ending with ' '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def _run_write_test(self, data, return_init_result=True, return_write_results=True): write_to_test_sink = WriteToTestSink(return_init_result, return_write_results) p = Pipeline(options=PipelineOptions([])) result = p | df.Create('start', data) | write_to_test_sink assert_that(result, is_empty()) p.run() sink = write_to_test_sink.last_sink self.assertIsNotNone(sink) self.assertEqual(sink.state, _TestSink.STATE_FINALIZED) if data: self.assertIsNotNone(sink.last_writer) self.assertEqual(sink.last_writer.state, _TestWriter.STATE_CLOSED) self.assertEqual(sink.last_writer.write_output, data) if return_init_result: self.assertEqual(sink.last_writer.init_result, _TestSink.TEST_INIT_RESULT) self.assertEqual(sink.init_result_at_finalize, _TestSink.TEST_INIT_RESULT) self.assertIsNotNone(sink.last_writer.uid) if return_write_results: self.assertEqual(sink.write_results_at_finalize, [_TestWriter.TEST_WRITE_RESULT]) else: self.assertIsNone(sink.last_writer)
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz')] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_packages option expects a full path ending with ' '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def test_table_schema_without_project(self): # Writer should pick executing project by default. sink = df.io.BigQuerySink(table='mydataset.mytable') options = PipelineOptions(flags=['--project', 'myproject']) sink.pipeline_options = options writer = sink.writer() self.assertEquals('myproject', writer.project_id)
def test_with_requirements_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket' expected_from_path = utils.path.join( sdk_location, 'google-cloud-dataflow-python-sdk-%s.tgz' % __version__) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), 'gs://my-gcs-bucket/gcs.tar.gz' ] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual([ 'abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE ], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = '%s/v%s.tar.gz' % (dependency.PACKAGES_URL_PREFIX, __version__) expected_from_path = self.override_file_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_table_schema_without_project(self): # Reader should pick executing project by default. source = df.io.BigQuerySource(table='mydataset.mytable') options = PipelineOptions(flags=['--project', 'myproject']) source.pipeline_options = options reader = source.reader() self.assertEquals('SELECT * FROM [myproject:mydataset.mytable];', reader.query)
def test_setup_file_not_named_setup_dot_py(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = ( os.path.join(source_dir, 'xyz-setup.py')) self.create_temp_file( os.path.join(source_dir, 'xyz-setup.py'), 'notused') with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertTrue( cm.exception.message.startswith( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of '))
def test_with_requirements_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
def test_setup_file_not_named_setup_dot_py(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = (os.path.join( source_dir, 'xyz-setup.py')) self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'), 'notused') with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertTrue( cm.exception.message.startswith( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of '))
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = '%s/v%s.tar.gz' % ( dependency.PACKAGES_URL_PREFIX, __version__) expected_from_path = self.override_file_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_sdk_location_local(self): staging_dir = tempfile.mkdtemp() sdk_location = tempfile.mkdtemp() self.create_temp_file( os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE), 'contents') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join(staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'contents')
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], True)
def test_missing_required_options(self): options = PipelineOptions(['']) runner = MockRunners.DataflowPipelineRunner() validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual( self.check_errors_for_arguments( errors, ['project', 'job_name', 'staging_location', 'temp_location']), [])
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), 'gs://my-gcs-bucket/gcs.tar.gz'] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual( ['abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner: An object of type 'PipelineRunner' that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options: A configured 'PipelineOptions' object containing arguments that should be used for running the Dataflow job. argv: a list of arguments (such as sys.argv) to be used for building a 'PipelineOptions' object. This will only be used if argument 'options' is None. Raises: ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self.options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r', options) elif argv is not None: if isinstance(argv, list): self.options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r', argv) else: self.options = None if runner is None and self.options is not None: runner = self.options.view_as(StandardOptions).runner if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner must be a PipelineRunner object or the ' 'name of a registered runner.') # List of PValue objects representing a DAG of transformations. self._nodes = [] # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set()
def run_pipeline(self, count_implementation, factor=1): input_path = self.create_temp_file('CAT\nDOG\nCAT\nCAT\nDOG\n') output_path = input_path + '.result' known_args, pipeline_args = custom_ptransform.get_args( ['--input=%s*' % input_path, '--output=%s' % output_path]) count_implementation(known_args, PipelineOptions(pipeline_args)) self.assertEqual([ """(u'CAT', %d)""" % (3 * factor), """(u'DOG', %d)""" % (2 * factor) ], self.get_output(output_path))
def get_validator(temp_location): options = [ '--project=example:example', '--job_name=job', '--staging_location=gs://foo/bar' ] if temp_location is not None: options.append('--temp_location=' + temp_location) pipeline_options = PipelineOptions(options) runner = MockRunners.DataflowPipelineRunner() validator = PipelineOptionsValidator(pipeline_options, runner) return validator
def test_sdk_location_local(self): staging_dir = tempfile.mkdtemp() sdk_location = tempfile.mkdtemp() self.create_temp_file( os.path.join( sdk_location, names.DATAFLOW_SDK_TARBALL_FILE), 'contents') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join( staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'contents')
def get_service_credentials(): """Get credentials to access Google services.""" user_agent = 'dataflow-python-sdk/1.0' if is_running_in_gce: # We are currently running as a GCE taskrunner worker. # # TODO(ccy): It's not entirely clear if these credentials are thread-safe. # If so, we can cache these credentials to save the overhead of creating # them again. return GCEMetadataCredentials(user_agent=user_agent) else: # We are currently being run from the command line. google_cloud_options = PipelineOptions( sys.argv).view_as(GoogleCloudOptions) if google_cloud_options.service_account_name: if not google_cloud_options.service_account_key_file: raise AuthenticationException( 'key file not provided for service account.') if not os.path.exists( google_cloud_options.service_account_key_file): raise AuthenticationException( 'Specified service account key file does not exist.') client_scopes = [ 'https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/cloud-platform', 'https://www.googleapis.com/auth/devstorage.full_control', 'https://www.googleapis.com/auth/userinfo.email', 'https://www.googleapis.com/auth/datastore' ] # The following code uses oauth2client >=2.0.0 functionality and if this # is not available due to import errors will use 1.5.2 functionality. try: from oauth2client.service_account import ServiceAccountCredentials return ServiceAccountCredentials.from_p12_keyfile( google_cloud_options.service_account_name, google_cloud_options.service_account_key_file, client_scopes, user_agent=user_agent) except ImportError: with file(google_cloud_options.service_account_key_file) as f: service_account_key = f.read() from oauth2client.client import SignedJwtAssertionCredentials return SignedJwtAssertionCredentials( google_cloud_options.service_account_name, service_account_key, client_scopes, user_agent=user_agent) else: return _GCloudWrapperCredentials(user_agent)
def pipeline_options_local(argv): """"Creating a Pipeline using a PipelineOptions object for local execution. URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params """ from google.cloud.dataflow import Pipeline from google.cloud.dataflow.utils.options import PipelineOptions options = PipelineOptions(flags=argv) # [START pipeline_options_define_custom_with_help_and_default] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the dataflow pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='Output for the dataflow pipeline', default='gs://my-bucket/output') # [END pipeline_options_define_custom_with_help_and_default] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # [START pipeline_options_local] # Create and set your Pipeline Options. options = PipelineOptions() p = Pipeline(options=options) # [END pipeline_options_local] lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input)) lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output)) p.run()
def _run_write_test(self, data, return_init_result=True, return_write_results=True): write_to_test_sink = WriteToTestSink(return_init_result, return_write_results) p = Pipeline(options=PipelineOptions([])) result = p | df.Create('start', data) | write_to_test_sink assert_that(result, is_empty()) p.run() sink = write_to_test_sink.last_sink self.assertIsNotNone(sink)
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz')] self.assertEqual( ['abc.tar.gz', 'xyz.tar.gz', dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n'], f.readlines())
def model_pipelines(argv): """A wordcount snippet as a simple pipeline example. URL: https://cloud.google.com/dataflow/model/pipelines """ # [START model_pipelines] import re import google.cloud.dataflow as df from google.cloud.dataflow.utils.options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear' '.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) p = df.Pipeline(options=pipeline_options) (p | df.io.Read(df.io.TextFileSource(my_options.input)) | df.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | df.Map(lambda x: (x, 1)) | df.combiners.Count.PerKey() | df.io.Write(df.io.TextFileSink(my_options.output))) p.run()
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], True)
def __ror__(self, left): """Used to apply this PTransform to non-PValues, e.g., a tuple.""" pvalueish, pvalues = self._extract_input_pvalues(left) pipelines = [ v.pipeline for v in pvalues if isinstance(v, pvalue.PValue) ] if pvalues and not pipelines: deferred = False # pylint: disable=g-import-not-at-top from google.cloud.dataflow import pipeline from google.cloud.dataflow.utils.options import PipelineOptions # pylint: enable=g-import-not-at-top p = pipeline.Pipeline('DirectPipelineRunner', PipelineOptions(sys.argv)) else: if not pipelines: if self.pipeline is not None: p = self.pipeline else: raise ValueError( '"%s" requires a pipeline to be specified ' 'as there are no deferred inputs.' % self.label) else: p = self.pipeline or pipelines[0] for pp in pipelines: if p != pp: raise ValueError( 'Mixing value from different pipelines not allowed.' ) deferred = not getattr(p.runner, 'is_eager', False) # pylint: disable=g-import-not-at-top from google.cloud.dataflow.transforms.core import Create # pylint: enable=g-import-not-at-top replacements = { id(v): p | Create('CreatePInput%s' % ix, v) for ix, v in enumerate(pvalues) if not isinstance(v, pvalue.PValue) and v is not None } pvalueish = _SetInputPValues().visit(pvalueish, replacements) self.pipeline = p result = p.apply(self, pvalueish) if deferred: return result else: # Get a reference to the runners internal cache, otherwise runner may # clean it after run. cache = p.runner.cache p.run() return _MaterializePValues(cache).visit(result)
def test_remote_runner_translation(self): remote_runner = DataflowPipelineRunner() p = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--no_auth=True' ])) res = (p | ptransform.Create('create', [1, 2, 3]) | ptransform.FlatMap('do', lambda x: [(x, x)]) | ptransform.GroupByKey('gbk')) remote_runner.job = apiclient.Job(p.options) super(DataflowPipelineRunner, remote_runner).run(p)
def test_with_requirements_file_and_cache(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) options.view_as(SetupOptions).requirements_cache = os.path.join( tempfile.gettempdir(), 'alternative-cache-dir') self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( sorted([dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE, 'abc.txt', 'def.txt']), sorted(dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache))) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt'))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
def test_deferred_side_input_iterable(self): @typehints.with_input_types(str, typehints.Iterable[str]) def concat(glue, items): return glue.join(sorted(items)) p = df.Pipeline(options=PipelineOptions([])) main_input = p | df.Create(['a', 'bb', 'c']) side_input = p | df.Create('side', ['x', 'y', 'z']) result = main_input | df.Map(concat, pvalue.AsIter(side_input)) assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz'])) p.run() bad_side_input = p | df.Create('bad_side', [1, 2, 3]) with self.assertRaises(typehints.TypeCheckError): main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
def pipeline_options_remote(argv): """"Creating a Pipeline using a PipelineOptions object for remote execution. URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params """ from google.cloud.dataflow import Pipeline from google.cloud.dataflow.utils.options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from google.cloud.dataflow.utils.options import GoogleCloudOptions from google.cloud.dataflow.utils.options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowPipelineRunner or # BlockingDataflowPipelineRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowPipelineRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # Overriding the runner for tests. options.view_as(StandardOptions).runner = 'DirectPipelineRunner' p = Pipeline(options=options) lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input)) lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output)) p.run()
def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner: An object of type 'PipelineRunner' that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options: A configured 'PipelineOptions' object containing arguments that should be used for running the Dataflow job. argv: a list of arguments (such as sys.argv) to be used for building a 'PipelineOptions' object. This will only be used if argument 'options' is None. Raises: ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self.options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r', options) elif argv is not None: if isinstance(argv, list): self.options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r', argv) else: self.options = None if runner is None and self.options is not None: runner = self.options.view_as(StandardOptions).runner if runner is None: runner = StandardOptions.DEFAULT_RUNNER logging.info(('Missing pipeline option (runner). Executing pipeline ' 'using the default runner: %s.'), runner) if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner must be a PipelineRunner object or the ' 'name of a registered runner.') # Validate pipeline options if self.options is not None: errors = PipelineOptionsValidator(self.options, runner).validate() if errors: raise ValueError( 'Pipeline has validations errors: \n' + '\n'.join(errors)) # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set() # Store cache of views created from PCollections. For reference, see # pvalue._cache_view(). self._view_cache = {}
def pipeline_monitoring(renames): """Using monitoring interface snippets. URL: https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf """ import re import google.cloud.dataflow as df from google.cloud.dataflow.utils.options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the dataflow pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the dataflow pipeline', default='gs://my-bucket/output') class ExtractWordsFn(df.DoFn): def process(self, context): words = re.findall(r'[A-Za-z\']+', context.element) for word in words: yield word class FormatCountsFn(df.DoFn): def process(self, context): word, count = context.element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(df.PTransform): def apply(self, pcoll): return (pcoll # Convert lines of text into individual words. | df.ParDo('ExtractWords', ExtractWordsFn()) # Count the number of times each word occurs. | df.combiners.Count.PerElement() # Format each word and count into a printable string. | df.ParDo('FormatCounts', FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = df.Pipeline(options=pipeline_options) # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | df.io.Read('ReadLines', df.io.TextFileSource(options.input)) # Count the words. | CountWords() # Write the formatted word counts to output. | df.io.Write('WriteCounts', df.io.TextFileSink(options.output))) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()