def run(argv=None): known_args, pipeline_args = get_args(argv) options = PipelineOptions(pipeline_args) run_count1(known_args, options) run_count2(known_args, options) run_count3(known_args, options)
def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [ dependency.WORKFLOW_TARBALL_FILE, names.PICKLED_MAIN_SESSION_FILE ], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE) ], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def test_table_schema_without_project(self): # Writer should pick executing project by default. sink = df.io.BigQuerySink(table='mydataset.mytable') options = PipelineOptions(flags=['--project', 'myproject']) sink.pipeline_options = options writer = sink.writer() self.assertEquals('myproject', writer.project_id)
def _run_write_test(self, data, return_init_result=True, return_write_results=True): write_to_test_sink = WriteToTestSink(return_init_result, return_write_results) p = Pipeline(options=PipelineOptions([])) result = p | df.Create('start', data) | write_to_test_sink assert_that(result, is_empty()) p.run() sink = write_to_test_sink.last_sink self.assertIsNotNone(sink) self.assertEqual(sink.state, _TestSink.STATE_FINALIZED) if data: self.assertIsNotNone(sink.last_writer) self.assertEqual(sink.last_writer.state, _TestWriter.STATE_CLOSED) self.assertEqual(sink.last_writer.write_output, data) if return_init_result: self.assertEqual(sink.last_writer.init_result, _TestSink.TEST_INIT_RESULT) self.assertEqual(sink.init_result_at_finalize, _TestSink.TEST_INIT_RESULT) self.assertIsNotNone(sink.last_writer.uid) if return_write_results: self.assertEqual(sink.write_results_at_finalize, [_TestWriter.TEST_WRITE_RESULT]) else: self.assertIsNone(sink.last_writer)
def test_table_schema_without_project(self): # Reader should pick executing project by default. source = df.io.BigQuerySource(table='mydataset.mytable') options = PipelineOptions(flags=['--project', 'myproject']) source.pipeline_options = options reader = source.reader() self.assertEquals('SELECT * FROM [myproject:mydataset.mytable];', reader.query)
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], True)
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual([], dependency.stage_job_resources(options))
def test_missing_required_options(self): options = PipelineOptions(['']) runner = MockRunners.DataflowPipelineRunner() validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual( self.check_errors_for_arguments( errors, ['project', 'job_name', 'staging_location', 'temp_location']), [])
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_option_with_spcae(self): options = PipelineOptions( flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual([names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_no_temp_location(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.staging_location = staging_dir self.update_options(options) google_cloud_options.temp_location = None with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual('The --temp_location option must be specified.', cm.exception.message)
def test_requirements_file_not_present(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary( all_options_dict) self.assertEqual( options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def get_validator(temp_location): options = [ '--project=example:example', '--job_name=job', '--staging_location=gs://foo/bar' ] if temp_location is not None: options.append('--temp_location=' + temp_location) pipeline_options = PipelineOptions(options) runner = MockRunners.DataflowPipelineRunner() validator = PipelineOptionsValidator(pipeline_options, runner) return validator
def run_pipeline(self, count_implementation, factor=1): input_path = self.create_temp_file('CAT\nDOG\nCAT\nCAT\nDOG\n') output_path = input_path + '.result' known_args, pipeline_args = custom_ptransform.get_args( ['--input=%s*' % input_path, '--output=%s' % output_path]) count_implementation(known_args, PipelineOptions(pipeline_args)) self.assertEqual([ """(u'CAT', %d)""" % (3 * factor), """(u'DOG', %d)""" % (2 * factor) ], self.get_output(output_path))
def get_service_credentials(): """Get credentials to access Google services.""" user_agent = 'dataflow-python-sdk/1.0' if is_running_in_gce: # We are currently running as a GCE taskrunner worker. # # TODO(ccy): It's not entirely clear if these credentials are thread-safe. # If so, we can cache these credentials to save the overhead of creating # them again. return GCEMetadataCredentials(user_agent=user_agent) else: # We are currently being run from the command line. google_cloud_options = PipelineOptions( sys.argv).view_as(GoogleCloudOptions) if google_cloud_options.service_account_name: if not google_cloud_options.service_account_key_file: raise AuthenticationException( 'key file not provided for service account.') if not os.path.exists( google_cloud_options.service_account_key_file): raise AuthenticationException( 'Specified service account key file does not exist.') client_scopes = [ 'https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/cloud-platform', 'https://www.googleapis.com/auth/devstorage.full_control', 'https://www.googleapis.com/auth/userinfo.email', 'https://www.googleapis.com/auth/datastore' ] # The following code uses oauth2client >=2.0.0 functionality and if this # is not available due to import errors will use 1.5.2 functionality. try: from oauth2client.service_account import ServiceAccountCredentials return ServiceAccountCredentials.from_p12_keyfile( google_cloud_options.service_account_name, google_cloud_options.service_account_key_file, client_scopes, user_agent=user_agent) except ImportError: with file(google_cloud_options.service_account_key_file) as f: service_account_key = f.read() from oauth2client.client import SignedJwtAssertionCredentials return SignedJwtAssertionCredentials( google_cloud_options.service_account_name, service_account_key, client_scopes, user_agent=user_agent) else: return _GCloudWrapperCredentials(user_agent)
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def _run_write_test(self, data, return_init_result=True, return_write_results=True): write_to_test_sink = WriteToTestSink(return_init_result, return_write_results) p = Pipeline(options=PipelineOptions([])) result = p | df.Create('start', data) | write_to_test_sink assert_that(result, is_empty()) p.run() sink = write_to_test_sink.last_sink self.assertIsNotNone(sink)
def __ror__(self, left): """Used to apply this PTransform to non-PValues, e.g., a tuple.""" pvalueish, pvalues = self._extract_input_pvalues(left) pipelines = [ v.pipeline for v in pvalues if isinstance(v, pvalue.PValue) ] if pvalues and not pipelines: deferred = False # pylint: disable=g-import-not-at-top from google.cloud.dataflow import pipeline from google.cloud.dataflow.utils.options import PipelineOptions # pylint: enable=g-import-not-at-top p = pipeline.Pipeline('DirectPipelineRunner', PipelineOptions(sys.argv)) else: if not pipelines: if self.pipeline is not None: p = self.pipeline else: raise ValueError( '"%s" requires a pipeline to be specified ' 'as there are no deferred inputs.' % self.label) else: p = self.pipeline or pipelines[0] for pp in pipelines: if p != pp: raise ValueError( 'Mixing value from different pipelines not allowed.' ) deferred = not getattr(p.runner, 'is_eager', False) # pylint: disable=g-import-not-at-top from google.cloud.dataflow.transforms.core import Create # pylint: enable=g-import-not-at-top replacements = { id(v): p | Create('CreatePInput%s' % ix, v) for ix, v in enumerate(pvalues) if not isinstance(v, pvalue.PValue) and v is not None } pvalueish = _SetInputPValues().visit(pvalueish, replacements) self.pipeline = p result = p.apply(self, pvalueish) if deferred: return result else: # Get a reference to the runners internal cache, otherwise runner may # clean it after run. cache = p.runner.cache p.run() return _MaterializePValues(cache).visit(result)
def test_remote_runner_translation(self): remote_runner = DataflowPipelineRunner() p = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--no_auth=True' ])) res = (p | ptransform.Create('create', [1, 2, 3]) | ptransform.FlatMap('do', lambda x: [(x, x)]) | ptransform.GroupByKey('gbk')) remote_runner.job = apiclient.Job(p.options) super(DataflowPipelineRunner, remote_runner).run(p)
def test_deferred_side_input_iterable(self): @typehints.with_input_types(str, typehints.Iterable[str]) def concat(glue, items): return glue.join(sorted(items)) p = df.Pipeline(options=PipelineOptions([])) main_input = p | df.Create(['a', 'bb', 'c']) side_input = p | df.Create('side', ['x', 'y', 'z']) result = main_input | df.Map(concat, pvalue.AsIter(side_input)) assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz'])) p.run() bad_side_input = p | df.Create('bad_side', [1, 2, 3]) with self.assertRaises(typehints.TypeCheckError): main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz') ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_packages option expects a full path ending with ' '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def test_deferred_side_inputs(self): @typehints.with_input_types(str, int) def repeat(s, times): return s * times p = df.Pipeline(options=PipelineOptions([])) main_input = p | df.Create(['a', 'bb', 'c']) side_input = p | df.Create('side', [3]) result = main_input | df.Map(repeat, pvalue.AsSingleton(side_input)) assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc'])) p.run() bad_side_input = p | df.Create('bad_side', ['z']) with self.assertRaises(typehints.TypeCheckError): main_input | df.Map('again', repeat, pvalue.AsSingleton(bad_side_input))
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ 'nosuchfile.tar.gz' ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), 'gs://my-gcs-bucket/gcs.tar.gz' ] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual([ 'abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE ], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = '%s/v%s.tar.gz' % (dependency.PACKAGES_URL_PREFIX, __version__) expected_from_path = self.override_file_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_with_requirements_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
def test_setup_file_not_named_setup_dot_py(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = (os.path.join( source_dir, 'xyz-setup.py')) self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'), 'notused') with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertTrue( cm.exception.message.startswith( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of '))