def test_unsupported_type_display_data(self): class MyDisplayComponent(HasDisplayData): def display_data(self): return {'item_key': 'item_value'} with self.assertRaises(ValueError): DisplayData.create_from_options(MyDisplayComponent())
def test_project_table_display_data(self): sinkq = beam.io.BigQuerySink('PROJECT:dataset.table') dd = DisplayData.create_from(sinkq) expected_items = [ DisplayDataItemMatcher('table', 'PROJECT:dataset.table'), DisplayDataItemMatcher('validation', False)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(sampleFn, n): trs = [sampleFn(n)] for transform in trs: dd = DisplayData.create_from(transform) hc.assert_that( dd.items, hc.contains_inanyorder(DisplayDataItemMatcher('n', transform._n)))
def test_create_list_display_data(self): flags = ['--extra_package', 'package1', '--extra_package', 'package2'] pipeline_options = PipelineOptions(flags=flags) items = DisplayData.create_from_options(pipeline_options).items hc.assert_that(items, hc.contains_inanyorder( DisplayDataItemMatcher('extra_packages', str(['package1', 'package2']))))
def test_sink_display_data(self): file_name = 'some_avro_sink' sink = _create_avro_sink( file_name, self.SCHEMA, 'null', '.end', 0, None, 'application/x-avro', use_fastavro=self.use_fastavro) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher( 'schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d.end'), DisplayDataItemMatcher( 'codec', 'null'), DisplayDataItemMatcher( 'compression', 'uncompressed')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): sink = PubSubSink('a_topic') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_no_subscription(self): source = PubSubSource('a_topic') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_value_provider_display_data(self): class TestOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument( '--int_flag', type=int, help='int_flag description') parser.add_value_provider_argument( '--str_flag', type=str, default='hello', help='str_flag description') parser.add_value_provider_argument( '--float_flag', type=float, help='float_flag description') options = TestOptions(['--int_flag', '1']) items = DisplayData.create_from_options(options).items expected_items = [ DisplayDataItemMatcher( 'int_flag', '1'), DisplayDataItemMatcher( 'str_flag', 'RuntimeValueProvider(option: str_flag,' ' type: str, default_value: \'hello\')' ), DisplayDataItemMatcher( 'float_flag', 'RuntimeValueProvider(option: float_flag,' ' type: float, default_value: None)' ) ] hc.assert_that(items, hc.contains_inanyorder(*expected_items))
def _add_step(self, step_kind, step_label, transform_node, side_tags=()): """Creates a Step object and adds it to the cache.""" # Import here to avoid adding the dependency for local running scenarios. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient step = apiclient.Step(step_kind, self._get_unique_step_name()) self.job.proto.steps.append(step.proto) step.add_property(PropertyNames.USER_NAME, step_label) # Cache the node/step association for the main output of the transform node. self._cache.cache_output(transform_node, None, step) # If side_tags is not () then this is a multi-output transform node and we # need to cache the (node, tag, step) for each of the tags used to access # the outputs. This is essential because the keys used to search in the # cache always contain the tag. for tag in side_tags: self._cache.cache_output(transform_node, tag, step) # Finally, we add the display data items to the pipeline step. # If the transform contains no display data then an empty list is added. step.add_property( PropertyNames.DISPLAY_DATA, [item.get_dict() for item in DisplayData.create_from(transform_node.transform).items]) return step
def test_display_data(self): sink = _PubSubPayloadSink('projects/fakeprj/topics/a_topic') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_query_only_display_data(self): source = beam.io.BigQuerySource(query='my_query') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('query', 'my_query')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_date_partitioned_table_name(self): source = beam.io.BigQuerySource('dataset.table$20030102', validate=True) dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', True), DisplayDataItemMatcher('table', 'dataset.table$20030102')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(combineFn): transform = beam.CombinePerKey(combineFn) dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('combine_fn', combineFn.__class__), DisplayDataItemMatcher('n', combineFn._n), DisplayDataItemMatcher('compare', combineFn._compare.__name__)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_combine_globally_display_data(self): transform = beam.CombineGlobally(combine.Smallest(5)) dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('combine_fn', combine.Smallest), DisplayDataItemMatcher('n', 5), DisplayDataItemMatcher('compare', 'gt')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_no_subscription(self): source = _PubSubSource('projects/fakeprj/topics/a_topic') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('with_attributes', False), ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_basic_combiners_display_data(self): transform = beam.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)) dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('combine_fn', combine.TupleCombineFn), DisplayDataItemMatcher('combiners', "['max', 'MeanCombineFn', 'sum']")] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_single_file_display_data(self): file_name, _ = write_data(10) fbs = LineSource(file_name) dd = DisplayData.create_from(fbs) expected_items = [ DisplayDataItemMatcher('file_pattern', file_name), DisplayDataItemMatcher('compression', 'auto')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_source_creation_display_data(self): file_name = 'dummy_pattern' fbs = LineSource(file_name, validate=False) dd = DisplayData.create_from(fbs) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): source = _PubSubPayloadSource('a_topic', 'a_subscription', 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'a_topic'), DisplayDataItemMatcher('subscription', 'a_subscription'), DisplayDataItemMatcher('id_label', 'a_label')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): source = _PubSubPayloadSource('a_topic', 'a_subscription', 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'a_topic'), DisplayDataItemMatcher('subscription', 'a_subscription'), DisplayDataItemMatcher('id_label', 'a_label') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_unicode_type_display_data(self): class MyDoFn(beam.DoFn): def display_data(self): return {'unicode_string': unicode('my string'), 'unicode_literal_string': u'my literal string'} fn = MyDoFn() dd = DisplayData.create_from(fn) for item in dd.items: self.assertEqual(item.type, 'STRING')
def test_basic_combiners_display_data(self): transform = beam.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)) dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('combine_fn', combine.TupleCombineFn), DisplayDataItemMatcher('combiners', "['max', 'MeanCombineFn', 'sum']") ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_item_on_validate_true(self): source = beam.io.BigQuerySource( 'dataset.table', validate=True, use_dataflow_native_source=True) dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', True), DisplayDataItemMatcher('table', 'dataset.table') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_read_display_data(self): file_name = 'some_avro_source' read = avroio.ReadFromAvro(file_name, validate=False) dd = DisplayData.create_from(read) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_topic(self): source = _PubSubSource('projects/fakeprj/topics/a_topic', None, 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('id_label', 'a_label') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_read_display_data(self): file_name = 'some_avro_source' read = avroio.ReadFromAvro(file_name, validate=False) dd = DisplayData.create_from(read) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_subscription(self): source = _PubSubSource( None, 'projects/fakeprj/subscriptions/a_subscription', 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher( 'subscription', 'projects/fakeprj/subscriptions/a_subscription'), DisplayDataItemMatcher('id_label', 'a_label'), DisplayDataItemMatcher('with_attributes', False), ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_unicode_type_display_data(self): class MyDoFn(beam.DoFn): def display_data(self): return { 'unicode_string': unicode('my string'), 'unicode_literal_string': u'my literal string' } fn = MyDoFn() dd = DisplayData.create_from(fn) for item in dd.items: self.assertEqual(item.type, 'STRING')
def test_display_data_topic(self): source = _PubSubPayloadSource( 'projects/fakeprj/topics/a_topic', None, 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher( 'topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('id_label', 'a_label')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_subscription(self): source = _PubSubSource( None, 'projects/fakeprj/subscriptions/a_subscription', 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher( 'subscription', 'projects/fakeprj/subscriptions/a_subscription'), DisplayDataItemMatcher('id_label', 'a_label')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_read_display_data(self): file_name = 'some_parquet_source' read = \ ReadFromParquet( file_name, validate=False) dd = DisplayData.create_from(read) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(sampleFn, args, kwargs): trs = [sampleFn(*args, **kwargs)] for transform in trs: dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('fn', transform._fn.__name__)] if args: expected_items.append( DisplayDataItemMatcher('args', str(args))) if kwargs: expected_items.append( DisplayDataItemMatcher('kwargs', str(kwargs))) hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): sink = _PubSubSink('projects/fakeprj/topics/a_topic', id_label='id', with_attributes=False, timestamp_attribute='time') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('id_label', 'id'), DisplayDataItemMatcher('with_attributes', False), DisplayDataItemMatcher('timestamp_attribute', 'time'), ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_write_display_data(self): file_name = 'some_avro_sink' write = avroio.WriteToAvro(file_name, self.SCHEMA) dd = DisplayData.create_from(write) expected_items = [ DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d'), DisplayDataItemMatcher('codec', 'deflate'), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): sink = WriteToPubSub('projects/fakeprj/topics/a_topic', id_label='id', timestamp_attribute='time') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('id_label', 'id'), DisplayDataItemMatcher('with_attributes', True), DisplayDataItemMatcher('timestamp_attribute', 'time'), ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_drop_if_none(self): class MyDoFn(beam.DoFn): def display_data(self): return {'some_val': DisplayDataItem('something').drop_if_none(), 'non_val': DisplayDataItem(None).drop_if_none(), 'def_val': DisplayDataItem(True).drop_if_default(True), 'nodef_val': DisplayDataItem(True).drop_if_default(False)} dd = DisplayData.create_from(MyDoFn()) expected_items = [DisplayDataItemMatcher('some_val', 'something'), DisplayDataItemMatcher('nodef_val', True)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_sink_display_data(self): file_name = 'some_avro_sink' sink = AvroSink(file_name, self.SCHEMA, 'null', '.end', 0, None, 'application/x-avro') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d.end'), DisplayDataItemMatcher('codec', 'null'), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_write_display_data(self): file_name = 'some_parquet_sink' write = WriteToParquet(file_name, self.SCHEMA) dd = DisplayData.create_from(write) expected_items = [ DisplayDataItemMatcher('codec', 'none'), DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher('row_group_buffer_size', str(64 * 1024 * 1024)), DisplayDataItemMatcher( 'file_pattern', 'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d'), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_file_sink_display_data(self): temp_path = os.path.join(self._new_tempdir(), 'display') sink = MyFileBasedSink( temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher( 'compression', 'auto'), DisplayDataItemMatcher( 'file_pattern', '{}{}'.format( temp_path, '-%(shard_num)05d-of-%(num_shards)05d.output'))] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_file_sink_display_data(self): temp_path = os.path.join(self._new_tempdir(), 'display') sink = MyFileBasedSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher( 'file_pattern', '{}{}'.format(temp_path, '-%(shard_num)05d-of-%(num_shards)05d.output')) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_table_reference_display_data(self): source = beam.io.BigQuerySource('dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'dataset.table')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) source = beam.io.BigQuerySource('project:dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'project:dataset.table')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) source = beam.io.BigQuerySource('xyz.com:project:dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'xyz.com:project:dataset.table')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_base_cases(self): """ Tests basic display data cases (key:value, key:dict) It does not test subcomponent inclusion """ class MyDoFn(beam.DoFn): def __init__(self, my_display_data=None): self.my_display_data = my_display_data def process(self, context): yield context.element + 1 def display_data(self): return { 'static_integer': 120, 'static_string': 'static me!', 'complex_url': DisplayDataItem('github.com', url='http://github.com', label='The URL'), 'python_class': HasDisplayData, 'my_dd': self.my_display_data } now = datetime.now() fn = MyDoFn(my_display_data=now) dd = DisplayData.create_from(fn) nspace = '{}.{}'.format(fn.__module__, fn.__class__.__name__) expected_items = [ DisplayDataItemMatcher(key='complex_url', value='github.com', namespace=nspace, label='The URL'), DisplayDataItemMatcher(key='my_dd', value=now, namespace=nspace), DisplayDataItemMatcher(key='python_class', value=HasDisplayData, namespace=nspace, shortValue='HasDisplayData'), DisplayDataItemMatcher(key='static_integer', value=120, namespace=nspace), DisplayDataItemMatcher(key='static_string', value='static me!', namespace=nspace) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_subcomponent(self): class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} dofn = SpecialDoFn() pardo = beam.ParDo(dofn) dd = DisplayData.create_from(pardo) dofn_nspace = '{}.{}'.format(dofn.__module__, dofn.__class__.__name__) pardo_nspace = '{}.{}'.format(pardo.__module__, pardo.__class__.__name__) expected_items = [ DisplayDataItemMatcher('dofn_value', 42, dofn_nspace), DisplayDataItemMatcher('fn', SpecialDoFn, pardo_nspace)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_file_sink_display_data(self): temp_path = tempfile.NamedTemporaryFile().name sink = MyFileSink(temp_path, file_name_suffix='.foo', coder=coders.ToStringCoder()) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher( 'file_pattern', '{}{}'.format(temp_path, '-%(shard_num)05d-of-%(num_shards)05d.foo')) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_source_display_data(self): file_name = 'some_avro_source' source = \ _create_avro_source( file_name, validate=False, use_fastavro=self.use_fastavro ) dd = DisplayData.create_from(source) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_sink_display_data(self): file_name = 'some_parquet_sink' sink = _create_parquet_sink(file_name, self.SCHEMA, 'none', 1024 * 1024, 1000, False, '.end', 0, None, 'application/x-parquet') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d.end'), DisplayDataItemMatcher('codec', 'none'), DisplayDataItemMatcher('row_group_buffer_size', str(1024 * 1024)), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(sampleFn, args, kwargs): trs = [beam.CombinePerKey(sampleFn(*args, **kwargs)), beam.CombineGlobally(sampleFn(*args, **kwargs))] for transform in trs: dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('fn', sampleFn.fn.__name__), DisplayDataItemMatcher('combine_fn', transform.fn.__class__)] if len(args) > 0: expected_items.append( DisplayDataItemMatcher('args', str(args))) if len(kwargs) > 0: expected_items.append( DisplayDataItemMatcher('kwargs', str(kwargs))) hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def __init__(self, packages, options, environment_version, pipeline_url): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.debug_options = options.view_as(DebugOptions) self.pipeline_url = pipeline_url self.proto = dataflow.Environment() self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE self.proto.dataset = '{}/cloud_dataflow'.format( GoogleCloudOptions.BIGQUERY_API_SERVICE) self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint if self.google_cloud_options.service_account_email: self.proto.serviceAccountEmail = ( self.google_cloud_options.service_account_email) self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value(self._get_python_sdk_name())), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(beam_version.__version__)) ]) # Version information. self.proto.version = dataflow.Environment.VersionValue() _verify_interpreter_version_is_supported(options) if self.standard_options.streaming: job_type = 'FNAPI_STREAMING' else: if _use_fnapi(options): job_type = 'FNAPI_BATCH' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version)) ]) # TODO: Use enumerated type instead of strings for job types. if job_type.startswith('FNAPI_'): runner_harness_override = (get_runner_harness_container_image()) self.debug_options.experiments = self.debug_options.experiments or [] if runner_harness_override: self.debug_options.experiments.append( 'runner_harness_container_image=' + runner_harness_override) # Add use_multiple_sdk_containers flag if its not already present. Do not # add the flag if 'no_use_multiple_sdk_containers' is present. # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK # till version 2.4. debug_options_experiments = self.debug_options.experiments if ('use_multiple_sdk_containers' not in debug_options_experiments and 'no_use_multiple_sdk_containers' not in debug_options_experiments): self.debug_options.experiments.append( 'use_multiple_sdk_containers') # FlexRS if self.google_cloud_options.flexrs_goal == 'COST_OPTIMIZED': self.proto.flexResourceSchedulingGoal = ( dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum. FLEXRS_COST_OPTIMIZED) elif self.google_cloud_options.flexrs_goal == 'SPEED_OPTIMIZED': self.proto.flexResourceSchedulingGoal = ( dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum. FLEXRS_SPEED_OPTIMIZED) # Experiments if self.debug_options.experiments: for experiment in self.debug_options.experiments: self.proto.experiments.append(experiment) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % (self.google_cloud_options.staging_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT, servicePath=self.google_cloud_options.dataflow_endpoint))) pool.autoscalingSettings = dataflow.AutoscalingSettings() # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.max_num_workers: pool.autoscalingSettings.maxNumWorkers = ( self.worker_options.max_num_workers) if self.worker_options.autoscaling_algorithm: values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum pool.autoscalingSettings.algorithm = { 'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE, 'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC, }.get(self.worker_options.autoscaling_algorithm) if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.subnetwork: pool.subnetwork = self.worker_options.subnetwork if self.worker_options.worker_harness_container_image: pool.workerHarnessContainerImage = ( self.worker_options.worker_harness_container_image) else: pool.workerHarnessContainerImage = ( get_default_container_image_for_current_sdk(job_type)) if self.worker_options.use_public_ips is not None: if self.worker_options.use_public_ips: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PUBLIC) else: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PRIVATE) if self.standard_options.streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool) sdk_pipeline_options = options.get_all_options() if sdk_pipeline_options: self.proto.sdkPipelineOptions = ( dataflow.Environment.SdkPipelineOptionsValue()) options_dict = { k: v for k, v in sdk_pipeline_options.items() if v is not None } options_dict["pipelineUrl"] = pipeline_url self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='options', value=to_json_value(options_dict))) dd = DisplayData.create_from_options(options) items = [item.get_dict() for item in dd.items] self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='display_data', value=to_json_value(items)))
def __init__(self, packages, options, environment_version): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.debug_options = options.view_as(DebugOptions) self.proto = dataflow.Environment() self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE self.proto.dataset = '{}/cloud_dataflow'.format( GoogleCloudOptions.BIGQUERY_API_SERVICE) self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint if self.google_cloud_options.service_account_email: self.proto.serviceAccountEmail = ( self.google_cloud_options.service_account_email) sdk_name, version_string = get_sdk_name_and_version() self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value(sdk_name)), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(version_string)) ]) # Version information. self.proto.version = dataflow.Environment.VersionValue() if self.standard_options.streaming: job_type = 'PYTHON_STREAMING' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version)) ]) # Experiments if self.debug_options.experiments: for experiment in self.debug_options.experiments: self.proto.experiments.append(experiment) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % (self.google_cloud_options.staging_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT, servicePath=self.google_cloud_options.dataflow_endpoint))) pool.autoscalingSettings = dataflow.AutoscalingSettings() # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.max_num_workers: pool.autoscalingSettings.maxNumWorkers = ( self.worker_options.max_num_workers) if self.worker_options.autoscaling_algorithm: values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum pool.autoscalingSettings.algorithm = { 'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE, 'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC, }.get(self.worker_options.autoscaling_algorithm) if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.worker_harness_container_image: pool.workerHarnessContainerImage = ( self.worker_options.worker_harness_container_image) else: # Default to using the worker harness container image for the current SDK # version. pool.workerHarnessContainerImage = ( 'dataflow.gcr.io/v1beta3/python:%s' % get_required_container_version()) if self.worker_options.use_public_ips is not None: if self.worker_options.use_public_ips: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PUBLIC) else: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PRIVATE) if self.standard_options.streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool) sdk_pipeline_options = options.get_all_options() if sdk_pipeline_options: self.proto.sdkPipelineOptions = ( dataflow.Environment.SdkPipelineOptionsValue()) options_dict = { k: v for k, v in sdk_pipeline_options.iteritems() if v is not None } self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='options', value=to_json_value(options_dict))) dd = DisplayData.create_from_options(options) items = [item.get_dict() for item in dd.items] self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='display_data', value=to_json_value(items)))
def test_perkey_display_data(self): transform = beam.ApproximateQuantiles.PerKey(3, key=len, reverse=True) data = DisplayData.create_from(transform) expected_items = self._display_data_matcher(transform) hc.assert_that(data.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) dd = DisplayData.create_from(options) hc.assert_that(dd.items, hc.contains_inanyorder(*case['display_data']))
def test_display_data(self): sink = PubSubSink('a_topic') dd = DisplayData.create_from(sink) expected_items = [DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_no_subscription(self): source = PubSubSource('a_topic') dd = DisplayData.create_from(source) expected_items = [DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))