def get_in_memory_source_to_text_sink_message(): rsi = dataflow.ReadInstruction() rsi.source = dataflow.Source() rsi.source.spec = dataflow.Source.SpecValue() for k, v in IN_MEMORY_SOURCE_SPEC.iteritems(): rsi.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) # Note that the in-memory source spec requires a windowed coder. add_source_windowed_codec_spec(rsi) wi = dataflow.WriteInstruction() wi.input = dataflow.InstructionInput() wi.sink = dataflow.Sink() wi.sink.spec = dataflow.Sink.SpecValue() for k, v in TEXT_SINK_SPEC.iteritems(): wi.sink.spec.additionalProperties.append( dataflow.Sink.SpecValue.AdditionalProperty(key=k, value=to_json_value(v))) add_sink_codec_spec(wi) mt = dataflow.MapTask() mt.instructions.append(get_instruction_with_outputs(read=rsi)) mt.instructions.append(dataflow.ParallelInstruction(write=wi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def get_in_memory_source_to_text_sink_message(): rsi = dataflow.ReadInstruction() rsi.source = dataflow.Source() rsi.source.spec = dataflow.Source.SpecValue() for k, v in IN_MEMORY_SOURCE_SPEC.iteritems(): rsi.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) # Note that the in-memory source spec requires a windowed coder. add_source_windowed_codec_spec(rsi) wi = dataflow.WriteInstruction() wi.input = dataflow.InstructionInput() wi.sink = dataflow.Sink() wi.sink.spec = dataflow.Sink.SpecValue() for k, v in TEXT_SINK_SPEC.iteritems(): wi.sink.spec.additionalProperties.append( dataflow.Sink.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_sink_codec_spec(wi) mt = dataflow.MapTask() mt.instructions.append(dataflow.ParallelInstruction(read=rsi)) mt.instructions.append(dataflow.ParallelInstruction(write=wi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def get_shuffle_source_to_text_sink_message(shuffle_source_spec): rsi = dataflow.ReadInstruction() rsi.source = dataflow.Source() rsi.source.spec = dataflow.Source.SpecValue() for k, v in shuffle_source_spec.iteritems(): rsi.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_source_codec_spec(rsi) wi = dataflow.WriteInstruction() wi.input = dataflow.InstructionInput() wi.sink = dataflow.Sink() wi.sink.spec = dataflow.Sink.SpecValue() for k, v in TEXT_SINK_SPEC.iteritems(): wi.sink.spec.additionalProperties.append( dataflow.Sink.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_sink_codec_spec(wi) mt = dataflow.MapTask() mt.instructions.append(dataflow.ParallelInstruction(read=rsi)) mt.instructions.append(dataflow.ParallelInstruction(write=wi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def set_mean(accumulator, metric_update): if accumulator.count: metric_update.meanSum = to_json_value(accumulator.sum, with_type=True) metric_update.meanCount = to_json_value(accumulator.count, with_type=True) else: # A denominator of 0 will raise an error in the service. # What it means is we have nothing to report yet, so don't. metric_update.kind = None
def set_mean(accumulator, metric_update): if accumulator.count: metric_update.meanSum = to_json_value(accumulator.sum, with_type=True) metric_update.meanCount = to_json_value(accumulator.count, with_type=True) else: # A denominator of 0 will raise an error in the service. # What it means is we have nothing to report yet, so don't. metric_update.kind = None
def append_metric(status_object, metric_name, value1, value2=None, step=None, output_user_name=None, tentative=False, worker_id=None, cumulative=True): """Creates and adds a MetricUpdate field to the passed-in protobuf. Args: status_object: a work_item_status to which to add this metric metric_name: a string naming this metric value1: scalar for a Sum or mean_sum for a Mean value2: mean_count for a Mean aggregation (do not provide for a Sum). step: the name of the associated step output_user_name: the user-visible name to use tentative: whether this should be labeled as a tentative metric worker_id: the id of this worker. Specifying a worker_id also causes this to be encoded as a metric, not a counter. cumulative: Whether this metric is cumulative, default True. Set to False for a delta value. """ # Does this look like a counter or like a metric? is_counter = not worker_id metric_update = dataflow.MetricUpdate() metric_update.name = dataflow.MetricStructuredName() metric_update.name.name = metric_name # Handle attributes stored in the name context if step or output_user_name or tentative or worker_id: metric_update.name.context = dataflow.MetricStructuredName.ContextValue() def append_to_context(key, value): metric_update.name.context.additionalProperties.append( dataflow.MetricStructuredName.ContextValue.AdditionalProperty( key=key, value=value)) if step: append_to_context('step', step) if output_user_name: append_to_context('output_user_name', output_user_name) if tentative: append_to_context('tentative', 'true') if worker_id: append_to_context('workerId', worker_id) if cumulative and is_counter: metric_update.cumulative = cumulative if value2 is None: if is_counter: # Counters are distinguished by having a kind; metrics do not. metric_update.kind = 'Sum' metric_update.scalar = to_json_value(value1, with_type=True) elif value2 > 0: metric_update.kind = 'Mean' metric_update.meanSum = to_json_value(value1, with_type=True) metric_update.meanCount = to_json_value(value2, with_type=True) else: # A denominator of 0 will raise an error in the service. # What it means is we have nothing to report yet, so don't. pass logging.debug('Appending metric_update: %s', metric_update) status_object.metricUpdates.append(metric_update)
def get_test_rows(self): now = time.time() expected_rows = [ {'i': 1, 's': 'abc', 'f': 2.3, 'b': True, 't': now}, {'i': 10, 's': 'xyz', 'f': -3.14, 'b': False}] schema = bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED'), bigquery.TableFieldSchema( name='f', type='FLOAT', mode='REQUIRED'), bigquery.TableFieldSchema( name='i', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema( name='s', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='t', type='TIMESTAMP', mode='NULLABLE')]) table_rows = [ bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value('true')), bigquery.TableCell(v=to_json_value(str(2.3))), bigquery.TableCell(v=to_json_value(str(1))), bigquery.TableCell(v=to_json_value('abc')), # For timestamps cannot use str() because it will truncate the # number representing the timestamp. bigquery.TableCell(v=to_json_value('%f' % now))]), bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value('false')), bigquery.TableCell(v=to_json_value(str(-3.14))), bigquery.TableCell(v=to_json_value(str(10))), bigquery.TableCell(v=to_json_value('xyz')), bigquery.TableCell(v=None)])] return table_rows, schema, expected_rows
def get_in_memory_source_to_flatten_message(): rsi = dataflow.ReadInstruction() rsi.source = dataflow.Source() add_source_codec_spec(rsi) rsi.source.spec = dataflow.Source.SpecValue() for k, v in IN_MEMORY_SOURCE_SPEC.iteritems(): rsi.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) # Note that the in-memory source spec requires a windowed coder. add_source_windowed_codec_spec(rsi) fi = dataflow.FlattenInstruction() fi.inputs = [dataflow.InstructionInput()] mt = dataflow.MapTask() mt.instructions.append(get_instruction_with_outputs(read=rsi)) mt.instructions.append(get_instruction_with_outputs(flatten=fi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def insert_rows(self, project_id, dataset_id, table_id, rows): """Inserts rows into the specified table. Args: project_id: The project id owning the table. dataset_id: The dataset id owning the table. table_id: The table id. rows: A list of plain Python dictionaries. Each dictionary is a row and each key in it is the name of a field. Returns: A tuple (bool, errors). If first element is False then the second element will be a bigquery.InserttErrorsValueListEntry instance containing specific errors. """ # Prepare rows for insertion. Of special note is the row ID that we add to # each row in order to help BigQuery avoid inserting a row multiple times. # BigQuery will do a best-effort if unique IDs are provided. This situation # can happen during retries on failures. # TODO(silviuc): Must add support to writing TableRow's instead of dicts. final_rows = [] for row in rows: json_object = bigquery.JsonObject() for k, v in row.iteritems(): json_object.additionalProperties.append( bigquery.JsonObject.AdditionalProperty( key=k, value=to_json_value(v))) final_rows.append( bigquery.TableDataInsertAllRequest.RowsValueListEntry( insertId=str(self.unique_row_id), json=json_object)) result, errors = self._insert_all_rows( project_id, dataset_id, table_id, final_rows) return result, errors
def test_rows_are_written(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table write_disposition = df.io.BigQueryDisposition.WRITE_APPEND insert_response = mock.Mock() insert_response.insertErrors = [] client.tabledata.InsertAll.return_value = insert_response with df.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client) as writer: writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14}) sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14} expected_rows = [] json_object = bigquery.JsonObject() for k, v in sample_row.iteritems(): json_object.additionalProperties.append( bigquery.JsonObject.AdditionalProperty(key=k, value=to_json_value(v))) expected_rows.append( bigquery.TableDataInsertAllRequest.RowsValueListEntry( insertId='_1', # First row ID generated with prefix '' json=json_object)) client.tabledata.InsertAll.assert_called_with( bigquery.BigqueryTabledataInsertAllRequest( projectId='project', datasetId='dataset', tableId='table', tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest( rows=expected_rows)))
def test_rows_are_written(self): client = mock.Mock() table = bigquery.Table( tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table write_disposition = df.io.BigQueryDisposition.WRITE_APPEND insert_response = mock.Mock() insert_response.insertErrors = [] client.tabledata.InsertAll.return_value = insert_response with df.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client) as writer: writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14}) sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14} expected_rows = [] json_object = bigquery.JsonObject() for k, v in sample_row.iteritems(): json_object.additionalProperties.append( bigquery.JsonObject.AdditionalProperty( key=k, value=to_json_value(v))) expected_rows.append( bigquery.TableDataInsertAllRequest.RowsValueListEntry( insertId='_1', # First row ID generated with prefix '' json=json_object)) client.tabledata.InsertAll.assert_called_with( bigquery.BigqueryTabledataInsertAllRequest( projectId='project', datasetId='dataset', tableId='table', tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest( rows=expected_rows)))
def insert_rows(self, project_id, dataset_id, table_id, rows): """Inserts rows into the specified table. Args: project_id: The project id owning the table. dataset_id: The dataset id owning the table. table_id: The table id. rows: A list of plain Python dictionaries. Each dictionary is a row and each key in it is the name of a field. Returns: A tuple (bool, errors). If first element is False then the second element will be a bigquery.InserttErrorsValueListEntry instance containing specific errors. """ # Prepare rows for insertion. Of special note is the row ID that we add to # each row in order to help BigQuery avoid inserting a row multiple times. # BigQuery will do a best-effort if unique IDs are provided. This situation # can happen during retries on failures. # TODO(silviuc): Must add support to writing TableRow's instead of dicts. final_rows = [] for row in rows: json_object = bigquery.JsonObject() for k, v in row.iteritems(): json_object.additionalProperties.append( bigquery.JsonObject.AdditionalProperty( key=k, value=to_json_value(v))) final_rows.append( bigquery.TableDataInsertAllRequest.RowsValueListEntry( insertId=str(self.unique_row_id), json=json_object)) result, errors = self._insert_all_rows( project_id, dataset_id, table_id, final_rows) return result, errors
def get_text_source_to_shuffle_sink_message(): ri = dataflow.ReadInstruction() ri.source = dataflow.Source() ri.source.spec = dataflow.Source.SpecValue() for k, v in TEXT_SOURCE_SPEC.iteritems(): ri.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_source_codec_spec(ri) di = dataflow.ParDoInstruction() di.input = dataflow.InstructionInput() di.input.producerInstructionIndex = 1 di.multiOutputInfos = [dataflow.MultiOutputInfo(tag='out')] di.userFn = dataflow.ParDoInstruction.UserFnValue() for k, v in PARDO_DOFN_SPEC.iteritems(): di.userFn.additionalProperties.append( dataflow.ParDoInstruction.UserFnValue.AdditionalProperty( key=k, value=to_json_value(v))) wsi = dataflow.WriteInstruction() wsi.input = dataflow.InstructionInput() wsi.input.producerInstructionIndex = 1 di.input.outputNum = 0 wsi.sink = dataflow.Sink() wsi.sink.spec = dataflow.Sink.SpecValue() for k, v in SHUFFLE_SINK_SPEC.iteritems(): wsi.sink.spec.additionalProperties.append( dataflow.Sink.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_sink_codec_spec(wsi) mt = dataflow.MapTask() mt.instructions.append(dataflow.ParallelInstruction(read=ri)) mt.instructions.append(dataflow.ParallelInstruction(parDo=di)) mt.instructions.append(dataflow.ParallelInstruction(write=wsi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def get_text_source_to_shuffle_sink_message(): ri = dataflow.ReadInstruction() ri.source = dataflow.Source() ri.source.spec = dataflow.Source.SpecValue() for k, v in TEXT_SOURCE_SPEC.iteritems(): ri.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_source_codec_spec(ri) di = dataflow.ParDoInstruction() di.input = dataflow.InstructionInput() di.input.producerInstructionIndex = 1 di.multiOutputInfos = [dataflow.MultiOutputInfo(tag='out')] di.userFn = dataflow.ParDoInstruction.UserFnValue() for k, v in PARDO_DOFN_SPEC.iteritems(): di.userFn.additionalProperties.append( dataflow.ParDoInstruction.UserFnValue.AdditionalProperty( key=k, value=to_json_value(v))) wsi = dataflow.WriteInstruction() wsi.input = dataflow.InstructionInput() wsi.input.producerInstructionIndex = 1 di.input.outputNum = 0 wsi.sink = dataflow.Sink() wsi.sink.spec = dataflow.Sink.SpecValue() for k, v in SHUFFLE_SINK_SPEC.iteritems(): wsi.sink.spec.additionalProperties.append( dataflow.Sink.SpecValue.AdditionalProperty(key=k, value=to_json_value(v))) add_sink_codec_spec(wsi) mt = dataflow.MapTask() mt.instructions.append(get_instruction_with_outputs(read=ri)) mt.instructions.append(get_instruction_with_outputs(parDo=di)) mt.instructions.append(dataflow.ParallelInstruction(write=wsi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def test_row_and_no_schema(self): coder = TableRowJsonCoder() test_row = bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(e)) for e in ['abc', 123, 123.456, True]]) with self.assertRaises(AttributeError) as ctx: coder.encode(test_row) self.assertTrue( ctx.exception.message.startswith('The TableRowJsonCoder requires'))
def test_row_and_no_schema(self): coder = TableRowJsonCoder() test_row = bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value(e)) for e in ['abc', 123, 123.456, True] ]) with self.assertRaises(AttributeError) as ctx: coder.encode(test_row) self.assertTrue( ctx.exception.message.startswith('The TableRowJsonCoder requires'))
def get_instruction_with_outputs(num_outputs=1, **kwargs): pi = dataflow.ParallelInstruction(**kwargs) for _ in xrange(num_outputs): output = dataflow.InstructionOutput() output.codec = dataflow.InstructionOutput.CodecValue() for k, v in CODER_SPEC.iteritems(): output.codec.additionalProperties.append( dataflow.InstructionOutput.CodecValue.AdditionalProperty( key=k, value=to_json_value(v))) pi.outputs.append(output) return pi
def get_instruction_with_outputs(num_outputs=1, **kwargs): pi = dataflow.ParallelInstruction(**kwargs) for _ in xrange(num_outputs): output = dataflow.InstructionOutput() output.codec = dataflow.InstructionOutput.CodecValue() for k, v in CODER_SPEC.iteritems(): output.codec.additionalProperties.append( dataflow.InstructionOutput.CodecValue.AdditionalProperty( key=k, value=to_json_value(v))) pi.outputs.append(output) return pi
def append_metric(status_object, metric_name, kind, value, setter=None, step=None, output_user_name=None, tentative=False, worker_id=None, cumulative=True): """Creates and adds a MetricUpdate field to the passed-in protobuf. Args: status_object: a work_item_status to which to add this metric metric_name: a string naming this metric kind: dataflow counter kind (e.g. 'sum') value: accumulator value to encode setter: if not None, a lambda to use to update metric_update with value step: the name of the associated step output_user_name: the user-visible name to use tentative: whether this should be labeled as a tentative metric worker_id: the id of this worker. Specifying a worker_id also causes this to be encoded as a metric, not a counter. cumulative: Whether this metric is cumulative, default True. Set to False for a delta value. """ # Does this look like a counter or like a metric? is_counter = not worker_id metric_update = dataflow.MetricUpdate() metric_update.name = dataflow.MetricStructuredName() metric_update.name.name = metric_name # Handle attributes stored in the name context if step or output_user_name or tentative or worker_id: metric_update.name.context = dataflow.MetricStructuredName.ContextValue() def append_to_context(key, value): metric_update.name.context.additionalProperties.append( dataflow.MetricStructuredName.ContextValue.AdditionalProperty( key=key, value=value)) if step: append_to_context('step', step) if output_user_name: append_to_context('output_user_name', output_user_name) if tentative: append_to_context('tentative', 'true') if worker_id: append_to_context('workerId', worker_id) if cumulative and is_counter: metric_update.cumulative = cumulative if is_counter: # Counters are distinguished by having a kind; metrics do not. metric_update.kind = kind if setter: setter(value, metric_update) else: metric_update.scalar = to_json_value(value, with_type=True) logging.debug('Appending metric_update: %s', metric_update) status_object.metricUpdates.append(metric_update)
def splits_to_split_response(bundles): """Generates a response to a custom source split request. Args: bundles: a set of bundles generated by a BoundedSource.split() invocation. Returns: a SourceOperationResponse object. """ derived_sources = [] for bundle in bundles: derived_source = dataflow.DerivedSource() derived_source.derivationMode = ( dataflow.DerivedSource.DerivationModeValueValuesEnum .SOURCE_DERIVATION_MODE_INDEPENDENT) derived_source.source = dataflow.Source() derived_source.source.doesNotNeedSplitting = True derived_source.source.spec = dataflow.Source.SpecValue() derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=names.SERIALIZED_SOURCE_KEY, value=to_json_value(pickler.dumps( (bundle.source, bundle.start_position, bundle.stop_position)), with_type=True))) derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty(key='@type', value=to_json_value( names.SOURCE_TYPE))) derived_sources.append(derived_source) split_response = dataflow.SourceSplitResponse() split_response.bundles = derived_sources split_response.outcome = ( dataflow.SourceSplitResponse.OutcomeValueValuesEnum .SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED) response = dataflow.SourceOperationResponse() response.split = split_response return response
def splits_to_split_response(bundles): """Generates a response to a custom source split request. Args: bundles: a set of bundles generated by a BoundedSource.split() invocation. Returns: a SourceOperationResponse object. """ derived_sources = [] for bundle in bundles: derived_source = dataflow.DerivedSource() derived_source.derivationMode = ( dataflow.DerivedSource.DerivationModeValueValuesEnum. SOURCE_DERIVATION_MODE_INDEPENDENT) derived_source.source = dataflow.Source() derived_source.source.doesNotNeedSplitting = True derived_source.source.spec = dataflow.Source.SpecValue() derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=names.SERIALIZED_SOURCE_KEY, value=to_json_value(pickler.dumps( (bundle.source, bundle.start_position, bundle.stop_position)), with_type=True))) derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key='@type', value=to_json_value(names.SOURCE_TYPE))) derived_sources.append(derived_source) split_response = dataflow.SourceSplitResponse() split_response.bundles = derived_sources split_response.outcome = ( dataflow.SourceSplitResponse.OutcomeValueValuesEnum. SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED) response = dataflow.SourceOperationResponse() response.split = split_response return response
def build_split_proto(self, bounded_source, desired_bundle_size): split_proto = dataflow.SourceSplitRequest() split_proto.options = dataflow.SourceSplitOptions() split_proto.options.desiredBundleSizeBytes = desired_bundle_size source = dataflow.Source() spec = dataflow.Source.SpecValue() if bounded_source: spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=names.SERIALIZED_SOURCE_KEY, value=to_json_value({'value': pickler.dumps(bounded_source), '@type': 'http://schema.org/Text'}))) spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key='@type', value=to_json_value('CustomSourcesType'))) source.spec = spec split_proto.source = source return split_proto
def get_test_rows(self): now = time.time() expected_rows = [{ 'i': 1, 's': 'abc', 'f': 2.3, 'b': True, 't': now }, { 'i': 10, 's': 'xyz', 'f': -3.14, 'b': False }] schema = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED'), bigquery.TableFieldSchema(name='f', type='FLOAT', mode='REQUIRED'), bigquery.TableFieldSchema( name='i', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema(name='s', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='t', type='TIMESTAMP', mode='NULLABLE') ]) table_rows = [ bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value('true')), bigquery.TableCell(v=to_json_value(str(2.3))), bigquery.TableCell(v=to_json_value(str(1))), bigquery.TableCell(v=to_json_value('abc')), # For timestamps cannot use str() because it will truncate the # number representing the timestamp. bigquery.TableCell(v=to_json_value('%f' % now)) ]), bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value('false')), bigquery.TableCell(v=to_json_value(str(-3.14))), bigquery.TableCell(v=to_json_value(str(10))), bigquery.TableCell(v=to_json_value('xyz')), bigquery.TableCell(v=None) ]) ] return table_rows, schema, expected_rows
def test_row_as_table_row(self): schema_definition = [ ('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'), ('b', 'BOOLEAN')] schema = bigquery.TableSchema( fields=[bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition]) coder = TableRowJsonCoder(table_schema=schema) test_row = bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(e)) for e in ['abc', 123, 123.456, True]]) self.assertEqual('{"s": "abc", "i": 123, "f": 123.456, "b": true}', coder.encode(test_row)) self.assertEqual(test_row, coder.decode(coder.encode(test_row))) # A coder without schema can still decode. self.assertEqual( test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
def test_row_as_table_row(self): schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'), ('b', 'BOOLEAN')] schema = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition ]) coder = TableRowJsonCoder(table_schema=schema) test_row = bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value(e)) for e in ['abc', 123, 123.456, True] ]) self.assertEqual('{"s": "abc", "i": 123, "f": 123.456, "b": true}', coder.encode(test_row)) self.assertEqual(test_row, coder.decode(coder.encode(test_row))) # A coder without schema can still decode. self.assertEqual(test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
def test_string_to(self): self.assertEquals(JsonValue(string_value='abc'), to_json_value('abc'))
def decode(self, encoded_table_row): od = json.loads( encoded_table_row, object_pairs_hook=collections.OrderedDict) return bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
def add_sink_codec_spec(target): target.sink.codec = dataflow.Sink.CodecValue() for k, v in CODER_SPEC.iteritems(): target.sink.codec.additionalProperties.append( dataflow.Sink.CodecValue.AdditionalProperty( key=k, value=to_json_value(v)))
def add_sink_codec_spec(target): target.sink.codec = dataflow.Sink.CodecValue() for k, v in CODER_SPEC.iteritems(): target.sink.codec.additionalProperties.append( dataflow.Sink.CodecValue.AdditionalProperty( key=k, value=to_json_value(v)))
def __init__(self, packages, options, environment_version): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.proto = dataflow.Environment() self.proto.clusterManagerApiService = COMPUTE_API_SERVICE self.proto.dataset = '%s/cloud_dataflow' % BIGQUERY_API_SERVICE self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace('gs:/', STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint version_string = version.__version__ self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value('Google Cloud Dataflow SDK for Python')), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(version_string))]) # Version information. self.proto.version = dataflow.Environment.VersionValue() if self.standard_options.is_streaming: job_type = 'PYTHON_STREAMING' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version))]) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % ( self.google_cloud_options.staging_location.replace( 'gs:/', STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl='https://dataflow.googleapis.com', servicePath=self.google_cloud_options.dataflow_endpoint))) # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.disk_source_image: pool.diskSourceImage = self.worker_options.disk_source_image if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.teardown_policy: if self.worker_options.teardown_policy == 'TEARDOWN_NEVER': pool.teardownPolicy = ( dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_NEVER) elif self.worker_options.teardown_policy == 'TEARDOWN_ALWAYS': pool.teardownPolicy = ( dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_ALWAYS) elif self.worker_options.teardown_policy == 'TEARDOWN_ON_SUCCESS': pool.teardownPolicy = ( dataflow.WorkerPool .TeardownPolicyValueValuesEnum.TEARDOWN_ON_SUCCESS) if self.standard_options.is_streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool)
def test_int_to(self): self.assertEquals(JsonValue(integer_value=14), to_json_value(14))
def test_string_from(self): self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))
def test_float_from(self): self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
def decode(self, encoded_table_row): od = json.loads( encoded_table_row, object_pairs_hook=collections.OrderedDict) return bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
def __init__(self, packages, options, environment_version): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.proto = dataflow.Environment() self.proto.clusterManagerApiService = COMPUTE_API_SERVICE self.proto.dataset = '%s/cloud_dataflow' % BIGQUERY_API_SERVICE self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace( 'gs:/', STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint version_string = version.__version__ self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value('Google Cloud Dataflow SDK for Python')), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(version_string)) ]) # Version information. self.proto.version = dataflow.Environment.VersionValue() if self.standard_options.streaming: job_type = 'PYTHON_STREAMING' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version)) ]) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % (self.google_cloud_options.staging_location.replace( 'gs:/', STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl='https://dataflow.googleapis.com', servicePath=self.google_cloud_options.dataflow_endpoint))) # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.disk_source_image: pool.diskSourceImage = self.worker_options.disk_source_image if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.worker_harness_container_image: pool.workerHarnessContainerImage = ( self.worker_options.worker_harness_container_image) if self.worker_options.teardown_policy: if self.worker_options.teardown_policy == 'TEARDOWN_NEVER': pool.teardownPolicy = ( dataflow.WorkerPool.TeardownPolicyValueValuesEnum. TEARDOWN_NEVER) elif self.worker_options.teardown_policy == 'TEARDOWN_ALWAYS': pool.teardownPolicy = ( dataflow.WorkerPool.TeardownPolicyValueValuesEnum. TEARDOWN_ALWAYS) elif self.worker_options.teardown_policy == 'TEARDOWN_ON_SUCCESS': pool.teardownPolicy = ( dataflow.WorkerPool.TeardownPolicyValueValuesEnum. TEARDOWN_ON_SUCCESS) if self.standard_options.streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool) sdk_pipeline_options = options.get_all_options() if sdk_pipeline_options: self.proto.sdkPipelineOptions = ( dataflow.Environment.SdkPipelineOptionsValue()) for k, v in sdk_pipeline_options.iteritems(): if v is not None: self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key=k, value=to_json_value(v)))
def add_property(self, name, value, with_type=False): self.proto.properties.additionalProperties.append( dataflow.Step.PropertiesValue.AdditionalProperty( key=name, value=to_json_value(value, with_type=with_type)))
def append_metric(status_object, metric_name, kind, value, setter=None, step=None, output_user_name=None, tentative=False, worker_id=None, cumulative=True): """Creates and adds a MetricUpdate field to the passed-in protobuf. Args: status_object: a work_item_status to which to add this metric metric_name: a string naming this metric kind: dataflow counter kind (e.g. 'sum') value: accumulator value to encode setter: if not None, a lambda to use to update metric_update with value step: the name of the associated step output_user_name: the user-visible name to use tentative: whether this should be labeled as a tentative metric worker_id: the id of this worker. Specifying a worker_id also causes this to be encoded as a metric, not a counter. cumulative: Whether this metric is cumulative, default True. Set to False for a delta value. """ # Does this look like a counter or like a metric? is_counter = not worker_id metric_update = dataflow.MetricUpdate() metric_update.name = dataflow.MetricStructuredName() metric_update.name.name = metric_name # Handle attributes stored in the name context if step or output_user_name or tentative or worker_id: metric_update.name.context = dataflow.MetricStructuredName.ContextValue( ) def append_to_context(key, value): metric_update.name.context.additionalProperties.append( dataflow.MetricStructuredName.ContextValue.AdditionalProperty( key=key, value=value)) if step: append_to_context('step', step) if output_user_name: append_to_context('output_user_name', output_user_name) if tentative: append_to_context('tentative', 'true') if worker_id: append_to_context('workerId', worker_id) if cumulative and is_counter: metric_update.cumulative = cumulative if is_counter: # Counters are distinguished by having a kind; metrics do not. metric_update.kind = kind if setter: setter(value, metric_update) else: metric_update.scalar = to_json_value(value, with_type=True) logging.debug('Appending metric_update: %s', metric_update) status_object.metricUpdates.append(metric_update)
def set_scalar(accumulator, metric_update): metric_update.scalar = to_json_value(accumulator.value, with_type=True)
def add_property(self, name, value, with_type=False): self.proto.properties.additionalProperties.append( dataflow.Step.PropertiesValue.AdditionalProperty( key=name, value=to_json_value(value, with_type=with_type)))
def test_true_to(self): self.assertEquals(JsonValue(boolean_value=True), to_json_value(True))
def test_false_from(self): self.assertEquals(False, from_json_value(to_json_value(False)))
def test_false_to(self): self.assertEquals(JsonValue(boolean_value=False), to_json_value(False))
def __init__(self, packages, options, environment_version): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.proto = dataflow.Environment() self.proto.clusterManagerApiService = COMPUTE_API_SERVICE self.proto.dataset = '%s/cloud_dataflow' % BIGQUERY_API_SERVICE self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace('gs:/', STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint version_string = version.__version__ self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value('Google Cloud Dataflow SDK for Python')), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(version_string))]) # Version information. self.proto.version = dataflow.Environment.VersionValue() if self.standard_options.streaming: job_type = 'PYTHON_STREAMING' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version))]) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % ( self.google_cloud_options.staging_location.replace( 'gs:/', STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl='https://dataflow.googleapis.com', servicePath=self.google_cloud_options.dataflow_endpoint))) pool.autoscalingSettings = dataflow.AutoscalingSettings() # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.max_num_workers: pool.autoscalingSettings.maxNumWorkers = ( self.worker_options.max_num_workers) if self.worker_options.autoscaling_algorithm: values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum pool.autoscalingSettings.algorithm = { 'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE, 'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC, }.get(self.worker_options.autoscaling_algorithm) if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.disk_source_image: pool.diskSourceImage = self.worker_options.disk_source_image if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.worker_harness_container_image: pool.workerHarnessContainerImage = ( self.worker_options.worker_harness_container_image) else: # Default to using the worker harness container image for the current SDK # version. pool.workerHarnessContainerImage = ( 'dataflow.gcr.io/v1beta3/python:%s' % version.__version__) if self.worker_options.teardown_policy: if self.worker_options.teardown_policy == 'TEARDOWN_NEVER': pool.teardownPolicy = ( dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_NEVER) elif self.worker_options.teardown_policy == 'TEARDOWN_ALWAYS': pool.teardownPolicy = ( dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_ALWAYS) elif self.worker_options.teardown_policy == 'TEARDOWN_ON_SUCCESS': pool.teardownPolicy = ( dataflow.WorkerPool .TeardownPolicyValueValuesEnum.TEARDOWN_ON_SUCCESS) if self.standard_options.streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool) sdk_pipeline_options = options.get_all_options() if sdk_pipeline_options: self.proto.sdkPipelineOptions = ( dataflow.Environment.SdkPipelineOptionsValue()) for k, v in sdk_pipeline_options.iteritems(): if v is not None: self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty( key=k, value=to_json_value(v)))
def test_float_to(self): self.assertEquals(JsonValue(double_value=2.75), to_json_value(2.75))
def append_metric(status_object, metric_name, value1, value2=None, step=None, output_user_name=None, tentative=False, worker_id=None, cumulative=True): """Creates and adds a MetricUpdate field to the passed-in protobuf. Args: status_object: a work_item_status to which to add this metric metric_name: a string naming this metric value1: scalar for a Sum or mean_sum for a Mean value2: mean_count for a Mean aggregation (do not provide for a Sum). step: the name of the associated step output_user_name: the user-visible name to use tentative: whether this should be labeled as a tentative metric worker_id: the id of this worker. Specifying a worker_id also causes this to be encoded as a metric, not a counter. cumulative: Whether this metric is cumulative, default True. Set to False for a delta value. """ # Does this look like a counter or like a metric? is_counter = not worker_id metric_update = dataflow.MetricUpdate() metric_update.name = dataflow.MetricStructuredName() metric_update.name.name = metric_name # Handle attributes stored in the name context if step or output_user_name or tentative or worker_id: metric_update.name.context = dataflow.MetricStructuredName.ContextValue( ) def append_to_context(key, value): metric_update.name.context.additionalProperties.append( dataflow.MetricStructuredName.ContextValue.AdditionalProperty( key=key, value=value)) if step: append_to_context('step', step) if output_user_name: append_to_context('output_user_name', output_user_name) if tentative: append_to_context('tentative', 'true') if worker_id: append_to_context('workerId', worker_id) if cumulative and is_counter: metric_update.cumulative = cumulative if value2 is None: if is_counter: # Counters are distinguished by having a kind; metrics do not. metric_update.kind = 'Sum' metric_update.scalar = to_json_value(value1, with_type=True) elif value2 > 0: metric_update.kind = 'Mean' metric_update.meanSum = to_json_value(value1, with_type=True) metric_update.meanCount = to_json_value(value2, with_type=True) else: # A denominator of 0 will raise an error in the service. # What it means is we have nothing to report yet, so don't. pass logging.debug('Appending metric_update: %s', metric_update) status_object.metricUpdates.append(metric_update)
def test_true_from(self): self.assertEquals(True, from_json_value(to_json_value(True)))
def add_source_windowed_codec_spec(target): target.source.codec = dataflow.Source.CodecValue() for k, v in WINDOWED_CODER_SPEC.iteritems(): target.source.codec.additionalProperties.append( dataflow.Source.CodecValue.AdditionalProperty( key=k, value=to_json_value(v)))
def test_int_from(self): self.assertEquals(-27, from_json_value(to_json_value(-27)))
def set_scalar(accumulator, metric_update): metric_update.scalar = to_json_value(accumulator.value, with_type=True)
def test_with_type(self): rt = from_json_value(to_json_value('abcd', with_type=True)) self.assertEquals('http://schema.org/Text', rt['@type']) self.assertEquals('abcd', rt['value'])
def add_source_windowed_codec_spec(target): target.source.codec = dataflow.Source.CodecValue() for k, v in WINDOWED_CODER_SPEC.iteritems(): target.source.codec.additionalProperties.append( dataflow.Source.CodecValue.AdditionalProperty( key=k, value=to_json_value(v)))