def get_in_memory_source_to_text_sink_message():
    rsi = dataflow.ReadInstruction()
    rsi.source = dataflow.Source()
    rsi.source.spec = dataflow.Source.SpecValue()
    for k, v in IN_MEMORY_SOURCE_SPEC.iteritems():
        rsi.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))
    # Note that the in-memory source spec requires a windowed coder.
    add_source_windowed_codec_spec(rsi)

    wi = dataflow.WriteInstruction()
    wi.input = dataflow.InstructionInput()
    wi.sink = dataflow.Sink()
    wi.sink.spec = dataflow.Sink.SpecValue()
    for k, v in TEXT_SINK_SPEC.iteritems():
        wi.sink.spec.additionalProperties.append(
            dataflow.Sink.SpecValue.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
    add_sink_codec_spec(wi)

    mt = dataflow.MapTask()
    mt.instructions.append(get_instruction_with_outputs(read=rsi))
    mt.instructions.append(dataflow.ParallelInstruction(write=wi))

    wi = dataflow.WorkItem()
    wi.id = 1234
    wi.projectId = 'project'
    wi.jobId = 'job'
    wi.mapTask = mt

    m = dataflow.LeaseWorkItemResponse()
    m.workItems.append(wi)
    return m
def get_in_memory_source_to_text_sink_message():
  rsi = dataflow.ReadInstruction()
  rsi.source = dataflow.Source()
  rsi.source.spec = dataflow.Source.SpecValue()
  for k, v in IN_MEMORY_SOURCE_SPEC.iteritems():
    rsi.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  # Note that the in-memory source spec requires a windowed coder.
  add_source_windowed_codec_spec(rsi)

  wi = dataflow.WriteInstruction()
  wi.input = dataflow.InstructionInput()
  wi.sink = dataflow.Sink()
  wi.sink.spec = dataflow.Sink.SpecValue()
  for k, v in TEXT_SINK_SPEC.iteritems():
    wi.sink.spec.additionalProperties.append(
        dataflow.Sink.SpecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  add_sink_codec_spec(wi)

  mt = dataflow.MapTask()
  mt.instructions.append(dataflow.ParallelInstruction(read=rsi))
  mt.instructions.append(dataflow.ParallelInstruction(write=wi))

  wi = dataflow.WorkItem()
  wi.id = 1234
  wi.projectId = 'project'
  wi.jobId = 'job'
  wi.mapTask = mt

  m = dataflow.LeaseWorkItemResponse()
  m.workItems.append(wi)
  return m
def get_shuffle_source_to_text_sink_message(shuffle_source_spec):
  rsi = dataflow.ReadInstruction()
  rsi.source = dataflow.Source()
  rsi.source.spec = dataflow.Source.SpecValue()
  for k, v in shuffle_source_spec.iteritems():
    rsi.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  add_source_codec_spec(rsi)

  wi = dataflow.WriteInstruction()
  wi.input = dataflow.InstructionInput()
  wi.sink = dataflow.Sink()
  wi.sink.spec = dataflow.Sink.SpecValue()
  for k, v in TEXT_SINK_SPEC.iteritems():
    wi.sink.spec.additionalProperties.append(
        dataflow.Sink.SpecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  add_sink_codec_spec(wi)

  mt = dataflow.MapTask()
  mt.instructions.append(dataflow.ParallelInstruction(read=rsi))
  mt.instructions.append(dataflow.ParallelInstruction(write=wi))

  wi = dataflow.WorkItem()
  wi.id = 1234
  wi.projectId = 'project'
  wi.jobId = 'job'
  wi.mapTask = mt

  m = dataflow.LeaseWorkItemResponse()
  m.workItems.append(wi)
  return m
def set_mean(accumulator, metric_update):
  if accumulator.count:
    metric_update.meanSum = to_json_value(accumulator.sum, with_type=True)
    metric_update.meanCount = to_json_value(accumulator.count, with_type=True)
  else:
    # A denominator of 0 will raise an error in the service.
    # What it means is we have nothing to report yet, so don't.
    metric_update.kind = None
Exemple #5
0
def set_mean(accumulator, metric_update):
    if accumulator.count:
        metric_update.meanSum = to_json_value(accumulator.sum, with_type=True)
        metric_update.meanCount = to_json_value(accumulator.count,
                                                with_type=True)
    else:
        # A denominator of 0 will raise an error in the service.
        # What it means is we have nothing to report yet, so don't.
        metric_update.kind = None
def append_metric(status_object, metric_name, value1, value2=None,
                  step=None, output_user_name=None, tentative=False,
                  worker_id=None, cumulative=True):
  """Creates and adds a MetricUpdate field to the passed-in protobuf.

  Args:
    status_object: a work_item_status to which to add this metric
    metric_name: a string naming this metric
    value1: scalar for a Sum or mean_sum for a Mean
    value2: mean_count for a Mean aggregation (do not provide for a Sum).
    step: the name of the associated step
    output_user_name: the user-visible name to use
    tentative: whether this should be labeled as a tentative metric
    worker_id: the id of this worker.  Specifying a worker_id also
      causes this to be encoded as a metric, not a counter.
    cumulative: Whether this metric is cumulative, default True.
      Set to False for a delta value.
  """
  # Does this look like a counter or like a metric?
  is_counter = not worker_id

  metric_update = dataflow.MetricUpdate()
  metric_update.name = dataflow.MetricStructuredName()
  metric_update.name.name = metric_name
  # Handle attributes stored in the name context
  if step or output_user_name or tentative or worker_id:
    metric_update.name.context = dataflow.MetricStructuredName.ContextValue()

    def append_to_context(key, value):
      metric_update.name.context.additionalProperties.append(
          dataflow.MetricStructuredName.ContextValue.AdditionalProperty(
              key=key, value=value))
    if step:
      append_to_context('step', step)
    if output_user_name:
      append_to_context('output_user_name', output_user_name)
    if tentative:
      append_to_context('tentative', 'true')
    if worker_id:
      append_to_context('workerId', worker_id)
  if cumulative and is_counter:
    metric_update.cumulative = cumulative
  if value2 is None:
    if is_counter:
      # Counters are distinguished by having a kind; metrics do not.
      metric_update.kind = 'Sum'
    metric_update.scalar = to_json_value(value1, with_type=True)
  elif value2 > 0:
    metric_update.kind = 'Mean'
    metric_update.meanSum = to_json_value(value1, with_type=True)
    metric_update.meanCount = to_json_value(value2, with_type=True)
  else:
    # A denominator of 0 will raise an error in the service.
    # What it means is we have nothing to report yet, so don't.
    pass
  logging.debug('Appending metric_update: %s', metric_update)
  status_object.metricUpdates.append(metric_update)
 def get_test_rows(self):
   now = time.time()
   expected_rows = [
       {'i': 1, 's': 'abc', 'f': 2.3, 'b': True, 't': now},
       {'i': 10, 's': 'xyz', 'f': -3.14, 'b': False}]
   schema = bigquery.TableSchema(
       fields=[
           bigquery.TableFieldSchema(
               name='b', type='BOOLEAN', mode='REQUIRED'),
           bigquery.TableFieldSchema(
               name='f', type='FLOAT', mode='REQUIRED'),
           bigquery.TableFieldSchema(
               name='i', type='INTEGER', mode='REQUIRED'),
           bigquery.TableFieldSchema(
               name='s', type='STRING', mode='REQUIRED'),
           bigquery.TableFieldSchema(
               name='t', type='TIMESTAMP', mode='NULLABLE')])
   table_rows = [
       bigquery.TableRow(f=[
           bigquery.TableCell(v=to_json_value('true')),
           bigquery.TableCell(v=to_json_value(str(2.3))),
           bigquery.TableCell(v=to_json_value(str(1))),
           bigquery.TableCell(v=to_json_value('abc')),
           # For timestamps cannot use str() because it will truncate the
           # number representing the timestamp.
           bigquery.TableCell(v=to_json_value('%f' % now))]),
       bigquery.TableRow(f=[
           bigquery.TableCell(v=to_json_value('false')),
           bigquery.TableCell(v=to_json_value(str(-3.14))),
           bigquery.TableCell(v=to_json_value(str(10))),
           bigquery.TableCell(v=to_json_value('xyz')),
           bigquery.TableCell(v=None)])]
   return table_rows, schema, expected_rows
def get_in_memory_source_to_flatten_message():
  rsi = dataflow.ReadInstruction()
  rsi.source = dataflow.Source()
  add_source_codec_spec(rsi)
  rsi.source.spec = dataflow.Source.SpecValue()
  for k, v in IN_MEMORY_SOURCE_SPEC.iteritems():
    rsi.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  # Note that the in-memory source spec requires a windowed coder.
  add_source_windowed_codec_spec(rsi)

  fi = dataflow.FlattenInstruction()
  fi.inputs = [dataflow.InstructionInput()]

  mt = dataflow.MapTask()
  mt.instructions.append(get_instruction_with_outputs(read=rsi))
  mt.instructions.append(get_instruction_with_outputs(flatten=fi))

  wi = dataflow.WorkItem()
  wi.id = 1234
  wi.projectId = 'project'
  wi.jobId = 'job'
  wi.mapTask = mt

  m = dataflow.LeaseWorkItemResponse()
  m.workItems.append(wi)
  return m
  def insert_rows(self, project_id, dataset_id, table_id, rows):
    """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

    # Prepare rows for insertion. Of special note is the row ID that we add to
    # each row in order to help BigQuery avoid inserting a row multiple times.
    # BigQuery will do a best-effort if unique IDs are provided. This situation
    # can happen during retries on failures.
    # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
    final_rows = []
    for row in rows:
      json_object = bigquery.JsonObject()
      for k, v in row.iteritems():
        json_object.additionalProperties.append(
            bigquery.JsonObject.AdditionalProperty(
                key=k, value=to_json_value(v)))
      final_rows.append(
          bigquery.TableDataInsertAllRequest.RowsValueListEntry(
              insertId=str(self.unique_row_id),
              json=json_object))
    result, errors = self._insert_all_rows(
        project_id, dataset_id, table_id, final_rows)
    return result, errors
Exemple #10
0
    def test_rows_are_written(self):
        client = mock.Mock()
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
                               schema=bigquery.TableSchema())
        client.tables.Get.return_value = table
        write_disposition = df.io.BigQueryDisposition.WRITE_APPEND

        insert_response = mock.Mock()
        insert_response.insertErrors = []
        client.tabledata.InsertAll.return_value = insert_response

        with df.io.BigQuerySink(
                'project:dataset.table',
                write_disposition=write_disposition).writer(client) as writer:
            writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

        sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
        expected_rows = []
        json_object = bigquery.JsonObject()
        for k, v in sample_row.iteritems():
            json_object.additionalProperties.append(
                bigquery.JsonObject.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
        expected_rows.append(
            bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                insertId='_1',  # First row ID generated with prefix ''
                json=json_object))
        client.tabledata.InsertAll.assert_called_with(
            bigquery.BigqueryTabledataInsertAllRequest(
                projectId='project',
                datasetId='dataset',
                tableId='table',
                tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                    rows=expected_rows)))
  def test_rows_are_written(self):
    client = mock.Mock()
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
        schema=bigquery.TableSchema())
    client.tables.Get.return_value = table
    write_disposition = df.io.BigQueryDisposition.WRITE_APPEND

    insert_response = mock.Mock()
    insert_response.insertErrors = []
    client.tabledata.InsertAll.return_value = insert_response

    with df.io.BigQuerySink(
        'project:dataset.table',
        write_disposition=write_disposition).writer(client) as writer:
      writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

    sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
    expected_rows = []
    json_object = bigquery.JsonObject()
    for k, v in sample_row.iteritems():
      json_object.additionalProperties.append(
          bigquery.JsonObject.AdditionalProperty(
              key=k, value=to_json_value(v)))
    expected_rows.append(
        bigquery.TableDataInsertAllRequest.RowsValueListEntry(
            insertId='_1',  # First row ID generated with prefix ''
            json=json_object))
    client.tabledata.InsertAll.assert_called_with(
        bigquery.BigqueryTabledataInsertAllRequest(
            projectId='project', datasetId='dataset', tableId='table',
            tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                rows=expected_rows)))
  def insert_rows(self, project_id, dataset_id, table_id, rows):
    """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

    # Prepare rows for insertion. Of special note is the row ID that we add to
    # each row in order to help BigQuery avoid inserting a row multiple times.
    # BigQuery will do a best-effort if unique IDs are provided. This situation
    # can happen during retries on failures.
    # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
    final_rows = []
    for row in rows:
      json_object = bigquery.JsonObject()
      for k, v in row.iteritems():
        json_object.additionalProperties.append(
            bigquery.JsonObject.AdditionalProperty(
                key=k, value=to_json_value(v)))
      final_rows.append(
          bigquery.TableDataInsertAllRequest.RowsValueListEntry(
              insertId=str(self.unique_row_id),
              json=json_object))
    result, errors = self._insert_all_rows(
        project_id, dataset_id, table_id, final_rows)
    return result, errors
def get_text_source_to_shuffle_sink_message():
  ri = dataflow.ReadInstruction()
  ri.source = dataflow.Source()
  ri.source.spec = dataflow.Source.SpecValue()
  for k, v in TEXT_SOURCE_SPEC.iteritems():
    ri.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  add_source_codec_spec(ri)

  di = dataflow.ParDoInstruction()
  di.input = dataflow.InstructionInput()
  di.input.producerInstructionIndex = 1
  di.multiOutputInfos = [dataflow.MultiOutputInfo(tag='out')]
  di.userFn = dataflow.ParDoInstruction.UserFnValue()
  for k, v in PARDO_DOFN_SPEC.iteritems():
    di.userFn.additionalProperties.append(
        dataflow.ParDoInstruction.UserFnValue.AdditionalProperty(
            key=k, value=to_json_value(v)))

  wsi = dataflow.WriteInstruction()
  wsi.input = dataflow.InstructionInput()
  wsi.input.producerInstructionIndex = 1
  di.input.outputNum = 0
  wsi.sink = dataflow.Sink()
  wsi.sink.spec = dataflow.Sink.SpecValue()
  for k, v in SHUFFLE_SINK_SPEC.iteritems():
    wsi.sink.spec.additionalProperties.append(
        dataflow.Sink.SpecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  add_sink_codec_spec(wsi)

  mt = dataflow.MapTask()
  mt.instructions.append(dataflow.ParallelInstruction(read=ri))
  mt.instructions.append(dataflow.ParallelInstruction(parDo=di))
  mt.instructions.append(dataflow.ParallelInstruction(write=wsi))

  wi = dataflow.WorkItem()
  wi.id = 1234
  wi.projectId = 'project'
  wi.jobId = 'job'
  wi.mapTask = mt

  m = dataflow.LeaseWorkItemResponse()
  m.workItems.append(wi)
  return m
def get_text_source_to_shuffle_sink_message():
    ri = dataflow.ReadInstruction()
    ri.source = dataflow.Source()
    ri.source.spec = dataflow.Source.SpecValue()
    for k, v in TEXT_SOURCE_SPEC.iteritems():
        ri.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))
    add_source_codec_spec(ri)

    di = dataflow.ParDoInstruction()
    di.input = dataflow.InstructionInput()
    di.input.producerInstructionIndex = 1
    di.multiOutputInfos = [dataflow.MultiOutputInfo(tag='out')]
    di.userFn = dataflow.ParDoInstruction.UserFnValue()
    for k, v in PARDO_DOFN_SPEC.iteritems():
        di.userFn.additionalProperties.append(
            dataflow.ParDoInstruction.UserFnValue.AdditionalProperty(
                key=k, value=to_json_value(v)))

    wsi = dataflow.WriteInstruction()
    wsi.input = dataflow.InstructionInput()
    wsi.input.producerInstructionIndex = 1
    di.input.outputNum = 0
    wsi.sink = dataflow.Sink()
    wsi.sink.spec = dataflow.Sink.SpecValue()
    for k, v in SHUFFLE_SINK_SPEC.iteritems():
        wsi.sink.spec.additionalProperties.append(
            dataflow.Sink.SpecValue.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
    add_sink_codec_spec(wsi)

    mt = dataflow.MapTask()
    mt.instructions.append(get_instruction_with_outputs(read=ri))
    mt.instructions.append(get_instruction_with_outputs(parDo=di))
    mt.instructions.append(dataflow.ParallelInstruction(write=wsi))

    wi = dataflow.WorkItem()
    wi.id = 1234
    wi.projectId = 'project'
    wi.jobId = 'job'
    wi.mapTask = mt

    m = dataflow.LeaseWorkItemResponse()
    m.workItems.append(wi)
    return m
 def test_row_and_no_schema(self):
   coder = TableRowJsonCoder()
   test_row = bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e))
          for e in ['abc', 123, 123.456, True]])
   with self.assertRaises(AttributeError) as ctx:
     coder.encode(test_row)
   self.assertTrue(
       ctx.exception.message.startswith('The TableRowJsonCoder requires'))
Exemple #16
0
 def test_row_and_no_schema(self):
     coder = TableRowJsonCoder()
     test_row = bigquery.TableRow(f=[
         bigquery.TableCell(v=to_json_value(e))
         for e in ['abc', 123, 123.456, True]
     ])
     with self.assertRaises(AttributeError) as ctx:
         coder.encode(test_row)
     self.assertTrue(
         ctx.exception.message.startswith('The TableRowJsonCoder requires'))
def get_instruction_with_outputs(num_outputs=1, **kwargs):
    pi = dataflow.ParallelInstruction(**kwargs)
    for _ in xrange(num_outputs):
        output = dataflow.InstructionOutput()
        output.codec = dataflow.InstructionOutput.CodecValue()
        for k, v in CODER_SPEC.iteritems():
            output.codec.additionalProperties.append(
                dataflow.InstructionOutput.CodecValue.AdditionalProperty(
                    key=k, value=to_json_value(v)))
        pi.outputs.append(output)
    return pi
def get_instruction_with_outputs(num_outputs=1, **kwargs):
  pi = dataflow.ParallelInstruction(**kwargs)
  for _ in xrange(num_outputs):
    output = dataflow.InstructionOutput()
    output.codec = dataflow.InstructionOutput.CodecValue()
    for k, v in CODER_SPEC.iteritems():
      output.codec.additionalProperties.append(
          dataflow.InstructionOutput.CodecValue.AdditionalProperty(
              key=k, value=to_json_value(v)))
    pi.outputs.append(output)
  return pi
def append_metric(status_object, metric_name, kind, value, setter=None,
                  step=None, output_user_name=None, tentative=False,
                  worker_id=None, cumulative=True):
  """Creates and adds a MetricUpdate field to the passed-in protobuf.

  Args:
    status_object: a work_item_status to which to add this metric
    metric_name: a string naming this metric
    kind: dataflow counter kind (e.g. 'sum')
    value: accumulator value to encode
    setter: if not None, a lambda to use to update metric_update with value
    step: the name of the associated step
    output_user_name: the user-visible name to use
    tentative: whether this should be labeled as a tentative metric
    worker_id: the id of this worker.  Specifying a worker_id also
      causes this to be encoded as a metric, not a counter.
    cumulative: Whether this metric is cumulative, default True.
      Set to False for a delta value.
  """
  # Does this look like a counter or like a metric?
  is_counter = not worker_id

  metric_update = dataflow.MetricUpdate()
  metric_update.name = dataflow.MetricStructuredName()
  metric_update.name.name = metric_name
  # Handle attributes stored in the name context
  if step or output_user_name or tentative or worker_id:
    metric_update.name.context = dataflow.MetricStructuredName.ContextValue()

    def append_to_context(key, value):
      metric_update.name.context.additionalProperties.append(
          dataflow.MetricStructuredName.ContextValue.AdditionalProperty(
              key=key, value=value))
    if step:
      append_to_context('step', step)
    if output_user_name:
      append_to_context('output_user_name', output_user_name)
    if tentative:
      append_to_context('tentative', 'true')
    if worker_id:
      append_to_context('workerId', worker_id)
  if cumulative and is_counter:
    metric_update.cumulative = cumulative
  if is_counter:
    # Counters are distinguished by having a kind; metrics do not.
    metric_update.kind = kind
  if setter:
    setter(value, metric_update)
  else:
    metric_update.scalar = to_json_value(value, with_type=True)
  logging.debug('Appending metric_update: %s', metric_update)
  status_object.metricUpdates.append(metric_update)
def splits_to_split_response(bundles):
  """Generates a response to a custom source split request.

  Args:
    bundles: a set of bundles generated by a BoundedSource.split() invocation.
  Returns:
   a SourceOperationResponse object.
  """
  derived_sources = []
  for bundle in bundles:
    derived_source = dataflow.DerivedSource()
    derived_source.derivationMode = (
        dataflow.DerivedSource.DerivationModeValueValuesEnum
        .SOURCE_DERIVATION_MODE_INDEPENDENT)
    derived_source.source = dataflow.Source()
    derived_source.source.doesNotNeedSplitting = True

    derived_source.source.spec = dataflow.Source.SpecValue()
    derived_source.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key=names.SERIALIZED_SOURCE_KEY,
            value=to_json_value(pickler.dumps(
                (bundle.source, bundle.start_position, bundle.stop_position)),
                                with_type=True)))
    derived_source.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(key='@type',
                                                     value=to_json_value(
                                                         names.SOURCE_TYPE)))
    derived_sources.append(derived_source)

  split_response = dataflow.SourceSplitResponse()
  split_response.bundles = derived_sources
  split_response.outcome = (
      dataflow.SourceSplitResponse.OutcomeValueValuesEnum
      .SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED)

  response = dataflow.SourceOperationResponse()
  response.split = split_response
  return response
Exemple #21
0
def splits_to_split_response(bundles):
    """Generates a response to a custom source split request.

  Args:
    bundles: a set of bundles generated by a BoundedSource.split() invocation.
  Returns:
   a SourceOperationResponse object.
  """
    derived_sources = []
    for bundle in bundles:
        derived_source = dataflow.DerivedSource()
        derived_source.derivationMode = (
            dataflow.DerivedSource.DerivationModeValueValuesEnum.
            SOURCE_DERIVATION_MODE_INDEPENDENT)
        derived_source.source = dataflow.Source()
        derived_source.source.doesNotNeedSplitting = True

        derived_source.source.spec = dataflow.Source.SpecValue()
        derived_source.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=names.SERIALIZED_SOURCE_KEY,
                value=to_json_value(pickler.dumps(
                    (bundle.source, bundle.start_position,
                     bundle.stop_position)),
                                    with_type=True)))
        derived_source.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key='@type', value=to_json_value(names.SOURCE_TYPE)))
        derived_sources.append(derived_source)

    split_response = dataflow.SourceSplitResponse()
    split_response.bundles = derived_sources
    split_response.outcome = (
        dataflow.SourceSplitResponse.OutcomeValueValuesEnum.
        SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED)

    response = dataflow.SourceOperationResponse()
    response.split = split_response
    return response
Exemple #22
0
  def build_split_proto(self, bounded_source, desired_bundle_size):
    split_proto = dataflow.SourceSplitRequest()
    split_proto.options = dataflow.SourceSplitOptions()
    split_proto.options.desiredBundleSizeBytes = desired_bundle_size

    source = dataflow.Source()
    spec = dataflow.Source.SpecValue()

    if bounded_source:
      spec.additionalProperties.append(
          dataflow.Source.SpecValue.AdditionalProperty(
              key=names.SERIALIZED_SOURCE_KEY,
              value=to_json_value({'value': pickler.dumps(bounded_source),
                                   '@type': 'http://schema.org/Text'})))
    spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key='@type',
            value=to_json_value('CustomSourcesType')))
    source.spec = spec
    split_proto.source = source

    return split_proto
Exemple #23
0
 def get_test_rows(self):
     now = time.time()
     expected_rows = [{
         'i': 1,
         's': 'abc',
         'f': 2.3,
         'b': True,
         't': now
     }, {
         'i': 10,
         's': 'xyz',
         'f': -3.14,
         'b': False
     }]
     schema = bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='b', type='BOOLEAN', mode='REQUIRED'),
         bigquery.TableFieldSchema(name='f', type='FLOAT', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='i', type='INTEGER', mode='REQUIRED'),
         bigquery.TableFieldSchema(name='s', type='STRING',
                                   mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='t', type='TIMESTAMP', mode='NULLABLE')
     ])
     table_rows = [
         bigquery.TableRow(f=[
             bigquery.TableCell(v=to_json_value('true')),
             bigquery.TableCell(v=to_json_value(str(2.3))),
             bigquery.TableCell(v=to_json_value(str(1))),
             bigquery.TableCell(v=to_json_value('abc')),
             # For timestamps cannot use str() because it will truncate the
             # number representing the timestamp.
             bigquery.TableCell(v=to_json_value('%f' % now))
         ]),
         bigquery.TableRow(f=[
             bigquery.TableCell(v=to_json_value('false')),
             bigquery.TableCell(v=to_json_value(str(-3.14))),
             bigquery.TableCell(v=to_json_value(str(10))),
             bigquery.TableCell(v=to_json_value('xyz')),
             bigquery.TableCell(v=None)
         ])
     ]
     return table_rows, schema, expected_rows
  def test_row_as_table_row(self):
    schema_definition = [
        ('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'), ('b', 'BOOLEAN')]
    schema = bigquery.TableSchema(
        fields=[bigquery.TableFieldSchema(name=k, type=v)
                for k, v in schema_definition])
    coder = TableRowJsonCoder(table_schema=schema)
    test_row = bigquery.TableRow(
        f=[bigquery.TableCell(v=to_json_value(e))
           for e in ['abc', 123, 123.456, True]])

    self.assertEqual('{"s": "abc", "i": 123, "f": 123.456, "b": true}',
                     coder.encode(test_row))
    self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
    # A coder without schema can still decode.
    self.assertEqual(
        test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
Exemple #25
0
    def test_row_as_table_row(self):
        schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'),
                             ('b', 'BOOLEAN')]
        schema = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name=k, type=v)
            for k, v in schema_definition
        ])
        coder = TableRowJsonCoder(table_schema=schema)
        test_row = bigquery.TableRow(f=[
            bigquery.TableCell(v=to_json_value(e))
            for e in ['abc', 123, 123.456, True]
        ])

        self.assertEqual('{"s": "abc", "i": 123, "f": 123.456, "b": true}',
                         coder.encode(test_row))
        self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
        # A coder without schema can still decode.
        self.assertEqual(test_row,
                         TableRowJsonCoder().decode(coder.encode(test_row)))
 def test_string_to(self):
     self.assertEquals(JsonValue(string_value='abc'), to_json_value('abc'))
Exemple #27
0
 def decode(self, encoded_table_row):
   od = json.loads(
       encoded_table_row, object_pairs_hook=collections.OrderedDict)
   return bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
def add_sink_codec_spec(target):
    target.sink.codec = dataflow.Sink.CodecValue()
    for k, v in CODER_SPEC.iteritems():
        target.sink.codec.additionalProperties.append(
            dataflow.Sink.CodecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))
def add_sink_codec_spec(target):
  target.sink.codec = dataflow.Sink.CodecValue()
  for k, v in CODER_SPEC.iteritems():
    target.sink.codec.additionalProperties.append(
        dataflow.Sink.CodecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))
  def __init__(self, packages, options, environment_version):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = COMPUTE_API_SERVICE
    self.proto.dataset = '%s/cloud_dataflow' % BIGQUERY_API_SERVICE
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace('gs:/',
                                                        STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    version_string = version.__version__

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value('Google Cloud Dataflow SDK for Python')),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(version_string))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.is_streaming:
      job_type = 'PYTHON_STREAMING'
    else:
      job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl='https://dataflow.googleapis.com',
                servicePath=self.google_cloud_options.dataflow_endpoint)))
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.disk_source_image:
      pool.diskSourceImage = self.worker_options.disk_source_image
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.teardown_policy:
      if self.worker_options.teardown_policy == 'TEARDOWN_NEVER':
        pool.teardownPolicy = (
            dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_NEVER)
      elif self.worker_options.teardown_policy == 'TEARDOWN_ALWAYS':
        pool.teardownPolicy = (
            dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_ALWAYS)
      elif self.worker_options.teardown_policy == 'TEARDOWN_ON_SUCCESS':
        pool.teardownPolicy = (
            dataflow.WorkerPool
            .TeardownPolicyValueValuesEnum.TEARDOWN_ON_SUCCESS)

    if self.standard_options.is_streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)
 def test_int_to(self):
     self.assertEquals(JsonValue(integer_value=14), to_json_value(14))
 def test_string_from(self):
     self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))
 def test_float_from(self):
     self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
 def decode(self, encoded_table_row):
   od = json.loads(
       encoded_table_row, object_pairs_hook=collections.OrderedDict)
   return bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
Exemple #35
0
    def __init__(self, packages, options, environment_version):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = COMPUTE_API_SERVICE
        self.proto.dataset = '%s/cloud_dataflow' % BIGQUERY_API_SERVICE
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        version_string = version.__version__

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name',
                value=to_json_value('Google Cloud Dataflow SDK for Python')),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(version_string))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        if self.standard_options.streaming:
            job_type = 'PYTHON_STREAMING'
        else:
            job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/', STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl='https://dataflow.googleapis.com',
                    servicePath=self.google_cloud_options.dataflow_endpoint)))
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.disk_source_image:
            pool.diskSourceImage = self.worker_options.disk_source_image
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        if self.worker_options.teardown_policy:
            if self.worker_options.teardown_policy == 'TEARDOWN_NEVER':
                pool.teardownPolicy = (
                    dataflow.WorkerPool.TeardownPolicyValueValuesEnum.
                    TEARDOWN_NEVER)
            elif self.worker_options.teardown_policy == 'TEARDOWN_ALWAYS':
                pool.teardownPolicy = (
                    dataflow.WorkerPool.TeardownPolicyValueValuesEnum.
                    TEARDOWN_ALWAYS)
            elif self.worker_options.teardown_policy == 'TEARDOWN_ON_SUCCESS':
                pool.teardownPolicy = (
                    dataflow.WorkerPool.TeardownPolicyValueValuesEnum.
                    TEARDOWN_ON_SUCCESS)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            for k, v in sdk_pipeline_options.iteritems():
                if v is not None:
                    self.proto.sdkPipelineOptions.additionalProperties.append(
                        dataflow.Environment.SdkPipelineOptionsValue.
                        AdditionalProperty(key=k, value=to_json_value(v)))
Exemple #36
0
 def add_property(self, name, value, with_type=False):
     self.proto.properties.additionalProperties.append(
         dataflow.Step.PropertiesValue.AdditionalProperty(
             key=name, value=to_json_value(value, with_type=with_type)))
Exemple #37
0
def append_metric(status_object,
                  metric_name,
                  kind,
                  value,
                  setter=None,
                  step=None,
                  output_user_name=None,
                  tentative=False,
                  worker_id=None,
                  cumulative=True):
    """Creates and adds a MetricUpdate field to the passed-in protobuf.

  Args:
    status_object: a work_item_status to which to add this metric
    metric_name: a string naming this metric
    kind: dataflow counter kind (e.g. 'sum')
    value: accumulator value to encode
    setter: if not None, a lambda to use to update metric_update with value
    step: the name of the associated step
    output_user_name: the user-visible name to use
    tentative: whether this should be labeled as a tentative metric
    worker_id: the id of this worker.  Specifying a worker_id also
      causes this to be encoded as a metric, not a counter.
    cumulative: Whether this metric is cumulative, default True.
      Set to False for a delta value.
  """
    # Does this look like a counter or like a metric?
    is_counter = not worker_id

    metric_update = dataflow.MetricUpdate()
    metric_update.name = dataflow.MetricStructuredName()
    metric_update.name.name = metric_name
    # Handle attributes stored in the name context
    if step or output_user_name or tentative or worker_id:
        metric_update.name.context = dataflow.MetricStructuredName.ContextValue(
        )

        def append_to_context(key, value):
            metric_update.name.context.additionalProperties.append(
                dataflow.MetricStructuredName.ContextValue.AdditionalProperty(
                    key=key, value=value))

        if step:
            append_to_context('step', step)
        if output_user_name:
            append_to_context('output_user_name', output_user_name)
        if tentative:
            append_to_context('tentative', 'true')
        if worker_id:
            append_to_context('workerId', worker_id)
    if cumulative and is_counter:
        metric_update.cumulative = cumulative
    if is_counter:
        # Counters are distinguished by having a kind; metrics do not.
        metric_update.kind = kind
    if setter:
        setter(value, metric_update)
    else:
        metric_update.scalar = to_json_value(value, with_type=True)
    logging.debug('Appending metric_update: %s', metric_update)
    status_object.metricUpdates.append(metric_update)
Exemple #38
0
def set_scalar(accumulator, metric_update):
    metric_update.scalar = to_json_value(accumulator.value, with_type=True)
 def add_property(self, name, value, with_type=False):
   self.proto.properties.additionalProperties.append(
       dataflow.Step.PropertiesValue.AdditionalProperty(
           key=name, value=to_json_value(value, with_type=with_type)))
 def test_true_to(self):
     self.assertEquals(JsonValue(boolean_value=True), to_json_value(True))
 def test_false_from(self):
     self.assertEquals(False, from_json_value(to_json_value(False)))
 def test_false_to(self):
     self.assertEquals(JsonValue(boolean_value=False), to_json_value(False))
  def __init__(self, packages, options, environment_version):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = COMPUTE_API_SERVICE
    self.proto.dataset = '%s/cloud_dataflow' % BIGQUERY_API_SERVICE
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace('gs:/',
                                                        STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    version_string = version.__version__

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value('Google Cloud Dataflow SDK for Python')),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(version_string))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'PYTHON_STREAMING'
    else:
      job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl='https://dataflow.googleapis.com',
                servicePath=self.google_cloud_options.dataflow_endpoint)))
    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.disk_source_image:
      pool.diskSourceImage = self.worker_options.disk_source_image
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      # Default to using the worker harness container image for the current SDK
      # version.
      pool.workerHarnessContainerImage = (
          'dataflow.gcr.io/v1beta3/python:%s' % version.__version__)
    if self.worker_options.teardown_policy:
      if self.worker_options.teardown_policy == 'TEARDOWN_NEVER':
        pool.teardownPolicy = (
            dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_NEVER)
      elif self.worker_options.teardown_policy == 'TEARDOWN_ALWAYS':
        pool.teardownPolicy = (
            dataflow.WorkerPool.TeardownPolicyValueValuesEnum.TEARDOWN_ALWAYS)
      elif self.worker_options.teardown_policy == 'TEARDOWN_ON_SUCCESS':
        pool.teardownPolicy = (
            dataflow.WorkerPool
            .TeardownPolicyValueValuesEnum.TEARDOWN_ON_SUCCESS)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      for k, v in sdk_pipeline_options.iteritems():
        if v is not None:
          self.proto.sdkPipelineOptions.additionalProperties.append(
              dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
                  key=k, value=to_json_value(v)))
 def test_float_to(self):
     self.assertEquals(JsonValue(double_value=2.75), to_json_value(2.75))
Exemple #45
0
def append_metric(status_object,
                  metric_name,
                  value1,
                  value2=None,
                  step=None,
                  output_user_name=None,
                  tentative=False,
                  worker_id=None,
                  cumulative=True):
    """Creates and adds a MetricUpdate field to the passed-in protobuf.

  Args:
    status_object: a work_item_status to which to add this metric
    metric_name: a string naming this metric
    value1: scalar for a Sum or mean_sum for a Mean
    value2: mean_count for a Mean aggregation (do not provide for a Sum).
    step: the name of the associated step
    output_user_name: the user-visible name to use
    tentative: whether this should be labeled as a tentative metric
    worker_id: the id of this worker.  Specifying a worker_id also
      causes this to be encoded as a metric, not a counter.
    cumulative: Whether this metric is cumulative, default True.
      Set to False for a delta value.
  """
    # Does this look like a counter or like a metric?
    is_counter = not worker_id

    metric_update = dataflow.MetricUpdate()
    metric_update.name = dataflow.MetricStructuredName()
    metric_update.name.name = metric_name
    # Handle attributes stored in the name context
    if step or output_user_name or tentative or worker_id:
        metric_update.name.context = dataflow.MetricStructuredName.ContextValue(
        )

        def append_to_context(key, value):
            metric_update.name.context.additionalProperties.append(
                dataflow.MetricStructuredName.ContextValue.AdditionalProperty(
                    key=key, value=value))

        if step:
            append_to_context('step', step)
        if output_user_name:
            append_to_context('output_user_name', output_user_name)
        if tentative:
            append_to_context('tentative', 'true')
        if worker_id:
            append_to_context('workerId', worker_id)
    if cumulative and is_counter:
        metric_update.cumulative = cumulative
    if value2 is None:
        if is_counter:
            # Counters are distinguished by having a kind; metrics do not.
            metric_update.kind = 'Sum'
        metric_update.scalar = to_json_value(value1, with_type=True)
    elif value2 > 0:
        metric_update.kind = 'Mean'
        metric_update.meanSum = to_json_value(value1, with_type=True)
        metric_update.meanCount = to_json_value(value2, with_type=True)
    else:
        # A denominator of 0 will raise an error in the service.
        # What it means is we have nothing to report yet, so don't.
        pass
    logging.debug('Appending metric_update: %s', metric_update)
    status_object.metricUpdates.append(metric_update)
 def test_true_from(self):
     self.assertEquals(True, from_json_value(to_json_value(True)))
def add_source_windowed_codec_spec(target):
    target.source.codec = dataflow.Source.CodecValue()
    for k, v in WINDOWED_CODER_SPEC.iteritems():
        target.source.codec.additionalProperties.append(
            dataflow.Source.CodecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))
 def test_int_from(self):
     self.assertEquals(-27, from_json_value(to_json_value(-27)))
def set_scalar(accumulator, metric_update):
  metric_update.scalar = to_json_value(accumulator.value, with_type=True)
 def test_with_type(self):
     rt = from_json_value(to_json_value('abcd', with_type=True))
     self.assertEquals('http://schema.org/Text', rt['@type'])
     self.assertEquals('abcd', rt['value'])
def add_source_windowed_codec_spec(target):
  target.source.codec = dataflow.Source.CodecValue()
  for k, v in WINDOWED_CODER_SPEC.iteritems():
    target.source.codec.additionalProperties.append(
        dataflow.Source.CodecValue.AdditionalProperty(
            key=k, value=to_json_value(v)))