Ejemplo n.º 1
0
def set_mean(accumulator, metric_update):
  if accumulator.count:
    metric_update.meanSum = to_json_value(accumulator.sum, with_type=True)
    metric_update.meanCount = to_json_value(accumulator.count, with_type=True)
  else:
    # A denominator of 0 will raise an error in the service.
    # What it means is we have nothing to report yet, so don't.
    metric_update.kind = None
Ejemplo n.º 2
0
  def test_row_as_table_row(self):
    schema_definition = [
        ('s', 'STRING'),
        ('i', 'INTEGER'),
        ('f', 'FLOAT'),
        ('b', 'BOOLEAN'),
        ('r', 'RECORD')]
    data_defination = [
        'abc',
        123,
        123.456,
        True,
        {'a': 'b'}]
    str_def = '{"s": "abc", "i": 123, "f": 123.456, "b": true, "r": {"a": "b"}}'
    schema = bigquery.TableSchema(
        fields=[bigquery.TableFieldSchema(name=k, type=v)
                for k, v in schema_definition])
    coder = TableRowJsonCoder(table_schema=schema)
    test_row = bigquery.TableRow(
        f=[bigquery.TableCell(v=to_json_value(e)) for e in data_defination])

    self.assertEqual(str_def, coder.encode(test_row))
    self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
    # A coder without schema can still decode.
    self.assertEqual(
        test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
Ejemplo n.º 3
0
    def test_rows_are_written(self):
        client = mock.Mock()
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
                               schema=bigquery.TableSchema())
        client.tables.Get.return_value = table
        write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

        insert_response = mock.Mock()
        insert_response.insertErrors = []
        client.tabledata.InsertAll.return_value = insert_response

        with beam.io.BigQuerySink(
                'project:dataset.table',
                write_disposition=write_disposition).writer(client) as writer:
            writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

        sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
        expected_rows = []
        json_object = bigquery.JsonObject()
        for k, v in sample_row.iteritems():
            json_object.additionalProperties.append(
                bigquery.JsonObject.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
        expected_rows.append(
            bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                insertId='_1',  # First row ID generated with prefix ''
                json=json_object))
        client.tabledata.InsertAll.assert_called_with(
            bigquery.BigqueryTabledataInsertAllRequest(
                projectId='project',
                datasetId='dataset',
                tableId='table',
                tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                    rows=expected_rows)))
Ejemplo n.º 4
0
  def insert_rows(self, project_id, dataset_id, table_id, rows):
    """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

    # Prepare rows for insertion. Of special note is the row ID that we add to
    # each row in order to help BigQuery avoid inserting a row multiple times.
    # BigQuery will do a best-effort if unique IDs are provided. This situation
    # can happen during retries on failures.
    # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
    final_rows = []
    for row in rows:
      json_object = bigquery.JsonObject()
      for k, v in row.iteritems():
        json_object.additionalProperties.append(
            bigquery.JsonObject.AdditionalProperty(
                key=k, value=to_json_value(v)))
      final_rows.append(
          bigquery.TableDataInsertAllRequest.RowsValueListEntry(
              insertId=str(self.unique_row_id),
              json=json_object))
    result, errors = self._insert_all_rows(
        project_id, dataset_id, table_id, final_rows)
    return result, errors
Ejemplo n.º 5
0
 def test_row_and_no_schema(self):
   coder = TableRowJsonCoder()
   test_row = bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e))
          for e in ['abc', 123, 123.456, True]])
   with self.assertRaises(AttributeError) as ctx:
     coder.encode(test_row)
   self.assertTrue(
       ctx.exception.message.startswith('The TableRowJsonCoder requires'))
Ejemplo n.º 6
0
 def json_compliance_exception(self, value):
   with self.assertRaises(ValueError) as exn:
     schema_definition = [('f', 'FLOAT')]
     schema = bigquery.TableSchema(
         fields=[bigquery.TableFieldSchema(name=k, type=v)
                 for k, v in schema_definition])
     coder = TableRowJsonCoder(table_schema=schema)
     test_row = bigquery.TableRow(
         f=[bigquery.TableCell(v=to_json_value(value))])
     coder.encode(test_row)
     self.assertTrue(bigquery.JSON_COMPLIANCE_ERROR in exn.exception.message)
    def test_metric_update_basic(self):
        metric_update = dataflow.MetricUpdate()
        metric_update.name = dataflow.MetricStructuredName()
        metric_update.name.name = 'metric1'
        metric_update.name.origin = 'origin1'

        metric_update.cumulative = False
        metric_update.kind = 'sum'
        metric_update.scalar = to_json_value(1, with_type=True)

        name_matcher = message_matchers.MetricStructuredNameMatcher(
            name='metric1', origin='origin1')
        matcher = message_matchers.MetricUpdateMatcher(name=name_matcher,
                                                       kind='sum',
                                                       scalar=1)

        hc.assert_that(metric_update, hc.is_(matcher))

        with self.assertRaises(AssertionError):
            matcher.kind = 'suma'
            hc.assert_that(metric_update, hc.is_(matcher))
Ejemplo n.º 8
0
 def test_large_integer(self):
   num = 1 << 35
   self.assertEquals(num, from_json_value(to_json_value(num)))
   self.assertEquals(long(num), from_json_value(to_json_value(long(num))))
Ejemplo n.º 9
0
 def test_long_value(self):
   self.assertEquals(long(27), from_json_value(to_json_value(long(27))))
Ejemplo n.º 10
0
 def test_with_type(self):
   rt = from_json_value(to_json_value('abcd', with_type=True))
   self.assertEquals('http://schema.org/Text', rt['@type'])
   self.assertEquals('abcd', rt['value'])
Ejemplo n.º 11
0
 def test_none_from(self):
   self.assertIsNone(from_json_value(to_json_value(None)))
Ejemplo n.º 12
0
 def test_true_to(self):
   self.assertEquals(JsonValue(boolean_value=True), to_json_value(True))
Ejemplo n.º 13
0
 def test_float_from(self):
   self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
Ejemplo n.º 14
0
def set_scalar(accumulator, metric_update):
  metric_update.scalar = to_json_value(accumulator.value, with_type=True)
Ejemplo n.º 15
0
 def add_property(self, name, value, with_type=False):
   self._additional_properties.append((name, value, with_type))
   self.proto.properties.additionalProperties.append(
       dataflow.Step.PropertiesValue.AdditionalProperty(
           key=name, value=to_json_value(value, with_type=with_type)))
Ejemplo n.º 16
0
 def test_none_to(self):
   self.assertEquals(JsonValue(is_null=True), to_json_value(None))
Ejemplo n.º 17
0
 def decode(self, encoded_table_row):
   od = json.loads(
       encoded_table_row, object_pairs_hook=collections.OrderedDict)
   return bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
Ejemplo n.º 18
0
 def test_float_to(self):
   self.assertEquals(JsonValue(double_value=2.75), to_json_value(2.75))
Ejemplo n.º 19
0
 def test_int_to(self):
   self.assertEquals(JsonValue(integer_value=14), to_json_value(14))
Ejemplo n.º 20
0
 def test_false_to(self):
   self.assertEquals(JsonValue(boolean_value=False), to_json_value(False))
Ejemplo n.º 21
0
 def test_too_long_value(self):
   with self.assertRaises(TypeError):
     to_json_value(long(1 << 64))
Ejemplo n.º 22
0
 def test_false_from(self):
   self.assertEquals(False, from_json_value(to_json_value(False)))
Ejemplo n.º 23
0
    def get_test_rows(self):
        now = time.time()
        dt = datetime.datetime.utcfromtimestamp(float(now))
        ts = dt.strftime('%Y-%m-%d %H:%M:%S.%f UTC')
        expected_rows = [{
            'i':
            1,
            's':
            'abc',
            'f':
            2.3,
            'b':
            True,
            't':
            ts,
            'dt':
            '2016-10-31',
            'ts':
            '22:39:12.627498',
            'dt_ts':
            '2008-12-25T07:30:00',
            'r': {
                's2': 'b'
            },
            'rpr': [{
                's3': 'c',
                'rpr2': [{
                    'rs': ['d', 'e'],
                    's4': 'f'
                }]
            }]
        }, {
            'i': 10,
            's': 'xyz',
            'f': -3.14,
            'b': False,
            'rpr': []
        }]

        nested_schema = [
            bigquery.TableFieldSchema(name='s2',
                                      type='STRING',
                                      mode='NULLABLE')
        ]
        nested_schema_2 = [
            bigquery.TableFieldSchema(name='s3',
                                      type='STRING',
                                      mode='NULLABLE'),
            bigquery.TableFieldSchema(
                name='rpr2',
                type='RECORD',
                mode='REPEATED',
                fields=[
                    bigquery.TableFieldSchema(name='rs',
                                              type='STRING',
                                              mode='REPEATED'),
                    bigquery.TableFieldSchema(name='s4',
                                              type='STRING',
                                              mode='NULLABLE')
                ])
        ]

        schema = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(
                name='b', type='BOOLEAN', mode='REQUIRED'),
            bigquery.TableFieldSchema(name='f', type='FLOAT', mode='REQUIRED'),
            bigquery.TableFieldSchema(
                name='i', type='INTEGER', mode='REQUIRED'),
            bigquery.TableFieldSchema(name='s', type='STRING',
                                      mode='REQUIRED'),
            bigquery.TableFieldSchema(
                name='t', type='TIMESTAMP', mode='NULLABLE'),
            bigquery.TableFieldSchema(name='dt', type='DATE', mode='NULLABLE'),
            bigquery.TableFieldSchema(name='ts', type='TIME', mode='NULLABLE'),
            bigquery.TableFieldSchema(
                name='dt_ts', type='DATETIME', mode='NULLABLE'),
            bigquery.TableFieldSchema(
                name='r', type='RECORD', mode='NULLABLE',
                fields=nested_schema),
            bigquery.TableFieldSchema(name='rpr',
                                      type='RECORD',
                                      mode='REPEATED',
                                      fields=nested_schema_2)
        ])

        table_rows = [
            bigquery.TableRow(f=[
                bigquery.TableCell(v=to_json_value('true')),
                bigquery.TableCell(v=to_json_value(str(2.3))),
                bigquery.TableCell(v=to_json_value(str(1))),
                bigquery.TableCell(v=to_json_value('abc')),
                # For timestamps cannot use str() because it will truncate the
                # number representing the timestamp.
                bigquery.TableCell(v=to_json_value('%f' % now)),
                bigquery.TableCell(v=to_json_value('2016-10-31')),
                bigquery.TableCell(v=to_json_value('22:39:12.627498')),
                bigquery.TableCell(v=to_json_value('2008-12-25T07:30:00')),
                # For record we cannot use dict because it doesn't create nested
                # schemas correctly so we have to use this f,v based format
                bigquery.TableCell(v=to_json_value({'f': [{
                    'v': 'b'
                }]})),
                bigquery.TableCell(v=to_json_value([{
                    'v': {
                        'f': [{
                            'v': 'c'
                        }, {
                            'v': [{
                                'v': {
                                    'f': [{
                                        'v': [{
                                            'v': 'd'
                                        }, {
                                            'v': 'e'
                                        }]
                                    }, {
                                        'v': 'f'
                                    }]
                                }
                            }]
                        }]
                    }
                }]))
            ]),
            bigquery.TableRow(f=[
                bigquery.TableCell(v=to_json_value('false')),
                bigquery.TableCell(v=to_json_value(str(-3.14))),
                bigquery.TableCell(v=to_json_value(str(10))),
                bigquery.TableCell(v=to_json_value('xyz')),
                bigquery.TableCell(v=None),
                bigquery.TableCell(v=None),
                bigquery.TableCell(v=None),
                bigquery.TableCell(v=None),
                bigquery.TableCell(v=None),
                bigquery.TableCell(v=to_json_value([]))
            ])
        ]
        return table_rows, schema, expected_rows
Ejemplo n.º 24
0
 def test_int_from(self):
   self.assertEquals(-27, from_json_value(to_json_value(-27)))
Ejemplo n.º 25
0
 def test_string_from(self):
   self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))
Ejemplo n.º 26
0
 def test_string_to(self):
   self.assertEquals(JsonValue(string_value='abc'), to_json_value('abc'))
Ejemplo n.º 27
0
 def test_true_from(self):
   self.assertEquals(True, from_json_value(to_json_value(True)))
Ejemplo n.º 28
0
  def __init__(self, packages, options, environment_version):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.debug_options = options.view_as(DebugOptions)
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
    self.proto.dataset = '{}/cloud_dataflow'.format(
        GoogleCloudOptions.BIGQUERY_API_SERVICE)
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace(
            'gs:/',
            GoogleCloudOptions.STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    if self.google_cloud_options.service_account_email:
      self.proto.serviceAccountEmail = (
          self.google_cloud_options.service_account_email)

    sdk_name, version_string = get_sdk_name_and_version()

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value(sdk_name)),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(version_string))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'PYTHON_STREAMING'
    else:
      job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # Experiments
    if self.debug_options.experiments:
      for experiment in self.debug_options.experiments:
        self.proto.experiments.append(experiment)
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                servicePath=self.google_cloud_options.dataflow_endpoint)))
    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      # Default to using the worker harness container image for the current SDK
      # version.
      pool.workerHarnessContainerImage = (
          'dataflow.gcr.io/v1beta3/python:%s' %
          get_required_container_version())
    if self.worker_options.use_public_ips is not None:
      if self.worker_options.use_public_ips:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
      else:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      options_dict = {k: v
                      for k, v in sdk_pipeline_options.iteritems()
                      if v is not None}
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='options', value=to_json_value(options_dict)))

      dd = DisplayData.create_from_options(options)
      items = [item.get_dict() for item in dd.items]
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='display_data', value=to_json_value(items)))