Esempio n. 1
0
def translate_mean(accumulator, metric_update):
  if accumulator.count:
    metric_update.meanSum = to_json_value(accumulator.sum, with_type=True)
    metric_update.meanCount = to_json_value(accumulator.count, with_type=True)
  else:
    # A denominator of 0 will raise an error in the service.
    # What it means is we have nothing to report yet, so don't.
    metric_update.kind = None
Esempio n. 2
0
def translate_mean(accumulator, metric_update):
    if accumulator.count:
        metric_update.meanSum = to_json_value(accumulator.sum, with_type=True)
        metric_update.meanCount = to_json_value(accumulator.count,
                                                with_type=True)
    else:
        # A denominator of 0 will raise an error in the service.
        # What it means is we have nothing to report yet, so don't.
        metric_update.kind = None
Esempio n. 3
0
  def insert_rows(self, project_id, dataset_id, table_id, rows):
    """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

    # Prepare rows for insertion. Of special note is the row ID that we add to
    # each row in order to help BigQuery avoid inserting a row multiple times.
    # BigQuery will do a best-effort if unique IDs are provided. This situation
    # can happen during retries on failures.
    # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
    final_rows = []
    for row in rows:
      json_object = bigquery.JsonObject()
      for k, v in row.iteritems():
        json_object.additionalProperties.append(
            bigquery.JsonObject.AdditionalProperty(
                key=k, value=to_json_value(v)))
      final_rows.append(
          bigquery.TableDataInsertAllRequest.RowsValueListEntry(
              insertId=str(self.unique_row_id),
              json=json_object))
    result, errors = self._insert_all_rows(
        project_id, dataset_id, table_id, final_rows)
    return result, errors
Esempio n. 4
0
  def test_rows_are_written(self):
    client = mock.Mock()
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
        schema=bigquery.TableSchema())
    client.tables.Get.return_value = table
    write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

    insert_response = mock.Mock()
    insert_response.insertErrors = []
    client.tabledata.InsertAll.return_value = insert_response

    with beam.io.BigQuerySink(
        'project:dataset.table',
        write_disposition=write_disposition).writer(client) as writer:
      writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

    sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
    expected_rows = []
    json_object = bigquery.JsonObject()
    for k, v in sample_row.iteritems():
      json_object.additionalProperties.append(
          bigquery.JsonObject.AdditionalProperty(
              key=k, value=to_json_value(v)))
    expected_rows.append(
        bigquery.TableDataInsertAllRequest.RowsValueListEntry(
            insertId='_1',  # First row ID generated with prefix ''
            json=json_object))
    client.tabledata.InsertAll.assert_called_with(
        bigquery.BigqueryTabledataInsertAllRequest(
            projectId='project', datasetId='dataset', tableId='table',
            tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                rows=expected_rows)))
Esempio n. 5
0
  def test_row_as_table_row(self):
    schema_definition = [
        ('s', 'STRING'),
        ('i', 'INTEGER'),
        ('f', 'FLOAT'),
        ('b', 'BOOLEAN'),
        ('r', 'RECORD')]
    data_defination = [
        'abc',
        123,
        123.456,
        True,
        {'a': 'b'}]
    str_def = '{"s": "abc", "i": 123, "f": 123.456, "b": true, "r": {"a": "b"}}'
    schema = bigquery.TableSchema(
        fields=[bigquery.TableFieldSchema(name=k, type=v)
                for k, v in schema_definition])
    coder = TableRowJsonCoder(table_schema=schema)
    test_row = bigquery.TableRow(
        f=[bigquery.TableCell(v=to_json_value(e)) for e in data_defination])

    self.assertEqual(str_def, coder.encode(test_row))
    self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
    # A coder without schema can still decode.
    self.assertEqual(
        test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
Esempio n. 6
0
    def insert_rows(self, project_id, dataset_id, table_id, rows):
        """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

        # Prepare rows for insertion. Of special note is the row ID that we add to
        # each row in order to help BigQuery avoid inserting a row multiple times.
        # BigQuery will do a best-effort if unique IDs are provided. This situation
        # can happen during retries on failures.
        # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
        final_rows = []
        for row in rows:
            json_object = bigquery.JsonObject()
            for k, v in row.iteritems():
                json_object.additionalProperties.append(
                    bigquery.JsonObject.AdditionalProperty(
                        key=k, value=to_json_value(v)))
            final_rows.append(
                bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                    insertId=str(self.unique_row_id), json=json_object))
        result, errors = self._insert_all_rows(project_id, dataset_id,
                                               table_id, final_rows)
        return result, errors
Esempio n. 7
0
 def test_row_and_no_schema(self):
   coder = TableRowJsonCoder()
   test_row = bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e))
          for e in ['abc', 123, 123.456, True]])
   with self.assertRaises(AttributeError) as ctx:
     coder.encode(test_row)
   self.assertTrue(
       ctx.exception.message.startswith('The TableRowJsonCoder requires'))
Esempio n. 8
0
 def json_compliance_exception(self, value):
   with self.assertRaises(ValueError) as exn:
     schema_definition = [('f', 'FLOAT')]
     schema = bigquery.TableSchema(
         fields=[bigquery.TableFieldSchema(name=k, type=v)
                 for k, v in schema_definition])
     coder = TableRowJsonCoder(table_schema=schema)
     test_row = bigquery.TableRow(
         f=[bigquery.TableCell(v=to_json_value(value))])
     coder.encode(test_row)
     self.assertTrue(bigquery.JSON_COMPLIANCE_ERROR in exn.exception.message)
Esempio n. 9
0
    def test_metric_update_basic(self):
        metric_update = dataflow.MetricUpdate()
        metric_update.name = dataflow.MetricStructuredName()
        metric_update.name.name = 'metric1'
        metric_update.name.origin = 'origin1'

        metric_update.cumulative = False
        metric_update.kind = 'sum'
        metric_update.scalar = to_json_value(1, with_type=True)

        name_matcher = message_matchers.MetricStructuredNameMatcher(
            name='metric1', origin='origin1')
        matcher = message_matchers.MetricUpdateMatcher(name=name_matcher,
                                                       kind='sum',
                                                       scalar=1)

        hc.assert_that(metric_update, hc.is_(matcher))

        with self.assertRaises(AssertionError):
            matcher.kind = 'suma'
            hc.assert_that(metric_update, hc.is_(matcher))
  def test_metric_update_basic(self):
    metric_update = dataflow.MetricUpdate()
    metric_update.name = dataflow.MetricStructuredName()
    metric_update.name.name = 'metric1'
    metric_update.name.origin = 'origin1'

    metric_update.cumulative = False
    metric_update.kind = 'sum'
    metric_update.scalar = to_json_value(1, with_type=True)

    name_matcher = message_matchers.MetricStructuredNameMatcher(
        name='metric1',
        origin='origin1')
    matcher = message_matchers.MetricUpdateMatcher(
        name=name_matcher,
        kind='sum',
        scalar=1)

    hc.assert_that(metric_update, hc.is_(matcher))

    with self.assertRaises(AssertionError):
      matcher.kind = 'suma'
      hc.assert_that(metric_update, hc.is_(matcher))
Esempio n. 11
0
 def test_false_from(self):
     self.assertEquals(False, from_json_value(to_json_value(False)))
Esempio n. 12
0
 def test_string_from(self):
     self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))
Esempio n. 13
0
 def test_float_to(self):
     self.assertEquals(JsonValue(double_value=2.75), to_json_value(2.75))
Esempio n. 14
0
 def test_false_to(self):
     self.assertEquals(JsonValue(boolean_value=False), to_json_value(False))
Esempio n. 15
0
 def test_string_to(self):
     self.assertEquals(JsonValue(string_value='abc'), to_json_value('abc'))
Esempio n. 16
0
def translate_scalar(accumulator, metric_update):
    metric_update.scalar = to_json_value(accumulator.value, with_type=True)
Esempio n. 17
0
 def test_long_value(self):
     self.assertEquals(long(27), from_json_value(to_json_value(long(27))))
 def test_none_to(self):
   self.assertEquals(JsonValue(is_null=True), to_json_value(None))
 def test_float_to(self):
   self.assertEquals(JsonValue(double_value=2.75), to_json_value(2.75))
 def test_int_to(self):
   self.assertEquals(JsonValue(integer_value=14), to_json_value(14))
 def test_false_to(self):
   self.assertEquals(JsonValue(boolean_value=False), to_json_value(False))
 def test_true_to(self):
   self.assertEquals(JsonValue(boolean_value=True), to_json_value(True))
 def test_string_to(self):
   self.assertEquals(JsonValue(string_value='abc'), to_json_value('abc'))
Esempio n. 24
0
def translate_scalar(accumulator, metric_update):
  metric_update.scalar = to_json_value(accumulator.value, with_type=True)
Esempio n. 25
0
 def test_float_from(self):
     self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
Esempio n. 26
0
 def test_none_from(self):
     self.assertIsNone(from_json_value(to_json_value(None)))
 def test_string_from(self):
   self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))
Esempio n. 28
0
    def __init__(self, packages, options, environment_version):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        sdk_name, version_string = get_sdk_name_and_version()

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(sdk_name)),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(version_string))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        if self.standard_options.streaming:
            job_type = 'PYTHON_STREAMING'
        else:
            job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))
        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            # Default to using the worker harness container image for the current SDK
            # version.
            pool.workerHarnessContainerImage = (
                'dataflow.gcr.io/v1beta3/python:%s' %
                get_required_container_version())
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.iteritems() if v is not None
            }
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))
 def test_true_from(self):
   self.assertEquals(True, from_json_value(to_json_value(True)))
Esempio n. 30
0
 def decode(self, encoded_table_row):
   od = json.loads(
       encoded_table_row, object_pairs_hook=collections.OrderedDict)
   return bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
 def test_false_from(self):
   self.assertEquals(False, from_json_value(to_json_value(False)))
Esempio n. 32
0
 def test_true_to(self):
     self.assertEquals(JsonValue(boolean_value=True), to_json_value(True))
 def test_int_from(self):
   self.assertEquals(-27, from_json_value(to_json_value(-27)))
Esempio n. 34
0
 def test_int_to(self):
     self.assertEquals(JsonValue(integer_value=14), to_json_value(14))
 def test_float_from(self):
   self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
Esempio n. 36
0
 def test_none_to(self):
     self.assertEquals(JsonValue(is_null=True), to_json_value(None))
 def test_with_type(self):
   rt = from_json_value(to_json_value('abcd', with_type=True))
   self.assertEquals('http://schema.org/Text', rt['@type'])
   self.assertEquals('abcd', rt['value'])
Esempio n. 38
0
 def test_true_from(self):
     self.assertEquals(True, from_json_value(to_json_value(True)))
 def test_none_from(self):
   self.assertIsNone(from_json_value(to_json_value(None)))
Esempio n. 40
0
 def test_int_from(self):
     self.assertEquals(-27, from_json_value(to_json_value(-27)))
Esempio n. 41
0
  def __init__(self, packages, options, environment_version):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.debug_options = options.view_as(DebugOptions)
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
    self.proto.dataset = '{}/cloud_dataflow'.format(
        GoogleCloudOptions.BIGQUERY_API_SERVICE)
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace(
            'gs:/',
            GoogleCloudOptions.STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    if self.google_cloud_options.service_account_email:
      self.proto.serviceAccountEmail = (
          self.google_cloud_options.service_account_email)

    sdk_name, version_string = get_sdk_name_and_version()

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value(sdk_name)),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(version_string))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'PYTHON_STREAMING'
    else:
      job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # Experiments
    if self.debug_options.experiments:
      for experiment in self.debug_options.experiments:
        self.proto.experiments.append(experiment)
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                servicePath=self.google_cloud_options.dataflow_endpoint)))
    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      # Default to using the worker harness container image for the current SDK
      # version.
      pool.workerHarnessContainerImage = (
          'dataflow.gcr.io/v1beta3/python:%s' %
          get_required_container_version())
    if self.worker_options.use_public_ips is not None:
      if self.worker_options.use_public_ips:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
      else:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      options_dict = {k: v
                      for k, v in sdk_pipeline_options.iteritems()
                      if v is not None}
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='options', value=to_json_value(options_dict)))

      dd = DisplayData.create_from_options(options)
      items = [item.get_dict() for item in dd.items]
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='display_data', value=to_json_value(items)))
Esempio n. 42
0
 def test_with_type(self):
     rt = from_json_value(to_json_value('abcd', with_type=True))
     self.assertEquals('http://schema.org/Text', rt['@type'])
     self.assertEquals('abcd', rt['value'])
 def test_long_value(self):
   self.assertEquals(long(27), from_json_value(to_json_value(long(27))))
Esempio n. 44
0
 def test_large_integer(self):
     num = 1 << 35
     self.assertEquals(num, from_json_value(to_json_value(num)))
     self.assertEquals(long(num), from_json_value(to_json_value(long(num))))
 def test_too_long_value(self):
   with self.assertRaises(TypeError):
     to_json_value(long(1 << 64))
Esempio n. 46
0
 def test_too_long_value(self):
     with self.assertRaises(TypeError):
         to_json_value(long(1 << 64))
Esempio n. 47
0
 def add_property(self, name, value, with_type=False):
   self._additional_properties.append((name, value, with_type))
   self.proto.properties.additionalProperties.append(
       dataflow.Step.PropertiesValue.AdditionalProperty(
           key=name, value=to_json_value(value, with_type=with_type)))
Esempio n. 48
0
 def add_property(self, name, value, with_type=False):
     self._additional_properties.append((name, value, with_type))
     self.proto.properties.additionalProperties.append(
         dataflow.Step.PropertiesValue.AdditionalProperty(
             key=name, value=to_json_value(value, with_type=with_type)))
Esempio n. 49
0
 def decode(self, encoded_table_row):
     od = json.loads(encoded_table_row,
                     object_pairs_hook=collections.OrderedDict)
     return bigquery.TableRow(f=[
         bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()
     ])
 def test_large_integer(self):
   num = 1 << 35
   self.assertEquals(num, from_json_value(to_json_value(num)))
   self.assertEquals(long(num), from_json_value(to_json_value(long(num))))
Esempio n. 51
0
  def get_test_rows(self):
    now = time.time()
    dt = datetime.datetime.utcfromtimestamp(float(now))
    ts = dt.strftime('%Y-%m-%d %H:%M:%S.%f UTC')
    expected_rows = [
        {
            'i': 1,
            's': 'abc',
            'f': 2.3,
            'b': True,
            't': ts,
            'dt': '2016-10-31',
            'ts': '22:39:12.627498',
            'dt_ts': '2008-12-25T07:30:00',
            'r': {'s2': 'b'},
            'rpr': [{'s3': 'c', 'rpr2': [{'rs': ['d', 'e'], 's4': None}]}]
        },
        {
            'i': 10,
            's': 'xyz',
            'f': -3.14,
            'b': False,
            'rpr': [],
            't': None,
            'dt': None,
            'ts': None,
            'dt_ts': None,
            'r': None,
        }]

    nested_schema = [
        bigquery.TableFieldSchema(
            name='s2', type='STRING', mode='NULLABLE')]
    nested_schema_2 = [
        bigquery.TableFieldSchema(
            name='s3', type='STRING', mode='NULLABLE'),
        bigquery.TableFieldSchema(
            name='rpr2', type='RECORD', mode='REPEATED', fields=[
                bigquery.TableFieldSchema(
                    name='rs', type='STRING', mode='REPEATED'),
                bigquery.TableFieldSchema(
                    name='s4', type='STRING', mode='NULLABLE')])]

    schema = bigquery.TableSchema(
        fields=[
            bigquery.TableFieldSchema(
                name='b', type='BOOLEAN', mode='REQUIRED'),
            bigquery.TableFieldSchema(
                name='f', type='FLOAT', mode='REQUIRED'),
            bigquery.TableFieldSchema(
                name='i', type='INTEGER', mode='REQUIRED'),
            bigquery.TableFieldSchema(
                name='s', type='STRING', mode='REQUIRED'),
            bigquery.TableFieldSchema(
                name='t', type='TIMESTAMP', mode='NULLABLE'),
            bigquery.TableFieldSchema(
                name='dt', type='DATE', mode='NULLABLE'),
            bigquery.TableFieldSchema(
                name='ts', type='TIME', mode='NULLABLE'),
            bigquery.TableFieldSchema(
                name='dt_ts', type='DATETIME', mode='NULLABLE'),
            bigquery.TableFieldSchema(
                name='r', type='RECORD', mode='NULLABLE',
                fields=nested_schema),
            bigquery.TableFieldSchema(
                name='rpr', type='RECORD', mode='REPEATED',
                fields=nested_schema_2)])

    table_rows = [
        bigquery.TableRow(f=[
            bigquery.TableCell(v=to_json_value('true')),
            bigquery.TableCell(v=to_json_value(str(2.3))),
            bigquery.TableCell(v=to_json_value(str(1))),
            bigquery.TableCell(v=to_json_value('abc')),
            # For timestamps cannot use str() because it will truncate the
            # number representing the timestamp.
            bigquery.TableCell(v=to_json_value('%f' % now)),
            bigquery.TableCell(v=to_json_value('2016-10-31')),
            bigquery.TableCell(v=to_json_value('22:39:12.627498')),
            bigquery.TableCell(v=to_json_value('2008-12-25T07:30:00')),
            # For record we cannot use dict because it doesn't create nested
            # schemas correctly so we have to use this f,v based format
            bigquery.TableCell(v=to_json_value({'f': [{'v': 'b'}]})),
            bigquery.TableCell(v=to_json_value([{'v':{'f':[{'v': 'c'}, {'v':[
                {'v':{'f':[{'v':[{'v':'d'}, {'v':'e'}]}, {'v':None}]}}]}]}}]))
            ]),
        bigquery.TableRow(f=[
            bigquery.TableCell(v=to_json_value('false')),
            bigquery.TableCell(v=to_json_value(str(-3.14))),
            bigquery.TableCell(v=to_json_value(str(10))),
            bigquery.TableCell(v=to_json_value('xyz')),
            bigquery.TableCell(v=None),
            bigquery.TableCell(v=None),
            bigquery.TableCell(v=None),
            bigquery.TableCell(v=None),
            bigquery.TableCell(v=None),
            bigquery.TableCell(v=to_json_value([]))])]
    return table_rows, schema, expected_rows