Exemple #1
0
def translate_mean(accumulator, metric_update):
  if accumulator.count:
    metric_update.meanSum = to_json_value(accumulator.sum, with_type=True)
    metric_update.meanCount = to_json_value(accumulator.count, with_type=True)
  else:
    # A denominator of 0 will raise an error in the service.
    # What it means is we have nothing to report yet, so don't.
    metric_update.kind = None
Exemple #2
0
  def insert_rows(self, project_id, dataset_id, table_id, rows):
    """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

    # Prepare rows for insertion. Of special note is the row ID that we add to
    # each row in order to help BigQuery avoid inserting a row multiple times.
    # BigQuery will do a best-effort if unique IDs are provided. This situation
    # can happen during retries on failures.
    # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
    final_rows = []
    for row in rows:
      json_object = bigquery.JsonObject()
      for k, v in row.iteritems():
        json_object.additionalProperties.append(
            bigquery.JsonObject.AdditionalProperty(
                key=k, value=to_json_value(v)))
      final_rows.append(
          bigquery.TableDataInsertAllRequest.RowsValueListEntry(
              insertId=str(self.unique_row_id),
              json=json_object))
    result, errors = self._insert_all_rows(
        project_id, dataset_id, table_id, final_rows)
    return result, errors
  def test_row_as_table_row(self):
    schema_definition = [
        ('s', 'STRING'),
        ('i', 'INTEGER'),
        ('f', 'FLOAT'),
        ('b', 'BOOLEAN'),
        ('r', 'RECORD')]
    data_defination = [
        'abc',
        123,
        123.456,
        True,
        {'a': 'b'}]
    str_def = '{"s": "abc", "i": 123, "f": 123.456, "b": true, "r": {"a": "b"}}'
    schema = bigquery.TableSchema(
        fields=[bigquery.TableFieldSchema(name=k, type=v)
                for k, v in schema_definition])
    coder = TableRowJsonCoder(table_schema=schema)
    test_row = bigquery.TableRow(
        f=[bigquery.TableCell(v=to_json_value(e)) for e in data_defination])

    self.assertEqual(str_def, coder.encode(test_row))
    self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
    # A coder without schema can still decode.
    self.assertEqual(
        test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
Exemple #4
0
  def test_rows_are_written(self):
    client = mock.Mock()
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
        schema=bigquery.TableSchema())
    client.tables.Get.return_value = table
    write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

    insert_response = mock.Mock()
    insert_response.insertErrors = []
    client.tabledata.InsertAll.return_value = insert_response

    with beam.io.BigQuerySink(
        'project:dataset.table',
        write_disposition=write_disposition).writer(client) as writer:
      writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

    sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
    expected_rows = []
    json_object = bigquery.JsonObject()
    for k, v in iteritems(sample_row):
      json_object.additionalProperties.append(
          bigquery.JsonObject.AdditionalProperty(
              key=k, value=to_json_value(v)))
    expected_rows.append(
        bigquery.TableDataInsertAllRequest.RowsValueListEntry(
            insertId='_1',  # First row ID generated with prefix ''
            json=json_object))
    client.tabledata.InsertAll.assert_called_with(
        bigquery.BigqueryTabledataInsertAllRequest(
            projectId='project', datasetId='dataset', tableId='table',
            tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                rows=expected_rows)))
 def test_row_and_no_schema(self):
   coder = TableRowJsonCoder()
   test_row = bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e))
          for e in ['abc', 123, 123.456, True]])
   with self.assertRaisesRegexp(AttributeError,
                                r'^The TableRowJsonCoder requires'):
     coder.encode(test_row)
 def test_row_and_no_schema(self):
   coder = TableRowJsonCoder()
   test_row = bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e))
          for e in ['abc', 123, 123.456, True]])
   with self.assertRaises(AttributeError) as ctx:
     coder.encode(test_row)
   self.assertTrue(
       ctx.exception.message.startswith('The TableRowJsonCoder requires'))
Exemple #7
0
 def test_row_and_no_schema(self):
     coder = TableRowJsonCoder()
     test_row = bigquery.TableRow(f=[
         bigquery.TableCell(v=to_json_value(e))
         for e in ['abc', 123, 123.456, True]
     ])
     with self.assertRaisesRegex(AttributeError,
                                 r'^The TableRowJsonCoder requires'):
         coder.encode(test_row)
 def json_compliance_exception(self, value):
   with self.assertRaisesRegexp(ValueError, re.escape(JSON_COMPLIANCE_ERROR)):
     schema_definition = [('f', 'FLOAT')]
     schema = bigquery.TableSchema(
         fields=[bigquery.TableFieldSchema(name=k, type=v)
                 for k, v in schema_definition])
     coder = TableRowJsonCoder(table_schema=schema)
     test_row = bigquery.TableRow(
         f=[bigquery.TableCell(v=to_json_value(value))])
     coder.encode(test_row)
Exemple #9
0
 def json_compliance_exception(self, value):
   with self.assertRaisesRegexp(ValueError, re.escape(JSON_COMPLIANCE_ERROR)):
     schema_definition = [('f', 'FLOAT')]
     schema = bigquery.TableSchema(
         fields=[bigquery.TableFieldSchema(name=k, type=v)
                 for k, v in schema_definition])
     coder = TableRowJsonCoder(table_schema=schema)
     test_row = bigquery.TableRow(
         f=[bigquery.TableCell(v=to_json_value(value))])
     coder.encode(test_row)
Exemple #10
0
 def test_row_and_no_schema(self):
     coder = TableRowJsonCoder()
     test_row = bigquery.TableRow(f=[
         bigquery.TableCell(v=to_json_value(e))
         for e in ['abc', 123, 123.456, True]
     ])
     with self.assertRaises(AttributeError) as ctx:
         coder.encode(test_row)
     self.assertTrue(
         ctx.exception.message.startswith('The TableRowJsonCoder requires'))
Exemple #11
0
 def _convert_to_json_row(self, row):
   json_object = bigquery.JsonObject()
   for k, v in iteritems(row):
     if isinstance(v, decimal.Decimal):
       # decimal values are converted into string because JSON does not
       # support the precision that decimal supports. BQ is able to handle
       # inserts into NUMERIC columns by receiving JSON with string attrs.
       v = str(v)
     json_object.additionalProperties.append(
         bigquery.JsonObject.AdditionalProperty(
             key=k, value=to_json_value(v)))
   return json_object
Exemple #12
0
    def insert_rows(self,
                    project_id,
                    dataset_id,
                    table_id,
                    rows,
                    skip_invalid_rows=False):
        """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.
      skip_invalid_rows: If there are rows with insertion errors, whether they
        should be skipped, and all others should be inserted successfully.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

        # Prepare rows for insertion. Of special note is the row ID that we add to
        # each row in order to help BigQuery avoid inserting a row multiple times.
        # BigQuery will do a best-effort if unique IDs are provided. This situation
        # can happen during retries on failures.
        # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
        final_rows = []
        for row in rows:
            json_object = bigquery.JsonObject()
            for k, v in iteritems(row):
                if isinstance(v, decimal.Decimal):
                    # decimal values are converted into string because JSON does not
                    # support the precision that decimal supports. BQ is able to handle
                    # inserts into NUMERIC columns by receiving JSON with string attrs.
                    v = str(v)
                json_object.additionalProperties.append(
                    bigquery.JsonObject.AdditionalProperty(
                        key=k, value=to_json_value(v)))
            final_rows.append(
                bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                    insertId=str(self.unique_row_id), json=json_object))
        result, errors = self._insert_all_rows(project_id, dataset_id,
                                               table_id, final_rows,
                                               skip_invalid_rows)
        return result, errors
Exemple #13
0
    def test_row_as_table_row(self):
        schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'),
                             ('b', 'BOOLEAN'), ('r', 'RECORD')]
        data_defination = ['abc', 123, 123.456, True, {'a': 'b'}]
        str_def = '{"s": "abc", "i": 123, "f": 123.456, "b": true, "r": {"a": "b"}}'
        schema = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name=k, type=v)
            for k, v in schema_definition
        ])
        coder = TableRowJsonCoder(table_schema=schema)
        test_row = bigquery.TableRow(f=[
            bigquery.TableCell(v=to_json_value(e)) for e in data_defination
        ])

        self.assertEqual(str_def, coder.encode(test_row))
        self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
        # A coder without schema can still decode.
        self.assertEqual(test_row,
                         TableRowJsonCoder().decode(coder.encode(test_row)))
  def insert_rows(self, project_id, dataset_id, table_id, rows,
                  skip_invalid_rows=False):
    """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.
      skip_invalid_rows: If there are rows with insertion errors, whether they
        should be skipped, and all others should be inserted successfully.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

    # Prepare rows for insertion. Of special note is the row ID that we add to
    # each row in order to help BigQuery avoid inserting a row multiple times.
    # BigQuery will do a best-effort if unique IDs are provided. This situation
    # can happen during retries on failures.
    # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
    final_rows = []
    for row in rows:
      json_object = bigquery.JsonObject()
      for k, v in iteritems(row):
        if isinstance(v, decimal.Decimal):
          # decimal values are converted into string because JSON does not
          # support the precision that decimal supports. BQ is able to handle
          # inserts into NUMERIC columns by receiving JSON with string attrs.
          v = str(v)
        json_object.additionalProperties.append(
            bigquery.JsonObject.AdditionalProperty(
                key=k, value=to_json_value(v)))
      final_rows.append(
          bigquery.TableDataInsertAllRequest.RowsValueListEntry(
              insertId=str(self.unique_row_id),
              json=json_object))
    result, errors = self._insert_all_rows(
        project_id, dataset_id, table_id, final_rows, skip_invalid_rows)
    return result, errors
Exemple #15
0
  def test_metric_update_basic(self):
    metric_update = dataflow.MetricUpdate()
    metric_update.name = dataflow.MetricStructuredName()
    metric_update.name.name = 'metric1'
    metric_update.name.origin = 'origin1'

    metric_update.cumulative = False
    metric_update.kind = 'sum'
    metric_update.scalar = to_json_value(1, with_type=True)

    name_matcher = message_matchers.MetricStructuredNameMatcher(
        name='metric1', origin='origin1')
    matcher = message_matchers.MetricUpdateMatcher(
        name=name_matcher, kind='sum', scalar=1)

    hc.assert_that(metric_update, hc.is_(matcher))

    with self.assertRaises(AssertionError):
      matcher.kind = 'suma'
      hc.assert_that(metric_update, hc.is_(matcher))
  def test_metric_update_basic(self):
    metric_update = dataflow.MetricUpdate()
    metric_update.name = dataflow.MetricStructuredName()
    metric_update.name.name = 'metric1'
    metric_update.name.origin = 'origin1'

    metric_update.cumulative = False
    metric_update.kind = 'sum'
    metric_update.scalar = to_json_value(1, with_type=True)

    name_matcher = message_matchers.MetricStructuredNameMatcher(
        name='metric1',
        origin='origin1')
    matcher = message_matchers.MetricUpdateMatcher(
        name=name_matcher,
        kind='sum',
        scalar=1)

    hc.assert_that(metric_update, hc.is_(matcher))

    with self.assertRaises(AssertionError):
      matcher.kind = 'suma'
      hc.assert_that(metric_update, hc.is_(matcher))
    def test_rows_are_written(self):
        client = mock.Mock()
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
                               schema=bigquery.TableSchema())
        client.tables.Get.return_value = table
        write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

        insert_response = mock.Mock()
        insert_response.insertErrors = []
        client.tabledata.InsertAll.return_value = insert_response

        with beam.io.BigQuerySink(
                'project:dataset.table',
                write_disposition=write_disposition).writer(client) as writer:
            writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

        sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
        expected_rows = []
        json_object = bigquery.JsonObject()
        for k, v in iteritems(sample_row):
            json_object.additionalProperties.append(
                bigquery.JsonObject.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
        expected_rows.append(
            bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                insertId='_1',  # First row ID generated with prefix ''
                json=json_object))
        client.tabledata.InsertAll.assert_called_with(
            bigquery.BigqueryTabledataInsertAllRequest(
                projectId='project',
                datasetId='dataset',
                tableId='table',
                tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                    rows=expected_rows,
                    skipInvalidRows=False,
                )))
 def value_or_decimal_to_json(val):
   if isinstance(val, decimal.Decimal):
     return to_json_value(str(val))
   else:
     return to_json_value(val)
 def test_string_from(self):
   self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))
 def test_true_from(self):
   self.assertEquals(True, from_json_value(to_json_value(True)))
 def test_long_value(self):
     num = 1 << 63 - 1
     self.assertEqual(num, from_json_value(to_json_value(num)))
 def test_none_from(self):
     self.assertIsNone(from_json_value(to_json_value(None)))
 def test_float_from(self):
     self.assertEqual(4.5, from_json_value(to_json_value(4.5)))
 def test_none_to(self):
   self.assertEquals(JsonValue(is_null=True), to_json_value(None))
 def test_too_long_value(self):
   with self.assertRaises(TypeError):
     to_json_value(1 << 64)
 def test_large_integer(self):
   num = 1 << 35
   self.assertEquals(num, from_json_value(to_json_value(num)))
 def test_long_value(self):
   num = 1 << 63 - 1
   self.assertEquals(num, from_json_value(to_json_value(num)))
 def test_none_from(self):
   self.assertIsNone(from_json_value(to_json_value(None)))
 def test_with_type(self):
   rt = from_json_value(to_json_value('abcd', with_type=True))
   self.assertEquals('http://schema.org/Text', rt['@type'])
   self.assertEquals('abcd', rt['value'])
 def test_float_from(self):
   self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
 def test_int_from(self):
   self.assertEquals(-27, from_json_value(to_json_value(-27)))
 def test_true_to(self):
     self.assertEqual(JsonValue(boolean_value=True), to_json_value(True))
Exemple #33
0
 def value_or_decimal_to_json(val):
     if isinstance(val, decimal.Decimal):
         return to_json_value(str(val))
     else:
         return to_json_value(val)
 def test_false_to(self):
     self.assertEqual(JsonValue(boolean_value=False), to_json_value(False))
 def test_int_from(self):
     self.assertEqual(-27, from_json_value(to_json_value(-27)))
 def test_int_to(self):
     self.assertEqual(JsonValue(integer_value=14), to_json_value(14))
 def test_with_type(self):
     rt = from_json_value(to_json_value('abcd', with_type=True))
     self.assertEqual('http://schema.org/Text', rt['@type'])
     self.assertEqual('abcd', rt['value'])
 def test_float_to(self):
     self.assertEqual(JsonValue(double_value=2.75), to_json_value(2.75))
 def test_large_integer(self):
     num = 1 << 35
     self.assertEqual(num, from_json_value(to_json_value(num)))
 def test_static_value_provider_to(self):
     svp = StaticValueProvider(str, 'abc')
     self.assertEqual(JsonValue(string_value=svp.value), to_json_value(svp))
Exemple #41
0
 def decode(self, encoded_table_row):
   od = json.loads(
       encoded_table_row, object_pairs_hook=collections.OrderedDict)
   return bigquery.TableRow(
       f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
Exemple #42
0
def translate_scalar(accumulator, metric_update):
    metric_update.scalar = to_json_value(accumulator.value, with_type=True)
 def test_static_value_provider_to(self):
   svp = StaticValueProvider(str, 'abc')
   self.assertEquals(JsonValue(string_value=svp.value), to_json_value(svp))
Exemple #44
0
    def __init__(self, packages, options, environment_version, pipeline_url):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.pipeline_url = pipeline_url
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(self._get_python_sdk_name())),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(beam_version.__version__))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        _verify_interpreter_version_is_supported(options)
        if self.standard_options.streaming:
            job_type = 'FNAPI_STREAMING'
        else:
            if _use_fnapi(options):
                job_type = 'FNAPI_BATCH'
            else:
                job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # TODO: Use enumerated type instead of strings for job types.
        if job_type.startswith('FNAPI_'):
            runner_harness_override = (get_runner_harness_container_image())
            self.debug_options.experiments = self.debug_options.experiments or []
            if runner_harness_override:
                self.debug_options.experiments.append(
                    'runner_harness_container_image=' +
                    runner_harness_override)
            # Add use_multiple_sdk_containers flag if its not already present. Do not
            # add the flag if 'no_use_multiple_sdk_containers' is present.
            # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK
            # till version 2.4.
            debug_options_experiments = self.debug_options.experiments
            if ('use_multiple_sdk_containers' not in debug_options_experiments
                    and 'no_use_multiple_sdk_containers'
                    not in debug_options_experiments):
                self.debug_options.experiments.append(
                    'use_multiple_sdk_containers')
        # FlexRS
        if self.google_cloud_options.flexrs_goal == 'COST_OPTIMIZED':
            self.proto.flexResourceSchedulingGoal = (
                dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
                FLEXRS_COST_OPTIMIZED)
        elif self.google_cloud_options.flexrs_goal == 'SPEED_OPTIMIZED':
            self.proto.flexResourceSchedulingGoal = (
                dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
                FLEXRS_SPEED_OPTIMIZED)
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))

        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.subnetwork:
            pool.subnetwork = self.worker_options.subnetwork
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            pool.workerHarnessContainerImage = (
                get_default_container_image_for_current_sdk(job_type))
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.items() if v is not None
            }
            options_dict["pipelineUrl"] = pipeline_url
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))
 def test_float_to(self):
   self.assertEquals(JsonValue(double_value=2.75), to_json_value(2.75))
Exemple #46
0
  def __init__(self, packages, options, environment_version, pipeline_url):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.debug_options = options.view_as(DebugOptions)
    self.pipeline_url = pipeline_url
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
    self.proto.dataset = '{}/cloud_dataflow'.format(
        GoogleCloudOptions.BIGQUERY_API_SERVICE)
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace(
            'gs:/',
            GoogleCloudOptions.STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    if self.google_cloud_options.service_account_email:
      self.proto.serviceAccountEmail = (
          self.google_cloud_options.service_account_email)

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value(names.BEAM_SDK_NAME)),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(beam_version.__version__))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'FNAPI_STREAMING'
    else:
      if _use_fnapi(options):
        job_type = 'FNAPI_BATCH'
      else:
        job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # TODO: Use enumerated type instead of strings for job types.
    if job_type.startswith('FNAPI_'):
      runner_harness_override = (
          get_runner_harness_container_image())
      self.debug_options.experiments = self.debug_options.experiments or []
      if runner_harness_override:
        self.debug_options.experiments.append(
            'runner_harness_container_image=' + runner_harness_override)
      # Add use_multiple_sdk_containers flag if its not already present. Do not
      # add the flag if 'no_use_multiple_sdk_containers' is present.
      # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK
      # till version 2.4.
      if ('use_multiple_sdk_containers' not in self.proto.experiments and
          'no_use_multiple_sdk_containers' not in self.proto.experiments):
        self.debug_options.experiments.append('use_multiple_sdk_containers')
    # Experiments
    if self.debug_options.experiments:
      for experiment in self.debug_options.experiments:
        self.proto.experiments.append(experiment)
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                servicePath=self.google_cloud_options.dataflow_endpoint)))

    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.subnetwork:
      pool.subnetwork = self.worker_options.subnetwork
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      pool.workerHarnessContainerImage = (
          get_default_container_image_for_current_sdk(job_type))
    if self.worker_options.use_public_ips is not None:
      if self.worker_options.use_public_ips:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
      else:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      options_dict = {k: v
                      for k, v in sdk_pipeline_options.items()
                      if v is not None}
      options_dict["pipelineUrl"] = pipeline_url
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='options', value=to_json_value(options_dict)))

      dd = DisplayData.create_from_options(options)
      items = [item.get_dict() for item in dd.items]
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='display_data', value=to_json_value(items)))
Exemple #47
0
    def __init__(self, packages, options, environment_version):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        sdk_name, version_string = get_sdk_name_and_version()

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(sdk_name)),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(version_string))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        if self.standard_options.streaming:
            job_type = 'PYTHON_STREAMING'
        else:
            job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))
        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            # Default to using the worker harness container image for the current SDK
            # version.
            pool.workerHarnessContainerImage = (
                'dataflow.gcr.io/v1beta3/python:%s' %
                get_required_container_version())
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.iteritems() if v is not None
            }
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))
 def test_false_from(self):
   self.assertEquals(False, from_json_value(to_json_value(False)))
 def test_int_to(self):
   self.assertEquals(JsonValue(integer_value=14), to_json_value(14))
Exemple #50
0
  def __init__(self, packages, options, environment_version):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.debug_options = options.view_as(DebugOptions)
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
    self.proto.dataset = '{}/cloud_dataflow'.format(
        GoogleCloudOptions.BIGQUERY_API_SERVICE)
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace(
            'gs:/',
            GoogleCloudOptions.STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    if self.google_cloud_options.service_account_email:
      self.proto.serviceAccountEmail = (
          self.google_cloud_options.service_account_email)

    sdk_name, version_string = get_sdk_name_and_version()

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value(sdk_name)),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(version_string))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'PYTHON_STREAMING'
    else:
      job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # Experiments
    if self.debug_options.experiments:
      for experiment in self.debug_options.experiments:
        self.proto.experiments.append(experiment)
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                servicePath=self.google_cloud_options.dataflow_endpoint)))
    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      # Default to using the worker harness container image for the current SDK
      # version.
      pool.workerHarnessContainerImage = (
          'dataflow.gcr.io/v1beta3/python:%s' %
          get_required_container_version())
    if self.worker_options.use_public_ips is not None:
      if self.worker_options.use_public_ips:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
      else:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      options_dict = {k: v
                      for k, v in sdk_pipeline_options.iteritems()
                      if v is not None}
      options_dict['_options_id'] = options._options_id
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='options', value=to_json_value(options_dict)))

      dd = DisplayData.create_from_options(options)
      items = [item.get_dict() for item in dd.items]
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='display_data', value=to_json_value(items)))
Exemple #51
0
 def add_property(self, name, value, with_type=False):
     self._additional_properties.append((name, value, with_type))
     self.proto.properties.additionalProperties.append(
         dataflow.Step.PropertiesValue.AdditionalProperty(
             key=name, value=to_json_value(value, with_type=with_type)))
Exemple #52
0
def translate_scalar(accumulator, metric_update):
  metric_update.scalar = to_json_value(accumulator.value, with_type=True)
Exemple #53
0
 def decode(self, encoded_table_row):
     od = json.loads(encoded_table_row,
                     object_pairs_hook=collections.OrderedDict)
     return bigquery.TableRow(
         f=[bigquery.TableCell(v=to_json_value(e)) for e in itervalues(od)])
 def test_false_from(self):
     self.assertEqual(False, from_json_value(to_json_value(False)))
 def test_none_to(self):
     self.assertEqual(JsonValue(is_null=True), to_json_value(None))
 def test_runtime_value_provider_to(self):
   RuntimeValueProvider.runtime_options = None
   rvp = RuntimeValueProvider('arg', 123, int)
   self.assertEquals(JsonValue(is_null=True), to_json_value(rvp))
Exemple #57
0
 def add_property(self, name, value, with_type=False):
   self._additional_properties.append((name, value, with_type))
   self.proto.properties.additionalProperties.append(
       dataflow.Step.PropertiesValue.AdditionalProperty(
           key=name, value=to_json_value(value, with_type=with_type)))
 def test_string_from(self):
     self.assertEqual('WXYZ', from_json_value(to_json_value('WXYZ')))
 def test_runtime_value_provider_to(self):
     RuntimeValueProvider.set_runtime_options(None)
     rvp = RuntimeValueProvider('arg', 123, int)
     self.assertEqual(JsonValue(is_null=True), to_json_value(rvp))
     # Reset runtime options to avoid side-effects in other tests.
     RuntimeValueProvider.set_runtime_options(None)
 def test_true_from(self):
     self.assertEqual(True, from_json_value(to_json_value(True)))