Ejemplo n.º 1
0
  def process(self, element, job_name_prefix=None):
    destination = element[0]
    job_reference = element[1]

    copy_to_reference = bigquery_tools.parse_table_reference(destination)
    if copy_to_reference.projectId is None:
      copy_to_reference.projectId = vp.RuntimeValueProvider.get_value('project',
                                                                      str, '')

    copy_from_reference = bigquery_tools.parse_table_reference(destination)
    copy_from_reference.tableId = job_reference.jobId
    if copy_from_reference.projectId is None:
      copy_from_reference.projectId = vp.RuntimeValueProvider.get_value(
          'project', str, '')

    copy_job_name = '%s_copy_%s_to_%s' % (
        job_name_prefix,
        _bq_uuid('%s:%s.%s' % (copy_from_reference.projectId,
                               copy_from_reference.datasetId,
                               copy_from_reference.tableId)),
        _bq_uuid('%s:%s.%s' % (copy_to_reference.projectId,
                               copy_to_reference.datasetId,
                               copy_to_reference.tableId)))

    _LOGGER.info("Triggering copy job from %s to %s",
                 copy_from_reference, copy_to_reference)
    job_reference = self.bq_wrapper._insert_copy_job(
        copy_to_reference.projectId,
        copy_job_name,
        copy_from_reference,
        copy_to_reference,
        create_disposition=self.create_disposition,
        write_disposition=self.write_disposition)

    yield (destination, job_reference)
Ejemplo n.º 2
0
  def test_value_provider_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    schema = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    additional_bq_parameters = {
        'timePartitioning': {'type': 'DAY'},
        'clustering': {'fields': ['language']}}

    table_ref = bigquery_tools.parse_table_reference(output_table_1)
    table_ref2 = bigquery_tools.parse_table_reference(output_table_2)

    pipeline_verifiers = [
        BigQueryTableMatcher(
            project=self.project,
            dataset=table_ref.datasetId,
            table=table_ref.tableId,
            expected_properties=additional_bq_parameters),
        BigQueryTableMatcher(
            project=self.project,
            dataset=table_ref2.datasetId,
            table=table_ref2.tableId,
            expected_properties=additional_bq_parameters),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_2,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create([row for row in _ELEMENTS if 'language' in row])

      _ = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_1)),
               schema=value_provider.StaticValueProvider(dict, schema),
               additional_bq_parameters=additional_bq_parameters,
               method='STREAMING_INSERTS'))
      _ = (input
           | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_2)),
               schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
               additional_bq_parameters=lambda _: additional_bq_parameters,
               method='FILE_LOADS'))
    def process(self,
                element,
                job_name_prefix=None,
                unused_schema_mod_jobs=None):
        destination = element[0]
        job_reference = element[1]

        copy_to_reference = bigquery_tools.parse_table_reference(destination)
        if copy_to_reference.projectId is None:
            copy_to_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        copy_from_reference = bigquery_tools.parse_table_reference(destination)
        copy_from_reference.tableId = job_reference.jobId
        if copy_from_reference.projectId is None:
            copy_from_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        copy_job_name = '%s_%s' % (job_name_prefix,
                                   _bq_uuid('%s:%s.%s' %
                                            (copy_from_reference.projectId,
                                             copy_from_reference.datasetId,
                                             copy_from_reference.tableId)))

        _LOGGER.info("Triggering copy job from %s to %s", copy_from_reference,
                     copy_to_reference)
        if copy_to_reference.tableId not in self._observed_tables:
            # When the write_disposition for a job is WRITE_TRUNCATE,
            # multiple copy jobs to the same destination can stump on
            # each other, truncate data, and write to the BQ table over and
            # over.
            # Thus, the first copy job runs with the user's write_disposition,
            # but afterwards, all jobs must always WRITE_APPEND to the table.
            # If they do not, subsequent copy jobs will clear out data appended
            # by previous jobs.
            write_disposition = self.write_disposition
            wait_for_job = True
            self._observed_tables.add(copy_to_reference.tableId)
        else:
            wait_for_job = False
            write_disposition = 'WRITE_APPEND'

        if not self.bq_io_metadata:
            self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
        job_reference = self.bq_wrapper._insert_copy_job(
            copy_to_reference.projectId,
            copy_job_name,
            copy_from_reference,
            copy_to_reference,
            create_disposition=self.create_disposition,
            write_disposition=write_disposition,
            job_labels=self.bq_io_metadata.add_additional_bq_job_labels())

        if wait_for_job:
            self.bq_wrapper.wait_for_bq_job(job_reference,
                                            sleep_duration_sec=10)

        yield (destination, job_reference)
Ejemplo n.º 4
0
 def process(self, table_reference):
   _LOGGER.info("Deleting table %s", table_reference)
   table_reference = bigquery_tools.parse_table_reference(table_reference)
   self.bq_wrapper._delete_table(
       table_reference.projectId,
       table_reference.datasetId,
       table_reference.tableId)
Ejemplo n.º 5
0
    def process(self,
                element: 'ReadFromBigQueryRequest') -> Iterable[BoundedSource]:
        bq = bigquery_tools.BigQueryWrapper(
            temp_dataset_id=self._get_temp_dataset().datasetId)
        # TODO(BEAM-11359): Clean up temp dataset at pipeline completion.

        if element.query is not None:
            self._setup_temporary_dataset(bq, element)
            table_reference = self._execute_query(bq, element)
        else:
            assert element.table
            table_reference = bigquery_tools.parse_table_reference(
                element.table, project=self._get_project())

        if not table_reference.projectId:
            table_reference.projectId = self._get_project()

        schema, metadata_list = self._export_files(bq, element,
                                                   table_reference)

        for metadata in metadata_list:
            yield self._create_source(metadata.path, schema)

        if element.query is not None:
            bq._delete_table(table_reference.projectId,
                             table_reference.datasetId,
                             table_reference.tableId)
Ejemplo n.º 6
0
 def process(self, table_reference):
   logging.info("Deleting table %s", table_reference)
   table_reference = bigquery_tools.parse_table_reference(table_reference)
   self.bq_wrapper._delete_table(
       table_reference.projectId,
       table_reference.datasetId,
       table_reference.tableId)
    def _write_files_with_auto_sharding(self, destination_data_kv_pc,
                                        file_prefix_pcv):
        clock = self.test_client.test_clock if self.test_client else time.time

        # Auto-sharding is achieved via GroupIntoBatches.WithShardedKey
        # transform which shards, groups and at the same time batches the table rows
        # to be inserted to BigQuery.

        # Firstly, the keys of tagged_data (table references) are converted to a
        # hashable format. This is needed to work with the keyed states used by.
        # GroupIntoBatches. After grouping and batching is done, table references
        # are restored.
        destination_files_kv_pc = (
            destination_data_kv_pc
            | 'ToHashableTableRef' >> beam.Map(
                bigquery_tools.to_hashable_table_ref)
            | 'WithAutoSharding' >> GroupIntoBatches.WithShardedKey(
                batch_size=_FILE_TRIGGERING_RECORD_COUNT,
                max_buffering_duration_secs=
                _FILE_TRIGGERING_BATCHING_DURATION_SECS,
                clock=clock)
            | 'FromHashableTableRefAndDropShard' >> beam.Map(lambda kvs: (
                bigquery_tools.parse_table_reference(kvs[0].key), kvs[1]))
            | beam.ParDo(
                WriteGroupedRecordsToFile(schema=self.schema,
                                          file_format=self._temp_file_format),
                file_prefix_pcv, *self.schema_side_inputs))

        return self._maybe_apply_user_trigger(destination_files_kv_pc)
Ejemplo n.º 8
0
  def __init__(self, table, dataset=None, project=None, schema=None,
               create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=BigQueryDisposition.WRITE_APPEND,
               batch_size=None, test_client=None):
    """Initialize a WriteToBigQuery transform.

    Args:
      table (str): The ID of the table. The ID must contain only letters
        ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If dataset
        argument is :data:`None` then the table argument must contain the
        entire table reference specified as: ``'DATASET.TABLE'`` or
        ``'PROJECT:DATASET.TABLE'``.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      schema (str): The schema to be used if the BigQuery table to write has to
        be created. This can be either specified as a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema`
        object or a single string  of the form
        ``'field1:type1,field2:type2,field3:type3'`` that defines a comma
        separated list of fields. Here ``'type'`` should specify the BigQuery
        type of the field. Single string based schemas do not support nested
        fields, repeated fields, or specifying a BigQuery mode for fields
        (mode will always be set to ``'NULLABLE'``).
      create_disposition (BigQueryDisposition): A string describing what
        happens if the table does not exist. Possible values are:

        * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not
          exist.
        * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not
          exist.

      write_disposition (BigQueryDisposition): A string describing what happens
        if the table has already some data. Possible values are:

        * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows.
        * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows.
        * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not
          empty.

        For streaming pipelines WriteTruncate can not be used.

      batch_size (int): Number of rows to be written to BQ per streaming API
        insert.
      test_client: Override the default bigquery client used for testing.
    """
    self.table_reference = bigquery_tools.parse_table_reference(
        table, dataset, project)
    self.create_disposition = BigQueryDisposition.validate_create(
        create_disposition)
    self.write_disposition = BigQueryDisposition.validate_write(
        write_disposition)
    self.schema = schema
    self.batch_size = batch_size
    self.test_client = test_client
Ejemplo n.º 9
0
 def test_calling_with_partially_qualified_table_ref(self):
   datasetId = 'test_dataset'
   tableId = 'test_table'
   partially_qualified_table = '{}.{}'.format(datasetId, tableId)
   parsed_ref = parse_table_reference(partially_qualified_table)
   self.assertIsInstance(parsed_ref, bigquery.TableReference)
   self.assertEqual(parsed_ref.datasetId, datasetId)
   self.assertEqual(parsed_ref.tableId, tableId)
Ejemplo n.º 10
0
 def test_calling_with_table_reference(self):
   table_ref = bigquery.TableReference()
   table_ref.projectId = 'test_project'
   table_ref.datasetId = 'test_dataset'
   table_ref.tableId = 'test_table'
   parsed_ref = parse_table_reference(table_ref)
   self.assertEqual(table_ref, parsed_ref)
   self.assertIsNot(table_ref, parsed_ref)
Ejemplo n.º 11
0
    def process(self, element, load_job_name_prefix, *schema_side_inputs):
        # Each load job is assumed to have files respecting these constraints:
        # 1. Total size of all files < 15 TB (Max size for load jobs)
        # 2. Total no. of files in a single load job < 10,000
        # This assumption means that there will always be a single load job
        # triggered for each partition of files.
        destination = element[0]
        files = element[1]

        if callable(self.schema):
            schema = self.schema(destination, *schema_side_inputs)
        elif isinstance(self.schema, vp.ValueProvider):
            schema = self.schema.get()
        else:
            schema = self.schema

        if callable(self.additional_bq_parameters):
            additional_parameters = self.additional_bq_parameters(destination)
        elif isinstance(self.additional_bq_parameters, vp.ValueProvider):
            additional_parameters = self.additional_bq_parameters.get()
        else:
            additional_parameters = self.additional_bq_parameters

        table_reference = bigquery_tools.parse_table_reference(destination)
        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')
        # Load jobs for a single destination are always triggered from the same
        # worker. This means that we can generate a deterministic numbered job id,
        # and not need to worry.
        destination_hash = _bq_uuid(
            '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId,
                          table_reference.tableId))
        uid = _bq_uuid()
        job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid)
        logging.debug('Load job has %s files. Job name is %s.', len(files),
                      job_name)

        if self.temporary_tables:
            # For temporary tables, we create a new table with the name with JobId.
            table_reference.tableId = job_name
            yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES,
                                      table_reference)

        logging.info(
            'Triggering job %s to load data to BigQuery table %s.'
            'Schema: %s. Additional parameters: %s', job_name, table_reference,
            schema, additional_parameters)
        job_reference = self.bq_wrapper.perform_load_job(
            table_reference,
            files,
            job_name,
            schema=schema,
            write_disposition=self.write_disposition,
            create_disposition=self.create_disposition,
            additional_load_parameters=additional_parameters)
        yield (destination, job_reference)
Ejemplo n.º 12
0
  def process(self, element, load_job_name_prefix, *schema_side_inputs):
    destination = element[0]
    files = iter(element[1])

    if callable(self.schema):
      schema = self.schema(destination, *schema_side_inputs)
    elif isinstance(self.schema, vp.ValueProvider):
      schema = self.schema.get()
    else:
      schema = self.schema

    if callable(self.additional_bq_parameters):
      additional_parameters = self.additional_bq_parameters(destination)
    elif isinstance(self.additional_bq_parameters, vp.ValueProvider):
      additional_parameters = self.additional_bq_parameters.get()
    else:
      additional_parameters = self.additional_bq_parameters

    batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
    while batch_of_files:

      table_reference = bigquery_tools.parse_table_reference(destination)
      if table_reference.projectId is None:
        table_reference.projectId = vp.RuntimeValueProvider.get_value(
            'project', str, '')
      # Load jobs for a single destination are always triggered from the same
      # worker. This means that we can generate a deterministic numbered job id,
      # and not need to worry.
      destination_hash = _bq_uuid('%s:%s.%s' % (table_reference.projectId,
                                                table_reference.datasetId,
                                                table_reference.tableId))
      timestamp = int(time.time())
      job_name = '%s_%s_%s' % (
          load_job_name_prefix, destination_hash, timestamp)
      logging.debug('Batch of files has %s files. Job name is %s.',
                    len(batch_of_files), job_name)

      if self.temporary_tables:
        # For temporary tables, we create a new table with the name with JobId.
        table_reference.tableId = job_name
        yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference)

      logging.info('Triggering job %s to load data to BigQuery table %s.'
                   'Schema: %s. Additional parameters: %s',
                   job_name, table_reference,
                   schema, additional_parameters)
      job_reference = self.bq_wrapper.perform_load_job(
          table_reference, batch_of_files, job_name,
          schema=schema,
          write_disposition=self.write_disposition,
          create_disposition=self.create_disposition,
          additional_load_parameters=additional_parameters)
      yield (destination, job_reference)

      # Prepare to trigger the next job
      batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
Ejemplo n.º 13
0
 def test_calling_with_all_arguments(self):
   projectId = 'test_project'
   datasetId = 'test_dataset'
   tableId = 'test_table'
   parsed_ref = parse_table_reference(
       tableId, dataset=datasetId, project=projectId)
   self.assertIsInstance(parsed_ref, bigquery.TableReference)
   self.assertEqual(parsed_ref.projectId, projectId)
   self.assertEqual(parsed_ref.datasetId, datasetId)
   self.assertEqual(parsed_ref.tableId, tableId)
Ejemplo n.º 14
0
 def test_calling_with_hyphened_table_ref(self):
   projectId = 'test_project'
   datasetId = 'test_dataset'
   tableId = 'test-table'
   fully_qualified_table = '{}:{}.{}'.format(projectId, datasetId, tableId)
   parsed_ref = parse_table_reference(fully_qualified_table)
   self.assertIsInstance(parsed_ref, bigquery.TableReference)
   self.assertEqual(parsed_ref.projectId, projectId)
   self.assertEqual(parsed_ref.datasetId, datasetId)
   self.assertEqual(parsed_ref.tableId, tableId)
Ejemplo n.º 15
0
    def process(self, element, job_name_prefix=None):
        destination = element[0]
        job_reference = element[1]

        if not self.temporary_tables:
            # If we did not use temporary tables, then we do not need to trigger any
            # copy jobs.
            return

        copy_to_reference = bigquery_tools.parse_table_reference(destination)
        if copy_to_reference.projectId is None:
            copy_to_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        copy_from_reference = bigquery_tools.parse_table_reference(destination)
        copy_from_reference.tableId = job_reference.jobId
        if copy_from_reference.projectId is None:
            copy_from_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        copy_job_name = '%s_copy_%s_to_%s' % (
            job_name_prefix,
            _bq_uuid(
                '%s:%s.%s' %
                (copy_from_reference.projectId, copy_from_reference.datasetId,
                 copy_from_reference.tableId)),
            _bq_uuid('%s:%s.%s' %
                     (copy_to_reference.projectId, copy_to_reference.datasetId,
                      copy_to_reference.tableId)))

        logging.info("Triggering copy job from %s to %s", copy_from_reference,
                     copy_to_reference)
        job_reference = self.bq_wrapper._insert_copy_job(
            copy_to_reference.projectId,
            copy_job_name,
            copy_from_reference,
            copy_to_reference,
            create_disposition=self.create_disposition,
            write_disposition=self.write_disposition)

        yield (destination, job_reference)
Ejemplo n.º 16
0
  def test_perform_load_job_with_source_stream(self):
    client = mock.Mock()
    wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)

    wrapper.perform_load_job(
        destination=parse_table_reference('project:dataset.table'),
        job_id='job_id',
        source_stream=io.BytesIO(b'some,data'))

    client.jobs.Insert.assert_called_once()
    upload = client.jobs.Insert.call_args[1]["upload"]
    self.assertEqual(b'some,data', upload.stream.read())
Ejemplo n.º 17
0
  def test_records_traverse_transform_with_mocks(self):
    destination = 'project1:dataset1.table1'

    job_reference = bigquery_api.JobReference()
    job_reference.projectId = 'project1'
    job_reference.jobId = 'job_name1'
    result_job = bigquery_api.Job()
    result_job.jobReference = job_reference

    mock_job = mock.Mock()
    mock_job.status.state = 'DONE'
    mock_job.status.errorResult = None
    mock_job.jobReference = job_reference

    bq_client = mock.Mock()
    bq_client.jobs.Get.return_value = mock_job

    bq_client.jobs.Insert.return_value = result_job

    transform = bigquery.WriteToBigQuery(
        destination,
        gs_location=self._new_tempdir(),
        test_client=bq_client)

    # Need to test this with the DirectRunner to avoid serializing mocks
    with TestPipeline('DirectRunner') as p:
      outputs = p | beam.Create(_ELEMENTS) | transform

      dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
      dest_job = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

      jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

      files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1])
      destinations = (dest_files
                      | "GetUniques" >> beam.combiners.Count.PerKey()
                      | "GetDests" >> beam.Map(lambda x: x[0]))

      # All files exist
      _ = (files | beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

      # One file per destination
      assert_that(files | beam.combiners.Count.Globally(),
                  equal_to([1]),
                  label='CountFiles')

      assert_that(destinations,
                  equal_to([bigquery_tools.parse_table_reference(destination)]),
                  label='CheckDestinations')

      assert_that(jobs,
                  equal_to([job_reference]), label='CheckJobs')
Ejemplo n.º 18
0
  def test_records_traverse_transform_with_mocks(self):
    destination = 'project1:dataset1.table1'

    job_reference = bigquery_api.JobReference()
    job_reference.projectId = 'project1'
    job_reference.jobId = 'job_name1'
    result_job = bigquery_api.Job()
    result_job.jobReference = job_reference

    mock_job = mock.Mock()
    mock_job.status.state = 'DONE'
    mock_job.status.errorResult = None
    mock_job.jobReference = job_reference

    bq_client = mock.Mock()
    bq_client.jobs.Get.return_value = mock_job

    bq_client.jobs.Insert.return_value = result_job

    transform = bigquery.WriteToBigQuery(
        destination,
        gs_location=self._new_tempdir(),
        test_client=bq_client)

    # Need to test this with the DirectRunner to avoid serializing mocks
    with TestPipeline('DirectRunner') as p:
      outputs = p | beam.Create(_ELEMENTS) | transform

      dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
      dest_job = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

      jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

      files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1])
      destinations = (dest_files
                      | "GetUniques" >> beam.combiners.Count.PerKey()
                      | "GetDests" >> beam.Map(lambda x: x[0]))

      # All files exist
      _ = (files | beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

      # One file per destination
      assert_that(files | beam.combiners.Count.Globally(),
                  equal_to([1]),
                  label='CountFiles')

      assert_that(destinations,
                  equal_to([bigquery_tools.parse_table_reference(destination)]),
                  label='CheckDestinations')

      assert_that(jobs,
                  equal_to([job_reference]), label='CheckJobs')
Ejemplo n.º 19
0
  def process(self, element, job_name_prefix=None):
    destination = element[0]
    job_reference = element[1]

    if not self.temporary_tables:
      # If we did not use temporary tables, then we do not need to trigger any
      # copy jobs.
      return

    copy_to_reference = bigquery_tools.parse_table_reference(destination)
    if copy_to_reference.projectId is None:
      copy_to_reference.projectId = vp.RuntimeValueProvider.get_value(
          'project', str, '')

    copy_from_reference = bigquery_tools.parse_table_reference(destination)
    copy_from_reference.tableId = job_reference.jobId
    if copy_from_reference.projectId is None:
      copy_from_reference.projectId = vp.RuntimeValueProvider.get_value(
          'project', str, '')

    copy_job_name = '%s_copy_%s_to_%s' % (
        job_name_prefix,
        _bq_uuid('%s:%s.%s' % (copy_from_reference.projectId,
                               copy_from_reference.datasetId,
                               copy_from_reference.tableId)),
        _bq_uuid('%s:%s.%s' % (copy_to_reference.projectId,
                               copy_to_reference.datasetId,
                               copy_to_reference.tableId)))

    logging.info("Triggering copy job from %s to %s",
                 copy_from_reference, copy_to_reference)
    job_reference = self.bq_wrapper._insert_copy_job(
        copy_to_reference.projectId,
        copy_job_name,
        copy_from_reference,
        copy_to_reference,
        create_disposition=self.create_disposition,
        write_disposition=self.write_disposition)

    yield (destination, job_reference)
Ejemplo n.º 20
0
  def _export_files(
      self,
      bq: bigquery_tools.BigQueryWrapper,
      element: 'ReadFromBigQueryRequest',
      table_reference: TableReference):
    """Runs a BigQuery export job.

    Returns:
      bigquery.TableSchema instance, a list of FileMetadata instances
    """
    job_labels = self._get_bq_metadata().add_additional_bq_job_labels(
        self.bigquery_job_labels)
    export_job_name = bigquery_tools.generate_bq_job_name(
        self._job_name,
        self._source_uuid,
        bigquery_tools.BigQueryJobTypes.EXPORT,
        element.obj_id)
    temp_location = self.options.view_as(GoogleCloudOptions).temp_location
    gcs_location = bigquery_export_destination_uri(
        self.gcs_location,
        temp_location,
        '%s%s' % (self._source_uuid, element.obj_id))
    if self.use_json_exports:
      job_ref = bq.perform_extract_job([gcs_location],
                                       export_job_name,
                                       table_reference,
                                       bigquery_tools.FileFormat.JSON,
                                       project=self._get_project(),
                                       job_labels=job_labels,
                                       include_header=False)
    else:
      job_ref = bq.perform_extract_job([gcs_location],
                                       export_job_name,
                                       table_reference,
                                       bigquery_tools.FileFormat.AVRO,
                                       project=self._get_project(),
                                       include_header=False,
                                       job_labels=job_labels,
                                       use_avro_logical_types=True)
    bq.wait_for_bq_job(job_ref)
    metadata_list = FileSystems.match([gcs_location])[0].metadata_list

    if isinstance(table_reference, ValueProvider):
      table_ref = bigquery_tools.parse_table_reference(
          element.table, project=self._get_project())
    else:
      table_ref = table_reference
    table = bq.get_table(
        table_ref.projectId, table_ref.datasetId, table_ref.tableId)

    return table.schema, metadata_list
Ejemplo n.º 21
0
  def process(self, element, load_job_name_prefix):
    destination = element[0]
    files = iter(element[1])

    if callable(self.schema):
      schema = self.schema(destination)
    elif isinstance(self.schema, vp.ValueProvider):
      schema = self.schema.get()
    else:
      schema = self.schema

    job_count = 0
    batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
    while batch_of_files:

      table_reference = bigquery_tools.parse_table_reference(destination)
      if table_reference.projectId is None:
        table_reference.projectId = vp.RuntimeValueProvider.get_value(
            'project', str, '')

      # Load jobs for a single destination are always triggered from the same
      # worker. This means that we can generate a deterministic numbered job id,
      # and not need to worry.
      job_name = '%s_%s_%s' % (
          load_job_name_prefix,
          _bq_uuid('%s:%s.%s' % (table_reference.projectId,
                                 table_reference.datasetId,
                                 table_reference.tableId)),
          job_count)
      logging.debug("Batch of files has %s files. Job name is %s",
                    len(batch_of_files), job_name)

      if self.temporary_tables:
        # For temporary tables, we create a new table with the name with JobId.
        table_reference.tableId = job_name
        yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference)

      logging.info("Triggering job %s to load data to BigQuery table %s.",
                   job_name, table_reference)
      job_reference = self.bq_wrapper.perform_load_job(
          table_reference, batch_of_files, job_name,
          schema=schema,
          write_disposition=self.write_disposition,
          create_disposition=self.create_disposition)
      yield (destination, job_reference)

      # Prepare to trigger the next job
      job_count += 1
      batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
Ejemplo n.º 22
0
  def test_perform_load_job_source_mutual_exclusivity(self):
    client = mock.Mock()
    wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)

    # Both source_uri and source_stream specified.
    with self.assertRaises(ValueError):
      wrapper.perform_load_job(
          destination=parse_table_reference('project:dataset.table'),
          job_id='job_id',
          source_uris=['gs://example.com/*'],
          source_stream=io.BytesIO())

    # Neither source_uri nor source_stream specified.
    with self.assertRaises(ValueError):
      wrapper.perform_load_job(destination='P:D.T', job_id='J')
Ejemplo n.º 23
0
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.dataset_id = '%s%s%s' % (
        self.BIG_QUERY_DATASET_ID,
        str(int(time.time())),
        random.randint(0, 10000))
    self.bigquery_client = bigquery_tools.BigQueryWrapper()
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = '%s.output_table' % (self.dataset_id)
    self.table_ref = bigquery_tools.parse_table_reference(self.output_table)
    _LOGGER.info(
        'Created dataset %s in project %s', self.dataset_id, self.project)
Ejemplo n.º 24
0
    def process(self, element, unused_create_fn_output=None):
        destination = element[0]
        if isinstance(destination, tuple):
            schema = destination[1]
            destination = destination[0]
            self._create_table_if_needed(
                schema, bigquery_tools.parse_table_reference(destination))

        row = element[1]
        self._rows_buffer[destination].append(row)
        self._total_buffered_rows += 1
        if len(self._rows_buffer[destination]) >= self._max_batch_size:
            return self._flush_batch(destination)
        elif self._total_buffered_rows >= self._max_buffered_rows:
            return self._flush_all_batches()
Ejemplo n.º 25
0
    def _flush_batch(self, destination):

        # Flush the current batch of rows to BigQuery.
        rows = self._rows_buffer[destination]
        table_reference = bigquery_tools.parse_table_reference(destination)

        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        logging.debug('Flushing data to %s. Total %s rows.', destination,
                      len(rows))

        while True:
            # TODO: Figure out an insertId to make calls idempotent.
            passed, errors = self.bigquery_wrapper.insert_rows(
                project_id=table_reference.projectId,
                dataset_id=table_reference.datasetId,
                table_id=table_reference.tableId,
                rows=rows,
                skip_invalid_rows=True)

            logging.debug("Passed: %s. Errors are %s", passed, errors)
            failed_rows = [rows[entry.index] for entry in errors]
            should_retry = any(
                bigquery_tools.RetryStrategy.should_retry(
                    self._retry_strategy, entry.errors[0].reason)
                for entry in errors)
            rows = failed_rows

            if not should_retry:
                break
            else:
                retry_backoff = next(self._backoff_calculator)
                logging.info('Sleeping %s seconds before retrying insertion.',
                             retry_backoff)
                time.sleep(retry_backoff)

        self._total_buffered_rows -= len(self._rows_buffer[destination])
        del self._rows_buffer[destination]

        return [
            pvalue.TaggedOutput(
                BigQueryWriteFn.FAILED_ROWS,
                GlobalWindows.windowed_value((destination, row)))
            for row in failed_rows
        ]
Ejemplo n.º 26
0
    def process(self, element, load_job_name_prefix):
        destination = element[0]
        files = iter(element[1])

        job_count = 0
        batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
        while batch_of_files:

            table_reference = bigquery_tools.parse_table_reference(destination)
            if table_reference.projectId is None:
                table_reference.projectId = vp.RuntimeValueProvider.get_value(
                    'project', str, '')

            # Load jobs for a single des5tination are always triggered from the same
            # worker. This means that we can generate a deterministic numbered job id,
            # and not need to worry.
            job_name = '%s_%s_%s' % (
                load_job_name_prefix,
                _bq_uuid('%s:%s.%s' %
                         (table_reference.projectId, table_reference.datasetId,
                          table_reference.tableId)), job_count)
            logging.debug("Batch of files has %s files. Job name is %s",
                          len(batch_of_files), job_name)

            if self.temporary_tables:
                # For temporary tables, we create a new table with the name with JobId.
                table_reference.tableId = job_name
                yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES,
                                          table_reference)

            logging.info(
                "Triggering job %s to load data to BigQuery table %s.",
                job_name, table_reference)
            job_reference = self.bq_wrapper.perform_load_job(
                table_reference,
                batch_of_files,
                job_name,
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition)
            yield (destination, job_reference)

            # Prepare to trigger the next job
            job_count += 1
            batch_of_files = list(itertools.islice(files,
                                                   _MAXIMUM_SOURCE_URIS))
Ejemplo n.º 27
0
  def _flush_batch(self, destination):

    # Flush the current batch of rows to BigQuery.
    rows = self._rows_buffer[destination]
    table_reference = bigquery_tools.parse_table_reference(destination)

    if table_reference.projectId is None:
      table_reference.projectId = vp.RuntimeValueProvider.get_value(
          'project', str, '')

    logging.debug('Flushing data to %s. Total %s rows.',
                  destination, len(rows))

    while True:
      # TODO: Figure out an insertId to make calls idempotent.
      passed, errors = self.bigquery_wrapper.insert_rows(
          project_id=table_reference.projectId,
          dataset_id=table_reference.datasetId,
          table_id=table_reference.tableId,
          rows=rows,
          skip_invalid_rows=True)

      logging.debug("Passed: %s. Errors are %s", passed, errors)
      failed_rows = [rows[entry.index] for entry in errors]
      should_retry = any(
          bigquery_tools.RetryStrategy.should_retry(
              self._retry_strategy, entry.errors[0].reason)
          for entry in errors)
      rows = failed_rows

      if not should_retry:
        break
      else:
        retry_backoff = next(self._backoff_calculator)
        logging.info('Sleeping %s seconds before retrying insertion.',
                     retry_backoff)
        time.sleep(retry_backoff)

    self._total_buffered_rows -= len(self._rows_buffer[destination])
    del self._rows_buffer[destination]

    return [pvalue.TaggedOutput(BigQueryWriteFn.FAILED_ROWS,
                                GlobalWindows.windowed_value(
                                    (destination, row))) for row in failed_rows]
Ejemplo n.º 28
0
    def __init__(
            self,
            # gcs_location=None,
            get_destination_uri=None,
            table=None,
            dataset=None,
            project=None,
            query=None,
            validate=False,
            coder=None,
            use_standard_sql=False,
            flatten_results=True,
            kms_key=None):
        if table is not None and query is not None:
            raise ValueError(
                'Both a BigQuery table and a query were specified.'
                ' Please specify only one of these.')
        elif table is None and query is None:
            raise ValueError('A BigQuery table or a query must be specified')
        elif table is not None:
            self.table_reference = bigquery_tools.parse_table_reference(
                table, dataset, project)
            self.query = None
            self.use_legacy_sql = True
        else:
            if isinstance(query, (str, unicode)):
                query = StaticValueProvider(str, query)
            self.query = query
            # TODO(BEAM-1082): Change the internal flag to be standard_sql
            self.use_legacy_sql = not use_standard_sql
            self.table_reference = None

        self.get_destination_uri = get_destination_uri
        # self.gcs_location = gcs_location
        if isinstance(project, (str, unicode)):
            project = StaticValueProvider(str, query)
        self.project = project
        self.validate = validate
        self.flatten_results = flatten_results
        self.coder = coder or _JsonToDictCoder
        self.kms_key = kms_key
        self.split_result = None
Ejemplo n.º 29
0
  def process(self, element, unused_create_fn_output=None):
    destination = element[0]

    if callable(self.schema):
      schema = self.schema(destination)
    elif isinstance(self.schema, vp.ValueProvider):
      schema = self.schema.get()
    else:
      schema = self.schema

    self._create_table_if_needed(
        bigquery_tools.parse_table_reference(destination),
        schema)

    row = element[1]
    self._rows_buffer[destination].append(row)
    self._total_buffered_rows += 1
    if len(self._rows_buffer[destination]) >= self._max_batch_size:
      return self._flush_batch(destination)
    elif self._total_buffered_rows >= self._max_buffered_rows:
      return self._flush_all_batches()
Ejemplo n.º 30
0
  def process(self, element, unused_create_fn_output=None):
    destination = element[0]

    if callable(self.schema):
      schema = self.schema(destination)
    elif isinstance(self.schema, vp.ValueProvider):
      schema = self.schema.get()
    else:
      schema = self.schema

    self._create_table_if_needed(
        bigquery_tools.parse_table_reference(destination),
        schema)

    destination = bigquery_tools.get_hashable_destination(destination)

    row = element[1]
    self._rows_buffer[destination].append(row)
    self._total_buffered_rows += 1
    if len(self._rows_buffer[destination]) >= self._max_batch_size:
      return self._flush_batch(destination)
    elif self._total_buffered_rows >= self._max_buffered_rows:
      return self._flush_all_batches()
Ejemplo n.º 31
0
    def process(self, element, schema_mod_job_name_prefix):
        destination = element[0]
        temp_table_load_job_reference = element[1]

        if callable(self._additional_bq_parameters):
            additional_parameters = self._additional_bq_parameters(destination)
        elif isinstance(self._additional_bq_parameters, vp.ValueProvider):
            additional_parameters = self._additional_bq_parameters.get()
        else:
            additional_parameters = self._additional_bq_parameters

        # When writing to normal tables WRITE_TRUNCATE will overwrite the schema but
        # when writing to a partition, care needs to be taken to update the schema
        # even on WRITE_TRUNCATE.
        if (self._write_disposition not in ('WRITE_TRUNCATE', 'WRITE_APPEND')
                or not additional_parameters
                or not additional_parameters.get("schemaUpdateOptions")):
            # No need to modify schema of destination table
            return

        table_reference = bigquery_tools.parse_table_reference(destination)
        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        try:
            # Check if destination table exists
            destination_table = self._bq_wrapper.get_table(
                project_id=table_reference.projectId,
                dataset_id=table_reference.datasetId,
                table_id=table_reference.tableId)
        except HttpError as exn:
            if exn.status_code == 404:
                # Destination table does not exist, so no need to modify its schema
                # ahead of the copy jobs.
                return
            else:
                raise

        temp_table_load_job = self._bq_wrapper.get_job(
            project=temp_table_load_job_reference.projectId,
            job_id=temp_table_load_job_reference.jobId,
            location=temp_table_load_job_reference.location)
        temp_table_schema = temp_table_load_job.configuration.load.schema

        if bigquery_tools.check_schema_equal(temp_table_schema,
                                             destination_table.schema,
                                             ignore_descriptions=True,
                                             ignore_field_order=True):
            # Destination table schema is already the same as the temp table schema,
            # so no need to run a job to update the destination table schema.
            return

        destination_hash = _bq_uuid(
            '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId,
                          table_reference.tableId))
        uid = _bq_uuid()
        job_name = '%s_%s_%s' % (schema_mod_job_name_prefix, destination_hash,
                                 uid)

        _LOGGER.debug('Triggering schema modification job %s on %s', job_name,
                      table_reference)
        # Trigger potential schema modification by loading zero rows into the
        # destination table with the temporary table schema.
        schema_update_job_reference = self._bq_wrapper.perform_load_job(
            destination=table_reference,
            source_stream=io.BytesIO(),  # file with zero rows
            job_id=job_name,
            schema=temp_table_schema,
            write_disposition='WRITE_APPEND',
            create_disposition='CREATE_NEVER',
            additional_load_parameters=additional_parameters,
            job_labels=self._bq_io_metadata.add_additional_bq_job_labels())
        yield (destination, schema_update_job_reference)
Ejemplo n.º 32
0
    def __init__(self,
                 table,
                 dataset=None,
                 project=None,
                 schema=None,
                 create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                 write_disposition=BigQueryDisposition.WRITE_APPEND,
                 kms_key=None,
                 batch_size=None,
                 max_file_size=None,
                 max_files_per_bundle=None,
                 test_client=None,
                 gs_location=None,
                 method=None,
                 insert_retry_strategy=None):
        """Initialize a WriteToBigQuery transform.

    Args:
      table (str, callable): The ID of the table, or a callable
         that returns it. The ID must contain only letters ``a-z``, ``A-Z``,
         numbers ``0-9``, or underscores ``_``. If dataset argument is
         :data:`None` then the table argument must contain the entire table
         reference specified as: ``'DATASET.TABLE'``
         or ``'PROJECT:DATASET.TABLE'``. If it's a callable, it must receive one
         argument representing an element to be written to BigQuery, and return
         a TableReference, or a string table name as specified above.
         Multiple destinations are only supported on Batch pipelines at the
         moment.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      schema (str): The schema to be used if the BigQuery table to write has to
        be created. This can be either specified as a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema`
        object or a single string  of the form
        ``'field1:type1,field2:type2,field3:type3'`` that defines a comma
        separated list of fields. Here ``'type'`` should specify the BigQuery
        type of the field. Single string based schemas do not support nested
        fields, repeated fields, or specifying a BigQuery mode for fields
        (mode will always be set to ``'NULLABLE'``).
      create_disposition (BigQueryDisposition): A string describing what
        happens if the table does not exist. Possible values are:

        * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not
          exist.
        * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not
          exist.

      write_disposition (BigQueryDisposition): A string describing what happens
        if the table has already some data. Possible values are:

        * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows.
        * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows.
        * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not
          empty.

        For streaming pipelines WriteTruncate can not be used.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.
      batch_size (int): Number of rows to be written to BQ per streaming API
        insert. The default is 500.
        insert.
      test_client: Override the default bigquery client used for testing.
      max_file_size (int): The maximum size for a file to be written and then
        loaded into BigQuery. The default value is 4TB, which is 80% of the
        limit of 5TB for BigQuery to load any file.
      max_files_per_bundle(int): The maximum number of files to be concurrently
        written by a worker. The default here is 20. Larger values will allow
        writing to multiple destinations without having to reshard - but they
        increase the memory burden on the workers.
      gs_location (str): A GCS location to store files to be used for file
        loads into BigQuery. By default, this will use the pipeline's
        temp_location, but for pipelines whose temp_location is not appropriate
        for BQ File Loads, users should pass a specific one.
      method: The method to use to write to BigQuery. It may be
        STREAMING_INSERTS, FILE_LOADS, or DEFAULT. An introduction on loading
        data to BigQuery: https://cloud.google.com/bigquery/docs/loading-data.
        DEFAULT will use STREAMING_INSERTS on Streaming pipelines and
        FILE_LOADS on Batch pipelines.
      insert_retry_strategy: The strategy to use when retrying streaming inserts
        into BigQuery. Options are shown in bigquery_tools.RetryStrategy attrs.
    """
        self.table_reference = bigquery_tools.parse_table_reference(
            table, dataset, project)
        self.create_disposition = BigQueryDisposition.validate_create(
            create_disposition)
        self.write_disposition = BigQueryDisposition.validate_write(
            write_disposition)
        self.schema = WriteToBigQuery.get_dict_table_schema(schema)
        self.batch_size = batch_size
        self.kms_key = kms_key
        self.test_client = test_client
        self.gs_location = gs_location
        self.max_file_size = max_file_size
        self.max_files_per_bundle = max_files_per_bundle
        self.method = method or WriteToBigQuery.Method.DEFAULT
        self.insert_retry_strategy = insert_retry_strategy
Ejemplo n.º 33
0
    def __init__(self,
                 table,
                 dataset=None,
                 project=None,
                 schema=None,
                 create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                 write_disposition=BigQueryDisposition.WRITE_EMPTY,
                 validate=False,
                 coder=None,
                 kms_key=None):
        """Initialize a BigQuerySink.

    Args:
      table (str): The ID of the table. The ID must contain only letters
        ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If
        **dataset** argument is :data:`None` then the table argument must
        contain the entire table reference specified as: ``'DATASET.TABLE'`` or
        ``'PROJECT:DATASET.TABLE'``.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      schema (str): The schema to be used if the BigQuery table to write has
        to be created. This can be either specified as a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema` object or a single string  of the form
        ``'field1:type1,field2:type2,field3:type3'`` that defines a comma
        separated list of fields. Here ``'type'`` should specify the BigQuery
        type of the field. Single string based schemas do not support nested
        fields, repeated fields, or specifying a BigQuery mode for fields (mode
        will always be set to ``'NULLABLE'``).
      create_disposition (BigQueryDisposition): A string describing what
        happens if the table does not exist. Possible values are:

          * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not
            exist.
          * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not
            exist.

      write_disposition (BigQueryDisposition): A string describing what
        happens if the table has already some data. Possible values are:

          * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows.
          * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows.
          * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not
            empty.

      validate (bool): If :data:`True`, various checks will be done when sink
        gets initialized (e.g., is table present given the disposition
        arguments?). This should be :data:`True` for most scenarios in order to
        catch errors as early as possible (pipeline construction instead of
        pipeline execution). It should be :data:`False` if the table is created
        during pipeline execution by a previous step.
      coder (~apache_beam.coders.coders.Coder): The coder for the
        table rows if serialized to disk. If :data:`None`, then the default
        coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`,
        which will interpret every element written to the sink as a dictionary
        that will be JSON serialized as a line in a file. This argument needs a
        value only in special cases when writing table rows as dictionaries is
        not desirable.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.

    Raises:
      ~exceptions.TypeError: if the schema argument is not a :class:`str` or a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema` object.
      ~exceptions.ValueError: if the table reference as a string does not
        match the expected format.
    """
        # Import here to avoid adding the dependency for local running scenarios.
        try:
            # pylint: disable=wrong-import-order, wrong-import-position
            from apitools.base import py  # pylint: disable=unused-variable
        except ImportError:
            raise ImportError('Google Cloud IO not available, '
                              'please install apache_beam[gcp]')

        self.table_reference = bigquery_tools.parse_table_reference(
            table, dataset, project)
        # Transform the table schema into a bigquery.TableSchema instance.
        if isinstance(schema, (str, unicode)):
            # TODO(silviuc): Should add a regex-based validation of the format.
            table_schema = bigquery.TableSchema()
            schema_list = [s.strip(' ') for s in schema.split(',')]
            for field_and_type in schema_list:
                field_name, field_type = field_and_type.split(':')
                field_schema = bigquery.TableFieldSchema()
                field_schema.name = field_name
                field_schema.type = field_type
                field_schema.mode = 'NULLABLE'
                table_schema.fields.append(field_schema)
            self.table_schema = table_schema
        elif schema is None:
            # TODO(silviuc): Should check that table exists if no schema specified.
            self.table_schema = schema
        elif isinstance(schema, bigquery.TableSchema):
            self.table_schema = schema
        else:
            raise TypeError('Unexpected schema argument: %s.' % schema)

        self.create_disposition = BigQueryDisposition.validate_create(
            create_disposition)
        self.write_disposition = BigQueryDisposition.validate_write(
            write_disposition)
        self.validate = validate
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
        self.kms_key = kms_key
Ejemplo n.º 34
0
def _parse_table_reference(table, dataset=None, project=None):
  return bigquery_tools.parse_table_reference(table, dataset, project)
Ejemplo n.º 35
0
def _parse_table_reference(table, dataset=None, project=None):
    return bigquery_tools.parse_table_reference(table, dataset, project)
Ejemplo n.º 36
0
 def test_calling_with_callable(self):
   callable_ref = lambda: 'foo'
   parsed_ref = parse_table_reference(callable_ref)
   self.assertIs(callable_ref, parsed_ref)
Ejemplo n.º 37
0
  def __init__(self,
               table,
               dataset=None,
               project=None,
               schema=None,
               create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=BigQueryDisposition.WRITE_APPEND,
               kms_key=None,
               batch_size=None,
               max_file_size=None,
               max_files_per_bundle=None,
               test_client=None,
               gs_location=None,
               method=None):
    """Initialize a WriteToBigQuery transform.

    Args:
      table (str, callable): The ID of the table, or a callable
         that returns it. The ID must contain only letters ``a-z``, ``A-Z``,
         numbers ``0-9``, or underscores ``_``. If dataset argument is
         :data:`None` then the table argument must contain the entire table
         reference specified as: ``'DATASET.TABLE'``
         or ``'PROJECT:DATASET.TABLE'``. If it's a callable, it must receive one
         argument representing an element to be written to BigQuery, and return
         a TableReference, or a string table name as specified above.
         Multiple destinations are only supported on Batch pipelines at the
         moment.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      schema (str): The schema to be used if the BigQuery table to write has to
        be created. This can be either specified as a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema`
        object or a single string  of the form
        ``'field1:type1,field2:type2,field3:type3'`` that defines a comma
        separated list of fields. Here ``'type'`` should specify the BigQuery
        type of the field. Single string based schemas do not support nested
        fields, repeated fields, or specifying a BigQuery mode for fields
        (mode will always be set to ``'NULLABLE'``).
      create_disposition (BigQueryDisposition): A string describing what
        happens if the table does not exist. Possible values are:

        * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not
          exist.
        * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not
          exist.

      write_disposition (BigQueryDisposition): A string describing what happens
        if the table has already some data. Possible values are:

        * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows.
        * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows.
        * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not
          empty.

        For streaming pipelines WriteTruncate can not be used.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.
      batch_size (int): Number of rows to be written to BQ per streaming API
        insert. The default is 500.
        insert.
      test_client: Override the default bigquery client used for testing.
      max_file_size (int): The maximum size for a file to be written and then
        loaded into BigQuery. The default value is 4TB, which is 80% of the
        limit of 5TB for BigQuery to load any file.
      max_files_per_bundle(int): The maximum number of files to be concurrently
        written by a worker. The default here is 20. Larger values will allow
        writing to multiple destinations without having to reshard - but they
        increase the memory burden on the workers.
      gs_location (str): A GCS location to store files to be used for file
        loads into BigQuery. By default, this will use the pipeline's
        temp_location, but for pipelines whose temp_location is not appropriate
        for BQ File Loads, users should pass a specific one.
      method: The method to use to write to BigQuery. It may be
        STREAMING_INSERTS, FILE_LOADS, or DEFAULT. An introduction on loading
        data to BigQuery: https://cloud.google.com/bigquery/docs/loading-data.
        DEFAULT will use STREAMING_INSERTS on Streaming pipelines and
        FILE_LOADS on Batch pipelines.
    """
    self.table_reference = bigquery_tools.parse_table_reference(
        table, dataset, project)
    self.create_disposition = BigQueryDisposition.validate_create(
        create_disposition)
    self.write_disposition = BigQueryDisposition.validate_write(
        write_disposition)
    self.schema = schema
    self.batch_size = batch_size
    self.kms_key = kms_key
    self.test_client = test_client
    self.gs_location = gs_location
    self.max_file_size = max_file_size
    self.max_files_per_bundle = max_files_per_bundle
    self.method = method or WriteToBigQuery.Method.DEFAULT
Ejemplo n.º 38
0
  def __init__(self, table, dataset=None, project=None, schema=None,
               create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=BigQueryDisposition.WRITE_EMPTY,
               validate=False, coder=None, kms_key=None):
    """Initialize a BigQuerySink.

    Args:
      table (str): The ID of the table. The ID must contain only letters
        ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If
        **dataset** argument is :data:`None` then the table argument must
        contain the entire table reference specified as: ``'DATASET.TABLE'`` or
        ``'PROJECT:DATASET.TABLE'``.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      schema (str): The schema to be used if the BigQuery table to write has
        to be created. This can be either specified as a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema` object or a single string  of the form
        ``'field1:type1,field2:type2,field3:type3'`` that defines a comma
        separated list of fields. Here ``'type'`` should specify the BigQuery
        type of the field. Single string based schemas do not support nested
        fields, repeated fields, or specifying a BigQuery mode for fields (mode
        will always be set to ``'NULLABLE'``).
      create_disposition (BigQueryDisposition): A string describing what
        happens if the table does not exist. Possible values are:

          * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not
            exist.
          * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not
            exist.

      write_disposition (BigQueryDisposition): A string describing what
        happens if the table has already some data. Possible values are:

          * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows.
          * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows.
          * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not
            empty.

      validate (bool): If :data:`True`, various checks will be done when sink
        gets initialized (e.g., is table present given the disposition
        arguments?). This should be :data:`True` for most scenarios in order to
        catch errors as early as possible (pipeline construction instead of
        pipeline execution). It should be :data:`False` if the table is created
        during pipeline execution by a previous step.
      coder (~apache_beam.coders.coders.Coder): The coder for the
        table rows if serialized to disk. If :data:`None`, then the default
        coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`,
        which will interpret every element written to the sink as a dictionary
        that will be JSON serialized as a line in a file. This argument needs a
        value only in special cases when writing table rows as dictionaries is
        not desirable.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.

    Raises:
      ~exceptions.TypeError: if the schema argument is not a :class:`str` or a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema` object.
      ~exceptions.ValueError: if the table reference as a string does not
        match the expected format.
    """
    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apitools.base import py  # pylint: disable=unused-variable
    except ImportError:
      raise ImportError(
          'Google Cloud IO not available, '
          'please install apache_beam[gcp]')

    self.table_reference = bigquery_tools.parse_table_reference(
        table, dataset, project)
    # Transform the table schema into a bigquery.TableSchema instance.
    if isinstance(schema, (str, unicode)):
      # TODO(silviuc): Should add a regex-based validation of the format.
      table_schema = bigquery.TableSchema()
      schema_list = [s.strip(' ') for s in schema.split(',')]
      for field_and_type in schema_list:
        field_name, field_type = field_and_type.split(':')
        field_schema = bigquery.TableFieldSchema()
        field_schema.name = field_name
        field_schema.type = field_type
        field_schema.mode = 'NULLABLE'
        table_schema.fields.append(field_schema)
      self.table_schema = table_schema
    elif schema is None:
      # TODO(silviuc): Should check that table exists if no schema specified.
      self.table_schema = schema
    elif isinstance(schema, bigquery.TableSchema):
      self.table_schema = schema
    else:
      raise TypeError('Unexpected schema argument: %s.' % schema)

    self.create_disposition = BigQueryDisposition.validate_create(
        create_disposition)
    self.write_disposition = BigQueryDisposition.validate_write(
        write_disposition)
    self.validate = validate
    self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
    self.kms_key = kms_key
Ejemplo n.º 39
0
  def __init__(self, table=None, dataset=None, project=None, query=None,
               validate=False, coder=None, use_standard_sql=False,
               flatten_results=True, kms_key=None):
    """Initialize a :class:`BigQuerySource`.

    Args:
      table (str): The ID of a BigQuery table. If specified all data of the
        table will be used as input of the current source. The ID must contain
        only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores
        ``_``. If dataset and query arguments are :data:`None` then the table
        argument must contain the entire table reference specified as:
        ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument or a query is specified.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument or a query is specified.
      query (str): A query to be used instead of arguments table, dataset, and
        project.
      validate (bool): If :data:`True`, various checks will be done when source
        gets initialized (e.g., is table present?). This should be
        :data:`True` for most scenarios in order to catch errors as early as
        possible (pipeline construction instead of pipeline execution). It
        should be :data:`False` if the table is created during pipeline
        execution by a previous step.
      coder (~apache_beam.coders.coders.Coder): The coder for the table
        rows if serialized to disk. If :data:`None`, then the default coder is
        :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`,
        which will interpret every line in a file as a JSON serialized
        dictionary. This argument needs a value only in special cases when
        returning table rows as dictionaries is not desirable.
      use_standard_sql (bool): Specifies whether to use BigQuery's standard SQL
        dialect for this query. The default value is :data:`False`.
        If set to :data:`True`, the query will use BigQuery's updated SQL
        dialect with improved standards compliance.
        This parameter is ignored for table inputs.
      flatten_results (bool): Flattens all nested and repeated fields in the
        query results. The default value is :data:`True`.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.

    Raises:
      ~exceptions.ValueError: if any of the following is true:

        1) the table reference as a string does not match the expected format
        2) neither a table nor a query is specified
        3) both a table and a query is specified.
    """

    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apitools.base import py  # pylint: disable=unused-variable
    except ImportError:
      raise ImportError(
          'Google Cloud IO not available, '
          'please install apache_beam[gcp]')

    if table is not None and query is not None:
      raise ValueError('Both a BigQuery table and a query were specified.'
                       ' Please specify only one of these.')
    elif table is None and query is None:
      raise ValueError('A BigQuery table or a query must be specified')
    elif table is not None:
      self.table_reference = bigquery_tools.parse_table_reference(
          table, dataset, project)
      self.query = None
      self.use_legacy_sql = True
    else:
      self.query = query
      # TODO(BEAM-1082): Change the internal flag to be standard_sql
      self.use_legacy_sql = not use_standard_sql
      self.table_reference = None

    self.validate = validate
    self.flatten_results = flatten_results
    self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
    self.kms_key = kms_key
Ejemplo n.º 40
0
 def test_calling_with_value_provider(self):
   value_provider_ref = StaticValueProvider(str, 'test_dataset.test_table')
   parsed_ref = parse_table_reference(value_provider_ref)
   self.assertIs(value_provider_ref, parsed_ref)
Ejemplo n.º 41
0
    def __init__(self,
                 table=None,
                 dataset=None,
                 project=None,
                 query=None,
                 validate=False,
                 coder=None,
                 use_standard_sql=False,
                 flatten_results=True,
                 kms_key=None):
        """Initialize a :class:`BigQuerySource`.

    Args:
      table (str): The ID of a BigQuery table. If specified all data of the
        table will be used as input of the current source. The ID must contain
        only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores
        ``_``. If dataset and query arguments are :data:`None` then the table
        argument must contain the entire table reference specified as:
        ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument or a query is specified.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument or a query is specified.
      query (str): A query to be used instead of arguments table, dataset, and
        project.
      validate (bool): If :data:`True`, various checks will be done when source
        gets initialized (e.g., is table present?). This should be
        :data:`True` for most scenarios in order to catch errors as early as
        possible (pipeline construction instead of pipeline execution). It
        should be :data:`False` if the table is created during pipeline
        execution by a previous step.
      coder (~apache_beam.coders.coders.Coder): The coder for the table
        rows if serialized to disk. If :data:`None`, then the default coder is
        :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`,
        which will interpret every line in a file as a JSON serialized
        dictionary. This argument needs a value only in special cases when
        returning table rows as dictionaries is not desirable.
      use_standard_sql (bool): Specifies whether to use BigQuery's standard SQL
        dialect for this query. The default value is :data:`False`.
        If set to :data:`True`, the query will use BigQuery's updated SQL
        dialect with improved standards compliance.
        This parameter is ignored for table inputs.
      flatten_results (bool): Flattens all nested and repeated fields in the
        query results. The default value is :data:`True`.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.

    Raises:
      ~exceptions.ValueError: if any of the following is true:

        1) the table reference as a string does not match the expected format
        2) neither a table nor a query is specified
        3) both a table and a query is specified.
    """

        # Import here to avoid adding the dependency for local running scenarios.
        try:
            # pylint: disable=wrong-import-order, wrong-import-position
            from apitools.base import py  # pylint: disable=unused-variable
        except ImportError:
            raise ImportError('Google Cloud IO not available, '
                              'please install apache_beam[gcp]')

        if table is not None and query is not None:
            raise ValueError(
                'Both a BigQuery table and a query were specified.'
                ' Please specify only one of these.')
        elif table is None and query is None:
            raise ValueError('A BigQuery table or a query must be specified')
        elif table is not None:
            self.table_reference = bigquery_tools.parse_table_reference(
                table, dataset, project)
            self.query = None
            self.use_legacy_sql = True
        else:
            self.query = query
            # TODO(BEAM-1082): Change the internal flag to be standard_sql
            self.use_legacy_sql = not use_standard_sql
            self.table_reference = None

        self.validate = validate
        self.flatten_results = flatten_results
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
        self.kms_key = kms_key