예제 #1
0
    def __init__(self,
                 destination,
                 schema=None,
                 gs_location=None,
                 create_disposition=None,
                 write_disposition=None,
                 coder=None,
                 max_file_size=None,
                 max_files_per_bundle=None,
                 test_client=None):
        self.destination = destination
        self.create_disposition = create_disposition
        self.write_disposition = write_disposition
        self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE
        self.max_files_per_bundle = (max_files_per_bundle
                                     or _DEFAULT_MAX_WRITERS_PER_BUNDLE)
        self._input_gs_location = gs_location
        self.test_client = test_client
        self.schema = schema
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()

        # If we have multiple destinations, then we will have multiple load jobs,
        # thus we will need temporary tables for atomicity.
        # If the destination is a single one, we assume that we will have only one
        # job to run - and thus we avoid using temporary tables
        self.temp_tables = True if callable(destination) else False
예제 #2
0
    def __init__(self,
                 destination,
                 schema=None,
                 custom_gcs_temp_location=None,
                 create_disposition=None,
                 write_disposition=None,
                 triggering_frequency=None,
                 coder=None,
                 max_file_size=None,
                 max_files_per_bundle=None,
                 max_partition_size=None,
                 max_files_per_partition=None,
                 additional_bq_parameters=None,
                 table_side_inputs=None,
                 schema_side_inputs=None,
                 test_client=None,
                 validate=True,
                 is_streaming_pipeline=False):
        self.destination = destination
        self.create_disposition = create_disposition
        self.write_disposition = write_disposition
        self.triggering_frequency = triggering_frequency
        self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE
        self.max_files_per_bundle = (max_files_per_bundle
                                     or _DEFAULT_MAX_WRITERS_PER_BUNDLE)
        self.max_partition_size = max_partition_size or _MAXIMUM_LOAD_SIZE
        self.max_files_per_partition = (max_files_per_partition
                                        or _MAXIMUM_SOURCE_URIS)
        if (isinstance(custom_gcs_temp_location, str)
                or custom_gcs_temp_location is None):
            self._custom_gcs_temp_location = vp.StaticValueProvider(
                str, custom_gcs_temp_location or '')
        elif isinstance(custom_gcs_temp_location, vp.ValueProvider):
            self._custom_gcs_temp_location = custom_gcs_temp_location
        else:
            raise ValueError(
                'custom_gcs_temp_location must be str or ValueProvider')

        self.test_client = test_client
        self.schema = schema
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()

        # If we have multiple destinations, then we will have multiple load jobs,
        # thus we will need temporary tables for atomicity.
        self.dynamic_destinations = True if callable(destination) else False

        self.additional_bq_parameters = additional_bq_parameters or {}
        self.table_side_inputs = table_side_inputs or ()
        self.schema_side_inputs = schema_side_inputs or ()

        self.is_streaming_pipeline = is_streaming_pipeline
        self._validate = validate
        if self._validate:
            self.verify()
예제 #3
0
    def __init__(self,
                 destination,
                 schema=None,
                 custom_gcs_temp_location=None,
                 create_disposition=None,
                 write_disposition=None,
                 coder=None,
                 max_file_size=None,
                 max_files_per_bundle=None,
                 additional_bq_parameters=None,
                 table_side_inputs=None,
                 schema_side_inputs=None,
                 test_client=None,
                 validate=True):
        self.destination = destination
        self.create_disposition = create_disposition
        self.write_disposition = write_disposition
        self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE
        self.max_files_per_bundle = (max_files_per_bundle
                                     or _DEFAULT_MAX_WRITERS_PER_BUNDLE)
        if (isinstance(custom_gcs_temp_location, str)
                or custom_gcs_temp_location is None):
            self._custom_gcs_temp_location = vp.StaticValueProvider(
                str, custom_gcs_temp_location or '')
        elif isinstance(custom_gcs_temp_location, vp.ValueProvider):
            self._custom_gcs_temp_location = custom_gcs_temp_location
        else:
            raise ValueError(
                'custom_gcs_temp_location must be str or ValueProvider')

        self.test_client = test_client
        self.schema = schema
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()

        # If we have multiple destinations, then we will have multiple load jobs,
        # thus we will need temporary tables for atomicity.
        # If the destination is a single one, we assume that we will have only one
        # job to run - and thus we avoid using temporary tables
        self.temp_tables = True if callable(destination) else False

        self.additional_bq_parameters = additional_bq_parameters or {}
        self.table_side_inputs = table_side_inputs or ()
        self.schema_side_inputs = schema_side_inputs or ()

        self._validate = validate
        if self._validate:
            self.verify()
예제 #4
0
    def __init__(self,
                 max_files_per_bundle=_DEFAULT_MAX_WRITERS_PER_BUNDLE,
                 max_file_size=_DEFAULT_MAX_FILE_SIZE,
                 coder=None):
        """Initialize a :class:`WriteRecordsToFile`.

    Args:
      max_files_per_bundle (int): The maximum number of files that can be kept
        open during execution of this step in a worker. This is to avoid over-
        whelming the worker memory.
      max_file_size (int): The maximum size in bytes for a file to be used in
        an export job.

    """
        self.max_files_per_bundle = max_files_per_bundle
        self.max_file_size = max_file_size
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
예제 #5
0
 def __init__(self, max_file_size=_DEFAULT_MAX_FILE_SIZE, coder=None):
     self.max_file_size = max_file_size
     self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
예제 #6
0
 def __init__(self):
     self.coder = bigquery_tools.RowAsDictJsonCoder()
예제 #7
0
    def __init__(self,
                 table,
                 dataset=None,
                 project=None,
                 schema=None,
                 create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                 write_disposition=BigQueryDisposition.WRITE_EMPTY,
                 validate=False,
                 coder=None,
                 kms_key=None):
        """Initialize a BigQuerySink.

    Args:
      table (str): The ID of the table. The ID must contain only letters
        ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If
        **dataset** argument is :data:`None` then the table argument must
        contain the entire table reference specified as: ``'DATASET.TABLE'`` or
        ``'PROJECT:DATASET.TABLE'``.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument.
      schema (str): The schema to be used if the BigQuery table to write has
        to be created. This can be either specified as a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema` object or a single string  of the form
        ``'field1:type1,field2:type2,field3:type3'`` that defines a comma
        separated list of fields. Here ``'type'`` should specify the BigQuery
        type of the field. Single string based schemas do not support nested
        fields, repeated fields, or specifying a BigQuery mode for fields (mode
        will always be set to ``'NULLABLE'``).
      create_disposition (BigQueryDisposition): A string describing what
        happens if the table does not exist. Possible values are:

          * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not
            exist.
          * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not
            exist.

      write_disposition (BigQueryDisposition): A string describing what
        happens if the table has already some data. Possible values are:

          * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows.
          * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows.
          * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not
            empty.

      validate (bool): If :data:`True`, various checks will be done when sink
        gets initialized (e.g., is table present given the disposition
        arguments?). This should be :data:`True` for most scenarios in order to
        catch errors as early as possible (pipeline construction instead of
        pipeline execution). It should be :data:`False` if the table is created
        during pipeline execution by a previous step.
      coder (~apache_beam.coders.coders.Coder): The coder for the
        table rows if serialized to disk. If :data:`None`, then the default
        coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`,
        which will interpret every element written to the sink as a dictionary
        that will be JSON serialized as a line in a file. This argument needs a
        value only in special cases when writing table rows as dictionaries is
        not desirable.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.

    Raises:
      ~exceptions.TypeError: if the schema argument is not a :class:`str` or a
        :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
bigquery_v2_messages.TableSchema` object.
      ~exceptions.ValueError: if the table reference as a string does not
        match the expected format.
    """
        # Import here to avoid adding the dependency for local running scenarios.
        try:
            # pylint: disable=wrong-import-order, wrong-import-position
            from apitools.base import py  # pylint: disable=unused-variable
        except ImportError:
            raise ImportError('Google Cloud IO not available, '
                              'please install apache_beam[gcp]')

        self.table_reference = bigquery_tools.parse_table_reference(
            table, dataset, project)
        # Transform the table schema into a bigquery.TableSchema instance.
        if isinstance(schema, (str, unicode)):
            # TODO(silviuc): Should add a regex-based validation of the format.
            table_schema = bigquery.TableSchema()
            schema_list = [s.strip(' ') for s in schema.split(',')]
            for field_and_type in schema_list:
                field_name, field_type = field_and_type.split(':')
                field_schema = bigquery.TableFieldSchema()
                field_schema.name = field_name
                field_schema.type = field_type
                field_schema.mode = 'NULLABLE'
                table_schema.fields.append(field_schema)
            self.table_schema = table_schema
        elif schema is None:
            # TODO(silviuc): Should check that table exists if no schema specified.
            self.table_schema = schema
        elif isinstance(schema, bigquery.TableSchema):
            self.table_schema = schema
        else:
            raise TypeError('Unexpected schema argument: %s.' % schema)

        self.create_disposition = BigQueryDisposition.validate_create(
            create_disposition)
        self.write_disposition = BigQueryDisposition.validate_write(
            write_disposition)
        self.validate = validate
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
        self.kms_key = kms_key
예제 #8
0
    def __init__(self,
                 table=None,
                 dataset=None,
                 project=None,
                 query=None,
                 validate=False,
                 coder=None,
                 use_standard_sql=False,
                 flatten_results=True,
                 kms_key=None):
        """Initialize a :class:`BigQuerySource`.

    Args:
      table (str): The ID of a BigQuery table. If specified all data of the
        table will be used as input of the current source. The ID must contain
        only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores
        ``_``. If dataset and query arguments are :data:`None` then the table
        argument must contain the entire table reference specified as:
        ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``.
      dataset (str): The ID of the dataset containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument or a query is specified.
      project (str): The ID of the project containing this table or
        :data:`None` if the table reference is specified entirely by the table
        argument or a query is specified.
      query (str): A query to be used instead of arguments table, dataset, and
        project.
      validate (bool): If :data:`True`, various checks will be done when source
        gets initialized (e.g., is table present?). This should be
        :data:`True` for most scenarios in order to catch errors as early as
        possible (pipeline construction instead of pipeline execution). It
        should be :data:`False` if the table is created during pipeline
        execution by a previous step.
      coder (~apache_beam.coders.coders.Coder): The coder for the table
        rows if serialized to disk. If :data:`None`, then the default coder is
        :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`,
        which will interpret every line in a file as a JSON serialized
        dictionary. This argument needs a value only in special cases when
        returning table rows as dictionaries is not desirable.
      use_standard_sql (bool): Specifies whether to use BigQuery's standard SQL
        dialect for this query. The default value is :data:`False`.
        If set to :data:`True`, the query will use BigQuery's updated SQL
        dialect with improved standards compliance.
        This parameter is ignored for table inputs.
      flatten_results (bool): Flattens all nested and repeated fields in the
        query results. The default value is :data:`True`.
      kms_key (str): Experimental. Optional Cloud KMS key name for use when
        creating new tables.

    Raises:
      ~exceptions.ValueError: if any of the following is true:

        1) the table reference as a string does not match the expected format
        2) neither a table nor a query is specified
        3) both a table and a query is specified.
    """

        # Import here to avoid adding the dependency for local running scenarios.
        try:
            # pylint: disable=wrong-import-order, wrong-import-position
            from apitools.base import py  # pylint: disable=unused-variable
        except ImportError:
            raise ImportError('Google Cloud IO not available, '
                              'please install apache_beam[gcp]')

        if table is not None and query is not None:
            raise ValueError(
                'Both a BigQuery table and a query were specified.'
                ' Please specify only one of these.')
        elif table is None and query is None:
            raise ValueError('A BigQuery table or a query must be specified')
        elif table is not None:
            self.table_reference = bigquery_tools.parse_table_reference(
                table, dataset, project)
            self.query = None
            self.use_legacy_sql = True
        else:
            self.query = query
            # TODO(BEAM-1082): Change the internal flag to be standard_sql
            self.use_legacy_sql = not use_standard_sql
            self.table_reference = None

        self.validate = validate
        self.flatten_results = flatten_results
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
        self.kms_key = kms_key
예제 #9
0
def RowAsDictJsonCoder(*args, **kwargs):
    return bigquery_tools.RowAsDictJsonCoder(*args, **kwargs)
예제 #10
0
파일: bigquery.py 프로젝트: wscheep/beam
def RowAsDictJsonCoder(*args, **kwargs):
    import warnings
    warnings.warn("This class is deprecated and will be permanently moved "
                  "to the bigquery_tools module in a future version of beam")
    return bigquery_tools.RowAsDictJsonCoder(*args, **kwargs)