def __init__(self, destination, schema=None, gs_location=None, create_disposition=None, write_disposition=None, coder=None, max_file_size=None, max_files_per_bundle=None, test_client=None): self.destination = destination self.create_disposition = create_disposition self.write_disposition = write_disposition self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE self.max_files_per_bundle = (max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE) self._input_gs_location = gs_location self.test_client = test_client self.schema = schema self.coder = coder or bigquery_tools.RowAsDictJsonCoder() # If we have multiple destinations, then we will have multiple load jobs, # thus we will need temporary tables for atomicity. # If the destination is a single one, we assume that we will have only one # job to run - and thus we avoid using temporary tables self.temp_tables = True if callable(destination) else False
def __init__(self, destination, schema=None, custom_gcs_temp_location=None, create_disposition=None, write_disposition=None, triggering_frequency=None, coder=None, max_file_size=None, max_files_per_bundle=None, max_partition_size=None, max_files_per_partition=None, additional_bq_parameters=None, table_side_inputs=None, schema_side_inputs=None, test_client=None, validate=True, is_streaming_pipeline=False): self.destination = destination self.create_disposition = create_disposition self.write_disposition = write_disposition self.triggering_frequency = triggering_frequency self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE self.max_files_per_bundle = (max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE) self.max_partition_size = max_partition_size or _MAXIMUM_LOAD_SIZE self.max_files_per_partition = (max_files_per_partition or _MAXIMUM_SOURCE_URIS) if (isinstance(custom_gcs_temp_location, str) or custom_gcs_temp_location is None): self._custom_gcs_temp_location = vp.StaticValueProvider( str, custom_gcs_temp_location or '') elif isinstance(custom_gcs_temp_location, vp.ValueProvider): self._custom_gcs_temp_location = custom_gcs_temp_location else: raise ValueError( 'custom_gcs_temp_location must be str or ValueProvider') self.test_client = test_client self.schema = schema self.coder = coder or bigquery_tools.RowAsDictJsonCoder() # If we have multiple destinations, then we will have multiple load jobs, # thus we will need temporary tables for atomicity. self.dynamic_destinations = True if callable(destination) else False self.additional_bq_parameters = additional_bq_parameters or {} self.table_side_inputs = table_side_inputs or () self.schema_side_inputs = schema_side_inputs or () self.is_streaming_pipeline = is_streaming_pipeline self._validate = validate if self._validate: self.verify()
def __init__(self, destination, schema=None, custom_gcs_temp_location=None, create_disposition=None, write_disposition=None, coder=None, max_file_size=None, max_files_per_bundle=None, additional_bq_parameters=None, table_side_inputs=None, schema_side_inputs=None, test_client=None, validate=True): self.destination = destination self.create_disposition = create_disposition self.write_disposition = write_disposition self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE self.max_files_per_bundle = (max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE) if (isinstance(custom_gcs_temp_location, str) or custom_gcs_temp_location is None): self._custom_gcs_temp_location = vp.StaticValueProvider( str, custom_gcs_temp_location or '') elif isinstance(custom_gcs_temp_location, vp.ValueProvider): self._custom_gcs_temp_location = custom_gcs_temp_location else: raise ValueError( 'custom_gcs_temp_location must be str or ValueProvider') self.test_client = test_client self.schema = schema self.coder = coder or bigquery_tools.RowAsDictJsonCoder() # If we have multiple destinations, then we will have multiple load jobs, # thus we will need temporary tables for atomicity. # If the destination is a single one, we assume that we will have only one # job to run - and thus we avoid using temporary tables self.temp_tables = True if callable(destination) else False self.additional_bq_parameters = additional_bq_parameters or {} self.table_side_inputs = table_side_inputs or () self.schema_side_inputs = schema_side_inputs or () self._validate = validate if self._validate: self.verify()
def __init__(self, max_files_per_bundle=_DEFAULT_MAX_WRITERS_PER_BUNDLE, max_file_size=_DEFAULT_MAX_FILE_SIZE, coder=None): """Initialize a :class:`WriteRecordsToFile`. Args: max_files_per_bundle (int): The maximum number of files that can be kept open during execution of this step in a worker. This is to avoid over- whelming the worker memory. max_file_size (int): The maximum size in bytes for a file to be used in an export job. """ self.max_files_per_bundle = max_files_per_bundle self.max_file_size = max_file_size self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
def __init__(self, max_file_size=_DEFAULT_MAX_FILE_SIZE, coder=None): self.max_file_size = max_file_size self.coder = coder or bigquery_tools.RowAsDictJsonCoder()
def __init__(self): self.coder = bigquery_tools.RowAsDictJsonCoder()
def __init__(self, table, dataset=None, project=None, schema=None, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY, validate=False, coder=None, kms_key=None): """Initialize a BigQuerySink. Args: table (str): The ID of the table. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If **dataset** argument is :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument. schema (str): The schema to be used if the BigQuery table to write has to be created. This can be either specified as a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object or a single string of the form ``'field1:type1,field2:type2,field3:type3'`` that defines a comma separated list of fields. Here ``'type'`` should specify the BigQuery type of the field. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for fields (mode will always be set to ``'NULLABLE'``). create_disposition (BigQueryDisposition): A string describing what happens if the table does not exist. Possible values are: * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not exist. * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not exist. write_disposition (BigQueryDisposition): A string describing what happens if the table has already some data. Possible values are: * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows. * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows. * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not empty. validate (bool): If :data:`True`, various checks will be done when sink gets initialized (e.g., is table present given the disposition arguments?). This should be :data:`True` for most scenarios in order to catch errors as early as possible (pipeline construction instead of pipeline execution). It should be :data:`False` if the table is created during pipeline execution by a previous step. coder (~apache_beam.coders.coders.Coder): The coder for the table rows if serialized to disk. If :data:`None`, then the default coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`, which will interpret every element written to the sink as a dictionary that will be JSON serialized as a line in a file. This argument needs a value only in special cases when writing table rows as dictionaries is not desirable. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. Raises: ~exceptions.TypeError: if the schema argument is not a :class:`str` or a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object. ~exceptions.ValueError: if the table reference as a string does not match the expected format. """ # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apitools.base import py # pylint: disable=unused-variable except ImportError: raise ImportError('Google Cloud IO not available, ' 'please install apache_beam[gcp]') self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) # Transform the table schema into a bigquery.TableSchema instance. if isinstance(schema, (str, unicode)): # TODO(silviuc): Should add a regex-based validation of the format. table_schema = bigquery.TableSchema() schema_list = [s.strip(' ') for s in schema.split(',')] for field_and_type in schema_list: field_name, field_type = field_and_type.split(':') field_schema = bigquery.TableFieldSchema() field_schema.name = field_name field_schema.type = field_type field_schema.mode = 'NULLABLE' table_schema.fields.append(field_schema) self.table_schema = table_schema elif schema is None: # TODO(silviuc): Should check that table exists if no schema specified. self.table_schema = schema elif isinstance(schema, bigquery.TableSchema): self.table_schema = schema else: raise TypeError('Unexpected schema argument: %s.' % schema) self.create_disposition = BigQueryDisposition.validate_create( create_disposition) self.write_disposition = BigQueryDisposition.validate_write( write_disposition) self.validate = validate self.coder = coder or bigquery_tools.RowAsDictJsonCoder() self.kms_key = kms_key
def __init__(self, table=None, dataset=None, project=None, query=None, validate=False, coder=None, use_standard_sql=False, flatten_results=True, kms_key=None): """Initialize a :class:`BigQuerySource`. Args: table (str): The ID of a BigQuery table. If specified all data of the table will be used as input of the current source. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If dataset and query arguments are :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument or a query is specified. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument or a query is specified. query (str): A query to be used instead of arguments table, dataset, and project. validate (bool): If :data:`True`, various checks will be done when source gets initialized (e.g., is table present?). This should be :data:`True` for most scenarios in order to catch errors as early as possible (pipeline construction instead of pipeline execution). It should be :data:`False` if the table is created during pipeline execution by a previous step. coder (~apache_beam.coders.coders.Coder): The coder for the table rows if serialized to disk. If :data:`None`, then the default coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`, which will interpret every line in a file as a JSON serialized dictionary. This argument needs a value only in special cases when returning table rows as dictionaries is not desirable. use_standard_sql (bool): Specifies whether to use BigQuery's standard SQL dialect for this query. The default value is :data:`False`. If set to :data:`True`, the query will use BigQuery's updated SQL dialect with improved standards compliance. This parameter is ignored for table inputs. flatten_results (bool): Flattens all nested and repeated fields in the query results. The default value is :data:`True`. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. Raises: ~exceptions.ValueError: if any of the following is true: 1) the table reference as a string does not match the expected format 2) neither a table nor a query is specified 3) both a table and a query is specified. """ # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apitools.base import py # pylint: disable=unused-variable except ImportError: raise ImportError('Google Cloud IO not available, ' 'please install apache_beam[gcp]') if table is not None and query is not None: raise ValueError( 'Both a BigQuery table and a query were specified.' ' Please specify only one of these.') elif table is None and query is None: raise ValueError('A BigQuery table or a query must be specified') elif table is not None: self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) self.query = None self.use_legacy_sql = True else: self.query = query # TODO(BEAM-1082): Change the internal flag to be standard_sql self.use_legacy_sql = not use_standard_sql self.table_reference = None self.validate = validate self.flatten_results = flatten_results self.coder = coder or bigquery_tools.RowAsDictJsonCoder() self.kms_key = kms_key
def RowAsDictJsonCoder(*args, **kwargs): return bigquery_tools.RowAsDictJsonCoder(*args, **kwargs)
def RowAsDictJsonCoder(*args, **kwargs): import warnings warnings.warn("This class is deprecated and will be permanently moved " "to the bigquery_tools module in a future version of beam") return bigquery_tools.RowAsDictJsonCoder(*args, **kwargs)