Example #1
0
 def _make_load_job_config(source_format,  # type: str
                           write_disposition,  # type: str
                           schema=None,  # type: Optional[List[SchemaField]]
                           skip_leading_row=False,  #type: bool
                           ):
     """
     Makes and returns a LoadJobConfig according to the passed-in parameters.
     Args:
         source_format: Should be a recognized BigQuery source format. See
             https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.sourceFormat
         write_disposition: Should be a recognized BigQuery write disposition. See
             https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.writeDisposition
         schema: A list of SchemaFields. If unset, BigQuery will try to infer a schema.
         skip_leading_row: If True, the first row of the file loaded in will be skipped.
     """
     job_config = LoadJobConfig()
     job_config.source_format = source_format
     job_config.write_disposition = write_disposition
     if schema:
         job_config.schema = schema
     else:
         job_config.autodetect = True
     if skip_leading_row:
         job_config.skip_leading_rows = 1
     return job_config
def update_or_create_table_from_csv(
    client: bigquery.Client,
    table_name: str,
    source_file: str,
    dataset: str,
    source_schema_file: str,
):
    LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name,
                 [source_file])
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(table_name)

    job_config = LoadJobConfig()
    job_config.source_format = "CSV"
    job_config.skip_leading_rows = 1
    if Path(source_schema_file).exists():
        job_config.schema = get_table_schema(source_schema_file)
    else:
        job_config.autodetect = True
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    with open(source_file, "rb") as source_fp:
        load_job = client.load_table_from_file(source_fp,
                                               destination=table_ref,
                                               job_config=job_config)

    # wait for job to complete
    load_job.result()

    LOGGER.info("updated config table: %s", table_ref.table_id)
Example #3
0
    def import_csv(self,
                   bucket_name: str,
                   bucket_path: str,
                   dataset: str,
                   table: str,
                   sep: str = "\t") -> bool:
        logging.info(
            f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} ...")
        client = self._get_client()

        config = LoadJobConfig()
        config.autodetect = True
        config.field_delimiter = sep

        bucket_url = f"gs://{self.config.lake_path}/{bucket_path}"

        load_job = client.load_table_from_uri(bucket_url,
                                              f"{dataset}.{table}",
                                              job_config=config)
        result = load_job.result()

        logging.info(
            f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} Complete!"
        )

        return True
Example #4
0
    def test_begin_w_autodetect(self):
        from google.cloud.bigquery.job import LoadJobConfig

        path = "/projects/{}/jobs".format(self.PROJECT)
        resource = self._make_resource()
        resource["configuration"]["load"]["autodetect"] = True
        # Ensure None for missing server-set props
        del resource["statistics"]["creationTime"]
        del resource["etag"]
        del resource["selfLink"]
        del resource["user_email"]
        conn = _make_connection(resource)
        client = _make_client(project=self.PROJECT, connection=conn)
        config = LoadJobConfig()
        config.autodetect = True
        job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF,
                             client, config)
        with mock.patch(
                "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
        ) as final_attributes:
            job._begin()

        final_attributes.assert_called_with({"path": path}, client, job)

        sent = {
            "jobReference": {
                "projectId": self.PROJECT,
                "jobId": self.JOB_ID
            },
            "configuration": {
                "load": {
                    "sourceUris": [self.SOURCE1],
                    "destinationTable": {
                        "projectId": self.PROJECT,
                        "datasetId": self.DS_ID,
                        "tableId": self.TABLE_ID,
                    },
                    "autodetect": True,
                }
            },
        }
        conn.api_request.assert_called_once_with(method="POST",
                                                 path=path,
                                                 data=sent,
                                                 timeout=None)
        self._verifyResourceProperties(job, resource)
    def _execute(self):
        client = self._get_client()
        source_uris = self._get_source_uris()

        job_config = LoadJobConfig()
        if self._params['import_json']:
            job_config.source_format = 'NEWLINE_DELIMITED_JSON'
        else:
            try:
                job_config.skip_leading_rows = self._params['rows_to_skip']
            except KeyError:
                job_config.skip_leading_rows = 0
        job_config.autodetect = self._params['autodetect']
        if not job_config.autodetect:
            job_config.allow_jagged_rows = True
            job_config.allow_quoted_newlines = True
            job_config.ignore_unknown_values = True
            if self._params['schema']:
                job_config.schema = self._parse_bq_json_schema(
                    self._params['schema'])
        if self._params['csv_null_marker']:
            job_config.null_marker = self._params['csv_null_marker']
        try:
            job_config.max_bad_records = self._params['errors_to_allow']
        except KeyError:
            job_config.max_bad_records = 0
        if self._params['overwrite']:
            job_config.write_disposition = 'WRITE_TRUNCATE'
        else:
            job_config.write_disposition = 'WRITE_APPEND'
        if self._params['dont_create']:
            job_config.create_disposition = 'CREATE_NEVER'
        else:
            job_config.create_disposition = 'CREATE_IF_NEEDED'

        job = client.load_table_from_uri(source_uris,
                                         self._get_full_table_name(),
                                         job_id_prefix=self._get_prefix(),
                                         job_config=job_config)
        self._wait(job)
        self.log_info('Finished successfully')
Example #6
0
    def update_or_create_table_from_csv(self, table_name, table_file, schema_file):
        dataset_ref = self.client.dataset(self.dataset_id)
        table_ref = dataset_ref.table(table_name)

        job_config = LoadJobConfig()
        job_config.source_format = "CSV"
        job_config.skip_leading_rows = 1
        if Path(schema_file).exists():
            job_config.schema = self.get_table_schema(schema_file)
        else:
            job_config.autodetect = True
        job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

        with open(table_file, "rb") as source_fp:
            load_job = self.client.load_table_from_file(
                source_fp,
                destination=table_ref,
                job_config=job_config
            )
            
        load_job.result()
        logging.info("tabela [%s] criada com sucesso", table_ref.table_id)