Beispiel #1
0
    def initialize(self) -> bigquery.table.RowIterator:
        """Creates, if not found, a table.

        Returns:
            google.cloud.bigquery.table.RowIterator -- Result of the query.
            Since this is a DDL query, this will always be empty if
            it succeeded.

        Raises:
            google.cloud.exceptions.GoogleCloudError –- If the job failed.
            concurrent.futures.TimeoutError –- If the job did not complete
            in the default BigQuery job timeout.
        """
        if not self.schema:
            raise ValueError("No schema provided for table {}; writing is not supported.".format(self.short_name))

        bq_client = get_bq_client()

        LOG.info("Creating table %s if not found.",
                 self.get_fully_qualified_name())

        querytext = """
            CREATE TABLE IF NOT EXISTS `{}` (
            {}
            )""".format(self.get_fully_qualified_name(), self.schema)

        LOG.debug("Running query: \n%s", querytext)

        query_job = bq_client.query(querytext)
        return query_job.result()
Beispiel #2
0
    def flush(self) -> None:
        """
        Flush all enqueued rows to BigQuery.

        Raises:
            error: Errors raised by bigquery.client.insert_rows_json

        Returns:
            None
        """
        if self.rows:
            if LOG.level <= logging.DEBUG:
                sending_bytes = sum([len(json.dumps(x)) for x in self.rows])
                LOG.debug("Flushing %s rows to %s, %s bytes.", len(self.rows),
                          self.tablename, sending_bytes)
            client = get_bq_client()
            try:
                insert_errors = client.insert_rows_json(
                    self.tablename, self.rows)
                if insert_errors:
                    LOG.error("Insert errors! %s",
                              [x for x in flatten(insert_errors)])
            except BadRequest as error:
                if not error.message.endswith(
                        "No rows present in the request."):
                    LOG.error("Insert error! %s", error.message)
                    raise error
            finally:
                self.insert_count += len(self.rows)
                self.rows = list()
Beispiel #3
0
def run_query_job(
    querytext: str,
    temp_table: str = None,
    query_job_config: QueryJobConfig = QueryJobConfig()
) -> QueryJob:
    """
    Set up and run a query job.

    Arguments:
        querytext {str} -- The querytext for the job.

    Keyword Arguments:
        temp_table {str} -- A temporary table in which to materialize results.
        The results will be streamed from this table when done. This is
        required for all large queries, and strongly recommended.
        (default: {None})

        query_job_config {QueryJobConfig} -- A QueryJobConfig to start from.
        (default: {QueryJobConfig()})

    Returns:
        QueryJob -- The resulting job.
    """
    LOG.debug("Running query: %s", querytext)
    client = get_bq_client()
    if temp_table:
        query_job_config.destination = temp_table
        query_job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
    return client.query(query=querytext, job_config=query_job_config)
Beispiel #4
0
    def drop(self) -> bigquery.table.RowIterator:
        """DROPs (deletes) the table. This cannot be undone.

        Returns:
            google.cloud.bigquery.table.RowIterator -- Result of the query.
            Since this is a DDL query, this will always be empty if
            it succeeded.

        Raises:
            google.cloud.exceptions.GoogleCloudError –- If the job failed.
            concurrent.futures.TimeoutError –- If the job did not complete
            in the default BigQuery job timeout.
        """
        bq_client = get_bq_client()

        LOG.info("Deleting table %s", self.get_fully_qualified_name())

        querytext = "DROP TABLE `{}`".format(self.get_fully_qualified_name())

        LOG.debug("Running query: \n%s", querytext)

        query_job = bq_client.query(querytext)
        return query_job.result()
Beispiel #5
0
def unpack_and_insert(output: BigQueryOutput, message: Message) -> None:
    """Unpack a PubSub message regarding a GCS object change, and insert it into
    a BigQueryOutput.

    Args:
        output (BigQueryOutput): The output to use. In most cases, you will want to use a single
        output object per program.
        message (Message): The PubSub message.
    """
    bq_client = get_bq_client()
    config = get_config()
    table = get_table(TableDefinitions.INVENTORY,
                      config.get("BIGQUERY", "INVENTORY_TABLE"))
    table_name = table.get_fully_qualified_name()

    try:
        LOG.debug("Message data: \n---DATA---\n{}\n---DATA---".format(
            message.data))

        # Decode and deserialize
        message_string = bytes.decode(message.data, "UTF-8")
        object_info = json.loads(message_string)

        LOG.debug(message)
        LOG.debug(object_info)

        # Get important attributes
        event_type = message.attributes['eventType']
        publish_time = message.publish_time.isoformat()
        LOG.info("Got a message: {} {} {}".format(
            publish_time, event_type,
            object_info['bucket'] + "/" + object_info['name']))

        # For deletes, use the publish time to approximate deleted time
        if event_type == "OBJECT_DELETE":
            object_info["timeDeleted"] = publish_time
            if object_info.get("metadata"):
                object_info["metadata"] = [{
                    "key": k,
                    "value": v
                } for k, v in object_info["metadata"].items()]

        if event_type == "OBJECT_METADATA_UPDATE":

            def generate_structs(arr):
                res = '['
                for s in arr:
                    res += "STRUCT(\"{key}\" as key, \"{value}\" as value),".format(
                        key=s['key'], value=s['value'])
                res = res[:-1]
                res += ']'
                return res

            querytext = "UPDATE `{table_name}`\
                SET metadata = {new_metadata}\
                WHERE id = '{id}'".format(
                table_name=table_name,
                new_metadata=generate_structs([{
                    "key": k,
                    "value": v
                } for k, v in object_info["metadata"].items()]),
                id=object_info["id"])
            LOG.info("Running query: \n%s", querytext)
            query_job = bq_client.query(querytext)
            LOG.info(query_job.result())
        else:
            # Enqueue for writing
            output.put(object_info)

        message.ack()

    except:
        LOG.exception(
            "Error processing message! ---DATA---\n{}\n---DATA---".format(
                message.data))
        # TODO: A retry / DLQ policy would be useful, if not already present by default.
        message.nack()