コード例 #1
0
def listen_command() -> None:
    """
    Implementation of the listen command.
    """
    config = get_config()
    # Call this once to initialize the table.
    _ = BigQueryOutput(
        get_table(TableDefinitions.INVENTORY,
                  config.get("BIGQUERY", "INVENTORY_TABLE")))

    subscriber = pubsub.SubscriberClient()
    topic_name = 'projects/{}/topics/{}'.format(
        config.get("GCP", "PROJECT"), config.get("PUBSUB", "TOPIC_SHORT_NAME"))
    subscription_name = 'projects/{}/subscriptions/{}'.format(
        config.get("GCP", "PROJECT"),
        config.get("PUBSUB", "SUBSCRIPTION_SHORT_NAME"))
    LOG.info("Creating or adopting subscription {}.".format(subscription_name))
    try:
        subscriber.create_subscription(name=subscription_name,
                                       topic=topic_name,
                                       ack_deadline_seconds=60)
    except AlreadyExists:
        pass

    output = BigQueryOutput(
        get_table(TableDefinitions.INVENTORY,
                  config.get("BIGQUERY", "INVENTORY_TABLE")), False)

    def handle(message):
        """Callback for handling new PubSub messages. Effectively, this just "partially applies" 
        the output stream above to unpack_and_insert.
        """
        unpack_and_insert(output, message)

    def shutdown(sub_future: StreamingPullFuture) -> None:
        """Close subscriptions and flush rows to BQ.
        """
        LOG.info("Cancelling subscription pull.")
        sub_future.cancel()
        LOG.info("Flushing rows to BigQuery.")
        output.flush()

    LOG.info("Subscribing...")
    subscription_future = subscriber.subscribe(subscription_name, handle)

    atexit.register(shutdown, subscription_future)

    timeout = config.getint("PUBSUB", "TIMEOUT", fallback=10)

    with subscriber:
        while True:
            try:
                subscription_future.result(timeout=timeout)
            except TimeoutError:
                LOG.debug("No messages in {} seconds, flushing rows (if any).".
                          format(timeout))
                output.flush()
            except:
                LOG.info("Quitting...")
                break
コード例 #2
0
def _compose_catch_up_union() -> str:
    """
    Compose a UNION ALL statement and secondary query to extend the
    access log with items that predate access logging, if the
    configuration contains BIGQUERY.CATCHUP_TABLE. Otherwise, return an
    empty string, having no effect on any composed queries.

    The REGEXP_REPLACE function serves to format the object URL the same
    way the audit log resourceName is formatted.

    Returns:
        str -- The UNION ALL statement, or empty string.
    """
    config = get_config()
    catchup_table_name = config.get("BIGQUERY", "CATCHUP_TABLE", fallback=None)
    if catchup_table_name:
        catchup_table = Table(catchup_table_name)
        return """
            UNION ALL
            SELECT
                REGEXP_REPLACE(url,"gs://(.*)/(.*)","projects/_/buckets/{0}1/objects/{0}2") AS resourceName,
                created AS timestamp
            FROM `{1}`
        """.format("\\\\", catchup_table.get_fully_qualified_name())
    return ""
コード例 #3
0
def _get_cold_threshold_days() -> int:
    """Retrieve the warm threshold days from the configuration.

    Returns:
        int -- Warm threshold days.
    """
    config = get_config()
    return config.getint('RULES', 'COLD_THRESHOLD_DAYS')
コード例 #4
0
def _get_warm_threshold_accesses() -> int:
    """Retrieve the warm threshold accesses from the configuration.

    Returns:
        int -- Warm threshold accesses.
    """
    config = get_config()
    return config.getint('RULES', 'WARM_THRESHOLD_ACCESSES')
コード例 #5
0
 def __init__(self, table: Table, create_table: bool = True):
     self.config = get_config()
     self.lock = Lock()
     self.rows = list()
     self.tablename = table.get_fully_qualified_name()
     self.batch_size = int(
         self.config.get('BIGQUERY', 'BATCH_WRITE_SIZE', fallback=100))
     self.insert_count = 0
     self.insert_bytes = 0
     if create_table:
         table.initialize()
コード例 #6
0
def _calculate_day_partitions() -> int:
    """Calculate the daily partitions to query. This is the sum of how far
    you need to look back (COLD_THRESHOLD_DAYS) and how often you look
    (DAYS_BETWEEN_RUNS).

    Returns:
        int -- The sum of cold threshold days and days between runs.
    """
    config = get_config()
    return config.getint('RULES', 'COLD_THRESHOLD_DAYS') + \
           config.getint('RULES', 'DAYS_BETWEEN_RUNS')
コード例 #7
0
    def get_client(self) -> bigquery.client:
        """Get a client.

        Returns:
            storage.client -- A configured BQ client.
        """
        if not self.client:
            LOG.debug("Making new BQ client.")
            config = get_config()
            self.client = bigquery.Client(
                project=config.get('BIGQUERY',
                                   'JOB_PROJECT',
                                   fallback=config.get('GCP', 'PROJECT')))
        return self.client
コード例 #8
0
    def get_fully_qualified_name(self) -> str:
        """Return a table name with project and dataset names prefixed.

        Arguments:
            name {str} -- Short name of the table.

        Returns:
            str -- Fully qualified name of the table.
        """
        config = get_config()
        return "{}.{}.{}".format(
            config.get("BIGQUERY",
                       "JOB_PROJECT",
                       fallback=config.get("GCP", "PROJECT")),
            config.get("BIGQUERY", "DATASET_NAME"), self.short_name)
コード例 #9
0
def load_command(buckets: [str] = None, prefix: str = None) -> None:
    """Implementation of the load command.

    This function dispatches each bucket listed into an executor thread for
    parallel processing of the bucket list.

    Keyword Arguments:
        buckets {[str]} -- A list of buckets to use instead of the
        project-wide bucket listing. (default: {None})
        prefix {str} -- A prefix to use when listing. (default: {None})
    """
    config = get_config()
    gcs = get_gcs_client()
    # Call this once to initialize.
    _ = BigQueryOutput(
        get_table(TableDefinitions.INVENTORY,
                  config.get("BIGQUERY", "INVENTORY_TABLE")))

    # if buckets is given, get each bucket object; otherwise, list all bucket
    # objects
    if buckets:
        buckets = [gcs.get_bucket(x) for x in buckets]
    else:
        buckets = [x for x in gcs.list_buckets()]

    total_buckets = len(buckets)
    buckets_listed = 0
    bucket_blob_counts = dict()

    # Use at most 2 workers for this part, as it won't be many
    workers = min(config.getint('RUNTIME', 'WORKERS'), 2)
    size = int(config.getint('RUNTIME', 'WORK_QUEUE_SIZE') * .25)
    with BoundedThreadPoolExecutor(max_workers=workers,
                                   queue_size=size) as executor:
        for bucket in buckets:
            buckets_listed += 1
            executor.submit(bucket_lister, config, gcs, bucket, prefix,
                            buckets_listed, total_buckets, bucket_blob_counts)

    LOG.info("Stats: \n\t%s", bucket_blob_counts)
    LOG.info("Total rows: \n\t%s",
             sum([v for _, v in bucket_blob_counts.items()]))
コード例 #10
0
    def get_client(self) -> storage.client:
        """Get a client from the pool. Automatically makes new ones until
        the pool is full. Threadsafe.

        Returns:
            storage.client -- A configured GCS client.
        """
        config = get_config()
        self.lock.acquire()
        if len(self.clients) < self.pool_size:
            LOG.debug("Making new GCS client.")
            self.clients.append(
                storage.Client(
                    config.get('GCP',
                               'GCS_PROJECT',
                               fallback=config.get('GCP', 'PROJECT'))))
        client = self.clients[self.next_up]
        self.next_up += 1
        if self.next_up >= self.pool_size - 1:
            self.next_up = 0
        self.lock.release()
        return client
コード例 #11
0
def unpack_and_insert(output: BigQueryOutput, message: Message) -> None:
    """Unpack a PubSub message regarding a GCS object change, and insert it into
    a BigQueryOutput.

    Args:
        output (BigQueryOutput): The output to use. In most cases, you will want to use a single
        output object per program.
        message (Message): The PubSub message.
    """
    bq_client = get_bq_client()
    config = get_config()
    table = get_table(TableDefinitions.INVENTORY,
                      config.get("BIGQUERY", "INVENTORY_TABLE"))
    table_name = table.get_fully_qualified_name()

    try:
        LOG.debug("Message data: \n---DATA---\n{}\n---DATA---".format(
            message.data))

        # Decode and deserialize
        message_string = bytes.decode(message.data, "UTF-8")
        object_info = json.loads(message_string)

        LOG.debug(message)
        LOG.debug(object_info)

        # Get important attributes
        event_type = message.attributes['eventType']
        publish_time = message.publish_time.isoformat()
        LOG.info("Got a message: {} {} {}".format(
            publish_time, event_type,
            object_info['bucket'] + "/" + object_info['name']))

        # For deletes, use the publish time to approximate deleted time
        if event_type == "OBJECT_DELETE":
            object_info["timeDeleted"] = publish_time
            if object_info.get("metadata"):
                object_info["metadata"] = [{
                    "key": k,
                    "value": v
                } for k, v in object_info["metadata"].items()]

        if event_type == "OBJECT_METADATA_UPDATE":

            def generate_structs(arr):
                res = '['
                for s in arr:
                    res += "STRUCT(\"{key}\" as key, \"{value}\" as value),".format(
                        key=s['key'], value=s['value'])
                res = res[:-1]
                res += ']'
                return res

            querytext = "UPDATE `{table_name}`\
                SET metadata = {new_metadata}\
                WHERE id = '{id}'".format(
                table_name=table_name,
                new_metadata=generate_structs([{
                    "key": k,
                    "value": v
                } for k, v in object_info["metadata"].items()]),
                id=object_info["id"])
            LOG.info("Running query: \n%s", querytext)
            query_job = bq_client.query(querytext)
            LOG.info(query_job.result())
        else:
            # Enqueue for writing
            output.put(object_info)

        message.ack()

    except:
        LOG.exception(
            "Error processing message! ---DATA---\n{}\n---DATA---".format(
                message.data))
        # TODO: A retry / DLQ policy would be useful, if not already present by default.
        message.nack()