예제 #1
0
def _run_bigquery(config, start_time_t, time_interval_seconds,
        time_table_expiration):
    """config is as described in logs_bridge.config.json."""
    # First, create a temporary table that's just the rows from
    # start_time_t to start_time_t + time_interval_seconds.
    # We'll give it a random name so we can run multiple copies of
    # this script at the same time.
    temp_table_name = (
        'khan-academy:logs_streaming_tmp_analysis.logs_bridge_%d_%04d'
        % (start_time_t, random.randint(0, 9999)))
    # We assume that this script will not run for longer than
    # time_interval_seconds; if it did, it would continually be
    # falling behind!
    temp_table_query = _query_for_rows_in_time_range(config, start_time_t,
                                                     time_interval_seconds)
    # Apparently the commandline doesn't like newlines in the script.
    # Reformat for the commandline.
    temp_table_query = temp_table_query.replace('\n', ' ')

    logging.debug("Creating the temporary table for querying over by running "
                  + temp_table_query)
    bq_util.call_bq(['mk', '--expiration', str(time_table_expiration),
                     temp_table_name],
                    project='khan-academy',
                    return_output=False,
                    stdout=open(os.devnull, 'w'))
    bq_util.call_bq(['query', '--destination_table', temp_table_name,
                     '--allow_large_results', temp_table_query],
                    project='khanacademy.org:deductive-jet-827',
                    return_output=False)

    logging.info("Step 1: Created temp table with all loglines. "
            "View it on bigquery:")
    logging.info("  https://bigquery.cloud.google.com/table/%s?tab=preview"
            % (temp_table_name))

    subqueries = [_create_subquery(entry, start_time_t, time_interval_seconds,
                                   temp_table_name)
                  for entry in config]

    # num_requests is the total number of requests in the specified
    # time period (either `now` or some other time). In order to get
    # this value, we sum the total number of requests for each field
    # (e.g. we sum the total number of requests for each browser)
    # partioned by the time, `when`.
    query = ('SELECT *, SUM(num_requests_by_field) OVER(PARTITION BY when)'
             ' as num_requests FROM %s' % ',\n'.join(subqueries))
    logging.debug('BIGQUERY QUERY: %s' % query)

    job_name = 'logs_bridge_query_%s' % random.randint(0, sys.maxint)
    r = bq_util.query_bigquery(query, job_name=job_name)
    logging.info('Step 2: Counting # of logs that match metric:')
    logging.info('  https://bigquery.cloud.google.com/results/khanacademy.org:deductive-jet-827:%s' % job_name)
    logging.debug('BIGQUERY RESULTS: %s' % r)

    return r
예제 #2
0
def dos_detect(start, end):
    query = QUERY_TEMPLATE.format(fastly_log_tables=_fastly_log_tables(
        start, end),
                                  start_timestamp=start.strftime(TS_FORMAT),
                                  end_timestamp=end.strftime(TS_FORMAT),
                                  max_count=(MAX_REQS_SEC * PERIOD))
    results = bq_util.call_bq(['query', query], project=BQ_PROJECT)

    for row in results:
        msg = ALERT_TEMPLATE.format(**row)
        alertlib.Alert(msg).send_to_slack(ALERT_CHANNEL)
예제 #3
0
def scratchpad_detect(start, end):
    scratchpad_query = SCRATCHPAD_QUERY_TEMPLATE.format(
        fastly_log_tables=_fastly_log_tables(start, end),
        start_timestamp=start.strftime(TS_FORMAT),
        end_timestamp=end.strftime(TS_FORMAT),
        max_count=MAX_SCRATCHPADS)

    scratchpad_results = bq_util.call_bq(['query', scratchpad_query],
                                         project=BQ_PROJECT)

    if len(scratchpad_results) != 0:
        msg = SCRATCHPAD_ALERT_INTRO_TEMPLATE.format(max_count=MAX_SCRATCHPADS)
        msg += '\n'.join(
            SCRATCHPAD_ALERT_ENTRY_TEMPLATE.format(**row)
            for row in scratchpad_results)
        alertlib.Alert(msg).send_to_slack(ALERT_CHANNEL)
예제 #4
0
def _send_campaign_report(status, start_date, end_date, temp_file, verbose,
                          dry_run, keep_temp):
    """Export data about all campaigns in a date range to Bigquery.
    This selects campaigns that started between start_date and end_date
    inclusive.

    Arguments:
      status: Export only the details of campaigns with this status.
              Options are 'sent', 'sending', 'scheduled' and 'draft'.
      start_date: Start date of blasts (format example: 'January 1 2017')
      end_date: End date of blasts (format example: 'January 1 2017')
      temp_file: A file to store the data, to be used by 'bq load'.
      verbose: True if you want to show debug messages, else False.
      dry_run: True if we should skip writing to bq.
      keep_temp: True if we should keep the temp_file that we write.

    Returns:
      Returns a python set of the blast IDs for the blasts that were
      started within 7 days before end_date inclusive both the end_date
      and seven days before end_date. The returned set has nothing to
      do with the start-date.
    """
    recent_blast_ids = set()
    response = _get('blast',
                    status=status,
                    start_date=start_date,
                    end_date=end_date,
                    limit=0)

    blasts_info_json = response.get_body().get('blasts')
    all_blasts_length = len(blasts_info_json)

    try:
        with open(temp_file, "wb") as json_file:
            for i in range(all_blasts_length):
                # Get the date a blast was started.
                date = datetime.datetime.strptime(
                    blasts_info_json[i]['start_time'],
                    '%a, %d %b %Y %H:%M:%S -%f')
                # Store a list of all blast IDs that started in the
                # last 7 days of end_date.
                if date >= datetime.datetime.strptime(
                        end_date, '%B %d %Y') - datetime.timedelta(days=7):
                    recent_blast_ids.add(blasts_info_json[i]['blast_id'])
                json.dump(blasts_info_json[i], json_file)

                if i != len(blasts_info_json) - 1:
                    json_file.write("\n")

        table_name = "sailthru_blasts.campaigns"

        if dry_run:
            print(
                "DRY RUN: if this was for real, we would write data at path"
                " '%s' to bq table '%s'" % (temp_file, table_name))
        else:
            if verbose:
                print("Writing json file with %s lines to bigquery table %s" %
                      (all_blasts_length, table_name))
            bq_util.call_bq([
                'load', '--source_format=NEWLINE_DELIMITED_JSON', '--replace',
                table_name, temp_file,
                os.path.join(os.path.dirname(__file__),
                             'sailthru_campaign_export_schema.json')
            ],
                            project='khanacademy.org:deductive-jet-827',
                            return_output=False)
    finally:
        if not keep_temp:
            os.unlink(temp_file)

    return recent_blast_ids
예제 #5
0
def _send_blast_details_to_bq(blast_id, temp_file, verbose, dry_run,
                              keep_temp):
    """Export blast data to BigQuery.

    Arguments:
      blast_id: ID of the blast to fetch data for.
      temp_file: A file to store the data, to be used by 'bq load'.
      verbose: True if you want to show debug messages, else False.
      dry_run: True if we should skip writing to bq, and instead log what
               would have happened. For normal behavior, set False.
      keep_temp: True if we should keep the temp_file that we write.
    """

    # Map associating blast_query response column names with separators.
    #
    # The blast_query response contains columns that contain multiple items.
    # These columns have different separators.
    #
    # The absence of a column name in this associative array indicates that
    # cells under this column each only contain a single item.
    blast_report_list_column_seperators = {
        "first_ten_clicks": " ",
        "first_ten_clicks_time": "|"
    }

    # Map associating Sailthru blast report header names with bq column names.
    blast_report_header_corrections = {
        "email hash": "email_hash",  # Spaces are annoying in SQL!
        "extid": "kaid",  # Might as well be precise.
    }

    # Fields for which we should append timezone information.
    blast_report_timestamp_columns = None
    with open(
            os.path.join(os.path.dirname(__file__),
                         "sailthru_blast_export_schema.json")) as f:
        blast_report_timestamp_columns = {
            column.get("name"): True
            for column in json.load(f) if column.get("type") == "TIMESTAMP"
        }

    tz_utc_offset = _get_sailthru_timezone_utc_offset()

    response_1 = _post('job',
                       job="blast_query",
                       blast_id=blast_id,
                       verbose=verbose)

    job_id = response_1.get_body().get('job_id')

    if job_id is None:
        print(
            "WARNING: For the blast_query job with blast_id = %s, "
            "the job_id returned from Sailthru's job=blast_query is "
            "None" % blast_id)
        return

    if verbose:
        print(
            "For the blast_query job with blast_id = %s, calling "
            "sailthru's job status API for job_id = %s" % (blast_id, job_id))
    response_2 = _get('job', job_id=job_id)

    while response_2.get_body().get('status') != "completed":
        if verbose:
            print(
                "For the blast_query job with blast_id = %s, polled "
                "sailthru's job status API for job_id = %s " %
                (blast_id, job_id))
            print "Will poll again in 5 seconds."
        time.sleep(5)
        response_2 = _get('job', job_id=job_id)
        if response_2.get_body().get('status') == "expired":
            raise SailthruAPIException(response_2)

    filename_url = response_2.get_body().get('export_url')

    if verbose:
        print(
            "For the blast_query job with blast_id = %s, creating a jsonl "
            "file from the sailthru data" % blast_id)

    with _CPU_LOCK:
        try:
            with open(temp_file, "wb") as f:
                with contextlib.closing(
                        urllib.urlopen(filename_url)) as csvdata:
                    # Take the csv data from the Sailthru API and
                    # convert it to JSON. bq can read columns in
                    # REPEATED mode from JSON files, but not from
                    # CSVs, and we have cells that contain multiple
                    # items.
                    reader = csv.reader(csvdata, delimiter=',', quotechar='"')

                    headers = reader.next()

                    # Correct confusing header names.
                    headers = [
                        blast_report_header_corrections[hdr]
                        if hdr in blast_report_header_corrections else hdr
                        for hdr in headers
                    ]

                    for row_csv in reader:
                        row_object = {}
                        for idx, column_name in enumerate(headers):
                            cell_content = row_csv[idx].strip()
                            if cell_content == "":
                                row_object[column_name] = None
                            elif (column_name
                                  in blast_report_list_column_seperators):
                                sep = blast_report_list_column_seperators[
                                    column_name]
                                row_object[column_name] = cell_content.split(
                                    sep)
                            else:
                                row_object[column_name] = cell_content

                            #  Append timezone information to TIMESTAMP cells.
                            if column_name in blast_report_timestamp_columns:
                                if isinstance(row_object[column_name], str):
                                    row_object[column_name] += " %s" % (
                                        tz_utc_offset)
                                elif isinstance(row_object[column_name], list):
                                    row_object[column_name] = [
                                        "%s %s" % (date, tz_utc_offset)
                                        for date in row_object[column_name]
                                    ]
                                else:
                                    assert (row_object[column_name] is None)

                        # Append the blast ID to each row.  This way
                        # we can join/union this blast table with
                        # other tables while preserving
                        # blast_ids. Otherwise, the blast_id would
                        # only be accessible from the table name.
                        row_object["blast_id"] = str(blast_id)

                        # Write each row.
                        # In JSON mode, bq expects a JSON object on each line.
                        f.write("%s\n" % (json.dumps(row_object)))

            table_name = "sailthru_blasts.blast_%s" % str(blast_id)

            # (TODO: Update schema to port dates in TIMESTAMP format in bq)

            if dry_run:
                print(
                    "DRY RUN: if this was for real, for the blast_query "
                    "job with blast_id = %s, we would write data at path "
                    "'%s' to bq table '%s'" %
                    (blast_id, temp_file, table_name))
            else:
                if verbose:
                    print(
                        "For the blast_query job with blast_id = %s, "
                        "writing jsonl file to bigquery" % blast_id)
                bq_util.call_bq([
                    'load', '--source_format=NEWLINE_DELIMITED_JSON',
                    '--replace', table_name, temp_file,
                    os.path.join(os.path.dirname(__file__),
                                 'sailthru_blast_export_schema.json')
                ],
                                project='khanacademy.org:deductive-jet-827',
                                return_output=False)
        finally:
            if not keep_temp:
                os.unlink(temp_file)
예제 #6
0
def build_page_load_temp_table(start_date=None, end_date=None):
    """Builds the temporary table with all relevant page load performance data.

    Returns the name of the table so it can be used in other functions which
    perform the queries to get the data we want for this report.

    The temp table is built from all request logs in the time from start_date
    to end_date. start_date and end_date are expected to be datetime objects
    if given. They default to eight days ago and yesterday, respectively.

    If the table already exists, this function exits successfully and returns
    the name of the table without rebuilding it.
    """

    start_date = (start_date or _DEFAULT_START_TIME).strftime("%Y%m%d")
    end_date = (end_date or _DEFAULT_END_TIME).strftime("%Y%m%d")

    temp_table_name = (
        'khan-academy:logs_streaming_tmp_analysis.email_reliability_%s_%s' %
        (start_date, end_date))

    # If the temp table already exists, don't redo the expensive query.
    try:
        bq_util.call_bq(['show', temp_table_name], project='khan-academy')
        return temp_table_name
    except subprocess.CalledProcessError:  # means 'table does not exist'
        pass

    bq_util.call_bq(['mk', temp_table_name],
                    project='khan-academy',
                    return_output=False)

    query = """\
SELECT * FROM (
  SELECT
    FLOAT(REGEXP_EXTRACT(app_logs.message,
      r'stats.time.client.sufficiently_usable_ms\.\w+\.\w+:(\d+);'
    ))/1000 AS page_load_time,
    REGEXP_EXTRACT(app_logs.message,
      r'stats.time.client.sufficiently_usable_ms\.\w+\.(\w+):\d+;'
    ) AS page_load_page,
    REGEXP_EXTRACT(app_logs.message,
      r'stats.time.client.sufficiently_usable_ms\.(server|client).+;'
    ) AS page_load_nav_type,
    elog_country
  FROM TABLE_DATE_RANGE([khanacademy.org:deductive-jet-827:logs.requestlogs_],
                      TIMESTAMP('%(start_date)s'),
                      TIMESTAMP('%(end_date)s'))
  WHERE
    elog_device_type IS NOT NULL
    AND elog_device_type != 'bot/dev'
)
WHERE
  page_load_time IS NOT NULL
  AND (
    (page_load_nav_type = 'server' AND page_load_time < 30)
    OR (page_load_nav_type = 'client' AND page_load_time < 20)
  )
""" % {
        "start_date": start_date,
        "end_date": end_date,
    }

    # Modeled from logs_bridge._run_bigquery
    # Apparently the commandline doesn't like newlines in the script.
    # Reformat for the commandline.
    query = query.replace('\n', ' ')

    bq_util.call_bq([
        'query', '--destination_table', temp_table_name,
        '--allow_large_results', query
    ],
                    project='khanacademy.org:deductive-jet-827',
                    return_output=False)

    return temp_table_name