Ejemplo n.º 1
0
def test_last_week_range():
    def test_week(week, startday_num, endday_num):
        # Did we get a Sunday as the beginning of the week...
        assert week[0].weekday() == startday_num
        # ... and a Saturday as the end of the week?
        assert week[1].weekday() == endday_num

        # Is this a full week spanning for exactly 7 days?
        delta = week[1] - week[0]
        assert delta.days == 6

        # Check if that's the closest full week. We monkey patched the 3rd July 2015
        # as "today". The previous full week, starting on Sunday, begins on the 21st
        # June 2015. If the first weekday is Monday, then on the 22nd of July.
        expected_dates = (date(2015, 6, 21), date(2015, 6, 27)) if\
            startday_num == 6 else (date(2015, 6, 22), date(2015, 6, 28))

        assert week[0] == expected_dates[0]
        assert week[1] == expected_dates[1]

    with patch('moztelemetry.standards.date') as mock_date:
        # Mock date.today() to return a specific day, so we can properly test.
        mock_date.today.return_value = date(2015, 7, 3)
        mock_date.side_effect = lambda *args, **kw: date(*args, **kw)

        # Get the start and end date for the previous full week, as a tuple, and make sure
        # it's valid.
        prev_week = moz_utils.get_last_week_range("Sunday")
        test_week(prev_week, 6, 5)

        # As before, with a week starting with Monday.
        prev_week = moz_utils.get_last_week_range("Monday")
        test_week(prev_week, 0, 6)
Ejemplo n.º 2
0
def test_last_week_range():
    def test_week(week, startday_num, endday_num):
        # Did we get a Sunday as the beginning of the week...
        assert week[0].weekday() == startday_num
        # ... and a Saturday as the end of the week?
        assert week[1].weekday() == endday_num

        # Is this a full week spanning for exactly 7 days?
        delta = week[1] - week[0]
        assert delta.days == 6

        # Check if that's the closest full week. We monkey patched the 3rd July 2015
        # as "today". The previous full week, starting on Sunday, begins on the 21st
        # June 2015. If the first weekday is Monday, then on the 22nd of July.
        expected_dates = (date(2015, 6, 21), date(2015, 6, 27)) if\
                         startday_num == 6 else (date(2015, 6, 22), date(2015, 6, 28))

        assert week[0] == expected_dates[0]
        assert week[1] == expected_dates[1]

    with patch('moztelemetry.standards.date') as mock_date:
        # Mock date.today() to return a specific day, so we can properly test.
        mock_date.today.return_value = date(2015, 7, 3)
        mock_date.side_effect = lambda *args, **kw: date(*args, **kw)

        # Get the start and end date for the previous full week, as a tuple, and make sure
        # it's valid.
        prev_week = moz_utils.get_last_week_range("Sunday")
        test_week(prev_week, 6, 5)

        # As before, with a week starting with Monday.
        prev_week = moz_utils.get_last_week_range("Monday")
        test_week(prev_week, 0, 6)
Ejemplo n.º 3
0
def generate_report(start_date, end_date, spark, spark_provider="emr"):
    """Generate the hardware survey dataset for the reference timeframe.

    If the timeframe is longer than a week, split it in in weekly chunks
    and process each chunk individually (eases backfilling).

    The report for each week is saved in a local JSON file.

    Args:
        start_date: The date from which we start generating the report. If None,
           the report starts from the beginning of the past week (Sunday).
        end_date: The date the marks the end of the reporting period. This only
           makes sense if a |start_date| was provided. If None, this defaults
           to the end of the past week (Saturday).
        spark: SparkSession.
        spark_provider: Environment the application is running in. For `emr`,
           Longitudinal will be used, on `dataproc` data will be loaded from
           `telemetry.main`.
    """
    # If no start_date was provided, generate a report for the past complete
    # week.

    last_week = moz_std.get_last_week_range()
    date_range = (
        moz_std.snap_to_beginning_of_week(start_date, "Sunday")
        if start_date is not None else last_week[0],
        end_date if
        (end_date is not None and start_date is not None) else last_week[1],
    )

    # Split the submission period in chunks, so we don't run out of resources while aggregating if
    # we want to backfill.
    chunk_start = date_range[0]
    chunk_end = None
    # Stores all hardware reports in json by date
    date_to_json = {}

    while chunk_start < date_range[1]:
        chunk_end = chunk_start + dt.timedelta(days=6)

        (filtered_data, broken_ratio,
         inactive_ratio) = (get_data_longitudinal(
             spark, chunk_start, chunk_end) if spark_provider is "emr" else
                            get_data_bigquery(spark, chunk_start, chunk_end))

        # Process the data, transforming it in the form we desire.
        device_map = build_device_map()
        processed_data = filtered_data.map(
            lambda d: prepare_data(d, device_map))

        logger.info("Aggregating entries...")
        aggregated_pings = aggregate_data(processed_data)
        # Get the sample count, we need it to compute the percentages instead of raw numbers.
        # Since we're getting only the newest ping for each client, we can simply count the
        # number of pings. THIS MAY NOT BE CONSTANT ACROSS WEEKS!
        valid_records_count = filtered_data.count()

        # Collapse together groups that count less than 1% of our samples.
        threshold_to_collapse = int(valid_records_count * 0.01)

        logger.info(
            "Collapsing smaller groups into the other bucket (threshold {th})".
            format(th=threshold_to_collapse))
        collapsed_aggregates = collapse_buckets(aggregated_pings,
                                                threshold_to_collapse)

        logger.info("Post-processing raw values...")

        processed_aggregates = finalize_data(
            collapsed_aggregates,
            valid_records_count,
            broken_ratio,
            inactive_ratio,
            chunk_start,
        )

        if not validate_finalized_data(processed_aggregates):
            raise Exception("The aggregates failed to validate.")

        # Write the week start/end in the filename.
        suffix = ("-" + chunk_start.strftime("%Y%d%m") + "-" +
                  chunk_end.strftime("%Y%d%m"))
        file_name = get_file_name(suffix)

        date_to_json[file_name] = processed_aggregates

        # Move on to the next chunk, just add one day the end of the last
        # chunk.
        chunk_start = chunk_end + dt.timedelta(days=1)

    return date_to_json
Ejemplo n.º 4
0
def generate_report(start_date, end_date, spark):
    """Generate the hardware survey dataset for the reference timeframe.

    If the timeframe is longer than a week, split it in in weekly chunks
    and process each chunk individually (eases backfilling).

    The report for each week is saved in a local JSON file.

    Args:
        start_date: The date from which we start generating the report. If None,
           the report starts from the beginning of the past week (Sunday).
        end_date: The date the marks the end of the reporting period. This only
           makes sense if a |start_date| was provided. If None, this defaults
           to the end of the past week (Saturday).
    """
    # If no start_date was provided, generate a report for the past complete
    # week.

    last_week = moz_std.get_last_week_range()
    date_range = (
        moz_std.snap_to_beginning_of_week(start_date, "Sunday")
        if start_date is not None else last_week[0],
        end_date if
        (end_date is not None and start_date is not None) else last_week[1],
    )

    # Split the submission period in chunks, so we don't run out of resources while aggregating if
    # we want to backfill.
    chunk_start = date_range[0]
    chunk_end = None
    # Stores all hardware reports in json by date
    date_to_json = {}

    while chunk_start < date_range[1]:
        chunk_end = chunk_start + dt.timedelta(days=6)
        longitudinal_version = get_longitudinal_version(chunk_end)

        sqlQuery = """
                   SELECT
                      build,
                      client_id,
                      active_plugins,
                      system_os,
                      submission_date,
                      system,
                      system_gfx,
                      system_cpu,
                      normalized_channel
                   FROM
                      {}
                   WHERE
                      normalized_channel = 'release'
                   AND
                      build is not null and build[0].application_name = 'Firefox'
                   """.format(longitudinal_version)

        frame = spark.sql(sqlQuery)

        # The number of all the fetched records (including inactive and broken).
        records_count = frame.count()
        logger.info("Total record count for {}: {}".format(
            chunk_start.strftime("%Y%m%d"), records_count))

        # Fetch the data we need.
        data = frame.rdd.map(
            lambda r: get_latest_valid_per_client(r, chunk_start, chunk_end))

        # Filter out broken data.
        filtered_data = data.filter(
            lambda r: r not in [REASON_BROKEN_DATA, REASON_INACTIVE])

        # Count the broken records and inactive records.
        discarded = data.filter(
            lambda r: r in [REASON_BROKEN_DATA, REASON_INACTIVE]).countByValue(
            )

        broken_count = discarded[REASON_BROKEN_DATA]
        inactive_count = discarded[REASON_INACTIVE]
        broken_ratio = broken_count / float(records_count)
        inactive_ratio = inactive_count / float(records_count)
        logger.info(
            "Broken pings ratio: {}; Inactive clients ratio: {}".format(
                broken_ratio, inactive_ratio))

        # If we're not seeing sane values for the broken or inactive ratios,
        # bail out early on. There's no point in aggregating.
        if broken_ratio >= 0.9 or inactive_ratio >= 0.9:
            raise Exception(
                "Unexpected ratio of broken pings or inactive clients. Broken ratio: {0},\
                inactive ratio: {1}".format(broken_ratio, inactive_ratio))

        # Process the data, transforming it in the form we desire.
        device_map = build_device_map()
        processed_data = filtered_data.map(
            lambda d: prepare_data(d, device_map))

        logger.info("Aggregating entries...")
        aggregated_pings = aggregate_data(processed_data)
        # Get the sample count, we need it to compute the percentages instead of raw numbers.
        # Since we're getting only the newest ping for each client, we can simply count the
        # number of pings. THIS MAY NOT BE CONSTANT ACROSS WEEKS!
        valid_records_count = filtered_data.count()

        # Collapse together groups that count less than 1% of our samples.
        threshold_to_collapse = int(valid_records_count * 0.01)

        logger.info(
            "Collapsing smaller groups into the other bucket (threshold {th})".
            format(th=threshold_to_collapse))
        collapsed_aggregates = collapse_buckets(aggregated_pings,
                                                threshold_to_collapse)

        logger.info("Post-processing raw values...")

        processed_aggregates = finalize_data(
            collapsed_aggregates,
            valid_records_count,
            broken_ratio,
            inactive_ratio,
            chunk_start,
        )

        if not validate_finalized_data(processed_aggregates):
            raise Exception("The aggregates failed to validate.")

        # Write the week start/end in the filename.
        suffix = ("-" + chunk_start.strftime("%Y%d%m") + "-" +
                  chunk_end.strftime("%Y%d%m"))
        file_name = get_file_name(suffix)

        date_to_json[file_name] = processed_aggregates

        # Move on to the next chunk, just add one day the end of the last
        # chunk.
        chunk_start = chunk_end + dt.timedelta(days=1)

    return date_to_json