Ejemplo n.º 1
0
def test_snapping():
    # Use Friday, July 3rd, 2015 as the reference date.
    ref_date = date(2015, 7, 3)

    # No weekday_start argument provided, snap ref_date to the closest, previous Sunday.
    expected_date = date(2015, 6, 28)  # Sunday, 28th June, 2015
    snapped_date = moz_utils.snap_to_beginning_of_week(ref_date)
    assert expected_date == snapped_date

    # Does this still work correctly when snapping to the closest Monday instead?
    expected_date = date(2015, 6, 29)  # Monday, 29th June, 2015
    snapped_date = moz_utils.snap_to_beginning_of_week(ref_date, "Monday")
    assert expected_date == snapped_date

    # Check that the correct date is returned when the reference date is Sunday.
    ref_date = expected_date = date(2015, 6, 28)
    snapped_date = moz_utils.snap_to_beginning_of_week(ref_date)
    assert expected_date == snapped_date

    # Can we correctly snap to the beginning of the month?
    ref_date = date(2015, 7, 3)
    expected_date = date(2015, 7, 1)
    snapped_date = moz_utils.snap_to_beginning_of_month(ref_date)
    assert expected_date == snapped_date

    # What if we're already at the beginning of the month?
    ref_date = expected_date = date(2015, 7, 1)
    snapped_date = moz_utils.snap_to_beginning_of_month(ref_date)
    assert expected_date == snapped_date
Ejemplo n.º 2
0
def test_snapping():
    # Use Friday, July 3rd, 2015 as the reference date.
    ref_date = date(2015, 7, 3)

    # No weekday_start argument provided, snap ref_date to the closest, previous Sunday.
    expected_date = date(2015, 6, 28) # Sunday, 28th June, 2015
    snapped_date = moz_utils.snap_to_beginning_of_week(ref_date)
    assert expected_date == snapped_date

    # Does this still work correctly when snapping to the closest Monday instead?
    expected_date = date(2015, 6, 29) # Monday, 29th June, 2015
    snapped_date = moz_utils.snap_to_beginning_of_week(ref_date, "Monday")
    assert expected_date == snapped_date

    # Check that the correct date is returned when the reference date is Sunday.
    ref_date = expected_date = date(2015, 6, 28)
    snapped_date = moz_utils.snap_to_beginning_of_week(ref_date)
    assert expected_date == snapped_date

    # Can we correctly snap to the beginning of the month?
    ref_date = date(2015, 7, 3)
    expected_date = date(2015, 7, 1)
    snapped_date = moz_utils.snap_to_beginning_of_month(ref_date)
    assert expected_date == snapped_date

    # What if we're already at the beginning of the month?
    ref_date = expected_date = date(2015, 7, 1)
    snapped_date = moz_utils.snap_to_beginning_of_month(ref_date)
    assert expected_date == snapped_date
Ejemplo n.º 3
0
def main(start_date, debug):

    spark = (
        SparkSession
        .builder
        .appName("churn")
        .getOrCreate()
    )

    config = {
        "source": "s3://telemetry-parquet/churn/v2",
        "uploads": [
            {
                "name": "Pipeline-Analysis",
                "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
                "prefix": "mreid/churn"
            },
            {
                "name": "Dashboard",
                "bucket": "net-mozaws-prod-metrics-data",
                "prefix": "telemetry-churn"
            }
        ],
        "search_cohort": {
            "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
            "prefix": "amiyaguchi/churn_csv"
        }
    }
    assert_valid_config(config)

    if debug:
        config["uploads"] = [
            {
                "name": "Testing",
                "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
                "prefix": "amiyaguchi/churn_csv_testing"
            }
        ]
        config['search_cohort'] = {
            "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
            "prefix": "amiyaguchi/churn_csv_testing"
        }
        assert_valid_config(config)

    # Churn waits 10 days for pings to be sent from the client
    week_start_date = snap_to_beginning_of_week(
        datetime.strptime(start_date, "%Y%m%d") - timedelta(10),
        "Sunday")
    week_start = fmt(week_start_date)

    convert_week(spark, config, week_start)
Ejemplo n.º 4
0
def test_snap_to_beginning_of_week():
    # Test default weekday_start
    tests = {
        "20170101": "20170101",
        "20170102": "20170101",
        "20170103": "20170101",
        "20170104": "20170101",
        "20170105": "20170101",
        "20170106": "20170101",
        "20170107": "20170101",
        "20170108": "20170108",
        "20170109": "20170108",
        "20170110": "20170108",
    }
    for arg, expected in iteritems(tests):
        d = parse(arg)
        e = parse(expected)
        assert e == std.snap_to_beginning_of_week(d)

    # Test a different weekday_start
    tests = {
        "20170101": "20161226",
        "20170102": "20170102",
        "20170103": "20170102",
        "20170104": "20170102",
        "20170105": "20170102",
        "20170106": "20170102",
        "20170107": "20170102",
        "20170108": "20170102",
        "20170109": "20170109",
        "20170110": "20170109",
    }
    for arg, expected in iteritems(tests):
        d = parse(arg)
        e = parse(expected)
        assert e == std.snap_to_beginning_of_week(d, weekday_start="Monday")
Ejemplo n.º 5
0
def adjust_start_date(start_date, use_lag):
    """ Adjust reporting start date to the nearest sunday, optionally
    taking into account telemetry client latency.

    This lag period accounts for telemetry pings that need to be sent
    relative to the reporting period. For example, a client could have
    a backlog of stored telemetry pings from being disconnected to the
    internet. The 10 day period accounts for a majority of pings while
    being concious about reporting latency.

    :start_date datestring: reporting start date
    :use_lag bool:          adjust for client latency
    :return datestring:     closest sunday that accounts for latency
    """
    if use_lag:
        lag_time = timedelta(10)
    else:
        lag_time = timedelta(0)

    offset_start = datetime.strptime(start_date, "%Y%m%d") - lag_time
    week_start_date = snap_to_beginning_of_week(offset_start, "Sunday")
    return fmt(week_start_date)
Ejemplo n.º 6
0
def convert(d2v, week_start, datum):
    out = {"good": False}

    pcd = daynum_to_date(datum.profile_creation_date)
    if not sane_date(pcd):
        return out

    pcd_formatted = datetime.strftime(pcd, "%Y-%m-%d")

    out["client_id"] = datum.client_id
    channel = datum.normalized_channel
    out["is_funnelcake"] = is_funnelcake(datum.distribution_id)
    if out["is_funnelcake"]:
        channel = "{}-cck-{}".format(datum.normalized_channel,
                                     datum.distribution_id)
    out["channel"] = channel
    out["geo"] = top_country(datum.country)
    out["acquisition_period"] = snap_to_beginning_of_week(pcd, "Sunday")
    out["start_version"] = get_effective_version(d2v, channel, pcd_formatted)

    # bug 1337037 - stub attribution
    attribution_fields = ["source", "medium", "campaign", "content"]
    if datum.attribution:
        for field in attribution_fields:
            value = datum.attribution[field]
            if value:
                out[field] = value

    # bug 1323598
    if datum.distribution_id:
        out['distribution_id'] = datum.distribution_id
    if datum.default_search_engine:
        out['default_search_engine'] = datum.default_search_engine
    if datum.locale:
        out['locale'] = datum.locale

    deviceCount = 0
    if datum.sync_count_desktop is not None:
        deviceCount += datum.sync_count_desktop
    if datum.sync_count_mobile is not None:
        deviceCount += datum.sync_count_mobile

    if deviceCount > 1:
        out["sync_usage"] = "multiple"
    elif deviceCount == 1:
        out["sync_usage"] = "single"
    elif datum.sync_configured is not None:
        if datum.sync_configured:
            out["sync_usage"] = "single"
        else:
            out["sync_usage"] = "no"
    # Else we don't set sync_usage at all, and use a default value later.

    out["current_version"] = datum.version

    # The usage time is in seconds, but we really need hours.  Because
    # we filter out broken subsession_lengths, we could end up with
    # clients with no usage hours.
    out["usage_hours"] = ((datum.usage_seconds / SECONDS_IN_HOUR)
                          if datum.usage_seconds is not None
                          else 0.0)
    out["squared_usage_hours"] = out["usage_hours"] ** 2

    out["total_uri_count"] = datum.total_uri_count_per_client
    out["unique_domains_count"] = datum.average_unique_domains_count_per_client

    # Incoming subsession_start_date looks like "2016-02-22T00:00:00.0-04:00"
    client_date = None
    if datum.subsession_start_date is not None:
        try:
            client_date = datetime.strptime(
                datum.subsession_start_date[0:10], "%Y-%m-%d")
        except ValueError:
            # Bogus format
            pass
        except TypeError:
            # String contains null bytes or other weirdness. Example:
            # TypeError: must be string without null bytes, not unicode
            pass
    if client_date is None:
        # Fall back to submission date
        client_date = datetime.strptime(datum.submission_date_s3, "%Y%m%d")
    out["current_week"] = get_week_num(pcd, client_date)
    out["is_active"] = "yes"
    if client_date is not None:
        try:
            if datetime.strftime(client_date, "%Y%m%d") < week_start:
                out["is_active"] = "no"
        except ValueError:
            pass
    out["good"] = True
    return out
Ejemplo n.º 7
0
def generate_report(start_date, end_date, spark, spark_provider="emr"):
    """Generate the hardware survey dataset for the reference timeframe.

    If the timeframe is longer than a week, split it in in weekly chunks
    and process each chunk individually (eases backfilling).

    The report for each week is saved in a local JSON file.

    Args:
        start_date: The date from which we start generating the report. If None,
           the report starts from the beginning of the past week (Sunday).
        end_date: The date the marks the end of the reporting period. This only
           makes sense if a |start_date| was provided. If None, this defaults
           to the end of the past week (Saturday).
        spark: SparkSession.
        spark_provider: Environment the application is running in. For `emr`,
           Longitudinal will be used, on `dataproc` data will be loaded from
           `telemetry.main`.
    """
    # If no start_date was provided, generate a report for the past complete
    # week.

    last_week = moz_std.get_last_week_range()
    date_range = (
        moz_std.snap_to_beginning_of_week(start_date, "Sunday")
        if start_date is not None else last_week[0],
        end_date if
        (end_date is not None and start_date is not None) else last_week[1],
    )

    # Split the submission period in chunks, so we don't run out of resources while aggregating if
    # we want to backfill.
    chunk_start = date_range[0]
    chunk_end = None
    # Stores all hardware reports in json by date
    date_to_json = {}

    while chunk_start < date_range[1]:
        chunk_end = chunk_start + dt.timedelta(days=6)

        (filtered_data, broken_ratio,
         inactive_ratio) = (get_data_longitudinal(
             spark, chunk_start, chunk_end) if spark_provider is "emr" else
                            get_data_bigquery(spark, chunk_start, chunk_end))

        # Process the data, transforming it in the form we desire.
        device_map = build_device_map()
        processed_data = filtered_data.map(
            lambda d: prepare_data(d, device_map))

        logger.info("Aggregating entries...")
        aggregated_pings = aggregate_data(processed_data)
        # Get the sample count, we need it to compute the percentages instead of raw numbers.
        # Since we're getting only the newest ping for each client, we can simply count the
        # number of pings. THIS MAY NOT BE CONSTANT ACROSS WEEKS!
        valid_records_count = filtered_data.count()

        # Collapse together groups that count less than 1% of our samples.
        threshold_to_collapse = int(valid_records_count * 0.01)

        logger.info(
            "Collapsing smaller groups into the other bucket (threshold {th})".
            format(th=threshold_to_collapse))
        collapsed_aggregates = collapse_buckets(aggregated_pings,
                                                threshold_to_collapse)

        logger.info("Post-processing raw values...")

        processed_aggregates = finalize_data(
            collapsed_aggregates,
            valid_records_count,
            broken_ratio,
            inactive_ratio,
            chunk_start,
        )

        if not validate_finalized_data(processed_aggregates):
            raise Exception("The aggregates failed to validate.")

        # Write the week start/end in the filename.
        suffix = ("-" + chunk_start.strftime("%Y%d%m") + "-" +
                  chunk_end.strftime("%Y%d%m"))
        file_name = get_file_name(suffix)

        date_to_json[file_name] = processed_aggregates

        # Move on to the next chunk, just add one day the end of the last
        # chunk.
        chunk_start = chunk_end + dt.timedelta(days=1)

    return date_to_json
Ejemplo n.º 8
0
def get_longitudinal_version(date):
    start_of_week = moz_std.snap_to_beginning_of_week(date, "Sunday")
    next_week = start_of_week + dt.timedelta(days=6)
    return "longitudinal_v" + next_week.strftime("%Y%m%d")
Ejemplo n.º 9
0
def generate_report(start_date, end_date, spark):
    """Generate the hardware survey dataset for the reference timeframe.

    If the timeframe is longer than a week, split it in in weekly chunks
    and process each chunk individually (eases backfilling).

    The report for each week is saved in a local JSON file.

    Args:
        start_date: The date from which we start generating the report. If None,
           the report starts from the beginning of the past week (Sunday).
        end_date: The date the marks the end of the reporting period. This only
           makes sense if a |start_date| was provided. If None, this defaults
           to the end of the past week (Saturday).
    """
    # If no start_date was provided, generate a report for the past complete
    # week.

    last_week = moz_std.get_last_week_range()
    date_range = (
        moz_std.snap_to_beginning_of_week(start_date, "Sunday")
        if start_date is not None else last_week[0],
        end_date if
        (end_date is not None and start_date is not None) else last_week[1],
    )

    # Split the submission period in chunks, so we don't run out of resources while aggregating if
    # we want to backfill.
    chunk_start = date_range[0]
    chunk_end = None
    # Stores all hardware reports in json by date
    date_to_json = {}

    while chunk_start < date_range[1]:
        chunk_end = chunk_start + dt.timedelta(days=6)
        longitudinal_version = get_longitudinal_version(chunk_end)

        sqlQuery = """
                   SELECT
                      build,
                      client_id,
                      active_plugins,
                      system_os,
                      submission_date,
                      system,
                      system_gfx,
                      system_cpu,
                      normalized_channel
                   FROM
                      {}
                   WHERE
                      normalized_channel = 'release'
                   AND
                      build is not null and build[0].application_name = 'Firefox'
                   """.format(longitudinal_version)

        frame = spark.sql(sqlQuery)

        # The number of all the fetched records (including inactive and broken).
        records_count = frame.count()
        logger.info("Total record count for {}: {}".format(
            chunk_start.strftime("%Y%m%d"), records_count))

        # Fetch the data we need.
        data = frame.rdd.map(
            lambda r: get_latest_valid_per_client(r, chunk_start, chunk_end))

        # Filter out broken data.
        filtered_data = data.filter(
            lambda r: r not in [REASON_BROKEN_DATA, REASON_INACTIVE])

        # Count the broken records and inactive records.
        discarded = data.filter(
            lambda r: r in [REASON_BROKEN_DATA, REASON_INACTIVE]).countByValue(
            )

        broken_count = discarded[REASON_BROKEN_DATA]
        inactive_count = discarded[REASON_INACTIVE]
        broken_ratio = broken_count / float(records_count)
        inactive_ratio = inactive_count / float(records_count)
        logger.info(
            "Broken pings ratio: {}; Inactive clients ratio: {}".format(
                broken_ratio, inactive_ratio))

        # If we're not seeing sane values for the broken or inactive ratios,
        # bail out early on. There's no point in aggregating.
        if broken_ratio >= 0.9 or inactive_ratio >= 0.9:
            raise Exception(
                "Unexpected ratio of broken pings or inactive clients. Broken ratio: {0},\
                inactive ratio: {1}".format(broken_ratio, inactive_ratio))

        # Process the data, transforming it in the form we desire.
        device_map = build_device_map()
        processed_data = filtered_data.map(
            lambda d: prepare_data(d, device_map))

        logger.info("Aggregating entries...")
        aggregated_pings = aggregate_data(processed_data)
        # Get the sample count, we need it to compute the percentages instead of raw numbers.
        # Since we're getting only the newest ping for each client, we can simply count the
        # number of pings. THIS MAY NOT BE CONSTANT ACROSS WEEKS!
        valid_records_count = filtered_data.count()

        # Collapse together groups that count less than 1% of our samples.
        threshold_to_collapse = int(valid_records_count * 0.01)

        logger.info(
            "Collapsing smaller groups into the other bucket (threshold {th})".
            format(th=threshold_to_collapse))
        collapsed_aggregates = collapse_buckets(aggregated_pings,
                                                threshold_to_collapse)

        logger.info("Post-processing raw values...")

        processed_aggregates = finalize_data(
            collapsed_aggregates,
            valid_records_count,
            broken_ratio,
            inactive_ratio,
            chunk_start,
        )

        if not validate_finalized_data(processed_aggregates):
            raise Exception("The aggregates failed to validate.")

        # Write the week start/end in the filename.
        suffix = ("-" + chunk_start.strftime("%Y%d%m") + "-" +
                  chunk_end.strftime("%Y%d%m"))
        file_name = get_file_name(suffix)

        date_to_json[file_name] = processed_aggregates

        # Move on to the next chunk, just add one day the end of the last
        # chunk.
        chunk_start = chunk_end + dt.timedelta(days=1)

    return date_to_json