Exemple #1
0
def generate_counts(frame, since, until=None):
    """
    A thin wrapper around moztelemetry counting functions.

    :frame main_summary or an isomorphic subset
    :since DT.date()
    :until DT.date() (unit test convenience)
    """
    cols = [
        frame.client_id.alias("clientId"),
        "subsession_start_date",
        "submission_date_s3",
    ]
    narrow = frame.select(cols)
    updates = []
    today = DT.date.today()
    generated = U.format_as_submission_date(today)
    start = since
    until = until or today
    while start < until:
        dau = count_active_users(narrow, start, 0)
        mau = count_active_users(narrow, start, MONTH)
        day = U.format_as_submission_date(start)
        d = {"day": day, "dau": dau, "mau": mau, "generated_on": generated}
        updates.append(d)
        start += DT.timedelta(1)
    return updates
Exemple #2
0
def write_locally(results):
    """
    :results [{'day': '%Y%m%d', 'dau': <int>,
              'mau': <int>, 'generated_on': '%Y%m%d'}, ...]
    """
    publication_date = U.format_as_submission_date(DT.date.today())
    basename = MAUDAU_SNAPSHOT_TEMPLATE.format(publication_date)
    cols = ["day", "dau", "mau", "generated_on"]
    with open(basename, "w") as f:
        writer = csv.DictWriter(f, cols)
        writer.writeheader()
        writer.writerows(results)
    return basename
import datetime as DT
import os
import pytest
import tempfile
from pyspark.sql.types import StructField, StructType, StringType
from mozetl.maudau import maudau as M
from mozetl.utils import format_as_submission_date

NARROW_SCHEMA = StructType([
    StructField("client_id", StringType(), True),
    StructField("submission_date_s3", StringType(), False),
    StructField("subsession_start_date", StringType(), True)
])

generated = format_as_submission_date(DT.date.today())


@pytest.fixture
def make_frame(spark):
    cols = ['client_id', 'subsession_start_date', 'submission_date_s3']
    values = [
        ('a', '2017-05-01', '20170508'),
        ('b', '2017-05-01', '20170501'),
        ('c', '2017-05-01', '20170510'),
        ('a', '2017-05-02', '20170503'),
        ('b', '2017-05-03', '20170503'),
        ('b', '2017-05-04', '20170511'),
        ('a', '2017-05-05', '20170507'),
    ]
    return spark.createDataFrame(
        [dict(list(zip(cols, tup))) for tup in values], schema=NARROW_SCHEMA)
from pyspark.sql import SparkSession
from mozetl.utils import (
    format_as_submission_date,
    format_spark_path,
    stop_session_safely,
)

logging.basicConfig(level=logging.DEBUG)


@click.command()
@click.option("--local/--no-local", default=False)
@click.option(
    "--submission-date-s3",
    type=str,
    default=format_as_submission_date(datetime.now() - timedelta(2)),
)
@click.option("--input-bucket", type=str, default="telemetry-parquet")
@click.option("--input-prefix", type=str, default="main_summary/v4")
@click.option("--output-bucket", type=str, default="telemetry-test-bucket")
@click.option("--output-prefix", type=str, default="mozetl_system_check")
def main(local, submission_date_s3, input_bucket, input_prefix, output_bucket,
         output_prefix):
    # print argument information
    for k, v in locals().items():
        print("{}: {}".format(k, v))

    print("Python version: {}".format(sys.version_info))
    spark = SparkSession.builder.getOrCreate()
    print("Spark version: {}".format(spark.version))