def generate_counts(frame, since, until=None): """ A thin wrapper around moztelemetry counting functions. :frame main_summary or an isomorphic subset :since DT.date() :until DT.date() (unit test convenience) """ cols = [ frame.client_id.alias("clientId"), "subsession_start_date", "submission_date_s3", ] narrow = frame.select(cols) updates = [] today = DT.date.today() generated = U.format_as_submission_date(today) start = since until = until or today while start < until: dau = count_active_users(narrow, start, 0) mau = count_active_users(narrow, start, MONTH) day = U.format_as_submission_date(start) d = {"day": day, "dau": dau, "mau": mau, "generated_on": generated} updates.append(d) start += DT.timedelta(1) return updates
def write_locally(results): """ :results [{'day': '%Y%m%d', 'dau': <int>, 'mau': <int>, 'generated_on': '%Y%m%d'}, ...] """ publication_date = U.format_as_submission_date(DT.date.today()) basename = MAUDAU_SNAPSHOT_TEMPLATE.format(publication_date) cols = ["day", "dau", "mau", "generated_on"] with open(basename, "w") as f: writer = csv.DictWriter(f, cols) writer.writeheader() writer.writerows(results) return basename
import datetime as DT import os import pytest import tempfile from pyspark.sql.types import StructField, StructType, StringType from mozetl.maudau import maudau as M from mozetl.utils import format_as_submission_date NARROW_SCHEMA = StructType([ StructField("client_id", StringType(), True), StructField("submission_date_s3", StringType(), False), StructField("subsession_start_date", StringType(), True) ]) generated = format_as_submission_date(DT.date.today()) @pytest.fixture def make_frame(spark): cols = ['client_id', 'subsession_start_date', 'submission_date_s3'] values = [ ('a', '2017-05-01', '20170508'), ('b', '2017-05-01', '20170501'), ('c', '2017-05-01', '20170510'), ('a', '2017-05-02', '20170503'), ('b', '2017-05-03', '20170503'), ('b', '2017-05-04', '20170511'), ('a', '2017-05-05', '20170507'), ] return spark.createDataFrame( [dict(list(zip(cols, tup))) for tup in values], schema=NARROW_SCHEMA)
from pyspark.sql import SparkSession from mozetl.utils import ( format_as_submission_date, format_spark_path, stop_session_safely, ) logging.basicConfig(level=logging.DEBUG) @click.command() @click.option("--local/--no-local", default=False) @click.option( "--submission-date-s3", type=str, default=format_as_submission_date(datetime.now() - timedelta(2)), ) @click.option("--input-bucket", type=str, default="telemetry-parquet") @click.option("--input-prefix", type=str, default="main_summary/v4") @click.option("--output-bucket", type=str, default="telemetry-test-bucket") @click.option("--output-prefix", type=str, default="mozetl_system_check") def main(local, submission_date_s3, input_bucket, input_prefix, output_bucket, output_prefix): # print argument information for k, v in locals().items(): print("{}: {}".format(k, v)) print("Python version: {}".format(sys.version_info)) spark = SparkSession.builder.getOrCreate() print("Spark version: {}".format(spark.version))