def setUpClass(cls):
     """Set up the database once for the test run."""
     cls.sc = pyspark.SparkContext(master="local[*]")
     raw_pings = list(generate_pings())
     aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings),
                                     num_reducers=10)
     submit_aggregates(aggregates)
def setup_module():
    global aggregates
    global sc

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings))
    submit_aggregates(aggregates)
def setup_module():
    global aggregates
    global sc

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)
    def setUpClass(cls):
        """Set up the database once for the test run."""
        clear_db()

        cls.sc = pyspark.SparkContext(master="local[*]")
        raw_pings = list(generate_pings())
        aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings), num_reducers=10)
        submit_aggregates(aggregates)
def test_submit():
    # Multiple submissions should not alter the aggregates in the db
    submit_aggregates(aggregates)
    build_id_count, submission_date_count = submit_aggregates(aggregates)

    n_submission_dates = len(ping_dimensions["submission_date"])
    n_channels = len(ping_dimensions["channel"])
    n_versions = len(ping_dimensions["version"])
    n_build_ids = len(ping_dimensions["build_id"])
    assert(build_id_count == n_submission_dates*n_channels*n_versions*n_build_ids)
    assert(submission_date_count == n_submission_dates*n_channels*n_versions)
Beispiel #6
0
def test_submit():
    # Multiple submissions should not alter the aggregates in the db
    submit_aggregates(aggregates)
    build_id_count, submission_date_count = submit_aggregates(aggregates)

    n_submission_dates = len(ping_dimensions["submission_date"])
    n_channels = len(ping_dimensions["channel"])
    n_versions = len(ping_dimensions["version"])
    n_build_ids = len(ping_dimensions["build_id"])
    assert (build_id_count == n_submission_dates * n_channels * n_versions *
            n_build_ids)
    assert (submission_date_count == n_submission_dates * n_channels *
            n_versions)
Beispiel #7
0
def test_null_label_character_submit():
    metric_info = ("SIMPLE_MEASURES_NULL_METRIC_LABEL", u"\u0001\u0000\u0000\u0000\u7000\ub82c", False)
    payload = {"sum": 4, "count": 2, "histogram": {2: 2}}
    key = ('20161111', 'nightly', '52', '20161111', 'Firefox', 'arch', 'linux', '42', 'false')
    aggregate = (key, {metric_info: payload})

    aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])]
    build_id_count, submission_date_count = submit_aggregates(aggregates)

    assert build_id_count == 0, "Build id count should be 0, was {}".format(build_id_count)
    assert submission_date_count == 0, "submission date count should be 0, was {}".format(build_id_count)
def test_null_arch_character_submit():
    metric_info = ("SIMPLE_MEASURES_NULL_ARCHITECTURE", "", False)
    payload = {"sum": 4, "count": 2, "histogram": {2: 2}}
    key = ('20161111', 'nightly', '52', '20161111', '', "Firefox", u"\x00", 'Windows', '2.4.21')
    aggregate = (key, {metric_info: payload})

    aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])]
    build_id_count, submission_date_count = submit_aggregates(aggregates)

    assert build_id_count == 0, "Build id count should be 0, was {}".format(build_id_count)
    assert submission_date_count == 0, "submission date count should be 0, was {}".format(build_id_count)
def test_null_arch_character_submit():
    metric_info = ("SIMPLE_MEASURES_NULL_ARCHITECTURE", "", False)
    payload = {"sum": 4, "count": 2, "histogram": {2: 2}}
    key = ('20161111', 'nightly', '52', '20161111', '', "Firefox", u"\x00",
           'Windows', '2.4.21')
    aggregate = (key, {metric_info: payload})

    aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])]
    build_id_count, submission_date_count = submit_aggregates(aggregates)

    assert build_id_count == 0, "Build id count should be 0, was {}".format(
        build_id_count)
    assert submission_date_count == 0, "submission date count should be 0, was {}".format(
        build_id_count)
#!/mnt/anaconda2/bin/ipython

import logging
from os import environ
from pyspark import SparkContext, SparkConf
from mozaggregator.aggregator import aggregate_metrics
from mozaggregator.db import submit_aggregates

conf = SparkConf().setAppName('telemetry-aggregates')
sc = SparkContext(conf=conf)
date = environ['date']
print "Running job for {}".format(date)
aggregates = aggregate_metrics(sc, ("nightly", "beta", "release"), date)
print "Number of build-id aggregates: {}".format(aggregates[0].count())
print "Number of submission date aggregates: {}".format(aggregates[1].count())
submit_aggregates(aggregates)
sc.stop()
Beispiel #11
0
def aggregates(sc):
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)
    return aggregates
    .get_bucket("telemetry-spark-emr-2")
    .get_key("aggregator_database_envvars.json")
    .get_contents_as_string()
)

for k, v in creds.items():
    environ[k] = v

# Attempt a database connection now so we can fail fast if credentials are broken.
_preparedb()

# Send jobs to the spark workers.
package_file = sys.argv[1]
print "Adding dependency " + package_file
conf = SparkConf().setAppName('telemetry-aggregates')
sc = SparkContext(conf=conf)
sc.addPyFile(package_file)

date = environ['date']
channels = [c.strip() for c in environ['channels'].split(',')]

print "Running job for {}".format(date)
aggregates = aggregate_metrics(sc, channels, date)
print "Number of build-id aggregates: {}".format(aggregates[0].count())
print "Number of submission date aggregates: {}".format(aggregates[1].count())

# Store the results in Postgres.
submit_aggregates(aggregates)

sc.stop()
Beispiel #13
0
def run_aggregator(
    date,
    channels,
    credentials_protocol,
    credentials_bucket,
    credentials_prefix,
    postgres_db,
    postgres_user,
    postgres_pass,
    postgres_host,
    postgres_ro_host,
    num_partitions,
    source,
    project_id,
    dataset_id,
    avro_prefix,
):
    spark = SparkSession.builder.getOrCreate()

    # Mozaggregator expects a series of POSTGRES_* variables in order to connect
    # to a db instance; we may pull them into the environment now by reading an
    # object from a file system.
    def create_path(protocol, bucket, prefix):
        mapping = {"file": "file", "s3": "s3a", "gcs": "gs"}
        return f"{mapping[protocol]}://{bucket}/{prefix}"

    # priority of reading credentials is options > credentials file > environment
    option_credentials = {
        "POSTGRES_DB": postgres_db,
        "POSTGRES_USER": postgres_user,
        "POSTGRES_PASS": postgres_pass,
        "POSTGRES_HOST": postgres_host,
        "POSTGRES_RO_HOST": postgres_ro_host,
    }
    if all(option_credentials.values()):
        print("reading credentials from options")
        environ.update(option_credentials)
    elif credentials_bucket and credentials_prefix:
        path = create_path(credentials_protocol, credentials_bucket,
                           credentials_prefix)
        print(f"reading credentials from {path}")
        creds = spark.read.json(path, multiLine=True).first().asDict()
        environ.update(creds)
    else:
        print(f"assuming credentials from the environment")

    # Attempt a database connection now so we can fail fast if credentials are broken.
    db._preparedb()

    channels = [channel.strip() for channel in channels.split(",")]
    print(f"Running job for {date}")
    aggregates = aggregator.aggregate_metrics(
        spark.sparkContext,
        channels,
        date,
        num_reducers=num_partitions,
        source=source,
        project_id=project_id,
        dataset_id=dataset_id,
        avro_prefix=avro_prefix,
    )
    aggregates[0].cache()
    aggregates[1].cache()
    print(f"Number of build-id aggregates: {aggregates[0].count()}")
    print(f"Number of submission date aggregates: {aggregates[1].count()}")

    # Store the results in Postgres.
    db.submit_aggregates(aggregates)