def test_where_exact_match():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    new_dataset = dataset.where(dim1='my-value')
    assert new_dataset is not dataset
    assert new_dataset.clauses.keys() == ['dim1']
    condition = new_dataset.clauses['dim1']
    assert condition('my-value')
def test_scan_multiple_params():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    new_dataset = dataset.where(dim1='myvalue')
    assert new_dataset is not dataset
    assert list(new_dataset.clauses.keys()) == ['dim1']
    condition = new_dataset.clauses['dim1']
    assert condition('myvalue')
def test_where():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    clause = lambda x: True
    new_dataset = dataset.where(dim1=clause)

    assert new_dataset is not dataset
    assert new_dataset.clauses == {'dim1': clause}
def test_where_exact_match():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    new_dataset = dataset.where(dim1='my-value')
    assert new_dataset is not dataset
    assert new_dataset.clauses.keys() == ['dim1']
    condition = new_dataset.clauses['dim1']
    assert condition('my-value')
def test_where():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    clause = lambda x: True
    new_dataset = dataset.where(dim1=clause)

    assert new_dataset is not dataset
    assert new_dataset.clauses == {'dim1': clause}
def aggregate_metrics(sc, channels, submission_date, main_ping_fraction=1, fennec_ping_fraction=1, num_reducers=10000):
    """ Returns the build-id and submission date aggregates for a given submission date.

    :param sc: A SparkContext instance
    :param channel: Either the name of a channel or a list/tuple of names
    :param submission-date: The submission date for which the data will be aggregated
    :param fraction: An approximative fraction of submissions to consider for aggregation
    """
    if not isinstance(channels, (tuple, list)):
        channels = [channels]

    channels = set(channels)
    pings = Dataset.from_source('telemetry') \
                   .where(appUpdateChannel=lambda x: x in channels,
                          submissionDate=submission_date,
                          docType='main',
                          sourceVersion='4',
                          appName=lambda x: x != 'Fennec') \
                   .records(sc, sample=main_ping_fraction)

    fennec_pings = Dataset.from_source('telemetry') \
                          .where(appUpdateChannel=lambda x: x in channels,
                                 submissionDate=submission_date,
                                 docType='saved_session',
                                 sourceVersion='4',
                                 appName='Fennec') \
                          .records(sc, sample=fennec_ping_fraction)

    all_pings = pings.union(fennec_pings)
    return _aggregate_metrics(all_pings)
def test_scan_multiple_params():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    new_dataset = dataset.where(dim1='myvalue')
    assert new_dataset is not dataset
    assert list(new_dataset.clauses.keys()) == ['dim1']
    condition = new_dataset.clauses['dim1']
    assert condition('myvalue')
Beispiel #8
0
def aggregate_metrics(sc,
                      channels,
                      submission_date,
                      main_ping_fraction=1,
                      fennec_ping_fraction=1,
                      num_reducers=10000):
    """ Returns the build-id and submission date aggregates for a given submission date.

    :param sc: A SparkContext instance
    :param channel: Either the name of a channel or a list/tuple of names
    :param submission-date: The submission date for which the data will be aggregated
    :param fraction: An approximative fraction of submissions to consider for aggregation
    """
    if not isinstance(channels, (tuple, list)):
        channels = [channels]

    channels = set(channels)
    pings = Dataset.from_source('telemetry') \
                   .where(appUpdateChannel=lambda x: x in channels,
                          submissionDate=submission_date,
                          docType='main',
                          sourceVersion='4',
                          appName=lambda x: x != 'Fennec') \
                   .records(sc, sample=main_ping_fraction)

    fennec_pings = Dataset.from_source('telemetry') \
                          .where(appUpdateChannel=lambda x: x in channels,
                                 submissionDate=submission_date,
                                 docType='saved_session',
                                 sourceVersion='4',
                                 appName='Fennec') \
                          .records(sc, sample=fennec_ping_fraction)

    all_pings = pings.union(fennec_pings)
    return _aggregate_metrics(all_pings, num_reducers)
def test_select():
    dataset1 = Dataset('test-bucket', ['dim1', 'dim2']).select('field1', 'field2')
    dataset2 = Dataset('test-bucket', ['dim1', 'dim2']).select('field1', field2='field2')
    dataset3 = Dataset('test-bucket', ['dim1', 'dim2']).select(field1='field1', field2='field2')

    assert dataset1.selection == {
        'field1': 'field1',
        'field2': 'field2',
    }

    assert dataset1.selection == dataset2.selection == dataset3.selection

    dataset4 = Dataset('test-bucket', ['dim1', 'dim2']).select('field1', field2='f2', field3='f3')

    assert dataset4.selection == {
        'field1': 'field1',
        'field2': 'f2',
        'field3': 'f3',
    }

    dataset5 = dataset4.select('field4', field5='f5')

    assert dataset5.selection == {
        'field1': 'field1',
        'field2': 'f2',
        'field3': 'f3',
        'field4': 'field4',
        'field5': 'f5'
    }
Beispiel #10
0
def test_where_wrong_dimension():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    clause = lambda x: True

    with pytest.raises(Exception) as exc_info:
        new_dataset = dataset.where(dim3=clause)

    assert str(exc_info.value) == 'The dimension dim3 doesn\'t exist'
def test_where_wrong_dimension():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    clause = lambda x: True

    with pytest.raises(Exception) as exc_info:
        new_dataset = dataset.where(dim3=clause)

    assert str(exc_info.value) == 'The dimension dim3 doesn\'t exist'
Beispiel #12
0
def test_where_dupe_dimension():
    clause = lambda x: True
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/',
                      clauses={'dim1': clause})

    with pytest.raises(Exception) as exc_info:
        new_dataset = dataset.where(dim1=clause)

    assert str(exc_info.value) == 'There should be only one clause for dim1'
def test_scan_multiple_where_params(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir1/another-dir/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1='dir1', dim2='subdir1')
    summaries = dataset.summaries(spark_context)
    expected_key = 'dir1/subdir1/key1'
    assert summaries == [{'key': expected_key, 'size': len(store.store[expected_key])}]
def test_scan_multiple_where_params(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir1/another-dir/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1='dir1', dim2='subdir1')
    summaries = dataset.summaries(spark_context)
    expected_key = 'dir1/subdir1/key1'
    assert summaries == [{'key': expected_key, 'size': len(store.store[expected_key])}]
def test_records_limit_and_sample(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x, limit=5, sample=0.9)
    assert records.count() == 5
def test_scan_no_dimensions():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan([], [
            'prefix/',
        ], {}, executor)

    assert folders == [
        'prefix/',
    ]
def test_records(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x)
    records = sorted(records.collect())

    assert records == [b'value1', b'value2']
def test_where_dupe_dimension():
    clause = lambda x: True
    dataset = Dataset('test-bucket', ['dim1', 'dim2'],
                      prefix='prefix/',
                      clauses={'dim1': clause})

    with pytest.raises(Exception) as exc_info:
        new_dataset = dataset.where(dim1=clause)

    assert str(exc_info.value) == 'There should be only one clause for dim1'
def test_records(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x)
    records = records.collect()

    assert records == [b'value1', b'value2']
def test_records_limit(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x, limit=5)
    assert records.count() == 5
Beispiel #21
0
def test_scan_with_prefix():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['prefix1/dir1/subdir1/key1'] = 'value1'
    store.store['prefix2/dir2/another-dir/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'],
                      clauses={'dim1': lambda x: x == 'dir1'}, store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'dim2',], ['prefix1/',], dataset.clauses, executor)
    assert list(folders) == ['prefix1/dir1/']
Beispiel #22
0
def aggregate_metrics(
    sc,
    channels,
    submission_date,
    main_ping_fraction=1,
    fennec_ping_fraction=1,
    num_reducers=10000,
    source="moztelemetry",
    project_id=None,
    dataset_id=None,
    avro_prefix=None,
):
    """ Returns the build-id and submission date aggregates for a given submission date.

    :param sc: A SparkContext instance
    :param channel: Either the name of a channel or a list/tuple of names
    :param submission-date: The submission date for which the data will be aggregated
    :param fraction: An approximative fraction of submissions to consider for aggregation
    """
    if not isinstance(channels, (tuple, list)):
        channels = [channels]

    if source == "bigquery" and project_id and dataset_id:
        dataset = BigQueryDataset()
        pings = dataset.load(project_id, dataset_id, "main", submission_date,
                             channels, "normalized_app_name <> 'Fennec'")
        fennec_pings = dataset.load(project_id, dataset_id, "saved_session",
                                    submission_date, channels,
                                    "normalized_app_name = 'Fennec'")
    elif source == "avro" and avro_prefix:
        dataset = BigQueryDataset()
        pings = dataset.load_avro(avro_prefix, "main", submission_date,
                                  channels, "normalized_app_name <> 'Fennec'")
        fennec_pings = dataset.load_avro(avro_prefix, "saved_session",
                                         submission_date, channels,
                                         "normalized_app_name = 'Fennec'")
    else:
        pings = Dataset.from_source('telemetry') \
                    .where(appUpdateChannel=lambda x: x in channels,
                            submissionDate=submission_date,
                            docType='main',
                            sourceVersion='4',
                            appName=lambda x: x != 'Fennec') \
                    .records(sc, sample=main_ping_fraction)

        fennec_pings = Dataset.from_source('telemetry') \
                            .where(appUpdateChannel=lambda x: x in channels,
                                    submissionDate=submission_date,
                                    docType='saved_session',
                                    sourceVersion='4',
                                    appName='Fennec') \
                            .records(sc, sample=fennec_ping_fraction)

    all_pings = pings.union(fennec_pings)
    return _aggregate_metrics(all_pings, num_reducers)
def test_records_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)
    records = dataset.records(spark_context, decode=lambda x: x,
                              summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}])
    records = records.collect()

    assert records == [b'value1']
def test_summaries_with_limit(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)
    summaries = dataset.summaries(spark_context, 1)

    assert len(summaries) == 1

    assert summaries[0]['key'] in store.store
def test_scan_no_clause():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/dir2/key1'
    value = 'value1'
    store.store[key] = value

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'subdir'], ['prefix'], {}, executor)
    assert list(folders) == ['prefix']
def test_scan_with_clause():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/another-dir/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'],
                      clauses={'dim1': lambda x: x == 'dir1'}, store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'dim2'], [''], dataset.clauses, executor)
    assert list(folders) == ['dir1/']
def test_select_keep_state():
    """Test that calling select only mutates the selection of a dataset"""
    dataset_before = Dataset('test-bucket', ['dim1', 'dim2']).where(dim1=True)
    dataset_after = dataset_before.select('field1', 'field2')

    assert dataset_before.selection != dataset_after.selection
    assert dataset_before.bucket == dataset_after.bucket
    assert dataset_before.schema == dataset_after.schema
    assert dataset_before.store == dataset_after.store
    assert dataset_before.prefix == dataset_after.prefix
    assert dataset_before.clauses == dataset_after.clauses
Beispiel #28
0
def test_summaries_with_limit():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    summaries = list(dataset._summaries(1))

    assert len(summaries) == 1

    assert summaries[0]['key'] in store.store
def test_select_keep_state():
    """Test that calling select only mutates the selection of a dataset"""
    dataset_before = Dataset('test-bucket', ['dim1', 'dim2']).where(dim1=True)
    dataset_after = dataset_before.select('field1', 'field2')

    assert dataset_before.selection != dataset_after.selection
    assert dataset_before.bucket == dataset_after.bucket
    assert dataset_before.schema == dataset_after.schema
    assert dataset_before.store == dataset_after.store
    assert dataset_before.prefix == dataset_after.prefix
    assert dataset_before.clauses == dataset_after.clauses
def test_records_print_output(spark_context, capsys):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    dataset.records(spark_context, decode=lambda x: x)
    out, err = capsys.readouterr()
    assert out.rstrip() == "fetching 0.00066MB in 100 files..."
def test_scan_no_clause():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/dir2/key1'
    value = 'value1'
    store.store[key] = value

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'subdir'], ['prefix'], {}, executor)
    assert list(folders) == ['prefix']
def test_records_print_output(spark_context, capsys):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100+1):
        key = 'dir{}/subdir{}/key{}'.format(*[i]*3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    dataset.records(spark_context, decode=lambda x: x)
    out, err = capsys.readouterr()
    assert out.rstrip() == "fetching 0.00066MB in 100 files..."
def test_records_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x,
                              summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}])
    records = records.collect()

    assert records == ['value1']
def test_summaries_with_limit(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    summaries = dataset.summaries(spark_context, 1)

    assert len(summaries) == 1

    assert summaries[0]['key'] in store.store
def test_select_dupe_properties():
    dataset = Dataset('test-bucket', ['dim1', 'dim2']).select('field1')

    with pytest.raises(Exception) as exc_info:
        dataset.select('field1')

    assert str(exc_info.value) == 'The property field1 has already been selected'

    with pytest.raises(Exception) as exc_info:
        dataset.select(field1='keyword_field')

    assert str(exc_info.value) == 'The property field1 has already been selected'
def test_sanitized_dimensions(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir_1/subdir1/key1'] = 'value1'
    store.store['dir_1/subdir2/key2'] = 'value2'
    store.store['dir_2/subdir3/key3'] = 'value3'
    store.store['dir_3/subdir4/key4'] = 'value4'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1="dir-1")

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2
Beispiel #37
0
def test_records_many_groups(spark_context, monkeypatch):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, spark_context.defaultParallelism + 2):
        store.store['dir1/subdir1/key{}'.format(i)] = 'value{}'.format(i)
    # create one group per item
    monkeypatch.setattr(moztelemetry.dataset, '_group_by_size', lambda x: [[y] for y in x])
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x)
    records = records.collect()

    assert records == ['value{}'.format(i) for i in range(1, spark_context.defaultParallelism + 2)]
Beispiel #38
0
def test_records_object(spark_context):
    expect = {"uid": 1}

    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['key'] = json.dumps(expect)

    ds = Dataset(bucket_name, None, store=store, max_concurrency=1)
    row = ds.records(spark_context, decode=decode).first()

    assert isinstance(row, dict)
    assert row == expect
Beispiel #39
0
def run_spinner_etl(sc):
    nightly_build_channels = ["nightly", "aurora"]
    sample_size = 1.0

    probe_available = datetime(2016, 9, 8)
    look_back_date = datetime.today() - timedelta(days=180)
    start_date = max(probe_available, look_back_date).strftime("%Y%m%d")
    end_date = datetime.today().strftime("%Y%m%d")

    def appBuildId_filter(b):
        return (
            (b.startswith(start_date) or b > start_date) and
            (b.startswith(end_date) or b < end_date)
        )

    print "Start Date: {}, End Date: {}".format(start_date, end_date)

    build_results = {}

    for build_type in nightly_build_channels:
        # Bug 1341340 - if we're looking for pings from before 20161012, we need to query
        # old infra.
        old_infra_pings = Dataset.from_source("telemetry-oldinfra") \
            .where(docType='main') \
            .where(submissionDate=lambda b: b < "20161201") \
            .where(appBuildId=appBuildId_filter) \
            .where(appUpdateChannel=build_type) \
            .records(sc, sample=sample_size)

        new_infra_pings = Dataset.from_source("telemetry") \
            .where(docType='main') \
            .where(submissionDate=lambda b: (b.startswith("20161201") or b > "20161201")) \
            .where(appBuildId=appBuildId_filter) \
            .where(appUpdateChannel=build_type) \
            .records(sc, sample=sample_size)

        pings = old_infra_pings.union(new_infra_pings)
        build_results[build_type] = get_short_and_long_spinners(pings)

    s3_client = boto3.client('s3')
    for result_key, results in build_results.iteritems():
        filename = "severities_by_build_id_%s.json" % result_key
        results_json = json.dumps(results, ensure_ascii=False)

        with open(filename, 'w') as f:
            f.write(results_json)

        s3_client.upload_file(
            filename,
            'telemetry-public-analysis-2',
            'spinner-severity-generator/data/{}'.format(filename)
        )
def test_records_selection(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/subdir1/key1'
    value = '{"a": {"b": { "c": "value"}}}'
    store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c')
    records = dataset.records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value'}]

    # Check that concatenating `select`s works as expected
    records = dataset.select(field2='a.b').records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
def test_records_many_groups(spark_context, monkeypatch):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, spark_context.defaultParallelism + 2):
        store.store['dir1/subdir1/key{}'.format(i)] = 'value{}'.format(i)
    # create one group per item
    monkeypatch.setattr(moztelemetry.dataset, '_group_by_size_greedy',
                        lambda x, _: [[y] for y in x])
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x)
    records = records.collect()

    assert records == ['value{}'.format(i) for i in range(1, spark_context.defaultParallelism + 2)]
def test_records_selection(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/subdir1/key1'
    value = '{"a": {"b": { "c": "value"}}}'
    store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c')
    records = dataset.records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value'}]

    # Check that concatenating `select`s works as expected
    records = dataset.select(field2='a.b').records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
Beispiel #43
0
def test_summaries():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)

    summaries = dataset._summaries()
    assert len(list(summaries)) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])
def test_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])
def test_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])
def run_spinner_etl(sc):
    sample_size = 1.0

    start_date = (datetime.today() - timedelta(days=180)).strftime("%Y%m%d")
    end_date = datetime.today().strftime("%Y%m%d")

    def appBuildId_filter(b):
        return b >= start_date and (b.startswith(end_date) or b < end_date)

    print("Start Date: {}, End Date: {}".format(start_date, end_date))

    results = {}

    pings = (Dataset.from_source("telemetry").where(docType="main").where(
        submissionDate=lambda b: b >= start_date).where(
            appBuildId=appBuildId_filter).where(
                appUpdateChannel="nightly").records(sc, sample=sample_size))

    results = get_short_and_long_spinners(pings)

    s3_client = boto3.client("s3")
    filename = "severities_by_build_id_nightly.json"
    with open(filename, "w") as f:
        f.write(json.dumps(results, ensure_ascii=False, cls=CustomEncoder))

    s3_client.upload_file(
        filename,
        "telemetry-public-analysis-2",
        "spinner-severity-generator/data/{}".format(filename),
    )
Beispiel #47
0
def dataset():
    bucket_name = 'test_bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = json.dumps({'foo': 1})
    store.store['dir2/subdir2/key2'] = json.dumps({'foo': 2})
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    return dataset
Beispiel #48
0
def extract(sc, submission_date, sample=0.01):
    landfill = (
        Dataset.from_source("landfill")
        .where(submissionDate=submission_date)
        .records(sc, sample=sample)
    )
    return landfill
def get_data(sc, config, date):
    date_str = date.strftime("%Y%m%d")

    pings = (Dataset.from_source("telemetry").where(docType='crash').where(
        submissionDate=lambda b: b.startswith(date_str)).where(
            appUpdateChannel=config['channel']).records(
                sc, sample=config['sample_size']))

    properties = [
        "environment/system/os/name",
        "environment/system/os/version",
        "application/architecture",
        "application/buildId",
        "payload/processType",
        "payload/stackTraces/crash_info/crashing_thread",
        "payload/stackTraces/threads",
        "payload/stackTraces/modules",
    ]

    try:
        pings_props = get_pings_properties(pings,
                                           properties,
                                           with_processes=True)
    except ValueError:
        return None

    return pings_props.map(map_to_hang_format)
def test_apply_selection():
    dataset = Dataset('test-bucket', ['dim1', 'dim2']).select('field1.field2')
    json_obj = {'field1': {'field2': 'value'}}

    assert dataset._apply_selection(json_obj) == {'field1.field2': 'value'}

    dataset = Dataset('test-bucket', ['dim1', 'dim2']).select(field='field1.field2')

    assert dataset._apply_selection(json_obj) == {'field': 'value'}

    dataset = Dataset('test-bucket', ['dim1', 'dim2']).select(field='foo.bar')

    assert dataset._apply_selection(json_obj) == {'field': None}
Beispiel #51
0
def get_data(sc):
    pings = Dataset.from_source("telemetry").where(
        docType='main',
        submissionDate=(date.today() - timedelta(1)).strftime("%Y%m%d"),
        appUpdateChannel="nightly").records(sc, sample=0.1)

    return get_pings_properties(pings,
                                ["clientId", "environment/system/os/name"])
def submit(docType, channel, timestamp, group_by):
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext
    rdd = Dataset.from_source("telemetry").where(
        docType=docType, appUpdateChannel=channel,
        submissionDate=timestamp).records(sc, group_by=group_by)
    print("Number of rows of RDD: {}".format(rdd.count()))
    print("Number of partitions used: {}".format(rdd.getNumPartitions()))
    sc.stop()
def test_prefix_slash(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['a/b/dir1/subdir1/key1'] = 'value1'
    store.store['a/b/dir2/subdir2/key2'] = 'value2'
    store.store['x/b/dir3/subdir3/key3'] = 'value3'
    store.store['a/c/dir4/subdir4/key4'] = 'value4'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, prefix='a/b', max_concurrency=1)

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])

    # be sure "where" still works
    summaries_filtered = dataset.where(dim1='dir1').summaries(spark_context)
    assert len(summaries_filtered) == 1
    assert summaries_filtered[0]['key'] == 'a/b/dir1/subdir1/key1'
def test_records_sample(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)

    records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect()
    assert len(records_1) == 10

    records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect()

    # The sampling seed is different, so we have two different samples.
    assert sorted(records_1) != sorted(records_2)

    records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect()
    records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect()

    # Same seed, same sample.
    assert sorted(records_1) == sorted(records_2)
def test_apply_selection():
    dataset = Dataset('test-bucket', ['dim1', 'dim2']).select('field1.field2')
    json_obj = {'field1': {'field2': 'value'}}

    assert dataset._apply_selection(json_obj) == {'field1.field2': 'value'}

    dataset = Dataset('test-bucket', ['dim1', 'dim2']).select(field='field1.field2')

    assert dataset._apply_selection(json_obj) == {'field': 'value'}

    dataset = Dataset('test-bucket', ['dim1', 'dim2']).select(field='foo.bar')

    assert dataset._apply_selection(json_obj) == {'field': None}
def test_dataset_from_source(my_mock_s3, monkeypatch):
    meta_bucket_name = 'net-mozaws-prod-us-west-2-pipeline-metadata'

    bucket = boto3.resource('s3').Bucket(meta_bucket_name)
    bucket.create()

    store = S3Store(meta_bucket_name)
    data_dir = os.path.join(os.path.dirname(__file__), 'data')

    with open(os.path.join(data_dir, 'sources.json'), 'rb') as f:
        store.upload_file(f, '', 'sources.json')
    with open(os.path.join(data_dir, 'schema.json'), 'rb') as f:
        store.upload_file(f, 'telemetry-2/', 'schema.json')
        f.seek(0)
        expected_dimensions = json.loads(f.read().decode('utf-8'))['dimensions']

    dimensions = [dim['field_name'] for dim in expected_dimensions]

    assert Dataset.from_source('telemetry').schema == dimensions
Beispiel #57
0
def aggregate_metrics(sc, begin, end=None, num_partitions=10000):
    """
    Returns the build-id and submission date aggregates for a given submission date.

    :param sc: A SparkContext instance
    :param begin: A string for the beginning date, in form "YYYYMMDD"
    :param end: An optional string for the end date, in form "YYYYMMDD". If
        not provided, metrics will only be aggregrated for the date provided
        with `begin`.
    :param num_partitions: An optional value to be passed to `aggregateByKey`.

    """
    if end is None:
        end = begin

    pings = (Dataset.from_source('telemetry')
                    .where(docType='mobile_metrics',
                           submissionDate=lambda x: begin <= x <= end)
                    .records(sc))

    return _aggregate_metrics(pings, num_partitions)
# COMMAND ----------

from moztelemetry.dataset import Dataset
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, DoubleType, LongType, MapType
from statsmodels.stats.weightstats import DescrStatsW

EXPERIMENT_ID = "prefflip-webrender-v1-2-1492568"
EXPERIMENT_ID_2 = "prefflip-webrender-v1-3-1492568"
PARTITIONS = [s.replace("-", "_") for s in (EXPERIMENT_ID, EXPERIMENT_ID_2)]

# COMMAND ----------

  Dataset.from_source("telemetry-cohorts")

# COMMAND ----------

to_summarize = {
  "composite_time": "payload.processes.gpu.histograms.COMPOSITE_TIME",
  "content_frame_time": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME",
  "content_frame_time_svg": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME_WITH_SVG",
  "content_frame_time_reason": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME_REASON",
  "content_frame_time_without_upload": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME_WITHOUT_UPLOAD",
  "content_paint_time": "payload.processes.content.histograms.CONTENT_PAINT_TIME",
  "tab_switch_composite": "payload.histograms.FX_TAB_SWITCH_COMPOSITE_E10S_MS",
  "content_full_paint_time": "payload.processes.gpu.histograms.CONTENT_FULL_PAINT_TIME",
  "page_load_ms": "payload.histograms.FX_PAGE_LOAD_MS_2"
}
def test_scan_no_dimensions():
    dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan([], ['prefix/', ], {}, executor)

    assert folders == ['prefix/', ]