def test_get_pings_none_filter(test_store, mock_message_parser, spark_context): upload_ping(test_store, 'value1', app='Firefox') upload_ping(test_store, 'value2', app='Thuderbird') pings = get_pings(spark_context, app=None) assert sorted(pings.collect()) == ['value1', 'value2'] pings = get_pings(spark_context, app='*') assert sorted(pings.collect()) == ['value1', 'value2']
def test_get_pings_multiple_by_range(test_store, mock_message_parser, spark_context): upload_ping(test_store, 'value1', **{f[0]: f[1] for f in test_data_for_range_match}) upload_ping(test_store, 'value2', **{f[0]: f[2] for f in test_data_for_range_match}) pings = get_pings(spark_context, **{f[0]: f[1] for f in test_data_for_range_match}) assert pings.collect() == ['value1'] pings = get_pings(spark_context, **{f[0]: (f[3], f[4]) for f in test_data_for_range_match}) assert pings.collect() == ['value1']
def test_get_pings_fraction(test_store, mock_message_parser, spark_context): for i in range(1, 10 + 1): upload_ping(test_store, 'value', build_id=str(i)) pings = get_pings(spark_context) assert pings.count() == 10 pings = get_pings(spark_context, fraction=0.1) assert pings.count() == 1
def test_get_pings_by_range(test_store, mock_message_parser, spark_context, filter_name, exact, wrong, start, end): upload_ping(test_store, 'value1', **{filter_name: exact}) upload_ping(test_store, 'value2', **{filter_name: wrong}) pings = get_pings(spark_context, **{filter_name: exact}) assert pings.collect() == ['value1'] pings = get_pings(spark_context, **{filter_name: (start, end)}) assert pings.collect() == ['value1']
def test_get_pings_fraction(test_store, mock_message_parser, spark_context): for i in range(1, 10+1): upload_ping(test_store, 'value', build_id=str(i)) pings = get_pings(spark_context) assert pings.count() == 10 pings = get_pings(spark_context, fraction=0.1) assert pings.count() == 1
def retrieve_crash_data(sc, submission_date_range, comparable_dimensions, fraction): # get the raw data normal_pings = get_pings(sc, doc_type="main", submission_date=submission_date_range, fraction=fraction) crash_pings = get_pings(sc, doc_type="crash", submission_date=submission_date_range, fraction=fraction) return normal_pings.union(crash_pings)
def retrieve_crash_data(sc, submission_date_range, comparable_dimensions, fraction): # get the raw data normal_pings = get_pings( sc, doc_type="main", submission_date=submission_date_range, fraction=fraction ) crash_pings = get_pings( sc, doc_type="crash", submission_date=submission_date_range, fraction=fraction ) return normal_pings.union(crash_pings)
def test_get_pings_multiple_filters(test_store, mock_message_parser, spark_context): filters = dict(submission_date='20160101', channel='beta') upload_ping(test_store, 'value1', **filters) filters['app'] = 'Thunderbird' upload_ping(test_store, 'value2', **filters) pings = get_pings(spark_context, **filters) assert pings.collect() == ['value2']
def test_get_pings_by_exact_match(test_store, dummy_pool_executor, mock_message_parser, spark_context, filter_name, exact, wrong): upload_ping(test_store, 'value1', **{filter_name: exact}) upload_ping(test_store, 'value2', **{filter_name: wrong}) pings = get_pings(spark_context, **{filter_name: exact}) assert pings.collect() == ['value1']
def test_get_pings_multiple_by_range(test_store, dummy_pool_executor, mock_message_parser, spark_context): upload_ping(test_store, 'value1', **{f[0]: f[1] for f in test_data_for_range_match}) upload_ping(test_store, 'value2', **{f[0]: f[2] for f in test_data_for_range_match}) pings = get_pings(spark_context, **{f[0]: f[1] for f in test_data_for_range_match}) assert pings.collect() == ['value1'] pings = get_pings( spark_context, **{f[0]: (f[3], f[4]) for f in test_data_for_range_match}) assert pings.collect() == ['value1']
def aggregate_metrics(sc, channels, submission_date, fraction=1): """ Returns the build-id and submission date aggregates for a given submission date. :param sc: A SparkContext instance :param channel: Either the name of a channel or a list/tuple of names :param submission-date: The submission date for which the data will be aggregated :param fraction: An approximative fraction of submissions to consider for aggregation """ if not isinstance(channels, (tuple, list)): channels = [channels] channels = set(channels) rdds = [get_pings(sc, channel=ch, submission_date=submission_date, doc_type="saved_session", schema="v4", fraction=fraction) for ch in channels] pings = reduce(lambda x, y: x.union(y), rdds) return _aggregate_metrics(pings)
def test_get_pings_propertiess_keyedHistogram_with_processes( test_store, dummy_pool_executor, mock_message_parser, spark_context): measures = { "payload": { "processes": { "content": { "keyedHistograms": { "TEST": { "key1": { "values": { "0": 2 } }, } } } }, "keyedHistograms": { "TEST": { "key1": { "values": { "0": 1 } }, }, } } } field = 'payload/keyedHistograms/TEST' upload_ping(test_store, json.dumps(measures)) pings = get_pings(spark_context) filtered_pings = get_pings_properties( pings, [field], with_processes=True, additional_histograms=additional_histograms) res = (filtered_pings.map(lambda d: d.get(field)).filter( lambda p: p is not None and len(p.keys()) > 0)) assert res.count() == 1 hist = res.first() assert hist['key1_parent'] == 1 assert hist['key1_children'] == 2 assert hist['key1'] == 3
def test_get_pings_wrong_schema(test_store, dummy_pool_executor, mock_message_parser, spark_context): with pytest.raises(ValueError): get_pings(spark_context, schema=1)
def test_get_pings_properties_keyedHistogram_exists_with_process( test_store, dummy_pool_executor, mock_message_parser, spark_context): # Before Firefox 51, histograms could be found in the child # payloads. This should handle obtaining histograms to keep # behavior consistent. child_measures = { "payload": { "childPayloads": [ { "keyedHistograms": {} }, # empty keyedHistogram {}, # missing keyedHistogram ], "keyedHistograms": { "TEST": { "key1": { "values": { "0": 1 } }, }, } } } # The histograms for all child processes are aggregated in the # content process. Here, keyedHistograms do not exist in # the content process. content_measures = { "payload": { "processes": { "content": { "keyedHistograms": {} } }, "keyedHistograms": { "TEST": { "key1": { "values": { "0": 1 } }, }, } } } field = 'payload/keyedHistograms/TEST' upload_ping(test_store, json.dumps(child_measures)) upload_ping(test_store, json.dumps(content_measures)) pings = get_pings(spark_context) filtered_pings = get_pings_properties( pings, [field], additional_histograms=additional_histograms) res = (filtered_pings.map(lambda d: d.get(field)).filter( lambda p: p is not None and len(p.keys()) > 0)) # assert existence assert res.count() == 2
def test_get_pings_wrong_schema(test_store, mock_message_parser, spark_context): with pytest.raises(ValueError): pings = get_pings(spark_context, schema=1)