def test_basic_response(self): metric = metric_classes['NamespaceEdits']( name='NamespaceEdits', namespaces=[0, 1, 2], start_date='2013-01-01 00:00:00', end_date='2013-01-02 00:00:00', ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) result = ar.task.delay(ar).get() assert_equals(result[Aggregation.IND][self.editor(0)]['edits'], 2) assert_equals(result[Aggregation.AVG]['edits'], r(1.0)) assert_equals(result[Aggregation.STD]['edits'], r(1.0))
def test_aggregate_empty_results(self): ''' Tests what happens when no users are returned for the initial metric run so there are no users to agreggate ''' self.create_wiki_cohort() metric = metric_classes['NamespaceEdits']( name='NamespaceEdits', namespaces=[0, 1, 2], start_date='2010-01-01 00:00:00', end_date='2010-01-02 00:00:00', ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.basic_wiki_cohort, options, user_id=self.basic_wiki_cohort_owner, ) result = ar.task.delay(ar).get() assert_equals(result[Aggregation.IND].keys(), []) assert_equals(result[Aggregation.SUM]['edits'], r(0)) assert_equals(result[Aggregation.AVG]['edits'], r(0)) assert_equals(result[Aggregation.STD]['edits'], r(0))
def test_finish(self): metric = metric_classes['NamespaceEdits']( name='NamespaceEdits', namespaces=[0, 1, 2], start_date='2013-05-01 00:00:00', end_date='2013-09-01 00:00:00', ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) finished = ar.finish([ { 1: {'edits': 2}, 2: {'edits': 3}, 3: {'edits': 0}, None: {'edits': 0} }, ]) assert_equals( finished[Aggregation.SUM]['edits'], 5 ) assert_equals( finished[Aggregation.AVG]['edits'], r(1.25) ) finished = ar.finish([ { 1: {'other_sub_metric': r(2.3)}, 2: {'other_sub_metric': r(3.4)}, 3: {'other_sub_metric': r(0.0)}, None: {'other_sub_metric': 0} }, ]) assert_equals( finished[Aggregation.SUM]['other_sub_metric'], r(5.7) ) assert_equals( finished[Aggregation.AVG]['other_sub_metric'], r(1.425) ) assert_equals( finished[Aggregation.STD]['other_sub_metric'], r(1.4771) )
def test_finish(self): metric = metric_classes['NamespaceEdits']( name='NamespaceEdits', namespaces=[0, 1, 2], start_date='2013-05-01 00:00:00', end_date='2013-09-01 00:00:00', ) ar = AggregateReport( self.cohort, metric, individual=True, aggregate=True, aggregate_sum=True, aggregate_average=True, aggregate_std_deviation=True, user_id=self.owner_user_id, ) finished = ar.finish([ { 'namespace edits - fake cohort' : { 1: {'edits': 2}, 2: {'edits': 3}, 3: {'edits': 0}, None: {'edits': 0} } }, { 'some other metric - fake cohort' : { 1: {'other_sub_metric': r(2.3)}, 2: {'other_sub_metric': r(3.4)}, 3: {'other_sub_metric': r(0.0)}, None: {'other_sub_metric': 0} } }, ]) assert_equals( finished[ar.result_key][Aggregation.SUM]['edits'], 5 ) assert_equals( finished[ar.result_key][Aggregation.SUM]['other_sub_metric'], r(5.7) ) assert_equals( finished[ar.result_key][Aggregation.AVG]['edits'], r(1.25) ) assert_equals( finished[ar.result_key][Aggregation.AVG]['other_sub_metric'], r(1.425) ) assert_equals( finished[ar.result_key][Aggregation.STD]['other_sub_metric'], r(1.4771) )
def test_basic_response(self): metric = metric_classes['NamespaceEdits']( name='NamespaceEdits', namespaces=[0, 1, 2], start_date='2013-01-01 00:00:00', end_date='2013-01-02 00:00:00', ) ar = AggregateReport( self.cohort, metric, individual=True, aggregate=True, aggregate_sum=True, aggregate_average=True, aggregate_std_deviation=True, user_id=self.owner_user_id, ) result = ar.task.delay(ar).get() self.session.commit() aggregate_key = self.session.query(PersistentReport)\ .filter(PersistentReport.id == ar.persistent_id)\ .one()\ .result_key assert_equals( result[aggregate_key][Aggregation.IND][0][self.editors[0].user_id] ['edits'], 2 ) assert_equals( result[aggregate_key][Aggregation.AVG]['edits'], r(1.0) ) assert_equals( result[aggregate_key][Aggregation.STD]['edits'], r(1.0) )
def test_timeseries_day(self): metric = NamespaceEdits( namespaces=[0], start_date='2012-12-31 00:00:00', end_date='2013-01-03 00:00:00', timeseries=TimeseriesChoices.DAY, ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) results = ar.task.delay(ar).get() self.session.commit() assert_equals( results[Aggregation.IND][self.editor(0)]['edits'], { '2012-12-31 00:00:00' : 1, '2013-01-01 00:00:00' : 2, '2013-01-02 00:00:00' : 0, } ) assert_equals( results[Aggregation.SUM]['edits'], { '2012-12-31 00:00:00' : 1, '2013-01-01 00:00:00' : 5, '2013-01-02 00:00:00' : 2, } ) assert_equals( results[Aggregation.AVG]['edits'], { '2012-12-31 00:00:00' : r(0.25), '2013-01-01 00:00:00' : r(1.25), '2013-01-02 00:00:00' : r(0.5), } ) assert_equals( results[Aggregation.STD]['edits'], { '2012-12-31 00:00:00' : r(0.4330), '2013-01-01 00:00:00' : r(0.4330), '2013-01-02 00:00:00' : r(0.8660), } )
def test_timeseries_day(self): metric = NamespaceEdits( namespaces=[0], start_date='2012-12-31 00:00:00', end_date='2013-01-03 00:00:00', timeseries=TimeseriesChoices.DAY, ) ar = AggregateReport( self.cohort, metric, individual=True, aggregate=True, aggregate_sum=True, aggregate_average=True, aggregate_std_deviation=True, user_id=self.owner_user_id, ) results = ar.task.delay(ar).get() self.session.commit() aggregate_key = self.session.query(PersistentReport)\ .filter(PersistentReport.id == ar.persistent_id)\ .one()\ .result_key assert_equals( results[aggregate_key][Aggregation.IND][0][self.editors[0].user_id]['edits'], { '2012-12-31 00:00:00' : 1, '2013-01-01 00:00:00' : 2, '2013-01-02 00:00:00' : 0, } ) assert_equals( results[aggregate_key][Aggregation.SUM]['edits'], { '2012-12-31 00:00:00' : 1, '2013-01-01 00:00:00' : 5, '2013-01-02 00:00:00' : 2, } ) assert_equals( results[aggregate_key][Aggregation.AVG]['edits'], { '2012-12-31 00:00:00' : r(0.25), '2013-01-01 00:00:00' : r(1.25), '2013-01-02 00:00:00' : r(0.5), } ) assert_equals( results[aggregate_key][Aggregation.STD]['edits'], { '2012-12-31 00:00:00' : r(0.4330), '2013-01-01 00:00:00' : r(0.4330), '2013-01-02 00:00:00' : r(0.8660), } )
def test_basic_response(self): metric = metric_classes['NamespaceEdits']( name='NamespaceEdits', namespaces=[0, 1, 2], start_date='2013-01-01 00:00:00', end_date='2013-01-02 00:00:00', ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) result = ar.task.delay(ar).get() assert_equals( result[Aggregation.IND][self.editor(0)] ['edits'], 2 ) assert_equals( result[Aggregation.AVG]['edits'], r(1.0) ) assert_equals( result[Aggregation.STD]['edits'], r(1.0) )
def test_timeseries_day(self): metric = NamespaceEdits( namespaces=[0], start_date='2012-12-31 00:00:00', end_date='2013-01-03 00:00:00', timeseries=TimeseriesChoices.DAY, ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) results = ar.task.delay(ar).get() self.session.commit() assert_equals( results[Aggregation.IND][self.editor(0)]['edits'], { '2012-12-31 00:00:00': 1, '2013-01-01 00:00:00': 2, '2013-01-02 00:00:00': 0, }) assert_equals( results[Aggregation.SUM]['edits'], { '2012-12-31 00:00:00': 1, '2013-01-01 00:00:00': 5, '2013-01-02 00:00:00': 2, }) assert_equals( results[Aggregation.AVG]['edits'], { '2012-12-31 00:00:00': r(0.25), '2013-01-01 00:00:00': r(1.25), '2013-01-02 00:00:00': r(0.5), }) assert_equals( results[Aggregation.STD]['edits'], { '2012-12-31 00:00:00': r(0.4330), '2013-01-01 00:00:00': r(0.4330), '2013-01-02 00:00:00': r(0.8660), })
def test_finish_timeseries(self): metric = NamespaceEdits( namespaces=[0], start_date='2012-12-31 00:00:00', end_date='2013-01-03 00:00:00', timeseries=TimeseriesChoices.DAY, ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) finished = ar.finish([ { 1: { 'edits': { 'date1': 1, 'date2': 2 } }, 2: { 'edits': { 'date1': 0, 'date2': 1 } }, 3: { 'edits': { 'date1': 0, 'date2': 0 } }, None: { 'edits': { 'date1': None, 'date2': None } } }, ]) assert_equals(finished[Aggregation.SUM]['edits'], { 'date1': 1, 'date2': 3 }) assert_equals(finished[Aggregation.AVG]['edits'], { 'date1': r(0.3333), 'date2': r(1.0) }) assert_equals(finished[Aggregation.STD]['edits'], { 'date1': r(0.4714), 'date2': r(0.8165) }) finished = ar.finish([ { 1: { 'other_sub_metric': { 'date3': r(2.3), 'date4': 0 } }, 2: { 'other_sub_metric': { 'date3': 0, 'date4': r(3.4) } }, 3: { 'other_sub_metric': { 'date3': None, 'date4': None } }, None: { 'other_sub_metric': { 'date3': None, 'date4': None } } }, ]) assert_equals(finished[Aggregation.SUM]['other_sub_metric'], { 'date3': r(2.3), 'date4': r(3.4) }) assert_equals(finished[Aggregation.AVG]['other_sub_metric'], { 'date3': r(1.15), 'date4': r(1.7) }) assert_equals(finished[Aggregation.STD]['other_sub_metric'], { 'date3': r(1.15), 'date4': r(1.7) })
def test_finish_timeseries(self): metric = NamespaceEdits( namespaces=[0], start_date='2012-12-31 00:00:00', end_date='2013-01-03 00:00:00', timeseries=TimeseriesChoices.DAY, ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) finished = ar.finish([ { 1: {'edits': {'date1': 1, 'date2': 2}}, 2: {'edits': {'date1': 0, 'date2': 1}}, 3: {'edits': {'date1': 0, 'date2': 0}}, None: {'edits': {'date1': None, 'date2': None}} }, ]) assert_equals( finished[Aggregation.SUM]['edits'], {'date1': 1, 'date2': 3} ) assert_equals( finished[Aggregation.AVG]['edits'], {'date1': r(0.3333), 'date2': r(1.0)} ) assert_equals( finished[Aggregation.STD]['edits'], {'date1': r(0.4714), 'date2': r(0.8165)} ) finished = ar.finish([ { 1: {'other_sub_metric': {'date3': r(2.3), 'date4': 0}}, 2: {'other_sub_metric': {'date3': 0, 'date4': r(3.4)}}, 3: {'other_sub_metric': {'date3': None, 'date4': None}}, None: {'other_sub_metric': {'date3': None, 'date4': None}} }, ]) assert_equals( finished[Aggregation.SUM]['other_sub_metric'], {'date3': r(2.3), 'date4': r(3.4)} ) assert_equals( finished[Aggregation.AVG]['other_sub_metric'], {'date3': r(1.15), 'date4': r(1.7)} ) assert_equals( finished[Aggregation.STD]['other_sub_metric'], {'date3': r(1.15), 'date4': r(1.7)} )
def calculate(self, results_by_user, type_of_aggregate, average=None): # TODO: terrible redo this """ Calculates one type of aggregate by just iterating over the individual results Takes into account that results and aggregates may be split up by timeseries Also makes sure to ignore censored records when appropriate Parameters list_of_results : list of individual results type_of_aggregate : can be SUM, AVG, STD average : None by default but required when computing STD Returns The aggregate specified, computed at the timeseries level if applicable """ aggregation = dict() helper = dict() for user_id in results_by_user.keys(): for key in results_by_user[user_id]: # the CENSORED key indicates that this user has censored # results for this metric. It is not aggregate-able if key == CENSORED: continue value = results_by_user[user_id][key] value_is_not_censored = CENSORED not in results_by_user[user_id]\ or results_by_user[user_id][CENSORED] != 1 # handle timeseries aggregation if isinstance(value, dict): if key not in aggregation: aggregation[key] = OrderedDict() helper[key] = dict() for subkey in value: if subkey not in aggregation[key]: aggregation[key][subkey] = 0 helper[key][subkey] = dict() helper[key][subkey]['sum'] = Decimal(0.0) helper[key][subkey]['square_diffs'] = Decimal(0.0) helper[key][subkey]['count'] = 0 if value_is_not_censored and not value[subkey] is None: helper[key][subkey]['sum'] += Decimal(value[subkey]) helper[key][subkey]['count'] += 1 if type_of_aggregate == Aggregation.STD: diff = Decimal(value[subkey]) - average[key][subkey] helper[key][subkey]['square_diffs'] += Decimal( pow(diff, 2) ) if type_of_aggregate == Aggregation.SUM: aggregation[key][subkey] = r(helper[key][subkey]['sum']) elif type_of_aggregate == Aggregation.AVG: aggregation[key][subkey] = r(safe_average( helper[key][subkey]['sum'], helper[key][subkey]['count'] )) elif type_of_aggregate == Aggregation.STD: aggregation[key][subkey] = r(sqrt(safe_average( helper[key][subkey]['square_diffs'], helper[key][subkey]['count'] ))) # handle normal aggregation else: if key not in aggregation: aggregation[key] = 0 helper[key] = dict() helper[key]['sum'] = Decimal(0.0) helper[key]['square_diffs'] = Decimal(0.0) helper[key]['count'] = 0 if value_is_not_censored and value is not None: helper[key]['sum'] += Decimal(value) helper[key]['count'] += 1 if type_of_aggregate == Aggregation.STD: diff = Decimal(value) - average[key] helper[key]['square_diffs'] += Decimal(pow(diff, 2)) if type_of_aggregate == Aggregation.SUM: aggregation[key] = r(helper[key]['sum']) elif type_of_aggregate == Aggregation.AVG: aggregation[key] = r(safe_average( helper[key]['sum'], helper[key]['count'] )) elif type_of_aggregate == Aggregation.STD: aggregation[key] = r(sqrt(safe_average( helper[key]['square_diffs'], helper[key]['count'] ))) return aggregation
def safe_average(cummulative_sum, count): if count != 0: return r(cummulative_sum / count) else: return 0
def test_finish_timeseries(self): metric = NamespaceEdits( namespaces=[0], start_date='2012-12-31 00:00:00', end_date='2013-01-03 00:00:00', timeseries=TimeseriesChoices.DAY, ) ar = AggregateReport( self.cohort, metric, individual=True, aggregate=True, aggregate_sum=True, aggregate_average=True, aggregate_std_deviation=True, user_id=self.owner_user_id, ) finished = ar.finish([ { 'namespace edits - fake cohort' : { 1: {'edits': {'date1': 1, 'date2': 2}}, 2: {'edits': {'date1': 0, 'date2': 1}}, 3: {'edits': {'date1': 0, 'date2': 0}}, None: {'edits': {'date1': None, 'date2': None}} } }, { 'some other metric - fake cohort' : { 1: {'other_sub_metric': {'date3': r(2.3), 'date4': 0}}, 2: {'other_sub_metric': {'date3': 0, 'date4': r(3.4)}}, 3: {'other_sub_metric': {'date3': None, 'date4': None}}, None: {'other_sub_metric': {'date3': None, 'date4': None}} } }, ]) assert_equals( finished[ar.result_key][Aggregation.SUM]['edits'], {'date1': 1, 'date2': 3} ) assert_equals( finished[ar.result_key][Aggregation.SUM]['other_sub_metric'], {'date3': r(2.3), 'date4': r(3.4)} ) assert_equals( finished[ar.result_key][Aggregation.AVG]['edits'], {'date1': r(0.3333), 'date2': r(1.0)} ) assert_equals( finished[ar.result_key][Aggregation.AVG]['other_sub_metric'], {'date3': r(1.15), 'date4': r(1.7)} ) assert_equals( finished[ar.result_key][Aggregation.STD]['edits'], {'date1': r(0.4714), 'date2': r(0.8165)} ) assert_equals( finished[ar.result_key][Aggregation.STD]['other_sub_metric'], {'date3': r(1.15), 'date4': r(1.7)} )
def test_finish(self): metric = metric_classes['NamespaceEdits']( name='NamespaceEdits', namespaces=[0, 1, 2], start_date='2013-05-01 00:00:00', end_date='2013-09-01 00:00:00', ) options = { 'individualResults': True, 'aggregateResults': True, 'aggregateSum': True, 'aggregateAverage': True, 'aggregateStandardDeviation': True, } ar = AggregateReport( metric, self.cohort, options, user_id=self.owner_user_id, ) finished = ar.finish([ { 1: { 'edits': 2 }, 2: { 'edits': 3 }, 3: { 'edits': 0 }, None: { 'edits': 0 } }, ]) assert_equals(finished[Aggregation.SUM]['edits'], 5) assert_equals(finished[Aggregation.AVG]['edits'], r(1.25)) finished = ar.finish([ { 1: { 'other_sub_metric': r(2.3) }, 2: { 'other_sub_metric': r(3.4) }, 3: { 'other_sub_metric': r(0.0) }, None: { 'other_sub_metric': 0 } }, ]) assert_equals(finished[Aggregation.SUM]['other_sub_metric'], r(5.7)) assert_equals(finished[Aggregation.AVG]['other_sub_metric'], r(1.425)) assert_equals(finished[Aggregation.STD]['other_sub_metric'], r(1.4771))
def safe_average(cummulative_sum, count): if count != 0: return r(cummulative_sum / count) else: return 0
def calculate(self, results_by_user, type_of_aggregate, average=None): # TODO: terrible redo this """ Calculates one type of aggregate by just iterating over the individual results Takes into account that results and aggregates may be split up by timeseries Also makes sure to ignore censored records when appropriate Parameters list_of_results : list of individual results type_of_aggregate : can be SUM, AVG, STD average : None by default but required when computing STD Returns The aggregate specified, computed at the timeseries level if applicable """ aggregation = dict() helper = dict() for user_id in results_by_user.keys(): for key in results_by_user[user_id]: # the CENSORED key indicates that this user has censored # results for this metric. It is not aggregate-able if key == CENSORED: continue value = results_by_user[user_id][key] value_is_not_censored = CENSORED not in results_by_user[user_id]\ or results_by_user[user_id][CENSORED] != 1 # handle timeseries aggregation if isinstance(value, dict): if key not in aggregation: aggregation[key] = OrderedDict() helper[key] = dict() for subkey in value: if subkey not in aggregation[key]: aggregation[key][subkey] = 0 helper[key][subkey] = dict() helper[key][subkey]['sum'] = Decimal(0.0) helper[key][subkey]['square_diffs'] = Decimal(0.0) helper[key][subkey]['count'] = 0 if value_is_not_censored and not value[subkey] is None: helper[key][subkey]['sum'] += Decimal(value[subkey]) helper[key][subkey]['count'] += 1 if type_of_aggregate == Aggregation.STD: diff = Decimal(value[subkey]) - average[key][subkey] helper[key][subkey]['square_diffs'] += Decimal( pow(diff, 2) ) if type_of_aggregate == Aggregation.SUM: aggregation[key][subkey] = r(helper[key][subkey]['sum']) elif type_of_aggregate == Aggregation.AVG: aggregation[key][subkey] = r(safe_average( helper[key][subkey]['sum'], helper[key][subkey]['count'] )) elif type_of_aggregate == Aggregation.STD: aggregation[key][subkey] = r(sqrt(safe_average( helper[key][subkey]['square_diffs'], helper[key][subkey]['count'] ))) # handle normal aggregation else: if key not in aggregation: aggregation[key] = 0 helper[key] = dict() helper[key]['sum'] = Decimal(0.0) helper[key]['square_diffs'] = Decimal(0.0) helper[key]['count'] = 0 if value_is_not_censored and value is not None: helper[key]['sum'] += Decimal(value) helper[key]['count'] += 1 if type_of_aggregate == Aggregation.STD: diff = Decimal(value) - average[key] helper[key]['square_diffs'] += Decimal(pow(diff, 2)) if type_of_aggregate == Aggregation.SUM: aggregation[key] = r(helper[key]['sum']) elif type_of_aggregate == Aggregation.AVG: aggregation[key] = r(safe_average( helper[key]['sum'], helper[key]['count'] )) elif type_of_aggregate == Aggregation.STD: aggregation[key] = r(sqrt(safe_average( helper[key]['square_diffs'], helper[key]['count'] ))) return aggregation