def setUp(self): self.registry = MeasureRegistry() data_dir = os.path.join(os.path.dirname(__file__), 'data') people = (PandasMeasureProvider( name='people', data=os.path.join(data_dir, 'people.csv')).provides_identifier( 'person', expr='id', role='primary').provides_identifier( 'geography', expr='id_country', role='foreign').provides_dimension( 'name').provides_measure('age')) self.registry.register(people) transactions = (PandasMeasureProvider( name='transactions', data=os.path.join( data_dir, 'transactions.csv')).provides_identifier( 'transaction', expr='id', role='primary').provides_identifier( 'person:buyer', expr='id_buyer', role='foreign').provides_identifier( 'person:seller', expr='id_seller', role='foreign').provides_measure('value')) self.registry.register(transactions) self.registry.show()
def setUp(self): self.registry = MetaMeasureProvider() data_dir = os.path.join(os.path.dirname(__file__), 'data') people = ( PandasMeasureProvider( name='people', data=os.path.join(data_dir, 'people.csv') ) .add_identifier('person', expr='id', role='primary') .add_dimension('name') .add_measure('age') .add_partition('ds') ) self.registry.register(people) people2 = ( PandasMeasureProvider( name='people2', data=os.path.join(data_dir, 'people.csv') ) .add_identifier('person', expr='id', role='unique') .add_identifier('geography', expr='id_geography', role='foreign') .add_partition('ds') ) self.registry.register(people2) geographies = ( PandasMeasureProvider( name='geographies', data=os.path.join(data_dir, 'geographies.csv') ) .add_identifier('geography', expr='id_geography', role='primary') .add_dimension('name') .add_measure('population') .add_partition('ds') ) self.registry.register(geographies) transactions = ( PandasMeasureProvider( name='transactions', data=os.path.join(data_dir, 'transactions.csv') ) .add_identifier('transaction', expr='id', role='primary') .add_identifier('person:buyer', expr='id_buyer', role='foreign') .add_identifier('person:seller', expr='id_seller', role='foreign') .add_measure('value') .add_partition('ds', requires_constraint=True) ) self.registry.register(transactions)
def evaluate(self, unit_type, measures=None, segment_by=None, where=None, joins=None, stats=True, covariates=False, context=None, stats_registry=None, **opts): """ This method evaluates the requested `measures` in this MeasureProvider segmented by the dimensions in `segment_by` after joining in the joins in `joins` and subject to the constraints in `where`; treating `unit_type` objects as indivisible. Args: unit_type (str, _StatisticalUnitIdentifier): The unit to treat as indivisible in this analysis. measures (list<str, _Measure>): The measures to be calculated. segment_by (list<str, _Feature>): The dimensions by which to segment the measure computations. where (dict, list, tuple, BaseConstraint): The constraints within which measures should be computed. stats (bool): Whether to keep track of the distribution of the measures, rather than just their sum. covariates (bool, list<tuple>): Whether to compute all covariates (if bool) or else a list of tuples of measures within which all pairs of covariates should be computed. context (dict): The context in which to perform the evaluation. opts (dict): Additional arguments to be passed onto `._evalaute` implementations. Returns: EvaluatedMeasures: A wrapper around the dataframe of the results of the computation. """ from mensor.backends.pandas import PandasMeasureProvider # We need this for some pandas transformations # Split joins into compatible and incompatible joins; 'joins_pre' and # 'joins_post' (so-called because compatible joins occur before any # computation in this method). joins_pre = [j for j in joins if j.compatible] joins_post = [j for j in joins if not j.compatible] # If there are post-joins, we will need to add the 'count' measure # (assuming it has not already been requested), so that we can weight # post-joins appropriately. if len(joins_post) > 0 and 'count' not in measures: count_measure = self.measures['count'].as_private measures[count_measure] = count_measure # If there are post-joins, we need to ensure that the pre- operations # that happen within the `._evaluate` method do not suppress prematurely # private fields that are necessary to later join in the post-joins. # We therefore modify the privacy of fields for the `._evaluate` stage # depending on whether they are needed later. We also suppress and # external fields not provided by pre-joins, so that `._evaluate` # instances need not concern themselves with them. # Moreover, if there are post-joins and where constraints, some of the constraints # may need to be applied after post-joins. As such, we split the where # constraints into where_pre and where_post. measures_pre, segment_by_pre, where_pre, measures_post, segment_by_post, where_post = ( self._compat_fields_split(measures, segment_by, where, joins_post=joins_post)) # Allow MeasureProvider instance to evaluate all pre- computations. result = self._evaluate(unit_type, measures_pre, segment_by=segment_by_pre, where=where_pre, joins=joins_pre, stats_registry=stats_registry, stats=stats and not joins_post, covariates=covariates, context=context, **opts) if len(joins_post) > 0: # Join in precomputed incompatible joins # TODO: Clean-up how joined measures are detected (remembering measure fields have suffixes) joined_measure_fields = set() if len(joins_post) > 0: for join in joins_post: joined_measure_fields.update(join.object.measure_fields) result = result.merge(join.object.raw, left_on=join.left_on, right_on=join.right_on, how=join.how) # Check columns in resulting dataframe expected_columns = _Measure.get_all_fields( measures_post, unit_type=unit_type, rebase_agg=True, stats_registry=stats_registry, stats=False) + [f.via_name for f in segment_by_post] excess_columns = set(result.columns).difference(expected_columns) missing_columns = set(expected_columns).difference(result.columns) if len( excess_columns ): # remove any unnecessary columns (such as now used join keys) result = result.drop(excess_columns, axis=1) if len(missing_columns): raise RuntimeError( 'Data is missing columns: {}.'.format(missing_columns)) # All new joined in measures need to be multiplied by the count series of # this dataframe, so that they are properly weighted. if len(joined_measure_fields) > 0: result = result.apply( lambda col: result['count|raw'] * col if col.name in joined_measure_fields else col, axis=0) result = PandasMeasureProvider._finalise_dataframe( df=result, unit_type=unit_type, measures=measures_post, segment_by=segment_by_post, where=where_post, stats=stats, stats_registry=stats_registry, covariates=covariates, rebase_agg=False, reagg=False) return EvaluatedMeasures.for_measures(result, stats_registry=stats_registry)