def testDefaultsExist(self): """Check defaults are accessible.""" # Summon the defaults. col_names = semantics.DataFrameNameMapping() groups = semantics.GroupSemantics() periods = semantics.PeriodSemantics() # Check default col names available. self.assertIsInstance(col_names.geo, str) self.assertIsInstance(col_names.group, str) self.assertIsInstance(col_names.period, str) self.assertIsInstance(col_names.response, str) self.assertIsInstance(col_names.cost, str) self.assertIsInstance(col_names.incr_response, str) self.assertIsInstance(col_names.incr_cost, str) # Check default data semantics available. self.assertIsInstance(groups.control, int) self.assertIsInstance(groups.treatment, int) self.assertIsInstance(groups.unassigned, int) self.assertIsInstance(periods.pre, int) self.assertIsInstance(periods.test, int) self.assertIsInstance(periods.cooldown, int) self.assertIsInstance(periods.unassigned, int)
def fit(self, data_frame, target, **kwargs): """Fit the TBR model to the supplied data frame. See optional kwargs for interpretation of the data frame. Args: data_frame: a pandas.DataFrame. Should contain the columns and indices corresponding to the **kwargs information below. Only one of response or cost need be present, corresponding to the supplied `target`. Must be indexed by date. target: `str`. The name of the column to be analysed. **kwargs: optional column/index names for the data and related semantics: key_geo='geo' - geo data frame index name. key_period='period' - experimental period column name. key_group='group' - group assignment column name. key_cost='cost' - cost column name. key_response='response' - response column name. key_date='date' - date index name. key_incr_cost='_incr_cost' - incremental cost column name. key_incr_response='_incr_response' - incremental response column name. group_control=1 - value representing the control group in the data. group_treatment=2 - value representing the treatment group in the data. period_pre=0 - value representing the pre-test period in the data. period_test=1 - value representing the test period in the data. period_cool=2 - value representing the cooldown period in the data. """ # Set the target of the analysis. self.target = target # Extract any column / index name information supplied by the user. user_df_names = utils.kwarg_subdict('key_', **kwargs) self.df_names = semantics.DataFrameNameMapping(**user_df_names) # Extract any semantics for control / treatment supplied by user. user_group_semantics = utils.kwarg_subdict('group_', **kwargs) self.groups = semantics.GroupSemantics(**user_group_semantics) # Extract any semantics for experimental period supplied by user. user_period_semantics = utils.kwarg_subdict('period_', **kwargs) self.periods = semantics.PeriodSemantics(**user_period_semantics) # Set up the analysis data. self._construct_analysis_data(data_frame) # Fit pre-period models for response and for cost. self._fit_pre_period_model()
def setUp(self): """This method will be run before each of the test methods in the class.""" super(TBRiROASTest, self).setUp() # Load the salesandcost dataset. csv_path = 'matched_markets/csv/' csv_dir = os.path.join("", csv_path) self.data = salesandcost.example_data_formatted(csv_dir) # Data frame names for the salesandcost example. self.key_response = 'sales' self.key_cost = 'cost' self.key_group = 'geo.group' self.key_period = 'period' self.key_geo = 'geo' self.key_date = 'date' # Semantics for groups and periods. self.groups = semantics.GroupSemantics() self.periods = semantics.PeriodSemantics()
def fit(self, data_frame, **kwargs): """Fit the TBRiROAS model to the supplied data frame. See optional kwargs for interpretation of the data frame. Args: data_frame: a pandas.DataFrame. Should contain the columns and indices corresponding to the **kwargs information below. Must be indexed by date. **kwargs: optional column/index names for the data and related semantics: key_geo='geo' - geo data frame index name. key_period='period' - experimental period column name. key_group='group' - group assignment column name. key_cost='cost' - cost column name. key_response='response' - response column name. key_date='date' - date index name. key_incr_cost='_incr_cost' - incremental cost column name. key_incr_response='_incr_response' - incremental response column name. group_control=1 - value representing the control group in the data. group_treat=2 - value representing the treatment group in the data. period_pre=0 - value representing the pre-test period in the data. period_test=1 - value representing the test period in the data. period_cool=2 - value representing the cooldown period in the data. """ # Extract any column / index name information supplied by the user user_df_names = utils.kwarg_subdict('key_', **kwargs) self.df_names = semantics.DataFrameNameMapping(**user_df_names) # Extract any semantics for control / treatment supplied by user user_group_semantics = utils.kwarg_subdict('group_', **kwargs) self.groups = semantics.GroupSemantics(**user_group_semantics) # Extract any semantics for experimental period supplied by user user_period_semantics = utils.kwarg_subdict('period_', **kwargs) self.periods = semantics.PeriodSemantics(**user_period_semantics) # Fit seprate TBR models for response and cost self.tbr_response.fit(data_frame, self.df_names.response, **kwargs) self.tbr_cost.fit(data_frame, self.df_names.cost, **kwargs)
def __init__( self, n_control, n_treat, time_pre, time_test, # no cooldown as yet hetresp, hetcost, beta, hetsked, sig_resp, sig_cost, noise_treat_only=False, seed=None, **kwargs): """Creates a data simulator. Args: n_control: int. The number of control geos. n_treat: int. The number of treatment geos. time_pre: int. The number of pre-test period ticks. time_test: int. The number of test period ticks. hetresp: float. The degree of mean response variable heterogeneity. hetcost: float. The degree of mean cost variable heterogeneity. beta: float. The iROAS coefficient to be used. hetsked: float. The degree of heteroskedasticity in cost and response. sig_resp: float. The noise level in the response variable. sig_cost: float. The noise level in the cost variable. noise_treat_only: bool. Whether to add noise only in the treatment period. seed: int. Sets the seed of the random number generator. **kwargs: optional sematics for the produced data frame. """ # Constants. self.n_control = n_control self.n_treat = n_treat self.time_pre = time_pre self.time_test = time_test self.time_total = time_pre + time_test # Model parameters. self.hetresp = hetresp self.hetcost = hetcost self.beta = beta self.hetsked = hetsked self.sig_resp = sig_resp self.sig_cost = sig_cost # Derived facts. self.n_total = self.n_treat + self.n_control self.col_len = self.n_total * self.time_total # Extract any column / index name information supplied by the user. user_df_names = utils.kwarg_subdict('key_', **kwargs) self._df_names = semantics.DataFrameNameMapping(**user_df_names) # Options self.noise_treat_only = noise_treat_only # Extract any semantics for control / treatment supplied by user. user_group_semantics = utils.kwarg_subdict('group_', **kwargs) self._groups = semantics.GroupSemantics(**user_group_semantics) # Extract any semantics for experimental period supplied by user. user_period_semantics = utils.kwarg_subdict('period_', **kwargs) self._periods = semantics.PeriodSemantics(**user_period_semantics) if seed is None: seed = np.random.randint(0, 2**32) self._rng = np.random.RandomState(seed=seed)
def fit(self, data_frame, target=None, **kwargs): """Runs the TBR diagnostics suite. This method executes the following diagnostics: (1) detect and remove the disrupted geos; (2) detect and remove the outlier time points (3) correlation test and (4) the structural stability (A/A) test removing part of the pre-test period. The results of these diagnostics are stored in the _test_results attribute. The resulting modified data frame is stored in the _data attribute and accessible via the get_data() method. Note. This method makes a copy of the original data_frame, and it doesn't modify the original. See optional kwargs for interpretation of the data frame. Args: data_frame: (pandas.DataFrame) Should contain the columns and indices corresponding to the **kwargs information below. Only one of response need be present, corresponding to the supplied `target`. Must be indexed by date. target: (str) name of the target metric (data frame column). If not specified, the column specified as key_response will be assumed. **kwargs: optional column/index names for the data and related semantics: key_geo (string) column name for geo (default: 'geo'). key_period (string) column name for period (default: 'period'). key_group (string) column name for group (default: 'group'). key_response (string) response column name (default: 'response'). key_date (string) date index name (default: 'date'). group_control (int) control group id (default: 1). group_treat (int) treatment group id (default: 2). period_pre (int) pre-test period id (default: 0). period_test (int) test period id (default: 1). period_cool (int) cooldown period id (default: 2). """ self._data = data_frame.copy() user_df_names = utils.kwarg_subdict('key_', **kwargs) self._df_names = semantics.DataFrameNameMapping(**user_df_names) user_group_semantics = utils.kwarg_subdict('group_', **kwargs) self._groups = semantics.GroupSemantics(**user_group_semantics) user_period_semantics = utils.kwarg_subdict('period_', **kwargs) self._periods = semantics.PeriodSemantics(**user_period_semantics) if target is None: target = self._df_names.response self._target = target remove_geos = self._detect_noisy_geos(iqr_coef=1.5, max_threshold=0.5) self._diagnostics['noisy_geos'] = remove_geos if remove_geos: exclude = self._data[self._df_names.geo].isin(remove_geos) self._data = self._data[~exclude] self._create_analysis_data() remove_dates = self._detect_outliers(max_prob=0.1) self._diagnostics['outlier_dates'] = remove_dates if remove_dates: exclude_dates = self._data[self._df_names.date].isin(remove_dates) self._data = self._data[~exclude_dates] self._create_analysis_data() self._diagnostics['corr_test'] = self._correlation_test( min_cor=0.5, prefer_cor=0.8, credible_level=0.95)