def testColumnNames(self): """Checks if the required columns are available.""" # The required column names are case sensitive. df = self.df.copy() new_columns = list(df.columns) new_columns[0] = 'Control' df.columns = new_columns with self.assertRaisesRegex(ValueError, r'Missing column\(s\): control'): geoeligibility.GeoEligibility(df) # Required columns must exist. df = self.df.copy() del df['exclude'] with self.assertRaisesRegex(ValueError, r'Missing column\(s\): exclude'): geoeligibility.GeoEligibility(df) # Other columns are allowed. df = self.df.copy() df['newcolumn'] = 1 geoeligibility.GeoEligibility(df) # Duplicated columns are not allowed. df = self.df.copy() df['newcolumn'] = 1 df.columns = ['control', 'treatment', 'exclude', 'control'] with self.assertRaisesRegex(ValueError, r'Duplicate column\(s\): control'): geoeligibility.GeoEligibility(df)
def testBadValues(self): """Checks if there are any illegal values in the value columns.""" # Only zeros and ones are allowed. df = self.df.copy() df.loc['G1'] = [1, 0, -1] with self.assertRaisesRegex( ValueError, 'GeoEligibility objects must have only values ' '0, 1 in columns control, treatment, exclude'): geoeligibility.GeoEligibility(df) # Three zeros is an illegal value. df.loc['G1'] = [0, 0, 0] with self.assertRaisesRegex(ValueError, r'Three zeros found for geo\(s\) G1'): geoeligibility.GeoEligibility(df)
def setUp(self): """Set up a valid Geo Eligibility data frame.""" super(GeoEligibilityTest, self).setUp() geonames = ['G%d' % i for i in range(7)] df = pd.DataFrame( { 'geo': geonames, 'control': 0, 'treatment': 0, 'exclude': 0 }, columns=['geo', 'control', 'treatment', 'exclude']) df = df.set_index('geo') # Add all 7 valid assignments (0, 0, 0 is invalid). df.loc['G0'] = [1, 0, 0] # Control only. df.loc['G1'] = [0, 1, 0] # Treatment only. df.loc['G2'] = [0, 0, 1] # Excluded only. df.loc['G3'] = [1, 1, 0] # Control or Treatment. df.loc['G4'] = [0, 1, 1] # Treatment or Excluded. df.loc['G5'] = [1, 0, 1] # Control or Excluded. df.loc['G6'] = [1, 1, 1] # Control, Treatment, or Excluded. self.df = df # Verify that the above dataframe does not raise errors. self.obj = geoeligibility.GeoEligibility(df)
def testExhaustiveSearchGeoEligibility(self): """Tests search with geo eligibility constraints.""" # without constraints, the optimal design would be treatment = {'1'} and # control = {'2'}, see testExhaustiveSearchFindsOptimalDesign df_geo_elig = self.default_geo_eligibility_data df_geo_elig.loc['1'] = [1, 1, 0] # Cannot exclude geo 1. df_geo_elig.loc['2'] = [1, 0, 0] # Cannot exclude geo 2. df_geo_elig.loc['3'] = [0, 1, 0] # Cannot exclude geo 3. # given the fact that geo 1 and 2 are correlated, and geo 3 is approx. # independent of both, the optimal design will have treatment = {'1','3'} # and control = {'2'}. Since geos '1' and '2' should be in different groups # to achieve high correlation and geo '2' is fixed to control. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, 'response', geo_elig) mm = TBRMatchedMarkets(data, self.par) designs = mm.exhaustive_search() diag = TBRMMDiagnostics( data.aggregate_time_series(set([0, 2])), self.par) diag.x = data.aggregate_time_series(set([1])) corr = diag.corr required_impact = diag.required_impact self.assertSetEqual(designs[0].treatment_geos, {'1', '3'}) self.assertSetEqual(designs[0].control_geos, {'2'}) self.assertTupleEqual( designs[0].score.score, (1, 1, 1, 1, round(corr, 2), 1/required_impact))
def testGeoColumn(self): """Checks if the geo column is there (as an index or column).""" # An index or column 'geo' (case sensitive) must exist. df = self.df.copy() df.index.name = 'Geo' with self.assertRaisesRegex(ValueError, r'There is no column or index \'geo\''): geoeligibility.GeoEligibility(df) df.reset_index(inplace=True) with self.assertRaisesRegex(ValueError, r'There is no column or index \'geo\''): geoeligibility.GeoEligibility(df) # Column 'geo' is also possible. No error raised. df = self.df.copy().reset_index() geoeligibility.GeoEligibility(df)
def testOneGeoExcluded_SizeUnbounded(self): """One geo excluded (x_fixed), no group size restrictions except >= 1.""" df_geo_elig = self.df_geo_elig df_geo_elig.loc['0'] = [0, 0, 1] geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) # n = number of freely assignable geos (geos in 'ctx'). # 3^n - 2^(n + 1) + 1 == 3^4 - 2^5 + 1 == 50. self.assertEqual(mm.count_max_designs(), 50)
def setUp(self): super().setUp() self.mm = TBRMatchedMarkets(self.data, self.par) df_geo_elig = self.data.geo_eligibility.data.copy() # Assign geo '3' into Treatment group. In the order of size (in terms of # required budget), geo '3' will be index 1. df_geo_elig.loc['3'] = [0, 1, 0] geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) self.mmfix = TBRMatchedMarkets(data, self.par)
def testOneGeoTX_SizeUnbounded(self): """One treatment geo in 'tx' and no group size restrictions except >= 1.""" df_geo_elig = self.df_geo_elig df_geo_elig.loc['0'] = [0, 1, 1] geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) # N freely assignable geos and one geo in group 'tx'. # 2 * (3^n + 2^(n+1) + 1) + 2^n - 1. self.assertEqual(mm.count_max_designs(), 115)
def testOneTreatmentFixedDefault_SizeUnbounded(self): """One treatment geo fixed and no group size restrictions except >= 1.""" df_geo_elig = self.df_geo_elig df_geo_elig.loc['0'] = [1, 0, 0] geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) # n = number of freely assignable geos (geos in 'ctx'). One fixed. # Total 3^4 - 2^4 == 65. self.assertEqual(mm.count_max_designs(), 65)
def testDuplicateGeos(self): """Checks if there are any duplicate geos in the geo column.""" df = self.df.copy() geos = df.index.tolist() geos[1] = 'G0' df.index = geos df.index.name = 'geo' with self.assertRaisesRegex(ValueError, r'\'geo\' has duplicate values: G0'): geoeligibility.GeoEligibility(df)
def testOneGeoCT_SizeUnbounded(self): """One treatment geo in 'ct' and no group size restrictions except >= 1.""" df_geo_elig = self.df_geo_elig df_geo_elig.loc['0'] = [1, 1, 0] geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) # n = number of freely assignable geos (geos in 'ctx'). One in 'ct'. # 2 * (3^n - 2^(n+1) + 2^n) == 2 * (3^4 - 2^5 + 2^4) = 2 * 65 = 130. self.assertEqual(mm.count_max_designs(), 130)
def testGeosMustInclude(self): """Geos that must be included are identified.""" df_geo_elig = self.default_geo_eligibility_data df_geo_elig.loc['1'] = [1, 1, 0] # Cannot exclude geo 1. df_geo_elig.loc['2'] = [1, 0, 0] # Cannot exclude geo 2. df_geo_elig.loc['3'] = [0, 1, 0] # Cannot exclude geo 3. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) par = TBRMMDesignParameters(n_test=14, iroas=3.0) mm = TBRMatchedMarkets(data, par) self.assertCountEqual(mm.geos_must_include, {'1', '2', '3'})
def testOneControlGeoFixed_SizeUnbounded(self): """One control geo fixed and no group size restrictions except >= 1.""" # Default except one fixed control geo (c_fixed). df_geo_elig = self.df_geo_elig df_geo_elig.loc['0'] = [1, 0, 0] geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) # n = number of freely assignable geos (geos in 'ctx'). One fixed. # 3^n - 2^n == 3^4 - 2^4 == 65. self.assertEqual(mm.count_max_designs(), 65)
def setUp(self): super().setUp() df_geo_elig = self.data.geo_eligibility.data.copy() # Object mm4: 4 geos. # Exclude geo, '1' from the set, use 4 geos for testing. df_geo_elig.loc['1'] = [0, 0, 1] # Note: the remaining 4 geos will be reindexed as 0, 1, 2, 3. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) self.data4 = TBRMMData(self.df, self.response, geo_elig) self.mm4 = TBRMatchedMarkets(self.data4, self.par) df_geo_elig = self.data.geo_eligibility.data.copy() # Object mmfix: 3 geos + 1 fixed to control + 1 in group 'ct'. # Exclude geo, '1' from the set, use 4 geos for testing. df_geo_elig.loc['1'] = [1, 0, 0] # Geo index 3, assigned to control. df_geo_elig.loc['2'] = [1, 1, 0] # Geo index 2, Control or Treatment only. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) self.datafix = TBRMMData(self.df, self.response, geo_elig) self.mmfix = TBRMatchedMarkets(self.datafix, self.par)
def testExcludedGeo(self): """Completely excluded geos (x_fixed) do not appear in geo_assignments. If a geo is excluded, geo indices will be renumbered. """ df_geo_elig = self.default_geo_eligibility_data df_geo_elig.loc['2'] = [0, 0, 1] # Group 'x_fixed'. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) self.assertFalse(mm.geo_assignments.x_fixed) self.assertCountEqual(mm.geo_assignments.all, {0, 1, 2, 3})
def testNonEligibilityOverridesGeoOverBudget(self): """Geos that are too large are not excluded if not eligible.""" df_geo_elig = self.default_geo_eligibility_data df_geo_elig.loc['4'] = [0, 1, 0] # Cannot exclude geo 4. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) iroas = 2.5 budget_max = self.impact['2'] / iroas # Exclude '3', '4'. par = TBRMMDesignParameters(n_test=14, iroas=iroas, budget_range=(0.1, budget_max)) mm = TBRMatchedMarkets(data, par) self.assertCountEqual(mm.geos_within_constraints, {'0', '1', '2', '4'})
def testNonEligibilityOverridesTooLargeGeo(self): """Geos that are too large are not excluded if not eligible.""" max_share = self.data.geo_share['3'] # Geo '4' is 'too large'. share_range = (max_share / 2.0, max_share) df_geo_elig = self.default_geo_eligibility_data df_geo_elig.loc['4'] = [0, 1, 0] # Cannot exclude geo 4. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) par = TBRMMDesignParameters(n_test=14, iroas=3.0, treatment_share_range=share_range) mm = TBRMatchedMarkets(data, par) self.assertCountEqual(mm.geos_within_constraints, self.geos)
def testRangeIsSpecifiedButLowerThanLowerBound(self): """The user-specified lower bound of the range is adjusted if too low.""" par = TBRMMDesignParameters(n_test=14, iroas=2.0, treatment_geos_range=(1, 4)) df_geo_elig = self.data.geo_eligibility.data df_geo_elig.loc['1'] = [0, 1, 0] # Geo '1' is always in Treatment. df_geo_elig.loc['2'] = [0, 1, 0] # Geo '2' is always in Treatment. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, par) # There are 5 geos, 2 always assigned to Treatment, hence the lower bound # must be 2. The upper bound is unchanged (4) as it is the maximum possible. self.assertEqual(mm.treatment_group_size_range(), range(2, 5))
def testThereAreFixedTreatmentGeos(self): """The minimum number of geos must be at least len(t_fixed). obj.geo_assignments.t_fixed is the set of treatment geos that are always included in Treatment group. Hence the minimum must be adjusted accordingly. """ par = TBRMMDesignParameters(n_test=14, iroas=2.0) df_geo_elig = self.data.geo_eligibility.data df_geo_elig.loc['1'] = [0, 1, 0] # Geo '1' is always in Treatment. df_geo_elig.loc['2'] = [0, 1, 0] # Geo '2' is always in Treatment. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, par) # There are 5 geos, 2 fixed to Treatment, but none fixed to control, hence # the range must be equal to [2, 3, 4]. self.assertEqual(mm.treatment_group_size_range(), range(2, 5))
def testSomeGeosAreNeverInTreatment(self): """The max # of geos == len(t) if some geos are never in Treatment group. If there are geos that are never assigned to treatment, the maximum treatment group size does not have to be restricted. """ par = TBRMMDesignParameters(n_test=14, iroas=2.0) df_geo_elig = self.data.geo_eligibility.data df_geo_elig.loc['1'] = [1, 0, 0] # Geo '1' is never in Treatment. df_geo_elig.loc['2'] = [1, 0, 1] # Geo '2' is never in Treatment. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, par) # There are 5 geos, 2 never assigned to Treatment, hence up to 3 geos can be # assigned to treatment so the range must be equal to [1, 2, 3]. self.assertEqual(mm.treatment_group_size_range(), range(1, 4))
def testGeoIndexOrder(self): """Geos are indexed from the largest budget (index 0) to smallest (4).""" df_geo_elig = self.default_geo_eligibility_data df_geo_elig.loc['4'] = [1, 0, 0] # Group 'c_fixed'. Largest geo -> index 0. df_geo_elig.loc['3'] = [0, 1, 0] # Group 't_fixed'. df_geo_elig.loc['1'] = [1, 1, 0] # Group 'ct'. df_geo_elig.loc['0'] = [1, 0, 1] # Group 'cx'. Smallest geo -> index 4. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) self.assertCountEqual(mm.geo_assignments.c_fixed, {0}) self.assertCountEqual(mm.geo_assignments.t_fixed, {1}) self.assertCountEqual(mm.geo_assignments.ct, {3}) self.assertCountEqual(mm.geo_assignments.cx, {4})
def testAllGeoDifferentGroup_SizeUnbounded(self): """All geos in different groups, no group size restrictions except >= 1.""" df_geo_elig = self.df_geo_elig df_geo_elig.loc['4'] = [1, 0, 0] # 0 - 'c_fixed'. df_geo_elig.loc['3'] = [1, 0, 1] # 1 - 'cx'. df_geo_elig.loc['2'] = [0, 1, 1] # 2 - 'tx'. df_geo_elig.loc['1'] = [1, 1, 0] # 3 - 'ct'. df_geo_elig.loc['0'] = [1, 1, 1] # 4 - 'ctx'. # 20 eligible designs. # 0 1 2 3 4 | 0 1 2 3 4 | 0 1 2 3 4 | 0 1 2 3 4 | # c c t c c | c c x c t | c x t c c | c x x c t | # . . . c t | . . . t c | . . . c t | . . . t c | # . . . c x | . . . t t | . . . c x | . . . t t | # . . . t c | . . . t x | . . . t c | . . . t x | # . . . t t | | . . . t t | | # . . . t x | | . . . t x | | geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) mm = TBRMatchedMarkets(data, self.par) self.assertEqual(mm.count_max_designs(), 20)
def testAllGeoDifferentGroup_SizeBounded(self): """All geos in different groups, limit to size 2 only.""" df_geo_elig = self.df_geo_elig df_geo_elig.loc['4'] = [1, 0, 0] # 0 - 'c_fixed'. df_geo_elig.loc['3'] = [1, 0, 1] # 1 - 'cx'. df_geo_elig.loc['2'] = [0, 1, 1] # 2 - 'tx'. df_geo_elig.loc['1'] = [1, 1, 0] # 3 - 'ct'. df_geo_elig.loc['0'] = [1, 1, 1] # 4 - 'ctx'. # Group sizes can vary from 2 or 3, only 7 eligible designs. # 0 1 2 3 4 | 0 1 2 3 4 | # c c t c t | c c x t t | # c c t t c | c x t c t | # c c t t t | c x t t c | # c c t t x | | geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) par = TBRMMDesignParameters(n_test=14, iroas=3.0, control_geos_range=(2, 3), treatment_geos_range=(2, 3)) mm = TBRMatchedMarkets(data, par) self.assertEqual(mm.count_max_designs(), 7)
def testAllGeoDifferentGroup_MaxRatioFixed(self): """All geos in different groups, limit to size 2 only.""" df_geo_elig = self.df_geo_elig df_geo_elig.loc['4'] = [1, 0, 0] # 0 - 'c_fixed'. df_geo_elig.loc['3'] = [1, 0, 1] # 1 - 'cx'. df_geo_elig.loc['2'] = [0, 1, 1] # 2 - 'tx'. df_geo_elig.loc['1'] = [1, 1, 0] # 3 - 'ct'. df_geo_elig.loc['0'] = [1, 1, 1] # 4 - 'ctx'. # Only control/treatment geo ratios 1/1, 2/3, 2/1, 1/2 allowed, 14 eligible # designs. # 0 1 2 3 4 | 0 1 2 3 4 | 0 1 2 3 4 | 0 1 2 3 4 | # c c t c t | c c x t t | c x t c t | c x x c t # . . . t c | . . . t x | . . . c x | . . . t c # . . . t t | | . . . t c | . . . t t # . . . t x | | . . . t x | . . . t x geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.df, self.response, geo_elig) par = TBRMMDesignParameters(n_test=14, iroas=3.0, geo_ratio_tolerance=1.0) mm = TBRMatchedMarkets(data, par) self.assertEqual(mm.count_max_designs(), 14)
def testGreedySearchFixedTreatmentGeoFail(self): """Search fails with fixed treatment group as all design do not pass dwtest.""" df_geo_elig = self.default_geo_eligibility_data df_geo_elig.loc['2'] = [0, 1, 0] # geo 2 (index 1) is fixed to Treatment. geo_elig = geoeligibility.GeoEligibility(df_geo_elig) data = TBRMMData(self.dataframe, 'response', geo_elig) self.par.treatment_geos_range = (1, 1) mm = TBRMatchedMarkets(data, self.par) designs = mm.greedy_search() diag = TBRMMDiagnostics(data.aggregate_time_series(set([1])), self.par) diag.x = data.aggregate_time_series(set([0])) corr = diag.corr required_impact = diag.estimate_required_impact(corr) self.assertTrue(len(designs) == 1) # pylint: disable=g-generic-assert self.assertSetEqual(designs[0].treatment_geos, {'2'}) self.assertSetEqual(designs[0].control_geos, {'1'}) self.assertTupleEqual( designs[0].score.score, (1, 1, 1, 0, round(corr, 2), 1 / required_impact)) self.assertEqual(designs[0].score.diag.corr, designs[0].diag.corr) self.assertEqual(designs[0].score.score.corr, round(designs[0].diag.corr, 2))