def test_cife(self): # Given n_cols = 5 n_rows = 1000 model = LogisticRegression() # When mg = MatrixGenerator() X, y, costs = mg.generate(n_rows=n_rows, n_basic_cols=n_cols, noise_sigmas=[0.1, 0.5], seed=2) r = 1 beta = 0.5 dvs = FractionVariableSelector() dvs.fit(data=X, target_variable=y, costs=costs, r=r, j_criterion_func='cife', beta=beta) dvs.score(model=model, scoring_function=roc_auc_score) dvs.plot_scores(compare_no_cost_method=True, model=model, annotate=True) # Then self.assertIsInstance(dvs.variables_selected_order, list) self.assertEqual(len(dvs.variables_selected_order), len(costs)) self.assertAlmostEqual(sum(costs), sum(dvs.cost_variables_selected_order))
def test_theoretical_output(self): integer_matrix = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 2], [0, 1, 3], [1, 1, 5]]) diverse_target = np.array([0, 0, 0, 0, 1]) costs = [1, 1, 1] r = 1 fvs = FractionVariableSelector() fvs.fit(data=integer_matrix, target_variable=diverse_target, costs=costs, r=r, j_criterion_func='mim') self.assertEqual(fvs.variables_selected_order[0], 2)
def test_pandas_input(self): integer_matrix = pd.DataFrame(np.random.randint(0, 10, (100, 3)), columns=['AA', 'BB', 'CC']) diverse_target = np.random.randint(0, 2, (100)) costs = {'AA': 10, 'BB': 1, 'CC': 1.5} r = 1 fvs = FractionVariableSelector() fvs.fit(data=integer_matrix, target_variable=diverse_target, costs=costs, r=r, j_criterion_func='mim') self.assertIsInstance(fvs.variables_selected_order, list) self.assertEqual(len(fvs.variables_selected_order), len(costs))
def test_numpy_input(self): integer_matrix = np.random.randint(0, 10, (100, 10)) diverse_target = np.random.randint(0, 10, (100)) costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38] r = 1 fvs = FractionVariableSelector() fvs.fit(data=integer_matrix, target_variable=diverse_target, costs=costs, r=r, j_criterion_func='mim') self.assertIsInstance(fvs.variables_selected_order, list) self.assertEqual(len(fvs.variables_selected_order), len(costs))
def test_stop_budget(self): integer_matrix = pd.DataFrame(np.random.randint(0, 10, (100, 3)), columns=['AA', 'BB', 'CC']) diverse_target = pd.Series(np.random.randint(0, 2, (100))) costs = {'AA': 2, 'BB': 1.1, 'CC': 1.5} r = 1 fvs = FractionVariableSelector() fvs.fit(data=integer_matrix, target_variable=diverse_target, costs=costs, r=r, j_criterion_func='mim', budget=2, stop_budget=True) self.assertGreater(2, sum(fvs.cost_variables_selected_order)) self.assertGreaterEqual(2, len(fvs.variables_selected_order))
def test_score(self): integer_matrix = np.random.randint(0, 10, (100, 10)) diverse_target = np.random.randint(0, 2, (100)) costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38] r = 1 fvs = FractionVariableSelector() fvs.fit(data=integer_matrix, target_variable=diverse_target, costs=costs, r=r, j_criterion_func='mim') model = LogisticRegression() fvs.score(model, scoring_function=roc_auc_score) self.assertEqual(len(fvs.total_scores), len(costs))
def test_regard_to_cost_is_better_cife(self): # Given n_cols = 3 n_rows = 1000 model = LogisticRegression() sigmas = [1, 10, 100] # When mg = MatrixGenerator() X, y, costs = mg.generate(n_rows=n_rows, n_basic_cols=n_cols, basic_cost=1, noise_sigmas=sigmas, seed=42) r = 0.8 fvs = FractionVariableSelector() fvs.fit(data=X, target_variable=y, costs=costs, r=r, j_criterion_func='cife', beta=0.05) fvs.score(model=model, scoring_function=roc_auc_score) fvs.plot_scores(compare_no_cost_method=True, model=model) def find_nearest_idx(list, value): array = np.asarray(list) idx = (np.abs(array - value)).argmin() return idx when_better = [] for i in range(fvs.data.shape[1]): idx_1_no_cost = i idx_1_cost = find_nearest_idx( fvs.total_costs, fvs.no_cost_total_costs[idx_1_no_cost]) if fvs.total_scores[idx_1_cost] > fvs.no_cost_total_scores[ idx_1_no_cost]: when_better.append(True) else: when_better.append(False) # Then self.assertTrue(sum(when_better) / len(when_better) >= 0.5)
def test_run_score_before_fit(self): fvs = FractionVariableSelector() model = LogisticRegression() with self.assertRaises(AssertionError): fvs.score(model, scoring_function=roc_auc_score)