def test_different_beta_parameter_cife(self): integer_matrix = np.random.randint(0, 10, (10, 10)) diverse_target = np.random.randint(0, 10, (10)) prev_variables_index = [3, 4, 5] candidates_index = [0, 1, 2, 6, 7, 8, 9] costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38] lamb = 1 beta_1 = 1 beta_2 = 10000 _, criterion_value_1, _ = difference_find_best_feature( j_criterion_func=cife, lamb=lamb, data=integer_matrix, target_variable=diverse_target, possible_variables_index=candidates_index, costs=costs, prev_variables_index=prev_variables_index, beta=beta_1) _, criterion_value_2, _ = difference_find_best_feature( j_criterion_func=cife, lamb=lamb, data=integer_matrix, target_variable=diverse_target, possible_variables_index=candidates_index, costs=costs, prev_variables_index=prev_variables_index, beta=beta_2) self.assertNotEqual(criterion_value_1, criterion_value_2)
def test_different_beta_parameter_mifs(self): integer_matrix = np.random.randint(0, 10, (10, 10)) diverse_target = np.random.randint(0, 10, (10)) prev_variables_index = [3, 4, 5] candidates_index = [0, 1, 2, 6, 7, 8, 9] costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38] normalized_costs = list((np.array(costs) - min(costs) + 0.0001) / (max(costs) - min(costs) + 0.0001)) lamb = 1 beta_1 = 1 beta_2 = 10000 _, filter_value_1, criterion_value_1, _ = difference_find_best_feature( j_criterion_func=mifs, lamb=lamb, data=integer_matrix, target_variable=diverse_target, possible_variables_index=candidates_index, costs=costs, normalized_costs=normalized_costs, prev_variables_index=prev_variables_index, beta=beta_1) _, filter_value_2, criterion_value_2, _ = difference_find_best_feature( j_criterion_func=mifs, lamb=lamb, data=integer_matrix, target_variable=diverse_target, possible_variables_index=candidates_index, costs=costs, normalized_costs=normalized_costs, prev_variables_index=prev_variables_index, beta=beta_2) self.assertNotEqual(filter_value_1, filter_value_2) self.assertNotEqual(criterion_value_1, criterion_value_2)
def fit(self, data, target_variable, costs, lamb, j_criterion_func='cife', **kwargs): # lamb assert isinstance(lamb, int) or isinstance( lamb, float), "Argument `lamb` must be integer or float" self.lamb = lamb super().fit(data, target_variable, costs, j_criterion_func, **kwargs) S = set() U = set([i for i in range(self.data.shape[1])]) self.variables_selected_order = [] self.cost_variables_selected_order = [] while len(U) > 0: k, _, cost = difference_find_best_feature( j_criterion_func=self.j_criterion_func, data=self.data, target_variable=self.target_variable, prev_variables_index=list(S), possible_variables_index=list(U), costs=self.costs, lamb=self.lamb, **kwargs) S.add(k) self.variables_selected_order.append(k) self.cost_variables_selected_order.append(cost) U = U.difference(set([k]))
def test_simple_input_mim(self): integer_matrix = np.random.randint(0, 10, (100, 10)) diverse_target = np.random.randint(0, 10, (100)) # prev_variable_index = [3,4,5] candidates_index = [0, 1, 2, 6, 7, 8, 9] costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38] lamb = 1 selected_feature, criterion_value, cost = difference_find_best_feature( j_criterion_func=mim, lamb=lamb, data=integer_matrix, target_variable=diverse_target, possible_variables_index=candidates_index, costs=costs) self.assertIsInstance(selected_feature, int) self.assertIsInstance(criterion_value, float) self.assertIsInstance(cost, float)
def test_simple_input_jmi(self): integer_matrix = np.random.randint(0, 10, (100, 10)) diverse_target = np.random.randint(0, 10, (100)) prev_variable_index = [3, 4, 5] candidates_index = [0, 1, 2, 6, 7, 8, 9] costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38] normalized_costs = list((np.array(costs) - min(costs) + 0.0001) / (max(costs) - min(costs) + 0.0001)) lamb = 1 selected_feature, filter_value, criterion_value, cost = difference_find_best_feature( j_criterion_func=jmi, lamb=lamb, data=integer_matrix, target_variable=diverse_target, possible_variables_index=candidates_index, costs=costs, normalized_costs=normalized_costs, prev_variables_index=prev_variable_index) self.assertIsInstance(selected_feature, int) self.assertIsInstance(filter_value, float) self.assertIsInstance(criterion_value, float) self.assertIsInstance(cost, float)
def fit(self, data, target_variable, costs, lamb, j_criterion_func='cife', number_of_features=None, budget=None, stop_budget=False, **kwargs): """Ranks all features in dataset with difference cost filter method. Parameters ---------- data: np.ndarray or pd. Matrix or data frame of data that we want to rank features. target_variable: np.ndarray or pd.core.series.Series Vector or series of target variable. Number of rows in `data` must equal target_variable length costs: list or dict Costs of features. Must be the same size as columns in `data`. When using `data` as np.array, provide `costs` as list of floats or integers. When using `data` as pd.DataFrame, provide `costs` as list of floats or integers or dict {'col_1':cost_1,...}. lamb: int or float Cost scaling parameter. Higher lambda is, higher is the impact of the cost on selection. j_criterion_func: str Method of approximation of the conditional mutual information Must be one of ['mim','mifs','mrmr','jmi','cife']. All methods can be seen by running: >>> from bcselector.information_theory.j_criterion_approximations.__all__ number_of_features: int Optional argument, constraint to selected number of features. budget: int or float Optional argument, constraint to selected total cost of features. stop_budget: bool Optional argument, TODO - must delete this argument **kwargs Arguments passed to `difference_find_best_feature()` function and then to `j_criterion_func`. Attributes ---------- Examples -------- >>> from bcselector.variable_selection import DiffVariableSelector >>> dvs = DiffVariableSelector() >>> dvs.fit(X, y, costs, lamb=1, j_criterion_func='mim') """ # lamb assert isinstance(lamb, int) or isinstance( lamb, float), "Argument `lamb` must be integer or float" self.lamb = lamb self.stop_budget = stop_budget super().fit(data=data, target_variable=target_variable, costs=costs, j_criterion_func=j_criterion_func, budget=budget, **kwargs) if number_of_features is None: self.number_of_features = self.data.shape[1] else: self.number_of_features = number_of_features if self.budget is None and stop_budget: warnings.warn( "Unused argument `stop_budget`. Works only with `budget` argument." ) S = set() U = set([i for i in range(self.data.shape[1])]) self.variables_selected_order = [] self.cost_variables_selected_order = [] for _ in range(self.number_of_features): k, filter_value, criterion_value, cost = difference_find_best_feature( j_criterion_func=self.j_criterion_func, data=self.X_train, target_variable=self.y_train, prev_variables_index=list(S), possible_variables_index=list(U), costs=self.costs, normalized_costs=self.normalized_costs, lamb=self.lamb, **kwargs) S.add(k) if stop_budget is True and (sum(self.cost_variables_selected_order) + cost) >= (self.budget or np.inf): break self.variables_selected_order.append(k) self.cost_variables_selected_order.append(cost) self.criterion_values.append(criterion_value) self.filter_values.append(filter_value) U = U.difference(set([k])) if len(S) == self.number_of_features: break