def test_different_beta_parameter_cife(self):
     integer_matrix = np.random.randint(0, 10, (10, 10))
     diverse_target = np.random.randint(0, 10, (10))
     prev_variables_index = [3, 4, 5]
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38]
     lamb = 1
     beta_1 = 1
     beta_2 = 10000
     _, criterion_value_1, _ = difference_find_best_feature(
         j_criterion_func=cife,
         lamb=lamb,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         prev_variables_index=prev_variables_index,
         beta=beta_1)
     _, criterion_value_2, _ = difference_find_best_feature(
         j_criterion_func=cife,
         lamb=lamb,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         prev_variables_index=prev_variables_index,
         beta=beta_2)
     self.assertNotEqual(criterion_value_1, criterion_value_2)
Esempio n. 2
0
 def test_different_beta_parameter_mifs(self):
     integer_matrix = np.random.randint(0, 10, (10, 10))
     diverse_target = np.random.randint(0, 10, (10))
     prev_variables_index = [3, 4, 5]
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38]
     normalized_costs = list((np.array(costs) - min(costs) + 0.0001) /
                             (max(costs) - min(costs) + 0.0001))
     lamb = 1
     beta_1 = 1
     beta_2 = 10000
     _, filter_value_1, criterion_value_1, _ = difference_find_best_feature(
         j_criterion_func=mifs,
         lamb=lamb,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs,
         prev_variables_index=prev_variables_index,
         beta=beta_1)
     _, filter_value_2, criterion_value_2, _ = difference_find_best_feature(
         j_criterion_func=mifs,
         lamb=lamb,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs,
         prev_variables_index=prev_variables_index,
         beta=beta_2)
     self.assertNotEqual(filter_value_1, filter_value_2)
     self.assertNotEqual(criterion_value_1, criterion_value_2)
Esempio n. 3
0
    def fit(self,
            data,
            target_variable,
            costs,
            lamb,
            j_criterion_func='cife',
            **kwargs):
        # lamb
        assert isinstance(lamb, int) or isinstance(
            lamb, float), "Argument `lamb` must be integer or float"
        self.lamb = lamb
        super().fit(data, target_variable, costs, j_criterion_func, **kwargs)

        S = set()
        U = set([i for i in range(self.data.shape[1])])

        self.variables_selected_order = []
        self.cost_variables_selected_order = []

        while len(U) > 0:
            k, _, cost = difference_find_best_feature(
                j_criterion_func=self.j_criterion_func,
                data=self.data,
                target_variable=self.target_variable,
                prev_variables_index=list(S),
                possible_variables_index=list(U),
                costs=self.costs,
                lamb=self.lamb,
                **kwargs)
            S.add(k)
            self.variables_selected_order.append(k)
            self.cost_variables_selected_order.append(cost)
            U = U.difference(set([k]))
 def test_simple_input_mim(self):
     integer_matrix = np.random.randint(0, 10, (100, 10))
     diverse_target = np.random.randint(0, 10, (100))
     # prev_variable_index = [3,4,5]
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38]
     lamb = 1
     selected_feature, criterion_value, cost = difference_find_best_feature(
         j_criterion_func=mim,
         lamb=lamb,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs)
     self.assertIsInstance(selected_feature, int)
     self.assertIsInstance(criterion_value, float)
     self.assertIsInstance(cost, float)
Esempio n. 5
0
 def test_simple_input_jmi(self):
     integer_matrix = np.random.randint(0, 10, (100, 10))
     diverse_target = np.random.randint(0, 10, (100))
     prev_variable_index = [3, 4, 5]
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38]
     normalized_costs = list((np.array(costs) - min(costs) + 0.0001) /
                             (max(costs) - min(costs) + 0.0001))
     lamb = 1
     selected_feature, filter_value, criterion_value, cost = difference_find_best_feature(
         j_criterion_func=jmi,
         lamb=lamb,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs,
         prev_variables_index=prev_variable_index)
     self.assertIsInstance(selected_feature, int)
     self.assertIsInstance(filter_value, float)
     self.assertIsInstance(criterion_value, float)
     self.assertIsInstance(cost, float)
Esempio n. 6
0
    def fit(self,
            data,
            target_variable,
            costs,
            lamb,
            j_criterion_func='cife',
            number_of_features=None,
            budget=None,
            stop_budget=False,
            **kwargs):
        """Ranks all features in dataset with difference cost filter method.

        Parameters
        ----------
        data: np.ndarray or pd.
            Matrix or data frame of data that we want to rank features.
        target_variable: np.ndarray or pd.core.series.Series
            Vector or series of target variable. Number of rows in `data` must equal target_variable length
        costs: list or dict
            Costs of features. Must be the same size as columns in `data`.
            When using `data` as np.array, provide `costs` as list of floats or integers.
            When using `data` as pd.DataFrame, provide `costs` as list of floats or integers or dict {'col_1':cost_1,...}.
        lamb: int or float
            Cost scaling parameter. Higher lambda is, higher is the impact of the cost on selection.
        j_criterion_func: str
            Method of approximation of the conditional mutual information
            Must be one of ['mim','mifs','mrmr','jmi','cife'].
            All methods can be seen by running:
            >>> from bcselector.information_theory.j_criterion_approximations.__all__
        number_of_features: int
            Optional argument, constraint to selected number of features.
        budget: int or float
            Optional argument, constraint to selected total cost of features.
        stop_budget: bool
            Optional argument, TODO - must delete this argument
        **kwargs
            Arguments passed to `difference_find_best_feature()` function and then to `j_criterion_func`.

        Attributes
        ----------

        Examples
        --------
        >>> from bcselector.variable_selection import DiffVariableSelector
        >>> dvs = DiffVariableSelector()
        >>> dvs.fit(X, y, costs, lamb=1, j_criterion_func='mim')
        """
        # lamb
        assert isinstance(lamb, int) or isinstance(
            lamb, float), "Argument `lamb` must be integer or float"
        self.lamb = lamb
        self.stop_budget = stop_budget
        super().fit(data=data,
                    target_variable=target_variable,
                    costs=costs,
                    j_criterion_func=j_criterion_func,
                    budget=budget,
                    **kwargs)

        if number_of_features is None:
            self.number_of_features = self.data.shape[1]
        else:
            self.number_of_features = number_of_features
        if self.budget is None and stop_budget:
            warnings.warn(
                "Unused argument `stop_budget`. Works only with `budget` argument."
            )
        S = set()
        U = set([i for i in range(self.data.shape[1])])

        self.variables_selected_order = []
        self.cost_variables_selected_order = []

        for _ in range(self.number_of_features):
            k, filter_value, criterion_value, cost = difference_find_best_feature(
                j_criterion_func=self.j_criterion_func,
                data=self.X_train,
                target_variable=self.y_train,
                prev_variables_index=list(S),
                possible_variables_index=list(U),
                costs=self.costs,
                normalized_costs=self.normalized_costs,
                lamb=self.lamb,
                **kwargs)
            S.add(k)

            if stop_budget is True and (sum(self.cost_variables_selected_order)
                                        + cost) >= (self.budget or np.inf):
                break

            self.variables_selected_order.append(k)
            self.cost_variables_selected_order.append(cost)
            self.criterion_values.append(criterion_value)
            self.filter_values.append(filter_value)
            U = U.difference(set([k]))

            if len(S) == self.number_of_features:
                break