예제 #1
0
 def test_filter_criterion_values(self):
     np.random.seed(seed=42)
     integer_matrix = np.random.randint(0, 10, (10, 10))
     diverse_target = np.random.randint(0, 10, (10))
     prev_variables_index = [3, 4, 5]
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [0.1, 0.19, 0.36, 0.96, 0.41, 0.17, 0.36, 0.75, 0.79, 0.99]
     normalized_costs = list((np.array(costs) - min(costs) + 0.0001) /
                             (max(costs) - min(costs) + 0.0001))
     r_1 = 0
     r_2 = 10
     _, filter_value_1, criterion_value_1, _ = fraction_find_best_feature(
         j_criterion_func=jmi,
         r=r_1,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs,
         prev_variables_index=prev_variables_index)
     _, filter_value_2, criterion_value_2, _ = fraction_find_best_feature(
         j_criterion_func=jmi,
         r=r_2,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs,
         prev_variables_index=prev_variables_index)
     assert filter_value_2 > filter_value_1
 def test_different_beta_parameter_cife(self):
     integer_matrix = np.random.randint(0, 10, (10, 10))
     diverse_target = np.random.randint(0, 10, (10))
     prev_variables_index = [3, 4, 5]
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38]
     r = 1
     beta_1 = 1
     beta_2 = 10000
     _, criterion_value_1, _ = fraction_find_best_feature(
         j_criterion_func=cife,
         r=r,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         prev_variables_index=prev_variables_index,
         beta=beta_1)
     _, criterion_value_2, _ = fraction_find_best_feature(
         j_criterion_func=cife,
         r=r,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         prev_variables_index=prev_variables_index,
         beta=beta_2)
     self.assertNotEqual(criterion_value_1, criterion_value_2)
예제 #3
0
 def test_different_beta_parameter_mifs(self):
     integer_matrix = np.random.randint(0, 10, (10, 10))
     diverse_target = np.random.randint(0, 10, (10))
     prev_variables_index = [3, 4, 5]
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38]
     normalized_costs = list((np.array(costs) - min(costs) + 0.0001) /
                             (max(costs) - min(costs) + 0.0001))
     r = 1
     beta_1 = 1
     beta_2 = 10000
     _, filter_value_1, criterion_value_1, _ = fraction_find_best_feature(
         j_criterion_func=mifs,
         r=r,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs,
         prev_variables_index=prev_variables_index,
         beta=beta_1)
     _, filter_value_2, criterion_value_2, _ = fraction_find_best_feature(
         j_criterion_func=mifs,
         r=r,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs,
         prev_variables_index=prev_variables_index,
         beta=beta_2)
     self.assertNotEqual(filter_value_1, filter_value_2)
     self.assertNotEqual(criterion_value_1, criterion_value_2)
예제 #4
0
    def fit(self,
            data,
            target_variable,
            costs,
            r,
            j_criterion_func='cife',
            **kwargs):
        # r
        assert isinstance(r, int) or isinstance(
            r, float), "Argument `r` must be integer or float"
        self.r = r

        super().fit(data, target_variable, costs, j_criterion_func, **kwargs)

        S = set()
        U = set([i for i in range(self.data.shape[1])])

        self.variables_selected_order = []
        self.cost_variables_selected_order = []

        while len(U) > 0:
            k, _, cost = fraction_find_best_feature(
                j_criterion_func=self.j_criterion_func,
                data=self.data,
                target_variable=self.target_variable,
                prev_variables_index=list(S),
                possible_variables_index=list(U),
                costs=self.costs,
                r=self.r,
                **kwargs)
            S.add(k)
            self.variables_selected_order.append(k)
            self.cost_variables_selected_order.append(cost)
            U = U.difference(set([k]))
 def test_simple_input_mim(self):
     integer_matrix = np.random.randint(0, 10, (100, 10))
     diverse_target = np.random.randint(0, 10, (100))
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, -0.36, 0.96, 0.41, 0.17, -0.36, 0.75, 0.79, -1.38]
     r = 1
     selected_feature, criterion_value, cost = fraction_find_best_feature(
         j_criterion_func=mim,
         r=r,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs)
     self.assertIsInstance(selected_feature, int)
     self.assertIsInstance(criterion_value, float)
     self.assertIsInstance(cost, float)
예제 #6
0
 def test_simple_input_mim(self):
     integer_matrix = np.random.randint(0, 10, (100, 10))
     diverse_target = np.random.randint(0, 10, (100))
     candidates_index = [0, 1, 2, 6, 7, 8, 9]
     costs = [1.76, 0.19, 0.36, 0.96, 0.41, 0.17, 0.36, 0.75, 0.79, 1.38]
     normalized_costs = list((np.array(costs) - min(costs) + 0.0001) /
                             (max(costs) - min(costs) + 0.0001))
     r = 1
     selected_feature, filter_value, criterion_value, cost = fraction_find_best_feature(
         j_criterion_func=mim,
         r=r,
         data=integer_matrix,
         target_variable=diverse_target,
         possible_variables_index=candidates_index,
         costs=costs,
         normalized_costs=normalized_costs)
     self.assertIsInstance(selected_feature, int)
     self.assertIsInstance(filter_value, float)
     self.assertIsInstance(criterion_value, float)
     self.assertIsInstance(cost, float)
예제 #7
0
    def test_criterion_filter_values(self):
        target = [1, 0, 1, 1, 0, 1, 0, 1]
        a = [1, 0, 0, 1, 0, 1, 0, 1]
        b = [1, 0, 0, 1, 1, 1, 0, 1]
        c = [1, 1, 1, 1, 1, 1, 1, 0]
        d = [1, 0, 0, 1, 1, 0, 0, 1]

        X = np.array([a, b, c, d]).transpose()
        y = np.array(target).transpose()
        costs = [1, 0.5, 0.25, 0.1]
        normalized_costs = list((np.array(costs) - min(costs) + 0.0001) /
                                (max(costs) - min(costs) + 0.0001))

        # MIM
        r = 0
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mim,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[0, 1, 2, 3],
            costs=costs,
            normalized_costs=normalized_costs)
        self.assertAlmostEqual(mutual_information(y, X[:, feature_index]),
                               criterion_value)

        r = 1.2
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mim,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[0, 1, 2, 3],
            costs=costs,
            normalized_costs=normalized_costs)
        self.assertAlmostEqual(mutual_information(y, X[:, feature_index]),
                               criterion_value)
        self.assertAlmostEqual(
            mutual_information(y, X[:, feature_index]) /
            normalized_costs[feature_index]**r, filter_value)

        # MIFS
        r = 0
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mifs,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[1, 2],
            costs=costs,
            normalized_costs=normalized_costs,
            prev_variables_index=[0, 3])
        mifs_value = mutual_information(
            y, X[:, feature_index]) - mutual_information(
                X[:, feature_index], X[:, 0]) - mutual_information(
                    X[:, feature_index], X[:, 3])
        self.assertAlmostEqual(mifs_value, criterion_value)

        r = 1
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mifs,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[1, 2],
            costs=costs,
            normalized_costs=normalized_costs,
            prev_variables_index=[0, 3])
        mifs_value = mutual_information(
            y, X[:, feature_index]) - mutual_information(
                X[:, feature_index], X[:, 0]) - mutual_information(
                    X[:, feature_index], X[:, 3])
        m = abs(
            min([
                mutual_information(y, X[:, 1]) -
                mutual_information(X[:, 1], X[:, 0]) -
                mutual_information(X[:, 1], X[:, 3]),
                mutual_information(y, X[:, 2]) -
                mutual_information(X[:, 2], X[:, 0]) -
                mutual_information(X[:, 2], X[:, 3])
            ]))
        self.assertAlmostEqual(mifs_value, criterion_value)
        self.assertAlmostEqual(
            (mifs_value + m) / normalized_costs[feature_index]**r,
            filter_value)
예제 #8
0
    def fit(self,
            data,
            target_variable,
            costs,
            r,
            j_criterion_func='cife',
            number_of_features=None,
            budget=None,
            stop_budget=False,
            **kwargs):
        """Ranks all features in dataset with fraction cost filter method.

        Parameters
        ----------
        data: np.ndarray or pd.
            Matrix or data frame of data that we want to rank features.
        target_variable: np.ndarray or pd.core.series.Series
            Vector or series of target variable. Number of rows in `data` must equal target_variable length
        costs: list or dict
            Costs of features. Must be the same size as columns in `data`.
            When using `data` as np.array, provide `costs` as list of floats or integers.
            When using `data` as pd.DataFrame, provide `costs` as list of floats or integers or dict {'col_1':cost_1,...}.
        r: int or float
            Cost scaling parameter. Higher `r` is, higher is the impact of the cost on selection.
        j_criterion_func: str
            Method of approximation of the conditional mutual information
            Must be one of ['mim','mifs','mrmr','jmi','cife'].
            All methods can be seen by running:
            >>> from bcselector.information_theory.j_criterion_approximations.__all__
        number_of_features: int
            Optional argument, constraint to selected number of features.
        budget: int or float
            Optional argument, constraint to selected total cost of features.
        stop_budget: bool
            Optional argument, TODO - must delete this argument
        **kwargs
            Arguments passed to `fraction_find_best_feature()` function and then to `j_criterion_func`.

        Examples
        --------
        >>> from bcselector.variable_selection import FractionVariableSelector
        >>> fvs = FractionVariableSelector()
        >>> fvs.fit(X, y, costs, lamb=1, j_criterion_func='mim')
        """
        # r
        assert isinstance(r, int) or isinstance(
            r, float), "Argument `r` must be integer or float"
        self.r = r
        self.stop_budget = stop_budget

        super().fit(data=data,
                    target_variable=target_variable,
                    costs=costs,
                    j_criterion_func=j_criterion_func,
                    budget=budget,
                    **kwargs)
        if number_of_features is None:
            self.number_of_features = self.data.shape[1]
        else:
            self.number_of_features = number_of_features
        if self.budget is None and stop_budget:
            warnings.warn(
                "Unused argument `stop_budget`. Works only with `budget` argument."
            )

        S = set()
        U = set([i for i in range(self.data.shape[1])])

        self.variables_selected_order = []
        self.cost_variables_selected_order = []

        # for _ in tqdm(range(self.number_of_features), desc=f'Selecting Features for r = {self.r:0.3f}'):
        for _ in range(self.number_of_features):
            k, filter_value, criterion_value, cost = fraction_find_best_feature(
                j_criterion_func=self.j_criterion_func,
                data=self.data,
                target_variable=self.target_variable,
                prev_variables_index=list(S),
                possible_variables_index=list(U),
                costs=self.costs,
                normalized_costs=self.normalized_costs,
                r=self.r,
                **kwargs)
            S.add(k)

            if stop_budget is True and (sum(self.cost_variables_selected_order)
                                        + cost) >= (self.budget or np.inf):
                break

            self.variables_selected_order.append(k)
            self.cost_variables_selected_order.append(cost)
            self.criterion_values.append(criterion_value)
            self.filter_values.append(filter_value)
            U = U.difference(set([k]))
            if len(S) == self.number_of_features:
                break