Beispiel #1
0
    def _get_local_region(self, X_test_norm):
        """ Get local region for each test instance

        Parameters
        ----------
        X_test_norm : numpy array, shape (n_samples, n_features)
            Normalized test data

        Returns
        -------
        final_local_region_list : List of lists, shape [n_samples, [local_region]]
            Indices of training samples in the local region of each test sample
        """

        # Initialize the local region list
        local_region_list = [[]] * X_test_norm.shape[0]

        if self.local_max_features > 1.0:
            warnings.warn(
                "Local max features greater than 1.0, reducing to 1.0")
            self.local_max_features = 1.0

        # perform multiple iterations
        for _ in range(self.local_region_iterations):

            # randomly generate feature subspaces
            features = generate_bagging_indices(
                self.random_state,
                bootstrap_features=False,
                n_features=self.X_train_norm_.shape[1],
                min_features=int(self.X_train_norm_.shape[1] *
                                 self.local_min_features),
                max_features=int(self.X_train_norm_.shape[1] *
                                 self.local_max_features))

            # build KDTree out of training subspace
            tree = KDTree(self.X_train_norm_[:, features])

            # Find neighbors of each test instance
            _, ind_arr = tree.query(X_test_norm[:, features],
                                    k=self.local_region_size)

            # add neighbors to local region list
            for j in range(X_test_norm.shape[0]):
                local_region_list[j] = local_region_list[j] + ind_arr[
                    j, :].tolist()

        # keep nearby points which occur at least local_region_threshold times
        final_local_region_list = [[]] * X_test_norm.shape[0]
        for j in range(X_test_norm.shape[0]):
            final_local_region_list[j] = [
                item for item, count in collections.Counter(
                    local_region_list[j]).items()
                if count > self.local_region_threshold
            ]

        return final_local_region_list
Beispiel #2
0
    def _get_local_region(self, X_test_norm):
        """ Get local region for each test instance

        Parameters
        ----------
        X_test_norm : numpy array, shape (n_samples, n_features)
            Normalized test data

        Returns
        -------
        final_local_region_list : List of lists, shape [n_samples, [local_region]]
            Indices of training samples in the local region of each test sample
        """

        # Initialize the local region list
        local_region_list = [[]] * X_test_norm.shape[0]

        if self.local_max_features > 1.0:
            warnings.warn(
                "Local max features greater than 1.0, reducing to 1.0")
            self.local_max_features = 1.0

        # perform multiple iterations
        for _ in range(self.local_region_iterations):

            # randomly generate feature subspaces
            features = generate_bagging_indices(
                self.random_state,
                bootstrap_features=False,
                n_features=self.X_train_norm_.shape[1],
                min_features=int(
                    self.X_train_norm_.shape[1] * self.local_min_features),
                max_features=int(
                    self.X_train_norm_.shape[1] * self.local_max_features))

            # build KDTree out of training subspace
            tree = KDTree(self.X_train_norm_[:, features])

            # Find neighbors of each test instance
            _, ind_arr = tree.query(X_test_norm[:, features],
                                    k=self.local_region_size)

            # add neighbors to local region list
            for j in range(X_test_norm.shape[0]):
                local_region_list[j] = local_region_list[j] + \
                                       ind_arr[j, :].tolist()

        # keep nearby points which occur at least local_region_threshold times
        final_local_region_list = [[]] * X_test_norm.shape[0]
        for j in range(X_test_norm.shape[0]):
            final_local_region_list[j] = [item for item, count in
                                          collections.Counter(
                                              local_region_list[j]).items() if
                                          count > self.local_region_threshold]

        return final_local_region_list
Beispiel #3
0
    def _get_local_region(self, X_test_norm):
        """ Get local region for each test instance

        Parameters
        ----------
        X_test_norm : numpy array, shape (n_samples, n_features)
            Normalized test data

        Returns
        -------
        final_local_region_list : List of lists, shape of [n_samples, [local_region]]
            Indices of training samples in the local region of each test sample
        """

        # Initialize the local region list
        local_region_list = [[]] * X_test_norm.shape[0]

        if self.local_max_features > 1.0:
            warnings.warn(
                "Local max features greater than 1.0, reducing to 1.0")
            self.local_max_features = 1.0

        if self.X_train_norm_.shape[1] * self.local_min_features < 1:
            warnings.warn(
                "Local min features smaller than 1, increasing to 1.0")
            self.local_min_features = 1.0

        # perform multiple iterations
        for _ in range(self.local_region_iterations):

            # if min and max are the same, then use all features
            if self.local_max_features == self.local_min_features:
                features = range(0, self.X_train_norm_.shape[1])
                warnings.warn("Local min features equals local max features; "
                              "use all features instead.")

            else:
                # randomly generate feature subspaces
                features = generate_bagging_indices(
                    self.random_state,
                    bootstrap_features=False,
                    n_features=self.X_train_norm_.shape[1],
                    min_features=int(
                        self.X_train_norm_.shape[1] * self.local_min_features),
                    max_features=int(
                        self.X_train_norm_.shape[1] * self.local_max_features))

            # build KDTree out of training subspace
            tree = KDTree(self.X_train_norm_[:, features])

            # Find neighbors of each test instance
            _, ind_arr = tree.query(X_test_norm[:, features],
                                    k=self.local_region_size)

            # add neighbors to local region list
            for j in range(X_test_norm.shape[0]):
                local_region_list[j] = local_region_list[j] + \
                                       ind_arr[j, :].tolist()

        # keep nearby points which occur at least local_region_threshold times
        final_local_region_list = [[]] * X_test_norm.shape[0]
        for j in range(X_test_norm.shape[0]):
            tmp = [item for item, count in collections.Counter(
                local_region_list[j]).items() if
                   count > self.local_region_threshold]
            decrease_value = 0
            while len(tmp) < 2:
                decrease_value = decrease_value + 1
                assert decrease_value < self.local_region_threshold
                tmp = [item for item, count in
                       collections.Counter(local_region_list[j]).items() if
                       count > (self.local_region_threshold - decrease_value)]

            final_local_region_list[j] = tmp

        return final_local_region_list
        y_train_scores = clf.decision_scores_
        toeplitz_time.append(time.time() - start)
        toeplitz_roc.append(roc_auc_score(y, y_train_scores))
        toeplitz_prn.append(precision_n_scores(y, y_train_scores))

        X_transformer = PCA_sklearn(n_components=dim_new).fit_transform(X)
        start = time.time()
        clf.fit(X_transformer)
        y_train_scores = clf.decision_scores_
        pca_time.append(time.time() - start)
        pca_roc.append(roc_auc_score(y, y_train_scores))
        pca_prn.append(precision_n_scores(y, y_train_scores))

        selected_features = generate_bagging_indices(random_state=j,
                                                     bootstrap_features=False,
                                                     n_features=int(
                                                         X.shape[1]),
                                                     min_features=dim_new,
                                                     max_features=dim_new + 1)
        assert (dim_new == len(selected_features))
        X_transformer = X[:, selected_features]
        start = time.time()
        clf.fit(X_transformer)
        y_train_scores = clf.decision_scores_
        rp_time.append(time.time() - start)
        rp_roc.append(roc_auc_score(y, y_train_scores))
        rp_prn.append(precision_n_scores(y, y_train_scores))

    print()
    print(mat_file_name)
    print('original', np.round(np.average(original_time), decimals=4),
          np.round(np.average(original_roc), decimals=4),