def preprocess_neighbors(self, rebuild=False, save=True):
        neighbors_model_path = os.path.join(self.selected_dir,
                                            "neighbors_model" + ".pkl")
        neighbors_path = os.path.join(self.selected_dir, "neighbors" + ".npy")
        neighbors_weight_path = os.path.join(self.selected_dir,
                                             "neighbors_weight" + ".npy")
        test_neighbors_path = os.path.join(self.selected_dir,
                                           "test_neighbors" + ".npy")
        test_neighbors_weight_path = os.path.join(
            self.selected_dir, "test_neighbors_weight" + ".npy")
        if os.path.exists(neighbors_model_path) and \
                os.path.exists(neighbors_path) and \
                os.path.exists(test_neighbors_path) and rebuild == False:
            print("neighbors and neighbor_weight exist!!!")
            neighbors = np.load(neighbors_path)
            neighbors_weight = np.load(neighbors_weight_path)
            test_neighbors = np.load(test_neighbors_path)
            self.test_neighbors = test_neighbors
            return neighbors, neighbors_weight, test_neighbors
        print("neighbors and neighbor_weight  do not exist, preprocessing!")
        train_num = self.train_X.shape[0]
        train_y = np.array(self.train_y)
        test_num = self.test_X.shape[0]
        max_neighbors = min(len(train_y), 200)
        print("data shape: {}, labeled_num: {}".format(str(self.train_X.shape),
                                                       sum(train_y != -1)))
        nn_fit = NearestNeighbors(7, n_jobs=-4).fit(self.train_X)
        print("nn construction finished!")
        neighbor_result = nn_fit.kneighbors_graph(
            nn_fit._fit_X,
            max_neighbors,
            # 2,
            mode="distance")
        test_neighbors_result = nn_fit.kneighbors_graph(self.test_X,
                                                        max_neighbors,
                                                        mode="distance")
        print("neighbor_result got!")
        neighbors, neighbors_weight = csr_to_impact_matrix(
            neighbor_result, train_num, max_neighbors)
        test_neighbors, test_neighbors_weight = csr_to_impact_matrix(
            test_neighbors_result, test_num, max_neighbors)
        self.test_neighbors = test_neighbors

        print("preprocessed neighbors got!")

        # save neighbors information
        if save:
            pickle_save_data(neighbors_model_path, nn_fit)
            np.save(neighbors_path, neighbors)
            np.save(neighbors_weight_path, neighbors_weight)
            np.save(test_neighbors_path, test_neighbors)
            np.save(test_neighbors_weight_path, test_neighbors_weight)
        return neighbors, neighbors_weight, test_neighbors
Exemple #2
0
    def add_data(self, added_idxs, train_pred, cls):
        self.actions.append("add-unlabeled")
        added_idxs = np.array(added_idxs).reshape(-1)

        added_false_idxes = [
            i for i in range(len(self.train_idx),
                             len(self.train_idx) + len(added_idxs))
        ]

        self.rest_idxs = self.rest_idxs.tolist() + added_false_idxes
        self.train_idx = np.hstack((self.train_idx, added_idxs))
        self.rest_idxs = np.array(self.rest_idxs)
        m = self.get_new_id_map()

        pre_num = self.affinity_matrix.shape[0]
        add_num = len(added_idxs)
        total_num = pre_num + add_num
        add_data_neighbors_path = os.path.join(self.selected_dir,
                                               "add_data_neighbors.pkl")
        add_data_test_neighbors_path = os.path.join(
            self.selected_dir, "add_data_test_neighbors.pkl")
        if os.path.exists(add_data_neighbors_path) and os.path.exists(
                add_data_test_neighbors_path):
            neighbors = pickle_load_data(add_data_neighbors_path)
            test_neighbors = pickle_load_data(add_data_test_neighbors_path)
        else:
            neighbors, test_neighbors = self._preprocess_neighbors(
                rebuild=True, save=False)
            pickle_save_data(add_data_neighbors_path, neighbors)
            pickle_save_data(add_data_test_neighbors_path, test_neighbors)
        self.neighbors = neighbors
        neighbors = self.get_neighbors(k_neighbors=10)
        self.test_neighbors = test_neighbors
        new_affinity_matrix = np.zeros((pre_num + add_num, pre_num + add_num))
        new_affinity_matrix[:pre_num, :pre_num] = self.affinity_matrix.toarray(
        )
        for i in range(pre_num, pre_num + add_num):
            nei_idxs = neighbors[i, 1:6]
            for idx in nei_idxs:
                if idx >= len(train_pred) or train_pred[idx] == cls:
                    new_affinity_matrix[i, idx] = 1
                    new_affinity_matrix[idx, i] = 1
        new_affinity_matrix = sparse.csr_matrix(new_affinity_matrix)
        self.affinity_matrix = self.correct_unconnected_nodes(
            new_affinity_matrix)
Exemple #3
0
    def add_data_oct(self, added_idxs, train_pred, cls):
        added_idxs = np.array(added_idxs).reshape(-1)
        self.train_idx = np.hstack((self.train_idx, added_idxs))
        self.rest_idxs = np.array(range(len(self.train_idx)))

        pre_num = self.affinity_matrix.shape[0]
        add_num = len(added_idxs)
        total_num = pre_num + add_num
        add_data_neighbors_path = os.path.join(self.selected_dir,
                                               "add_data_neighbors.pkl")
        add_data_test_neighbors_path = os.path.join(
            self.selected_dir, "add_data_test_neighbors.pkl")
        if os.path.exists(add_data_neighbors_path) and os.path.exists(
                add_data_test_neighbors_path):
            neighbors = pickle_load_data(add_data_neighbors_path)
            test_neighbors = pickle_load_data(add_data_test_neighbors_path)
        else:
            neighbors, test_neighbors = self._preprocess_neighbors(
                rebuild=True, save=False)
            pickle_save_data(add_data_neighbors_path, neighbors)
            pickle_save_data(add_data_test_neighbors_path, test_neighbors)
        self.neighbors = neighbors
        new_affinity_matrix = np.zeros((pre_num + add_num, pre_num + add_num))
        new_affinity_matrix[:pre_num, :pre_num] = self.affinity_matrix.toarray(
        )
        for i in range(pre_num, pre_num + add_num):
            nei_idxs = neighbors[i, 1:100]
            count = 0
            for idx in nei_idxs:
                # if idx >= len(train_pred) or self.get_train_ground_truth()[idx] == cls:
                if idx >= len(train_pred) or train_pred[idx] == cls:
                    new_affinity_matrix[i, idx] = 1
                    new_affinity_matrix[idx, i] = 1
                    count += 1
                if count > 3:
                    break
        new_affinity_matrix = sparse.csr_matrix(new_affinity_matrix)
        self.affinity_matrix = self.correct_unconnected_nodes(
            new_affinity_matrix)
Exemple #4
0
    def _load_data(self):
        processed_data_filename = os.path.join(self.data_root,
                                               config.processed_dataname)
        processed_data = pickle_load_data(processed_data_filename)
        self.processed_data = processed_data
        self.X = processed_data[config.X_name]
        self.y = processed_data[config.y_name]
        self.y = np.array(self.y).astype(int)
        if self.dataname.lower() == "oct":
            # wrong label
            self.y[564] = 3

        self.train_idx = processed_data[config.train_idx_name]
        self.valid_idx = processed_data[config.valid_idx_name]
        self.test_idx = processed_data[config.test_idx_name]
        self.labeled_idx = processed_data[config.labeled_idx_name]
        self.unlabeled_idx = processed_data[config.unlabeled_idx_name]
        self.class_names = processed_data[
            config.class_name]  #+["lizard", "snake"]
        self.add_info = processed_data[config.add_info_name]
        self.actions = []

        # if self.dataname.lower() == "stl":
        #     # self.y[]
        #     unlabeled_pred = pickle_load_data(os.path.join(self.data_root, "unlabeled_labels.pkl"))
        #     self.y[self.unlabeled_idx] = unlabeled_pred

        if self.selected_labeled_num is None and self.selected_total_num is None:
            self.selected_labeled_num = self.add_info.get(
                "default_selected_labeled_num", None)
            self.selected_total_num = self.add_info.get(
                "default_selected_total_num", None)
            self.seed = self.add_info.get("default_seed", 123)

        # produce unlabeled data
        assert (self.selected_labeled_num is not None
                and self.selected_total_num is not None)
        dir_name = "labeled-" + str(self.selected_labeled_num) + \
                   ".total-" + str(self.selected_total_num) + ".seed-" + str(self.seed)
        logger.info(dir_name)
        dir_path = os.path.join(self.data_root, dir_name)
        check_dir(dir_path)
        self.selected_dir = dir_path
        idx_info_path = os.path.join(dir_path, "idx_info.pkl")
        if os.path.exists(idx_info_path):
            logger.info("idx info exists in: {}".format(idx_info_path))
            idx_info = pickle_load_data(idx_info_path)
            self.train_idx = idx_info["train_idx"]
            self.selected_labeled_idx = idx_info["selected_labeled_idx"]
            if self.dataname.lower() == "stl":
                # relabel:
                removed_idx = [self.train_idx[39], self.train_idx[33]]
                added_idx = [self.train_idx[9081], self.train_idx[7427]]
                # removed_idx = [self.train_idx[39], self.train_idx[33]]
                # added_idx = [self.train_idx[2790], self.train_idx[5855]]
                tmp_labeled_idx = added_idx
                # added_idx = [self.train_idx[11146], self.train_idx[7683]]
                # tmp_labeled_idx = []
                for old_idx in self.selected_labeled_idx:
                    if old_idx not in removed_idx:
                        tmp_labeled_idx.append(old_idx)
                self.selected_labeled_idx = np.array(tmp_labeled_idx)

            self.rest_idxs = np.array(range(len(self.train_idx)))
            return
        #
        if len(self.labeled_idx) == self.selected_labeled_num:
            # self.selected_labeled_idx = self.labeled_idx
            selected_labeled_idx = np.array(self.labeled_idx)
            selected_labeled_idx.sort()
        else:
            # selected_labeled_idx = np.random.choice(self.labeled_idx, self.selected_labeled_num, replace=False)
            # class balance selection
            selected_labeled_num_in_each_class = np.zeros(len(
                self.class_names))
            class_num = len(selected_labeled_num_in_each_class)
            num_per_class = self.selected_labeled_num // class_num
            selected_labeled_num_in_each_class = (np.ones(class_num) *
                                                  num_per_class).astype(int)
            rest_num = self.selected_labeled_num - num_per_class * class_num
            if rest_num > 0:
                idx = np.random.choice(class_num, rest_num, replace=False)
                selected_labeled_num_in_each_class[idx] += 1
            selected_labeled_idx = []
            labeled_y = self.y[self.labeled_idx]
            for i in range(class_num):
                labeled_idx_in_this_class = self.labeled_idx[labeled_y == i]
                selected_labeled_idx_in_this_class = \
                    np.random.choice(labeled_idx_in_this_class, selected_labeled_num_in_each_class[i], replace=False)
                selected_labeled_idx = selected_labeled_idx + selected_labeled_idx_in_this_class.tolist(
                )
            selected_labeled_idx = np.array(selected_labeled_idx)
            selected_labeled_idx.sort()

        # get unlabeled idx
        rest_selected_labeled_num = self.selected_total_num - self.selected_labeled_num
        rest_selected_labeled_idx = np.random.choice(self.unlabeled_idx,
                                                     rest_selected_labeled_num,
                                                     replace=False)
        train_idx = np.hstack(
            (selected_labeled_idx, rest_selected_labeled_idx))
        train_idx.sort()
        self.train_idx = train_idx
        self.selected_labeled_idx = selected_labeled_idx
        idx_info = {
            "selected_labeled_idx": selected_labeled_idx,
            "train_idx": train_idx
        }
        pickle_save_data(idx_info_path, idx_info)
Exemple #5
0
    def _preprocess_neighbors(self, rebuild=False, save=True):
        neighbors_model_path = os.path.join(
            self.selected_dir,
            "neighbors_model-step" + str(self.model.step) + ".pkl")
        neighbors_path = os.path.join(
            self.selected_dir,
            "neighbors-step" + str(self.model.step) + ".npy")
        neighbors_weight_path = os.path.join(
            self.selected_dir,
            "neighbors_weight-step" + str(self.model.step) + ".npy")
        test_neighbors_path = os.path.join(
            self.selected_dir,
            "test_neighbors-step" + str(self.model.step) + ".npy")
        test_neighbors_weight_path = os.path.join(
            self.selected_dir,
            "test_neighbors_weight-step" + str(self.model.step) + ".npy")
        if os.path.exists(neighbors_model_path) and \
                os.path.exists(neighbors_path) and \
                os.path.exists(test_neighbors_path) and rebuild == False and DEBUG == False:
            logger.info("neighbors and neighbor_weight exist!!!")
            self.neighbors = np.load(neighbors_path)
            self.neighbors_weight = np.load(neighbors_weight_path)
            self.test_neighbors = np.load(test_neighbors_path)
            return
        logger.info("neighbors and neighbor_weight "
                    "do not exist, preprocessing!")
        train_X = self.get_full_train_X()
        train_num = train_X.shape[0]
        train_y = self.get_full_train_label()
        train_y = np.array(train_y)
        test_X = self.get_test_X()
        test_num = test_X.shape[0]
        self.max_neighbors = min(len(train_y), self.max_neighbors)
        logger.info("data shape: {}, labeled_num: {}".format(
            str(train_X.shape), sum(train_y != -1)))
        nn_fit = NearestNeighbors(7, n_jobs=-4).fit(train_X)
        logger.info("nn construction finished!")
        neighbor_result = nn_fit.kneighbors_graph(
            nn_fit._fit_X,
            self.max_neighbors,
            # 2,
            mode="distance")
        test_neighbors_result = nn_fit.kneighbors_graph(test_X,
                                                        self.max_neighbors,
                                                        mode="distance")
        logger.info("neighbor_result got!")
        self.neighbors, self.neighbors_weight = self.csr_to_impact_matrix(
            neighbor_result, train_num, self.max_neighbors)
        self.test_neighbors, test_neighbors_weight = self.csr_to_impact_matrix(
            test_neighbors_result, test_num, self.max_neighbors)

        logger.info("preprocessed neighbors got!")

        # save neighbors information
        if save:
            pickle_save_data(neighbors_model_path, nn_fit)
            np.save(neighbors_path, self.neighbors)
            np.save(neighbors_weight_path, self.neighbors_weight)
            np.save(test_neighbors_path, self.test_neighbors)
            np.save(test_neighbors_weight_path, test_neighbors_weight)
        return self.neighbors, self.test_neighbors