Esempio n. 1
0
    def _construct_graph(self, n_neighbor=None, weight=False):
        # create neighbors buffer
        self._preprocess_neighbors()

        # # load neighbors information
        # neighbors_path = os.path.join(self.selected_dir, "neighbors.npy")
        # neighbors_weight_path = os.path.join(self.selected_dir,
        #                                      "neighbors_weight.npy")
        # neighbors = np.load(neighbors_path)
        # neighbors_weight = np.load(neighbors_weight_path)
        neighbors = self.neighbors
        neighbors_weight = self.neighbors_weight
        instance_num = neighbors.shape[0]
        train_y = self.get_train_label()
        train_y = np.array(train_y)
        self.train_y = train_y
        print("train_y", train_y.shape)

        # get knn graph in a csr form
        indptr = [i * n_neighbor for i in range(instance_num + 1)]
        logger.info("get indptr")
        indices = neighbors[:, :n_neighbor].reshape(-1).tolist()
        logger.info("get indices")
        if not weight:
            data = neighbors[:, :n_neighbor].reshape(-1)
            logger.info("get data")
            data = (data * 0 + 1.0).tolist()
        else:
            data = neighbors_weight[:, :n_neighbor].reshape(-1).tolist()
        logger.info("get data in connectivity")
        affinity_matrix = sparse.csr_matrix((data, indices, indptr),
                                            shape=(instance_num, instance_num))
        affinity_matrix = affinity_matrix + affinity_matrix.T
        affinity_matrix = sparse.csr_matrix(
            (np.ones(len(affinity_matrix.data)).tolist(),
             affinity_matrix.indices, affinity_matrix.indptr),
            shape=(instance_num, instance_num))

        # affinity_matrix = modify_graph(affinity_matrix, train_y, 0.2)
        # affinity_matrix = self.correct_unconnected_nodes(affinity_matrix)

        # affinity_matrix = self.correct_unconnected_nodes(affinity_matrix)
        logger.info("affinity_matrix construction finished!!")

        self.affinity_matrix = affinity_matrix

        return affinity_matrix
Esempio n. 2
0
 def get_graph(self, n_neighbor=None, rebuild=False):
     if self.affinity_matrix is None or rebuild is True:
         self._construct_graph(n_neighbor)
     n_components, labels = sparse.csgraph.connected_components(
         csgraph=self.affinity_matrix, return_labels=True)
     logger.info("n_components: {}".format(n_components))
     train_y = self.get_train_label()
     unp = []
     for i in range(n_components):
         y_in_this_component = train_y[labels == i]
         if not any(y_in_this_component > -1):
             idxs = self.get_rest_idxs()[labels == i]
             unp = unp + idxs.tolist()
     logger.info(
         "connected components without labeled data - instance num: {}".
         format(len(unp)))
     return self.affinity_matrix.copy()
Esempio n. 3
0
 def correct_unconnected_nodes(self, affinity_matrix):
     logger.info("begin correct unconnected nodes...")
     np.random.seed(123)
     correted_nodes = []
     affinity_matrix = affinity_matrix.copy()
     labeled_ids = np.where(self.get_train_label() > -1)[0]
     iter_cnt = 0
     neighbors = self.get_neighbors(k_neighbors=100)
     while True:
         unconnected_ids = self._find_unconnected_nodes(
             affinity_matrix, labeled_ids)
         if unconnected_ids.shape[0] == 0:
             logger.info(
                 "No correcnted nodes after {} iteration. Correction finished."
                 .format(iter_cnt))
             # debug: show how many edge is uncorrect
             gt = self.get_train_ground_truth()
             err_cnt = 0
             all_cnt = 0
             # np.save("./buffer/add_edges.npy", np.array(correted_nodes))
             # for source, target in correted_nodes:
             #     all_cnt += 1
             #     if gt[source] != gt[target]:
             #         err_cnt+=1
             # if all_cnt>0:
             #     logger.info("All:{}, Err:{}, Percent:{}".format(all_cnt, err_cnt, err_cnt/all_cnt))
             return affinity_matrix
         else:
             while True:
                 corrected_id = np.random.choice(unconnected_ids)
                 k_neighbors = neighbors[corrected_id]
                 find = False
                 for neighbor_id in k_neighbors:
                     if neighbor_id not in unconnected_ids:
                         find = True
                         iter_cnt += 1
                         affinity_matrix[corrected_id, neighbor_id] = 1
                         correted_nodes.append([corrected_id, neighbor_id])
                         break
                 if find:
                     break
Esempio n. 4
0
    def update_graph(self, deleted_idxs):
        logger.info("begin update graph according to editing info")
        rest_idxs = self.get_rest_idxs()
        remove_idxs = self.get_removed_idxs()
        assert len(
            set(rest_idxs.copy().tolist()).intersection(
                set(deleted_idxs))) == 0
        last_rest_idxs = np.sort(rest_idxs.copy().tolist() + deleted_idxs)
        last_map = {}
        for i in range(len(last_rest_idxs)):
            last_map[last_rest_idxs[i]] = i
        rest_idxs = [last_map[idx] for idx in rest_idxs]

        logger.info("total len: {}".format(len(rest_idxs) + len(remove_idxs)))
        self.affinity_matrix = self.affinity_matrix[rest_idxs, :]
        self.affinity_matrix = self.affinity_matrix[:, rest_idxs]
        # update neighbors info
        self._preprocess_neighbors()

        logger.info("affinity_matrix shape after updating: {}".format(
            str(self.affinity_matrix.shape)))
Esempio n. 5
0
    def _load_data(self):
        processed_data_filename = os.path.join(self.data_root,
                                               config.processed_dataname)
        processed_data = pickle_load_data(processed_data_filename)
        self.processed_data = processed_data
        self.X = processed_data[config.X_name]
        self.y = processed_data[config.y_name]
        self.y = np.array(self.y).astype(int)
        if self.dataname.lower() == "oct":
            # wrong label
            self.y[564] = 3

        self.train_idx = processed_data[config.train_idx_name]
        self.valid_idx = processed_data[config.valid_idx_name]
        self.test_idx = processed_data[config.test_idx_name]
        self.labeled_idx = processed_data[config.labeled_idx_name]
        self.unlabeled_idx = processed_data[config.unlabeled_idx_name]
        self.class_names = processed_data[
            config.class_name]  #+["lizard", "snake"]
        self.add_info = processed_data[config.add_info_name]
        self.actions = []

        # if self.dataname.lower() == "stl":
        #     # self.y[]
        #     unlabeled_pred = pickle_load_data(os.path.join(self.data_root, "unlabeled_labels.pkl"))
        #     self.y[self.unlabeled_idx] = unlabeled_pred

        if self.selected_labeled_num is None and self.selected_total_num is None:
            self.selected_labeled_num = self.add_info.get(
                "default_selected_labeled_num", None)
            self.selected_total_num = self.add_info.get(
                "default_selected_total_num", None)
            self.seed = self.add_info.get("default_seed", 123)

        # produce unlabeled data
        assert (self.selected_labeled_num is not None
                and self.selected_total_num is not None)
        dir_name = "labeled-" + str(self.selected_labeled_num) + \
                   ".total-" + str(self.selected_total_num) + ".seed-" + str(self.seed)
        logger.info(dir_name)
        dir_path = os.path.join(self.data_root, dir_name)
        check_dir(dir_path)
        self.selected_dir = dir_path
        idx_info_path = os.path.join(dir_path, "idx_info.pkl")
        if os.path.exists(idx_info_path):
            logger.info("idx info exists in: {}".format(idx_info_path))
            idx_info = pickle_load_data(idx_info_path)
            self.train_idx = idx_info["train_idx"]
            self.selected_labeled_idx = idx_info["selected_labeled_idx"]
            if self.dataname.lower() == "stl":
                # relabel:
                removed_idx = [self.train_idx[39], self.train_idx[33]]
                added_idx = [self.train_idx[9081], self.train_idx[7427]]
                # removed_idx = [self.train_idx[39], self.train_idx[33]]
                # added_idx = [self.train_idx[2790], self.train_idx[5855]]
                tmp_labeled_idx = added_idx
                # added_idx = [self.train_idx[11146], self.train_idx[7683]]
                # tmp_labeled_idx = []
                for old_idx in self.selected_labeled_idx:
                    if old_idx not in removed_idx:
                        tmp_labeled_idx.append(old_idx)
                self.selected_labeled_idx = np.array(tmp_labeled_idx)

            self.rest_idxs = np.array(range(len(self.train_idx)))
            return
        #
        if len(self.labeled_idx) == self.selected_labeled_num:
            # self.selected_labeled_idx = self.labeled_idx
            selected_labeled_idx = np.array(self.labeled_idx)
            selected_labeled_idx.sort()
        else:
            # selected_labeled_idx = np.random.choice(self.labeled_idx, self.selected_labeled_num, replace=False)
            # class balance selection
            selected_labeled_num_in_each_class = np.zeros(len(
                self.class_names))
            class_num = len(selected_labeled_num_in_each_class)
            num_per_class = self.selected_labeled_num // class_num
            selected_labeled_num_in_each_class = (np.ones(class_num) *
                                                  num_per_class).astype(int)
            rest_num = self.selected_labeled_num - num_per_class * class_num
            if rest_num > 0:
                idx = np.random.choice(class_num, rest_num, replace=False)
                selected_labeled_num_in_each_class[idx] += 1
            selected_labeled_idx = []
            labeled_y = self.y[self.labeled_idx]
            for i in range(class_num):
                labeled_idx_in_this_class = self.labeled_idx[labeled_y == i]
                selected_labeled_idx_in_this_class = \
                    np.random.choice(labeled_idx_in_this_class, selected_labeled_num_in_each_class[i], replace=False)
                selected_labeled_idx = selected_labeled_idx + selected_labeled_idx_in_this_class.tolist(
                )
            selected_labeled_idx = np.array(selected_labeled_idx)
            selected_labeled_idx.sort()

        # get unlabeled idx
        rest_selected_labeled_num = self.selected_total_num - self.selected_labeled_num
        rest_selected_labeled_idx = np.random.choice(self.unlabeled_idx,
                                                     rest_selected_labeled_num,
                                                     replace=False)
        train_idx = np.hstack(
            (selected_labeled_idx, rest_selected_labeled_idx))
        train_idx.sort()
        self.train_idx = train_idx
        self.selected_labeled_idx = selected_labeled_idx
        idx_info = {
            "selected_labeled_idx": selected_labeled_idx,
            "train_idx": train_idx
        }
        pickle_save_data(idx_info_path, idx_info)
Esempio n. 6
0
    def _preprocess_neighbors(self, rebuild=False, save=True):
        neighbors_model_path = os.path.join(
            self.selected_dir,
            "neighbors_model-step" + str(self.model.step) + ".pkl")
        neighbors_path = os.path.join(
            self.selected_dir,
            "neighbors-step" + str(self.model.step) + ".npy")
        neighbors_weight_path = os.path.join(
            self.selected_dir,
            "neighbors_weight-step" + str(self.model.step) + ".npy")
        test_neighbors_path = os.path.join(
            self.selected_dir,
            "test_neighbors-step" + str(self.model.step) + ".npy")
        test_neighbors_weight_path = os.path.join(
            self.selected_dir,
            "test_neighbors_weight-step" + str(self.model.step) + ".npy")
        if os.path.exists(neighbors_model_path) and \
                os.path.exists(neighbors_path) and \
                os.path.exists(test_neighbors_path) and rebuild == False and DEBUG == False:
            logger.info("neighbors and neighbor_weight exist!!!")
            self.neighbors = np.load(neighbors_path)
            self.neighbors_weight = np.load(neighbors_weight_path)
            self.test_neighbors = np.load(test_neighbors_path)
            return
        logger.info("neighbors and neighbor_weight "
                    "do not exist, preprocessing!")
        train_X = self.get_full_train_X()
        train_num = train_X.shape[0]
        train_y = self.get_full_train_label()
        train_y = np.array(train_y)
        test_X = self.get_test_X()
        test_num = test_X.shape[0]
        self.max_neighbors = min(len(train_y), self.max_neighbors)
        logger.info("data shape: {}, labeled_num: {}".format(
            str(train_X.shape), sum(train_y != -1)))
        nn_fit = NearestNeighbors(7, n_jobs=-4).fit(train_X)
        logger.info("nn construction finished!")
        neighbor_result = nn_fit.kneighbors_graph(
            nn_fit._fit_X,
            self.max_neighbors,
            # 2,
            mode="distance")
        test_neighbors_result = nn_fit.kneighbors_graph(test_X,
                                                        self.max_neighbors,
                                                        mode="distance")
        logger.info("neighbor_result got!")
        self.neighbors, self.neighbors_weight = self.csr_to_impact_matrix(
            neighbor_result, train_num, self.max_neighbors)
        self.test_neighbors, test_neighbors_weight = self.csr_to_impact_matrix(
            test_neighbors_result, test_num, self.max_neighbors)

        logger.info("preprocessed neighbors got!")

        # save neighbors information
        if save:
            pickle_save_data(neighbors_model_path, nn_fit)
            np.save(neighbors_path, self.neighbors)
            np.save(neighbors_weight_path, self.neighbors_weight)
            np.save(test_neighbors_path, self.test_neighbors)
            np.save(test_neighbors_weight_path, test_neighbors_weight)
        return self.neighbors, self.test_neighbors
Esempio n. 7
0
 def remove_instance(self, idxs):
     if len(idxs) > 0:
         self.actions.append("remove-node")
     self.rest_idxs = np.array([i for i in self.rest_idxs if i not in idxs])
     self.removed_idxs += idxs
     logger.info("rest data: {}".format(len(self.rest_idxs)))