def preprocess_neighbors(self, rebuild=False, save=True): neighbors_model_path = os.path.join(self.selected_dir, "neighbors_model" + ".pkl") neighbors_path = os.path.join(self.selected_dir, "neighbors" + ".npy") neighbors_weight_path = os.path.join(self.selected_dir, "neighbors_weight" + ".npy") test_neighbors_path = os.path.join(self.selected_dir, "test_neighbors" + ".npy") test_neighbors_weight_path = os.path.join( self.selected_dir, "test_neighbors_weight" + ".npy") if os.path.exists(neighbors_model_path) and \ os.path.exists(neighbors_path) and \ os.path.exists(test_neighbors_path) and rebuild == False: print("neighbors and neighbor_weight exist!!!") neighbors = np.load(neighbors_path) neighbors_weight = np.load(neighbors_weight_path) test_neighbors = np.load(test_neighbors_path) self.test_neighbors = test_neighbors return neighbors, neighbors_weight, test_neighbors print("neighbors and neighbor_weight do not exist, preprocessing!") train_num = self.train_X.shape[0] train_y = np.array(self.train_y) test_num = self.test_X.shape[0] max_neighbors = min(len(train_y), 200) print("data shape: {}, labeled_num: {}".format(str(self.train_X.shape), sum(train_y != -1))) nn_fit = NearestNeighbors(7, n_jobs=-4).fit(self.train_X) print("nn construction finished!") neighbor_result = nn_fit.kneighbors_graph( nn_fit._fit_X, max_neighbors, # 2, mode="distance") test_neighbors_result = nn_fit.kneighbors_graph(self.test_X, max_neighbors, mode="distance") print("neighbor_result got!") neighbors, neighbors_weight = csr_to_impact_matrix( neighbor_result, train_num, max_neighbors) test_neighbors, test_neighbors_weight = csr_to_impact_matrix( test_neighbors_result, test_num, max_neighbors) self.test_neighbors = test_neighbors print("preprocessed neighbors got!") # save neighbors information if save: pickle_save_data(neighbors_model_path, nn_fit) np.save(neighbors_path, neighbors) np.save(neighbors_weight_path, neighbors_weight) np.save(test_neighbors_path, test_neighbors) np.save(test_neighbors_weight_path, test_neighbors_weight) return neighbors, neighbors_weight, test_neighbors
def add_data(self, added_idxs, train_pred, cls): self.actions.append("add-unlabeled") added_idxs = np.array(added_idxs).reshape(-1) added_false_idxes = [ i for i in range(len(self.train_idx), len(self.train_idx) + len(added_idxs)) ] self.rest_idxs = self.rest_idxs.tolist() + added_false_idxes self.train_idx = np.hstack((self.train_idx, added_idxs)) self.rest_idxs = np.array(self.rest_idxs) m = self.get_new_id_map() pre_num = self.affinity_matrix.shape[0] add_num = len(added_idxs) total_num = pre_num + add_num add_data_neighbors_path = os.path.join(self.selected_dir, "add_data_neighbors.pkl") add_data_test_neighbors_path = os.path.join( self.selected_dir, "add_data_test_neighbors.pkl") if os.path.exists(add_data_neighbors_path) and os.path.exists( add_data_test_neighbors_path): neighbors = pickle_load_data(add_data_neighbors_path) test_neighbors = pickle_load_data(add_data_test_neighbors_path) else: neighbors, test_neighbors = self._preprocess_neighbors( rebuild=True, save=False) pickle_save_data(add_data_neighbors_path, neighbors) pickle_save_data(add_data_test_neighbors_path, test_neighbors) self.neighbors = neighbors neighbors = self.get_neighbors(k_neighbors=10) self.test_neighbors = test_neighbors new_affinity_matrix = np.zeros((pre_num + add_num, pre_num + add_num)) new_affinity_matrix[:pre_num, :pre_num] = self.affinity_matrix.toarray( ) for i in range(pre_num, pre_num + add_num): nei_idxs = neighbors[i, 1:6] for idx in nei_idxs: if idx >= len(train_pred) or train_pred[idx] == cls: new_affinity_matrix[i, idx] = 1 new_affinity_matrix[idx, i] = 1 new_affinity_matrix = sparse.csr_matrix(new_affinity_matrix) self.affinity_matrix = self.correct_unconnected_nodes( new_affinity_matrix)
def add_data_oct(self, added_idxs, train_pred, cls): added_idxs = np.array(added_idxs).reshape(-1) self.train_idx = np.hstack((self.train_idx, added_idxs)) self.rest_idxs = np.array(range(len(self.train_idx))) pre_num = self.affinity_matrix.shape[0] add_num = len(added_idxs) total_num = pre_num + add_num add_data_neighbors_path = os.path.join(self.selected_dir, "add_data_neighbors.pkl") add_data_test_neighbors_path = os.path.join( self.selected_dir, "add_data_test_neighbors.pkl") if os.path.exists(add_data_neighbors_path) and os.path.exists( add_data_test_neighbors_path): neighbors = pickle_load_data(add_data_neighbors_path) test_neighbors = pickle_load_data(add_data_test_neighbors_path) else: neighbors, test_neighbors = self._preprocess_neighbors( rebuild=True, save=False) pickle_save_data(add_data_neighbors_path, neighbors) pickle_save_data(add_data_test_neighbors_path, test_neighbors) self.neighbors = neighbors new_affinity_matrix = np.zeros((pre_num + add_num, pre_num + add_num)) new_affinity_matrix[:pre_num, :pre_num] = self.affinity_matrix.toarray( ) for i in range(pre_num, pre_num + add_num): nei_idxs = neighbors[i, 1:100] count = 0 for idx in nei_idxs: # if idx >= len(train_pred) or self.get_train_ground_truth()[idx] == cls: if idx >= len(train_pred) or train_pred[idx] == cls: new_affinity_matrix[i, idx] = 1 new_affinity_matrix[idx, i] = 1 count += 1 if count > 3: break new_affinity_matrix = sparse.csr_matrix(new_affinity_matrix) self.affinity_matrix = self.correct_unconnected_nodes( new_affinity_matrix)
def _load_data(self): processed_data_filename = os.path.join(self.data_root, config.processed_dataname) processed_data = pickle_load_data(processed_data_filename) self.processed_data = processed_data self.X = processed_data[config.X_name] self.y = processed_data[config.y_name] self.y = np.array(self.y).astype(int) if self.dataname.lower() == "oct": # wrong label self.y[564] = 3 self.train_idx = processed_data[config.train_idx_name] self.valid_idx = processed_data[config.valid_idx_name] self.test_idx = processed_data[config.test_idx_name] self.labeled_idx = processed_data[config.labeled_idx_name] self.unlabeled_idx = processed_data[config.unlabeled_idx_name] self.class_names = processed_data[ config.class_name] #+["lizard", "snake"] self.add_info = processed_data[config.add_info_name] self.actions = [] # if self.dataname.lower() == "stl": # # self.y[] # unlabeled_pred = pickle_load_data(os.path.join(self.data_root, "unlabeled_labels.pkl")) # self.y[self.unlabeled_idx] = unlabeled_pred if self.selected_labeled_num is None and self.selected_total_num is None: self.selected_labeled_num = self.add_info.get( "default_selected_labeled_num", None) self.selected_total_num = self.add_info.get( "default_selected_total_num", None) self.seed = self.add_info.get("default_seed", 123) # produce unlabeled data assert (self.selected_labeled_num is not None and self.selected_total_num is not None) dir_name = "labeled-" + str(self.selected_labeled_num) + \ ".total-" + str(self.selected_total_num) + ".seed-" + str(self.seed) logger.info(dir_name) dir_path = os.path.join(self.data_root, dir_name) check_dir(dir_path) self.selected_dir = dir_path idx_info_path = os.path.join(dir_path, "idx_info.pkl") if os.path.exists(idx_info_path): logger.info("idx info exists in: {}".format(idx_info_path)) idx_info = pickle_load_data(idx_info_path) self.train_idx = idx_info["train_idx"] self.selected_labeled_idx = idx_info["selected_labeled_idx"] if self.dataname.lower() == "stl": # relabel: removed_idx = [self.train_idx[39], self.train_idx[33]] added_idx = [self.train_idx[9081], self.train_idx[7427]] # removed_idx = [self.train_idx[39], self.train_idx[33]] # added_idx = [self.train_idx[2790], self.train_idx[5855]] tmp_labeled_idx = added_idx # added_idx = [self.train_idx[11146], self.train_idx[7683]] # tmp_labeled_idx = [] for old_idx in self.selected_labeled_idx: if old_idx not in removed_idx: tmp_labeled_idx.append(old_idx) self.selected_labeled_idx = np.array(tmp_labeled_idx) self.rest_idxs = np.array(range(len(self.train_idx))) return # if len(self.labeled_idx) == self.selected_labeled_num: # self.selected_labeled_idx = self.labeled_idx selected_labeled_idx = np.array(self.labeled_idx) selected_labeled_idx.sort() else: # selected_labeled_idx = np.random.choice(self.labeled_idx, self.selected_labeled_num, replace=False) # class balance selection selected_labeled_num_in_each_class = np.zeros(len( self.class_names)) class_num = len(selected_labeled_num_in_each_class) num_per_class = self.selected_labeled_num // class_num selected_labeled_num_in_each_class = (np.ones(class_num) * num_per_class).astype(int) rest_num = self.selected_labeled_num - num_per_class * class_num if rest_num > 0: idx = np.random.choice(class_num, rest_num, replace=False) selected_labeled_num_in_each_class[idx] += 1 selected_labeled_idx = [] labeled_y = self.y[self.labeled_idx] for i in range(class_num): labeled_idx_in_this_class = self.labeled_idx[labeled_y == i] selected_labeled_idx_in_this_class = \ np.random.choice(labeled_idx_in_this_class, selected_labeled_num_in_each_class[i], replace=False) selected_labeled_idx = selected_labeled_idx + selected_labeled_idx_in_this_class.tolist( ) selected_labeled_idx = np.array(selected_labeled_idx) selected_labeled_idx.sort() # get unlabeled idx rest_selected_labeled_num = self.selected_total_num - self.selected_labeled_num rest_selected_labeled_idx = np.random.choice(self.unlabeled_idx, rest_selected_labeled_num, replace=False) train_idx = np.hstack( (selected_labeled_idx, rest_selected_labeled_idx)) train_idx.sort() self.train_idx = train_idx self.selected_labeled_idx = selected_labeled_idx idx_info = { "selected_labeled_idx": selected_labeled_idx, "train_idx": train_idx } pickle_save_data(idx_info_path, idx_info)
def _preprocess_neighbors(self, rebuild=False, save=True): neighbors_model_path = os.path.join( self.selected_dir, "neighbors_model-step" + str(self.model.step) + ".pkl") neighbors_path = os.path.join( self.selected_dir, "neighbors-step" + str(self.model.step) + ".npy") neighbors_weight_path = os.path.join( self.selected_dir, "neighbors_weight-step" + str(self.model.step) + ".npy") test_neighbors_path = os.path.join( self.selected_dir, "test_neighbors-step" + str(self.model.step) + ".npy") test_neighbors_weight_path = os.path.join( self.selected_dir, "test_neighbors_weight-step" + str(self.model.step) + ".npy") if os.path.exists(neighbors_model_path) and \ os.path.exists(neighbors_path) and \ os.path.exists(test_neighbors_path) and rebuild == False and DEBUG == False: logger.info("neighbors and neighbor_weight exist!!!") self.neighbors = np.load(neighbors_path) self.neighbors_weight = np.load(neighbors_weight_path) self.test_neighbors = np.load(test_neighbors_path) return logger.info("neighbors and neighbor_weight " "do not exist, preprocessing!") train_X = self.get_full_train_X() train_num = train_X.shape[0] train_y = self.get_full_train_label() train_y = np.array(train_y) test_X = self.get_test_X() test_num = test_X.shape[0] self.max_neighbors = min(len(train_y), self.max_neighbors) logger.info("data shape: {}, labeled_num: {}".format( str(train_X.shape), sum(train_y != -1))) nn_fit = NearestNeighbors(7, n_jobs=-4).fit(train_X) logger.info("nn construction finished!") neighbor_result = nn_fit.kneighbors_graph( nn_fit._fit_X, self.max_neighbors, # 2, mode="distance") test_neighbors_result = nn_fit.kneighbors_graph(test_X, self.max_neighbors, mode="distance") logger.info("neighbor_result got!") self.neighbors, self.neighbors_weight = self.csr_to_impact_matrix( neighbor_result, train_num, self.max_neighbors) self.test_neighbors, test_neighbors_weight = self.csr_to_impact_matrix( test_neighbors_result, test_num, self.max_neighbors) logger.info("preprocessed neighbors got!") # save neighbors information if save: pickle_save_data(neighbors_model_path, nn_fit) np.save(neighbors_path, self.neighbors) np.save(neighbors_weight_path, self.neighbors_weight) np.save(test_neighbors_path, self.test_neighbors) np.save(test_neighbors_weight_path, test_neighbors_weight) return self.neighbors, self.test_neighbors