def data_partition_random(dataset_dir, dataset_name, label_n_per_class): # Random data partition text_set_n = 1000 val_set_n = 500 adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, one_hot_labels = load_data(dataset_name, dataset_dir) n = len(y_train) k = len(y_train[0]) labels = one_hot_labels.argmax(axis=1) train_index_new = np.zeros(k*label_n_per_class).astype(int) train_mask_new = np.zeros(n).astype(bool) val_mask_new = np.zeros(n).astype(bool) test_mask_new = np.zeros(n).astype(bool) y_train_new = np.zeros((n, k)) y_val_new = np.zeros((n, k)) y_test_new = np.zeros((n, k)) class_index_dict = {} for i in range(k): class_index_dict[i] = np.where(labels == i)[0] for i in range(k): class_index = class_index_dict[i] train_index_one_class = np.random.choice(class_index, label_n_per_class, replace=False) train_index_new[i*label_n_per_class:i*label_n_per_class + label_n_per_class] = train_index_one_class train_index_new = list(train_index_new) test_val_potential_index = list(set([i for i in range(n)]) - set(train_index_new)) test_index_new = np.random.choice(test_val_potential_index, text_set_n, replace=False) potential_val_index = list(set(test_val_potential_index) - set(test_index_new)) val_index_new = np.random.choice(potential_val_index, val_set_n, replace=False) train_mask_new[train_index_new] = True val_mask_new[val_index_new] = True test_mask_new[test_index_new] = True for i in train_index_new: y_train_new[i][labels[i]] = 1 for i in val_index_new: y_val_new[i][labels[i]] = 1 for i in test_index_new: y_test_new[i][labels[i]] = 1 return adj, features, y_train_new, y_val_new, y_test_new, train_mask_new, val_mask_new, test_mask_new, one_hot_labels
def data_partition_fix(dataset_dir, dataset_name, label_n_per_class): # Data partition using the official split from Kipf's original GCN if dataset_name == 'movie': adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels = load_movie( dataset_name, dataset_dir) else: adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels = load_data( dataset_name, dataset_dir) k = len(y_train[0]) train_set_index = np.where(train_mask == True)[0] # print("train_set_index", train_set_index) train_set_labels = labels[train_set_index] # print(labels) train_node_index = {} for i in range(k): train_node_index[i] = np.where(train_set_labels[:, i] == 1)[0] for i in range(k): hide_index = train_node_index[i][label_n_per_class:] print("The training set index for class {} is {}".format( i, train_node_index[i][0:label_n_per_class])) train_mask[hide_index] = False y_train[hide_index] = 0 return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels
def data_partition_fix(dataset_dir, dataset_name, label_n_per_class): # Data partition using the official split from Kipf's original GCN adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, one_hot_labels = load_data( dataset_name, dataset_dir) k = len(y_train[0]) train_set_index = np.where(train_mask == True)[0] labels = one_hot_labels.argmax(axis=1) train_set_labels = labels[train_set_index] train_node_index = {} for i in range(k): train_node_index[i] = np.where(train_set_labels == i)[0] for i in range(k): hide_index = train_node_index[i][label_n_per_class:] print("The training set index for class {} is {}".format(i, train_node_index[i][0:label_n_per_class])) train_mask[hide_index] = False y_train[hide_index] = 0 return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, one_hot_labels
def get_data(dataset_name, random_split, split_sizes, random_split_seed, add_val=True, add_val_seed=1, p_val=0.5, adjacency_filename=None, use_knn_graph=False, knn_metric=None, knn_k=None, balanced_split=False, samples_per_class=20): if dataset_name == 'polblogs': X, y, _ = load_polblogs() A = load_adjacency_from_file("datasets/polblogs/polblogs_graph.gpickle") X = sp.sparse.lil_matrix(X) else: A, X, y_train, y_val, y_test, mask_train, mask_val, mask_test, y = load_data(dataset_name, "datasets") if use_knn_graph: tf.logging.info("Using KNN graph") A = kneighbors_graph(X, knn_k, metric=knn_metric) # consistent with our implementation of only considering the lower triangular A = sp.sparse.tril(A, k=-1) A = A + np.transpose(A) if adjacency_filename: A = load_adjacency_from_file(adjacency_filename) # consistent with our implementation of only considering the lower triangular A = sp.sparse.tril(A, k=-1) A = A + np.transpose(A) n, d = X.shape _, k = y.shape tf.logging.info("Dataset has {} samples, dimensionality {}".format(n, d)) tf.logging.info("Targets belong to {} classes".format(k)) if random_split: print("Using a random split") random_state = np.random.RandomState(random_split_seed) split = recursive_stratified_shuffle_split( sizes=split_sizes, random_state=random_state ) indices = list(split(X, y)) elif balanced_split: indices = balanced_data_split(X, y, samples_per_class, random_state=random_split_seed) else: # fixed plit indices = load_split(dataset_name) tf.logging.info( "Split resulted in " "{} training, " "{} validation, " "{} test samples.".format(*map(len, indices)) ) [mask_train, mask_val, mask_test] = masks = list( map(partial(indices_to_mask, size=n), indices) ) y_train, y_val, y_test = map(partial(mask_values, y), masks) # A = A.toarray() if (add_val): mask_train, mask_val = add_val_to_train(mask_train, mask_val, add_val_seed, p_val) masks = [mask_train, mask_val, mask_test] y_train, y_val, y_test = map(partial(mask_values, y), masks) print("**********************************************************************************************") print("train size: {} val size: {} test size: {}".format(np.sum(mask_train), np.sum(mask_val), np.sum(mask_test))) print("**********************************************************************************************") return X, y, A, mask_train, mask_val, mask_test, y_train, y_val, y_test # def get_data_incompatible(dataset_name, random_split, split_sizes, random_split_seed, # add_val=True, add_val_seed=1, p_val=0.5, # adjacency_filename=None, # balanced_split=False, samples_per_class=20): # # tf.logging.info("Loading '{}' dataset...".format(dataset_name)) # loader = DATASET_LOADERS[dataset_name] # X, y, A = loader() # # if adjacency_filename: # A = load_adjacency_from_file(adjacency_filename) # # X = normalize(X, norm="l1", axis=1) # # n, d = X.shape # _, k = y.shape # # tf.logging.info("Dataset has {} samples, dimensionality {}".format(n, d)) # tf.logging.info("Targets belong to {} classes".format(k)) # # if random_split: # random_state = np.random.RandomState(random_split_seed) # split = recursive_stratified_shuffle_split( # sizes=split_sizes, random_state=random_state # ) # indices = list(split(X, y)) # elif balanced_split: # indices = balanced_data_split(X, y, samples_per_class, random_state=random_split_seed) # else: # fixed plit # indices = load_split(dataset_name) # # tf.logging.info( # "Split resulted in " # "{} training, " # "{} validation, " # "{} test samples.".format(*map(len, indices)) # ) # # [mask_train, mask_val, mask_test] = masks = list( # map(partial(indices_to_mask, size=n), indices) # ) # # y_train, y_val, y_test = map(partial(mask_values, y), masks) # # # A = A.toarray() # # if (add_val): # mask_train, mask_val = add_val_to_train(mask_train, mask_val, add_val_seed, p_val) print("**********************************************************************************************") print("train size: {} val size: {} test size: {}".format(np.sum(mask_train), np.sum(mask_val), np.sum(mask_test))) print("**********************************************************************************************") return X, y, A, mask_train, mask_val, mask_test, y_train, y_val, y_test