def load_data(data_prefix, dataset_str, precalc): """Return the required data formats for GCN models.""" (num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data) = utils.load_graphsage_data(data_prefix, dataset_str) visible_data = train_data y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_data, :] = labels[train_data, :] y_val[val_data, :] = labels[val_data, :] y_test[test_data, :] = labels[test_data, :] train_mask = utils.sample_mask(train_data, labels.shape[0]) val_mask = utils.sample_mask(val_data, labels.shape[0]) test_mask = utils.sample_mask(test_data, labels.shape[0]) if precalc: train_feats = train_adj.dot(feats) train_feats = np.hstack((train_feats, feats)) test_feats = full_adj.dot(feats) test_feats = np.hstack((test_feats, feats)) return (train_adj, full_adj, train_feats, test_feats, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_data, val_data, test_data, num_data, visible_data)
def full_load_citation(dataset_str="cora"): names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("dataset/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: objects.append(pkl.load(f, encoding='latin1')) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("dataset/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, labels, train_mask, val_mask, test_mask
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' with open("./aifb.pickle", 'rb') as f: data = pkl.load(f, encoding='latin1') A = data['A'] y = data['y'] train_idx = data['train_idx'] test_idx = data['test_idx'] # Get dataset splits y_train, y_val, y_test, idx_train, idx_val, idx_test = utils.get_splits( y, train_idx, test_idx, False) print((y_train.shape)) train_mask = utils.sample_mask(idx_train, y.shape[0]) print(train_mask.shape) val_mask = utils.sample_mask(idx_val, y.shape[0]) test_mask = utils.sample_mask(idx_test, y.shape[0]) # print (train_mask) # print (val_mask) print(idx_train) print(idx_val) print(idx_test)
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 0.0, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_string('save_name', './mymodel.ckpt', 'Path for saving model') #flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).') # Load data training_inputs, training_data_values, test_inputs, test_data_values = data.load_big_data() print(training_inputs.shape) print(training_data_values.shape) train_mask = utils.sample_mask(np.arange(training_inputs.shape[0]), training_inputs.shape[0]) test_mask = utils.sample_mask(np.arange(test_inputs.shape[0]), test_inputs.shape[0]) #val_mask = utils.sample_mask(val_indexes, y.shape[0]) y_train = np.zeros(training_data_values.shape) #y_val = np.zeros(y.shape) y_test = np.zeros(test_data_values.shape) y_train[train_mask, :] = training_data_values[train_mask, :] #y_val[val_mask, :] = y[val_mask, :] y_test[test_mask, :] = test_data_values[test_mask, :] # Some preprocessing #features = utils.preprocess_features(training_inputs) #features_test = utils.preprocess_features(test_inputs)
#time sensitive parameters gamma = np.random.beta(1, 1.005-basef**epoch) alpha = beta = (1-gamma)/2 # Construct feed dictionary feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy, model.predict()], feed_dict=feed_dict) #choose instance to label based on entropy if len(idx_train)<NL: entropy = sc.stats.entropy(outs[3].T) train_mask = sample_mask(idx_train, labels.shape[0]) #entropy[train_mask+val_mask+test_mask]=-100 entrperc = np.asarray([perc(entropy,i) for i in range(len(entropy))]) kmeans = KMeans(n_clusters=NCL, random_state=0).fit(outs[3]) ed=euclidean_distances(outs[3],kmeans.cluster_centers_) ed_score = np.min(ed,axis=1) #the larger ed_score is, the far that node is away from cluster centers, the less representativeness the node is edprec = np.asarray([percd(ed_score,i) for i in range(len(ed_score))]) finalweight = alpha*entrperc + beta*edprec + gamma*cenperc finalweight[train_mask+val_mask+test_mask]=-100 select=np.argmax(finalweight) idx_train.append(select) train_mask = sample_mask(idx_train, labels.shape[0]) y_train = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] else: print ('finish select!')