def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') print('Bröther may i have some self-lööps') n_nodes = FLAGS.n_nodes n_clusters = FLAGS.n_clusters train_size = FLAGS.train_size data_clean, data_dirty, labels = overlapping_gaussians(n_nodes, n_clusters) graph_clean = construct_knn_graph(data_clean).todense().A1.reshape( n_nodes, n_nodes) train_mask = np.zeros(n_nodes, dtype=np.bool) train_mask[np.random.choice(np.arange(n_nodes), int(n_nodes * train_size), replace=False)] = True test_mask = ~train_mask print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}') print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}') input_features = tf.keras.layers.Input(shape=(2, )) input_features_corrupted = tf.keras.layers.Input(shape=(2, )) input_graph = tf.keras.layers.Input((n_nodes, )) encoder = [GCN(64), GCN(32)] model = deep_graph_infomax( [input_features, input_features_corrupted, input_graph], encoder) def loss(model, x, y, training): _, y_ = model(x, training=training) return loss_object(y_true=y, y_pred=y_) def grad(model, inputs, targets): with tf.GradientTape() as tape: loss_value = loss(model, inputs, targets, training=True) for loss_internal in model.losses: loss_value += loss_internal return loss_value, tape.gradient(loss_value, model.trainable_variables) labels_dgi = tf.concat([tf.zeros([n_nodes, 1]), tf.ones([n_nodes, 1])], 0) loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate) for epoch in range(FLAGS.n_epochs): data_corrupted = data_dirty.copy() perc_shuffle = np.linspace(1, 0.05, FLAGS.n_epochs)[epoch] # perc_shuffle = 1 rows_shuffle = np.random.choice(np.arange(n_nodes), int(n_nodes * perc_shuffle)) data_corrupted_tmp = data_corrupted[rows_shuffle] np.random.shuffle(data_corrupted_tmp) data_corrupted[rows_shuffle] = data_corrupted_tmp loss_value, grads = grad(model, [data_dirty, data_corrupted, graph_clean], labels_dgi) optimizer.apply_gradients(zip(grads, model.trainable_variables)) print('epoch %d, loss: %0.4f, shuffle %0.2f%%' % (epoch, loss_value.numpy(), 100 * perc_shuffle)) representations, _ = model([data_dirty, data_corrupted, graph_clean], training=False) representations = representations.numpy() clf = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf.fit(representations[train_mask], labels[train_mask]) clusters = clf.predict(representations[test_mask]) print( 'NMI:', normalized_mutual_info_score(labels[test_mask], clusters, average_method='arithmetic')) print('Accuracy:', 100 * accuracy_score(labels[test_mask], clusters))
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') print('Starting', format_filename()) if FLAGS.load_strategy == 'schur': adjacency, features, labels, label_mask = load_npz_to_sparse_graph( FLAGS.graph_path) elif FLAGS.load_strategy == 'kipf': adjacency, features, labels, label_mask = load_kipf_data( *os.path.split(FLAGS.graph_path)) else: raise Exception('Unknown loading strategy!') n_nodes = adjacency.shape[0] feature_size = features.shape[1] architecture = [int(x) for x in FLAGS.architecture.strip('[]').split('_')] graph_clean_normalized = scipy_to_tf( normalize_graph(adjacency.copy(), normalized=True)) input_features = tf.keras.layers.Input(shape=(feature_size,)) input_features_corrupted = tf.keras.layers.Input(shape=(feature_size,)) input_graph = tf.keras.layers.Input((n_nodes,), sparse=True) encoder = [GCN(512) for size in architecture] model = deep_graph_infomax( [input_features, input_features_corrupted, input_graph], encoder) def loss(model, x, y, training): _, y_ = model(x, training=training) return loss_object(y_true=y, y_pred=y_) def grad(model, inputs, targets): with tf.GradientTape() as tape: loss_value = loss(model, inputs, targets, training=True) for loss_internal in model.losses: loss_value += loss_internal return loss_value, tape.gradient(loss_value, model.trainable_variables) loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate) patience = 20 best_loss = 999 patience_counter = 0 for epoch in range(FLAGS.n_epochs): features_corr = features.copy() pseudolabels = tf.concat([tf.zeros([n_nodes, 1]), tf.ones([n_nodes, 1])], 0) features_corr = features_corr.copy() np.random.shuffle(features_corr) loss_value, grads = grad(model, [features, features_corr, graph_clean_normalized], pseudolabels) optimizer.apply_gradients(zip(grads, model.trainable_variables)) loss_value = loss_value.numpy() print(epoch, loss_value) if loss_value > best_loss: patience_counter += 1 if patience_counter == patience: break else: best_loss = loss_value patience_counter = 0 representations = model([features, features, graph_clean_normalized], training=False)[0].numpy() clf = KMeans(n_clusters=FLAGS.n_clusters) clf.fit(representations) clusters = clf.labels_ print('Conductance:', conductance(adjacency, clusters)) print('Modularity:', modularity(adjacency, clusters)) print( 'NMI:', normalized_mutual_info_score( labels, clusters[label_mask], average_method='arithmetic')) print('Precision:', precision(labels, clusters[label_mask])) print('Recall:', recall(labels, clusters[label_mask])) with open(format_filename(), 'w') as out_file: print('Conductance:', conductance(adjacency, clusters), file=out_file) print('Modularity:', modularity(adjacency, clusters), file=out_file) print( 'NMI:', normalized_mutual_info_score( labels, clusters[label_mask], average_method='arithmetic'), file=out_file) print('Precision:', precision(labels, clusters[label_mask]), file=out_file) print('Recall:', recall(labels, clusters[label_mask]), file=out_file)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') print('Bröther may i have some self-lööps') n_nodes = FLAGS.n_nodes n_clusters = FLAGS.n_clusters train_size = FLAGS.train_size batch_size = FLAGS.batch_size data_clean, data_dirty, labels = line_gaussians(n_nodes, n_clusters) graph_clean = construct_knn_graph(data_clean) n_neighbors = [15, 10] # TODO(tsitsulin): move to FLAGS. total_matrix_size = 1 + np.cumprod(n_neighbors).sum() train_mask = np.zeros(n_nodes, dtype=np.bool) train_mask[np.random.choice(np.arange(n_nodes), int(n_nodes * train_size), replace=False)] = True test_mask = ~train_mask print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}') print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}') input_features = tf.keras.layers.Input(shape=( total_matrix_size, 2, )) input_features_corrupted = tf.keras.layers.Input(shape=( total_matrix_size, 2, )) input_graph = tf.keras.layers.Input(( total_matrix_size, total_matrix_size, )) encoder = [ GCN(64), GCN(32), tf.keras.layers.Lambda(lambda x: x[0][:, 0, :]) ] model = deep_graph_infomax( [input_features, input_features_corrupted, input_graph], encoder) def loss(model, x, y, training): _, y_ = model(x, training=training) return loss_object(y_true=y, y_pred=y_) def grad(model, inputs, targets): with tf.GradientTape() as tape: loss_value = loss(model, inputs, targets, training=True) for loss_internal in model.losses: loss_value += loss_internal return loss_value, tape.gradient(loss_value, model.trainable_variables) labels_dgi = tf.concat( [tf.zeros([batch_size, 1]), tf.ones([batch_size, 1])], 0) loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate) for epoch in range(FLAGS.n_epochs): subgraph_mat, features_mat, _, nonzero_indices = random_batch( graph_clean, data_dirty, batch_size, n_neighbors) perc_shuffle = 1 # np.linspace(1, 0.25, max_epoch)[epoch] features_corrupted = shuffle_inbatch(features_mat, nonzero_indices, perc_shuffle) loss_value, grads = grad( model, [features_mat, features_corrupted, subgraph_mat], labels_dgi) optimizer.apply_gradients(zip(grads, model.trainable_variables)) print( f'epoch {epoch}, loss: {loss_value.numpy():.4f}, shuffle %: {100*perc_shuffle:.2f}' ) subgraph_mat, features_mat, _ = make_batch(graph_clean, data_dirty, np.arange(n_nodes), n_neighbors) representations, _ = model([features_mat, features_mat, subgraph_mat], training=False) representations = representations.numpy() clf = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf.fit(representations[train_mask], labels[train_mask]) clusters = clf.predict(representations[test_mask]) print( 'NMI:', normalized_mutual_info_score(labels[test_mask], clusters, average_method='arithmetic')) print('Accuracy:', 100 * accuracy_score(labels[test_mask], clusters))