def RawDocumentContentAnanlyse():
    name = 'Hongbin Li'
    name = name + '_document'
    adj, features, labels, AuthorIds = load_local_data(name=name)

    emb_norm = normalize_vectors(features)

    print(emb_norm)
    PCAAnanlyse(emb_norm)
Example #2
0
def AuthorFeatureClusterTest(name):
    adj, features, labels, AuthorIds = load_local_data(name=name)

    print(features)
    print(features.shape)

    n_clusters = 62
    emb_norm = normalize_vectors(features)
    clusters_pred = clustering(emb_norm, num_clusters=n_clusters)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)

    print('n_clusters:', n_clusters)
    print(clusters_pred)
    print(list(labels))

    print(features)

    print('pairwise precision', '{:.5f}'.format(prec), 'recall',
          '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1))
Example #3
0
def gae_for_na(name, isend=False):
    """
    train and evaluate disambiguation results for a specific name
    :param name:  author name
    :return: evaluation results
    """
    adj, features, labels, AuthorIds = load_local_data(name=name)
    # print(labels)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    adj_train = gen_train_edges(adj)

    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    num_nodes = adj.shape[0]
    input_feature_dim = features.shape[1]
    if FLAGS.is_sparse:  # TODO to test
        # features = sparse_to_tuple(features.tocoo())
        # features_nonzero = features[1].shape[0]
        features = features.todense()  # TODO
    else:
        features = normalize_vectors(features)

    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32,
                                   shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=()),
        'labels': tf.placeholder(tf.int32, shape=(None), name='labels')
    }

    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelAE(placeholders, input_feature_dim)
    elif model_str == 'gcn_vae':
        model = GCNModelVAE(placeholders, input_feature_dim, num_nodes)
    pos_weight = float(adj.shape[0] * adj.shape[0] -
                       adj.sum()) / adj.sum()  # negative edges/pos edges
    print('positive edge weight', pos_weight)
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.nnz) * 2)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(
                                  tf.sparse_tensor_to_dense(
                                      placeholders['adj_orig'],
                                      validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif model_str == 'gcn_vae':
            print(
                'check Label: ',
                tf.reshape(
                    tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                              validate_indices=False), [-1]))
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(
                                   tf.sparse_tensor_to_dense(
                                       placeholders['adj_orig'],
                                       validate_indices=False), [-1]),
                               model=model,
                               num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    def get_embs():
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean, feed_dict=feed_dict)  # z_mean is better
        return emb

    # print('labels: ', labels)
    # print('labels len: ', len(labels))
    # temp = labels
    # temp = sorted(temp)
    # labels = sorted(labels)
    # print('labels len: ', len(list(set(labels))))
    # print('temp: ', temp, ', temp len', len(temp))
    # print('label: ', labels, ', labels len', len(labels))

    # Train model
    for epoch in range(FLAGS.epochs):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features,
                                        placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)

        # Compute average loss
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(avg_cost), "train_acc=",
              "{:.5f}".format(avg_accuracy), "time=",
              "{:.5f}".format(time.time() - t))

    emb = get_embs()

    if isend:
        n_clusters = len(set(labels))
        emb_norm = normalize_vectors(emb)
        clusters_pred = clustering(emb_norm, num_clusters=n_clusters)
        prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)

        print('n_clusters:', n_clusters)
        print(clusters_pred)
        print(labels)

        print(emb)

        print('pairwise precision', '{:.5f}'.format(prec), 'recall',
              '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1))
        return [prec, rec, f1], num_nodes, n_clusters
    else:
        return emb, AuthorIds