Exemple #1
0
    def train(self, adj_list, fea_list, y_train, y_val, y_test, train_mask, val_mask, test_mask, y_all, all_mask, rawlabels, needtSNE=False, rawFeature=[]):

        prec, rec, f1 = 0.0, 0.0, 0.0
        nb_nodes = fea_list[0].shape[0]
        ft_size = fea_list[0].shape[1]
        nb_classes = y_train.shape[1]
        # nb_classes = len(set(rawlabels))

        # adj = adj.todense()

        # features = features[np.newaxis]  # [1, nb_node, ft_size]
        fea_list = [fea[np.newaxis] for fea in fea_list]
        adj_list = [adj[np.newaxis] for adj in adj_list]
        y_train = y_train[np.newaxis]
        y_val = y_val[np.newaxis]
        y_test = y_test[np.newaxis]
        y_all = y_all[np.newaxis]

        train_mask = train_mask[np.newaxis]
        val_mask = val_mask[np.newaxis]
        test_mask = test_mask[np.newaxis]
        all_mask = all_mask[np.newaxis]

        biases_list = [process.adj_to_bias(adj, [nb_nodes], nhood=1) for adj in adj_list]

        print('build graph...')
        with tf.Graph().as_default():
            with tf.name_scope('input'):
                metric_ftr_in = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='metric_ftr_in')
                ftr_in_list = [tf.placeholder(dtype=tf.float32,
                                              shape=(batch_size, nb_nodes, ft_size),
                                              name='ftr_in_{}'.format(i))
                               for i in range(len(fea_list))]
                bias_in_list = [tf.placeholder(dtype=tf.float32,
                                               shape=(batch_size, nb_nodes, nb_nodes),
                                               name='bias_in_{}'.format(i))
                                for i in range(len(biases_list))]
                lbl_in = tf.placeholder(dtype=tf.int32, shape=(
                    batch_size, nb_nodes, nb_classes), name='lbl_in')
                msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes),
                                        name='msk_in')
                attn_drop = tf.placeholder(dtype=tf.float32, shape=(), name='attn_drop')
                ffd_drop = tf.placeholder(dtype=tf.float32, shape=(), name='ffd_drop')
                is_train = tf.placeholder(dtype=tf.bool, shape=(), name='is_train')

            # forward
            logits, final_embedding, att_val, centers_embed, test_final_embeed = model.inference(ftr_in_list, nb_classes, nb_nodes, is_train,
                                                               attn_drop, ffd_drop,
                                                               bias_mat_list=bias_in_list,
                                                               hid_units=hid_units, n_heads=n_heads, features=fea_list, labels=rawlabels,
                                                               residual=residual, activation=nonlinearity, feature_size=ft_size)


            # final_embedding: checkout Tensor("Sum:0", shape=(286, 64), dtype=float32)

            # logits: checkout Tensor("ExpandDims_3:0", shape=(1, 286, 30), dtype=float32)

            # cal masked_loss
            # lab_list = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ), name='lab_list')
            # ftr_resh = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='ftr_resh')
            log_resh = tf.reshape(logits, [-1, nb_classes])
            lab_resh = tf.reshape(lbl_in, [-1, nb_classes])
            msk_resh = tf.reshape(msk_in, [-1])


            print ("final_embedding: checkout", final_embedding)
            print ("logits: checkout", logits)
            print ("log_resh: checkout", log_resh)
            # print ("ftr_resh: ", ftr_resh)
            print ("lab_resh: ", lab_resh)
            print ("fea_list: ", fea_list)
            print ("centers_embed: ", centers_embed)
            print ("batch_size, nb_nodes, nb_classes, ft_size", batch_size, nb_nodes, nb_classes, ft_size)

            osm_caa_loss = OSM_CAA_Loss(batch_size=nb_nodes)
            osm_loss = osm_caa_loss.forward

            # final_embedding: checkout Tensor("Sum:0", shape=(286, 64), dtype=float32)
            # logits: checkout Tensor("ExpandDims_3:0", shape=(1, 286, 30), dtype=float32)
            # log_resh: checkout Tensor("Reshape:0", shape=(286, 30), dtype=float32)
            # ftr_resh:  Tensor("ftr_resh:0", shape=(286, 100), dtype=float32)
            # lab_resh:  Tensor("Reshape_1:0", shape=(286, 30), dtype=int32)

            osmLoss, checkvalue = osm_loss(final_embedding, rawlabels, centers_embed)
            # osmLoss, checkvalue = osm_loss(metric_ftr_in, rawlabels, centers_embed)
            SoftMaxloss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh)
            loss = osmLoss
            # 为什么loss会固定
            # loss = osmLoss
            # loss = SoftMaxloss

            accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh)
            # optimzie
            train_op = model.training(loss, lr, l2_coef)

            Path = 'pre_trained/{}/{}/{}'.format(dataset, dataset, self.name)
            self.mkdir(Path)
            checkpt_file = '{}/allMP_multi_{}_.ckpt'.format(Path, featype)
            print('model: {}'.format(checkpt_file))
            saver = tf.train.Saver()

            init_op = tf.group(tf.global_variables_initializer(),
                               tf.local_variables_initializer())

            vlss_mn = np.inf
            vacc_mx = 0.0
            curr_step = 0

            with tf.Session(config=config) as sess:
                sess.run(init_op)

                train_loss_avg = 0
                train_acc_avg = 0
                val_loss_avg = 0
                val_acc_avg = 0

                for epoch in range(nb_epochs):
                    tr_step = 0

                    tr_size = fea_list[0].shape[0]
                    # ================   training    ============
                    while tr_step * batch_size < tr_size:
                        fd1 = {i: d[tr_step * batch_size:(tr_step + 1) * batch_size]
                               for i, d in zip(ftr_in_list, fea_list)}
                        fd2 = {i: d[tr_step * batch_size:(tr_step + 1) * batch_size]
                               for i, d in zip(bias_in_list, biases_list)}
                        fd3 = {lbl_in: y_train[tr_step * batch_size:(tr_step + 1) * batch_size],
                               msk_in: train_mask[tr_step * batch_size:(tr_step + 1) * batch_size],
                               metric_ftr_in: rawFeature,
                               is_train: True,
                               attn_drop: 0.6,
                               ffd_drop: 0.6}
                        fd = fd1
                        fd.update(fd2)
                        fd.update(fd3)
                        _, loss_value_tr, acc_tr, att_val_train = sess.run([train_op, loss, accuracy, att_val],
                                                                           feed_dict=fd)
                        test_check_value = sess.run(checkvalue, feed_dict=fd)
                        print ("test_check_value: ", test_check_value)

                        train_loss_avg += loss_value_tr
                        train_acc_avg += acc_tr
                        tr_step += 1


                    vl_step = 0
                    vl_size = fea_list[0].shape[0]
                    # =============   val       =================
                    while vl_step * batch_size < vl_size:
                        # fd1 = {ftr_in: features[vl_step * batch_size:(vl_step + 1) * batch_size]}
                        fd1 = {i: d[vl_step * batch_size:(vl_step + 1) * batch_size]
                               for i, d in zip(ftr_in_list, fea_list)}
                        fd2 = {i: d[vl_step * batch_size:(vl_step + 1) * batch_size]
                               for i, d in zip(bias_in_list, biases_list)}
                        fd3 = {lbl_in: y_val[vl_step * batch_size:(vl_step + 1) * batch_size],
                               msk_in: val_mask[vl_step * batch_size:(vl_step + 1) * batch_size],
                               metric_ftr_in: rawFeature,
                               is_train: False,
                               attn_drop: 0.0,
                               ffd_drop: 0.0}

                        fd = fd1
                        fd.update(fd2)
                        fd.update(fd3)
                        loss_value_vl, acc_vl = sess.run([loss, accuracy],
                                                         feed_dict=fd)
                        val_loss_avg += loss_value_vl
                        val_acc_avg += acc_vl
                        vl_step += 1
                    # import pdb; pdb.set_trace()
                    print('Epoch: {}, att_val: {}'.format(epoch, np.mean(att_val_train, axis=0)))
                    print('Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f | vl_step: %d, tr_step: %d' %
                          (train_loss_avg / tr_step, train_acc_avg / tr_step,
                           val_loss_avg / vl_step, val_acc_avg / vl_step, vl_step, tr_step))

                    if val_acc_avg / vl_step >= vacc_mx or val_loss_avg / vl_step <= vlss_mn:
                        if val_acc_avg / vl_step >= vacc_mx and val_loss_avg / vl_step <= vlss_mn:
                            vacc_early_model = val_acc_avg / vl_step
                            vlss_early_model = val_loss_avg / vl_step
                            saver.save(sess, checkpt_file)
                        vacc_mx = np.max((val_acc_avg / vl_step, vacc_mx))
                        vlss_mn = np.min((val_loss_avg / vl_step, vlss_mn))
                        curr_step = 0
                    else:
                        curr_step += 1
                        if curr_step == patience:
                            print('Early stop! Min loss: ', vlss_mn,
                                  ', Max accuracy: ', vacc_mx)
                            print('Early stop model validation loss: ',
                                  vlss_early_model, ', accuracy: ', vacc_early_model)
                            break

                    train_loss_avg = 0
                    train_acc_avg = 0
                    val_loss_avg = 0
                    val_acc_avg = 0
                # check save
                saver.save(sess, checkpt_file)

                saver.restore(sess, checkpt_file)
                print('load model from : {}'.format(checkpt_file))
                ts_size = fea_list[0].shape[0]
                ts_step = 0
                ts_loss = 0.0
                ts_acc = 0.0

                while ts_step * batch_size < ts_size:
                    fd1 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size]
                           for i, d in zip(ftr_in_list, fea_list)}
                    fd2 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size]
                           for i, d in zip(bias_in_list, biases_list)}
                    fd3 = {lbl_in: y_all[ts_step * batch_size:(ts_step + 1) * batch_size],
                           msk_in: all_mask[ts_step * batch_size:(ts_step + 1) * batch_size],
                           metric_ftr_in: rawFeature,
                          is_train: False,
                          attn_drop: 0.0,
                          ffd_drop: 0.0}

                    fd = fd1
                    fd.update(fd2)
                    fd.update(fd3)
                    loss_value_ts, acc_ts, jhy_final_embedding, test_final_embeed_check = sess.run([loss, accuracy, final_embedding, test_final_embeed],
                                                                          feed_dict=fd)
                    ts_loss += loss_value_ts
                    ts_acc += acc_ts
                    ts_step += 1

                xx = np.expand_dims(jhy_final_embedding, axis=0)[all_mask]
                xx2 = np.expand_dims(test_final_embeed_check, axis=0)[all_mask]
                yy = y_all[all_mask]


                print ("check fd")
                print('xx: {}, yy: {}, ts_size: {}, ts_step: {}, batch_size: {}'.format(xx.shape, yy.shape, ts_size, ts_step,batch_size))

                labels, numberofLabels = self.getLabel(yy)

                from utils import  clustering, pairwise_precision_recall_f1

                clusters_pred = clustering(xx2, num_clusters=numberofLabels)
                prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)
                print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', numberofLabels)

                if needtSNE:
                    tSNEAnanlyse(xx, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_final.png" % (self.name)))
                    tSNEAnanlyse(rawFeature, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_features.png" % (self.name)))
                    tSNEAnanlyse(xx2, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_xx2.png" % (self.name)))
                    tSNEAnanlyse(xx, clusters_pred, join(settings.PIC_DIR, "HAN", "rawReature_%s_result_label.png" % (self.name)))


                sess.close()

        return prec, rec, f1, xx2
def gae_for_na(name, rawfeature):
    """
    train and evaluate disambiguation results for a specific name
    :param name:  author name
    :return: evaluation results
    """
    adj, features, labels = load_local_data(name=name, rawfeature=rawfeature)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    adj_train = gen_train_edges(adj)

    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    num_nodes = adj.shape[0]
    input_feature_dim = features.shape[1]
    if FLAGS.is_sparse:  # TODO to test
        # features = sparse_to_tuple(features.tocoo())
        # features_nonzero = features[1].shape[0]
        features = features.todense()  # TODO
    else:
        features = normalize_vectors(features)

    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelAE(placeholders, input_feature_dim)
    elif model_str == 'gcn_vae':
        model = GCNModelVAE(placeholders, input_feature_dim, num_nodes)
    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()  # negative edges/pos edges
    print('positive edge weight', pos_weight)
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                          validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif model_str == 'gcn_vae':
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                           validate_indices=False), [-1]),
                               model=model, num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    def get_embs():
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean, feed_dict=feed_dict)  # z_mean is better
        return emb

    # Train model
    for epoch in range(FLAGS.epochs):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)

        # Compute average loss
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
              "train_acc=", "{:.5f}".format(avg_accuracy),
              "time=", "{:.5f}".format(time.time() - t))

    emb = get_embs()
    n_clusters = len(set(labels))
    emb_norm = normalize_vectors(emb)
    clusters_pred = clustering(emb_norm, num_clusters=n_clusters)
    prec, rec, f1 =  pairwise_precision_recall_f1(clusters_pred, labels)
    print('pairwise precision', '{:.5f}'.format(prec),
          'recall', '{:.5f}'.format(rec),
          'f1', '{:.5f}'.format(f1))

    clusters_pred2 = clustering(features, num_clusters=n_clusters)
    prec2, rec2, f12 =  pairwise_precision_recall_f1(clusters_pred2, labels)
    print('pairwise precision', '{:.5f}'.format(prec2),
          'recall', '{:.5f}'.format(rec2),
          'f1', '{:.5f}'.format(f12))

    from sklearn.manifold import TSNE
    features_new = TSNE(learning_rate=100).fit_transform(features)
    emb_new = TSNE(learning_rate=100).fit_transform(emb_norm)

    labels = np.array(labels) + 2
    clusters_pred = np.array(clusters_pred) + 2
    clusters_pred2 = np.array(clusters_pred2) + 2

    if rawfeature == RAW_INTER_NAME:
        tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_raw.png" % (name)))
        tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_raw.png" % (name)))
    elif rawfeature == ATTENTIONFEATURE:
        tSNEAnanlyse(emb_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final.png" % (name)))
        tSNEAnanlyse(features_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features.png" % (name)))
        tSNEAnanlyse(emb_new, clusters_pred, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_clusterresult.png" % (name)))
        tSNEAnanlyse(features_new, clusters_pred2, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_clusterresult.png" % (name)))
    else:
        tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_triplet.png" % (name)))
        tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_triplet.png" % (name)))

    return [prec, rec, f1], num_nodes, n_clusters
Exemple #3
0
    def MetricDebug(self, adj_list, fea_list, y_train, y_val, y_test, train_mask, val_mask, test_mask, y_all, all_mask, rawlabels, needtSNE=False, rawFeature=[]):
        prec, rec, f1 = 0.0, 0.0, 0.0
        nb_nodes = fea_list[0].shape[0]
        ft_size = fea_list[0].shape[1]
        nb_classes = y_train.shape[1]
        # nb_classes = len(set(rawlabels))

        # adj = adj.todense()

        # features = features[np.newaxis]  # [1, nb_node, ft_size]
        fea_list = [fea[np.newaxis] for fea in fea_list]
        adj_list = [adj[np.newaxis] for adj in adj_list]
        y_train = y_train[np.newaxis]
        y_val = y_val[np.newaxis]
        y_test = y_test[np.newaxis]
        y_all = y_all[np.newaxis]

        train_mask = train_mask[np.newaxis]
        val_mask = val_mask[np.newaxis]
        test_mask = test_mask[np.newaxis]
        all_mask = all_mask[np.newaxis]

        biases_list = [process.adj_to_bias(adj, [nb_nodes], nhood=1) for adj in adj_list]

        print('build graph...')
        with tf.Graph().as_default():
            with tf.name_scope('input'):
                metric_ftr_in = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='metric_ftr_in')
                ftr_in_list = [tf.placeholder(dtype=tf.float32,
                                              shape=(batch_size, nb_nodes, ft_size),
                                              name='ftr_in_{}'.format(i))
                               for i in range(len(fea_list))]
                bias_in_list = [tf.placeholder(dtype=tf.float32,
                                               shape=(batch_size, nb_nodes, nb_nodes),
                                               name='bias_in_{}'.format(i))
                                for i in range(len(biases_list))]
                lbl_in = tf.placeholder(dtype=tf.int32, shape=(
                    batch_size, nb_nodes, nb_classes), name='lbl_in')
                msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes),
                                        name='msk_in')
                attn_drop = tf.placeholder(dtype=tf.float32, shape=(), name='attn_drop')
                ffd_drop = tf.placeholder(dtype=tf.float32, shape=(), name='ffd_drop')
                is_train = tf.placeholder(dtype=tf.bool, shape=(), name='is_train')

            # forward
            logits, final_embedding, att_val, centers_embed, test_final_embeed = model.inference(ftr_in_list, nb_classes, nb_nodes, is_train,
                                                               attn_drop, ffd_drop,
                                                               bias_mat_list=bias_in_list,
                                                               hid_units=hid_units, n_heads=n_heads, features=fea_list, labels=rawlabels,
                                                               residual=residual, activation=nonlinearity, feature_size=ft_size)

            log_resh = tf.reshape(logits, [-1, nb_classes])
            lab_resh = tf.reshape(lbl_in, [-1, nb_classes])
            msk_resh = tf.reshape(msk_in, [-1])

            osm_caa_loss = OSM_CAA_Loss(batch_size=nb_nodes)
            osm_loss = osm_caa_loss.forward

            osmLoss, checkvalue = osm_loss(final_embedding, rawlabels, centers_embed)
            SoftMaxloss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh)
            loss = osmLoss

            accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh)
            # optimzie
            train_op = model.training(loss, lr, l2_coef)

            Path = 'pre_trained/{}/{}/{}'.format(dataset, dataset, self.name)
            self.mkdir(Path)
            checkpt_file = '{}/allMP_multi_{}_.ckpt'.format(Path, featype)
            saver = tf.train.Saver()

            init_op = tf.group(tf.global_variables_initializer(),
                               tf.local_variables_initializer())

            ts_size = fea_list[0].shape[0]
            ts_step = 0
            ts_loss = 0.0
            ts_acc = 0.0

            with tf.Session(config=config) as sess:
                sess.run(init_op)
                saver.restore(sess, checkpt_file)

                while ts_step * batch_size < ts_size:
                    fd1 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size]
                           for i, d in zip(ftr_in_list, fea_list)}
                    fd2 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size]
                           for i, d in zip(bias_in_list, biases_list)}
                    fd3 = {lbl_in: y_all[ts_step * batch_size:(ts_step + 1) * batch_size],
                           msk_in: all_mask[ts_step * batch_size:(ts_step + 1) * batch_size],
                           metric_ftr_in: rawFeature,
                          is_train: False,
                          attn_drop: 0.0,
                          ffd_drop: 0.0}

                    fd = fd1
                    fd.update(fd2)
                    fd.update(fd3)
                    test_final_embeed_check = sess.run([ test_final_embeed], feed_dict=fd)
                    ts_step += 1

                xx2 = np.expand_dims(test_final_embeed_check, axis=0)[all_mask]
                yy = y_all[all_mask]

                labels, numberofLabels = self.getLabel(yy)

                from utils import  clustering, pairwise_precision_recall_f1

                clusters_pred = clustering(xx2, num_clusters=numberofLabels)
                prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)
                print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', numberofLabels)

                if needtSNE:
                    tSNEAnanlyse(rawFeature, labels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_features.png" % (self.name)))
                    tSNEAnanlyse(xx2, labels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_xx2.png" % (self.name)))
def train(name, needtSNE=False, savefile=True):
    adj, adj2, features, labels, Clusterlabels, Ids = load_local_data(
        name=name)

    initClusterlabel = Clusterlabels
    oneHotClusterLabels = toOneHot(Clusterlabels)
    num_logits = len(oneHotClusterLabels[0])
    # enc.transform([['Female', 1], ['Male', 4]]).toarray()
    print('debuging ', oneHotClusterLabels.shape)

    originClusterlabels = Clusterlabels
    n_clusters = len(set(labels))
    OldClusterlabels = Clusterlabels
    originNumberOfClusterlabels = len(set(Clusterlabels))

    num_nodes = adj.shape[0]
    input_feature_dim = features.shape[1]
    adj_norm, adj_label = NormalizedAdj(adj)
    adj_norm2, adj_label2 = NormalizedAdj(adj2)

    if FLAGS.is_sparse:  # TODO to test
        # features = sparse_to_tuple(features.tocoo())
        # features_nonzero = features[1].shape[0]
        features = features.todense()  # TODO
    else:
        features = normalize_vectors(features)

    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32,
                                   shape=(None, input_feature_dim)),
        'labels': tf.placeholder(tf.int64, shape=(None), name='labels'),
        'graph1': tf.sparse_placeholder(tf.float32),
        'graph2': tf.sparse_placeholder(tf.float32),
        'graph1_orig': tf.sparse_placeholder(tf.float32),
        'graph2_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=()),
        'epoch': tf.placeholder_with_default(0., shape=()),
        'clusterEpoch': tf.placeholder_with_default(0., shape=())
    }

    # pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()  # negative edges/pos edges
    # norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2)

    def get_embs():
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean_1, feed_dict=feed_dict)  # z_mean is better
        return emb

    def getGraphDetail(adj):
        pos_weight = float(adj.shape[0] * adj.shape[0] -
                           adj.sum()) / adj.sum()  # negative edges/pos edges
        norm = adj.shape[0] * adj.shape[0] / float(
            (adj.shape[0] * adj.shape[0] - adj.nnz) * 2)
        return {'norm': norm, 'pos_weight': pos_weight}

        # return pos_weight, norm

    # loss1s = []
    # loss2s = []
    # loss3s = []

    n_clusters = len(set(labels))
    graph1 = getGraphDetail(adj)
    graph2 = getGraphDetail(adj2)

    # construct adj_orig
    graph1['labels'] = tf.reshape(
        tf.sparse_tensor_to_dense(placeholders['graph1_orig'],
                                  validate_indices=False), [-1])
    graph2['labels'] = tf.reshape(
        tf.sparse_tensor_to_dense(placeholders['graph2_orig'],
                                  validate_indices=False), [-1])

    # Train model
    for clusterepoch in range(FLAGS.clusterEpochs):
        print('cluster epoch: ', clusterepoch)
        # tf.reset_default_graph()

        # num_logits
        model = BuildModel(placeholders,
                           input_feature_dim,
                           num_nodes,
                           name='model%d' % (clusterepoch),
                           num_logits=num_logits)

        # Session

        # tf.reset_default_graph()
        # sess = tf.InteractiveSession()

        opt = OptimizerDualGCNAutoEncoder(model=model,
                                          num_nodes=num_nodes,
                                          z_label=Clusterlabels,
                                          name='model%d' % (clusterepoch),
                                          graph1=graph1,
                                          graph2=graph2)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        # Centers
        # centers = opt.centers

        for epoch in range(FLAGS.epochs):
            # print ('epoch: ', epoch)

            # opt.epoch = epoch
            model.epoch = epoch

            # Construct feed dictionary
            # Number of logics and preb

            feed_dict = construct_feed_dict(adj_norm, adj_label, adj_norm2,
                                            adj_label2, features, placeholders,
                                            Clusterlabels, epoch,
                                            clusterepoch + 1)
            feed_dict.update({placeholders['dropout']: FLAGS.dropout})
            # Run single weight update
            # outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)
            outs = sess.run([opt.opt_op, opt.cost], feed_dict=feed_dict)
            # [Loss, softmax_loss, loss3, centerloss, reconstructloss] = sess.run([opt.cost, opt.softmax_loss, opt.loss3, opt.centerloss, opt.reconstructloss], feed_dict=feed_dict)
            # [Loss, loss3, centerloss, reconstructloss, L2loss] = sess.run([opt.cost, opt.loss3, opt.centerloss, opt.reconstructloss, opt.L2loss], feed_dict=feed_dict)
            [Loss, reconstructloss] = sess.run([opt.cost, opt.reconstructloss],
                                               feed_dict=feed_dict)

            # print ('loss: ', Loss, ', loss1: ', loss1, ', loss2: ', loss2 ,', centerloss: ', centerloss, ', acc: ', outs[2])
            print('epoch: ', epoch, ', loss: ', Loss, ', reconstructloss : ',
                  reconstructloss)

        # if clusterepoch != FLAGS.clusterEpochs -1 :
        emb = get_embs()
        X_new = TSNE(learning_rate=100).fit_transform(emb)

        tClusterLabels = []
        Maxscore = -10000
        NumberOfCluster = 0
        for nc in range(2, originNumberOfClusterlabels + 1, 1):
            TempLabels = clustering(X_new, nc)
            score = silhouette_score(X_new, TempLabels)
            print('nc: ', nc, ', score: ', score)
            if score > Maxscore:
                Maxscore = score
                tClusterLabels = TempLabels
                NumberOfCluster = nc

        print('NumberOfCluster: ', NumberOfCluster,
              ', originNumberOfClusterlabels : ', originNumberOfClusterlabels,
              ', Maxscore: ', Maxscore)
        if NumberOfCluster < 0 or NumberOfCluster > originNumberOfClusterlabels:
            continue

        # 符合不断缩小的要求
        # 重新修改这些参数
        Clusterlabels = tClusterLabels
        originNumberOfClusterlabels = NumberOfCluster

        prec, rec, f1 = pairwise_precision_recall_f1(Clusterlabels, labels)
        print('prec: ', prec, ', rec: ', rec, ', f1: ', f1,
              ', originNumberOfClusterlabels: ', originNumberOfClusterlabels)
        Cc = Counter(Clusterlabels)
        print(Cc)
        if needtSNE:
            sNEComparingAnanlyse(emb,
                                 OldClusterlabels,
                                 labels,
                                 Clusterlabels,
                                 savepath=join(
                                     settings.PIC_DIR,
                                     "%s_%s.png" % (name, clusterepoch)))
            # tSNEAnanlyse(emb, labels, join(settings.PIC_DIR, "%s.png"%(clusterepoch)) )
            # tf.reset_default_graph()

    emb = get_embs()
    emb_norm = normalize_vectors(emb)
    clusters_pred = clustering(emb_norm,
                               num_clusters=originNumberOfClusterlabels)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)
    print('prec: ', prec, ', rec: ', rec, ', f1: ', f1,
          ', originNumberOfClusterlabels: ', originNumberOfClusterlabels)
    # lossPrint(range(FLAGS.epochs), loss1s, loss2s, loss3s)
    if needtSNE:
        tSNEAnanlyse(emb, labels,
                     join(settings.PIC_DIR, "%s_final.png" % (name)))
    tf.reset_default_graph()
    return [prec, rec, f1], num_nodes, n_clusters
Exemple #5
0
    while epoch < epochs:
        _, losscheck, value2 = sess.run([train_op, loss, checkvalue], feed_dict=fd)
        print ("epoch: {} loss: {}, checkvalue: {}".format(epoch, losscheck, value2))
        epoch += 1

    print ("final_embed: ", final_embed)
    embedding = sess.run([final_embed], feed_dict=fd)
    embedding = embedding[0]
    print ("embedding: ", embedding)

    from utils import clustering, pairwise_precision_recall_f1

    clusters_pred = clustering(embedding, num_clusters=nb_class)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, rawlabels)
    print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', nb_class)

    tSNEAnanlyse(embedding, rawlabels, join(settings.PIC_DIR, "PureMetricLoss", "%s_final.png" % (name)))
    tSNEAnanlyse(features, rawlabels, join(settings.PIC_DIR, "PureMetricLoss", "%s_features.png" % (name)))

    # my_KNN(xx, yy)
    # my_Kmeans(xx, yy)

    sess.close()







Exemple #6
0
        labels.append(aid)
        rf.append(rawFeature.get(pid))
        tf.append(tripletFeature.get(pid))
        attentionf.append(lc_emb.get(pid))

labels = encode_labels(labels)
numberofLabels = len(set(labels))


def clusterTest(embedding, numberofLabels):
    clusters_pred = clustering(embedding, num_clusters=numberofLabels)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)
    return [prec, rec, f1]


tSNEAnanlyse(rf, labels, join(settings.PIC_DIR, "FINALResult", "%s_rawFeature.png" % (name)))
tSNEAnanlyse(tf, labels, join(settings.PIC_DIR, "FINALResult", "%s_tripletFeature.png" % (name)))
tSNEAnanlyse(attentionf, labels, join(settings.PIC_DIR, "FINALResult", "%s_lcmbFeature.png" % (name)))

Res = {}
Res['rawfeature'] = clusterTest(rf, numberofLabels=numberofLabels)
Res['tripletfeature'] = clusterTest(tf, numberofLabels=numberofLabels)
Res['lcmbfeature'] = clusterTest(attentionf, numberofLabels=numberofLabels)

print ("Res: ", Res)





        EndIndex = -2
    featurePath = getPATH(name, idf_threshold, 'feature_and_label', ispretrain)
    # idx_features_labels = np.genfromtxt(join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)), dtype=np.dtype(str))
    idx_features_labels = np.genfromtxt(featurePath, dtype=np.dtype(str))
    features = np.array(idx_features_labels[:, 1:EndIndex],
                        dtype=np.float32)  # sparse?
    rawlabels = encode_labels(idx_features_labels[:, EndIndex])
    pids = idx_features_labels[:, 0]
    return features, pids, rawlabels


def load_test_names():
    return data_utils.load_json(settings.DATA_DIR, 'test_name_list2.json')


Res = {}

names = load_test_names()
for name in names:
    features, pids, rawlabels = loadFeature(name, ispretrain=False)
    tSNEAnanlyse(
        features, rawlabels,
        join(settings.PIC_DIR, "MetricLearning",
             "rawReature_%s_train.png" % (name)))
    numberofLabels = len(set(rawlabels))
    clusters_pred = clustering(features, num_clusters=numberofLabels)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, rawlabels)
    Res[name] = {"prec": prec, "rec": rec, "f1": f1}

print(Res)
    embs_input = []
    labels = []
    pids = []
    for i, aid in enumerate(name_data.keys()):
        if len(name_data[aid]) < 5:  # n_pubs of current author is too small
            continue
        for pid in name_data[aid]:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
            labels.append(aid)

    embs_input = np.stack(embs_input)
    inter_embs = get_hidden_output(trained_global_model, embs_input)
    labels = encode_labels(labels)

    for i, pid_ in enumerate(pids):
        res_embs.append(inter_embs[i])

    # Clustering and save the result
    tSNEAnanlyse(
        res_embs, labels,
        join(settings.PIC_DIR, "OnlyTriplete",
             "rawReature_%s_triplet.png" % (name)))
    tSNEAnanlyse(
        embs_input, labels,
        join(settings.PIC_DIR, "OnlyTriplete",
             "rawReature_%s_features.png" % (name)))