Example #1
0
def main():
    # global n_words
    # Prepare training and testing data
    loadpath = "./data/yahoo.p"
    x = cPickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2]
    train_lab, val_lab, test_lab = x[3], x[4], x[5]
    wordtoix, ixtoword = x[6], x[7]

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')

    opt = Options()
    opt.n_words = len(ixtoword)

    del x

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    if opt.part_data:
        np.random.seed(123)
        train_ind = np.random.choice(len(train),
                                     int(len(train) * opt.portion),
                                     replace=False)
        train = [train[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]

    try:
        params = np.load('./param_g.npz')
        if params['Wemb'].shape == (opt.n_words, opt.embed_size):
            print('Use saved embedding.')
            opt.W_emb = params['Wemb']
        else:
            print('Emb Dimension mismatch: param_g.npz:' +
                  str(params['Wemb'].shape) + ' opt: ' +
                  str((opt.n_words, opt.embed_size)))
            opt.fix_emb = False
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:1'):
        x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_mask_ = tf.placeholder(tf.float32,
                                 shape=[opt.batch_size, opt.maxlen])
        keep_prob = tf.placeholder(tf.float32)
        y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, 10])
        accuracy_, loss_, train_op, W_emb_ = emb_classifier(
            x_, x_mask_, y_, keep_prob, opt)
        # merged = tf.summary.merge_all()

    uidx = 0
    max_val_accuracy = 0.
    max_test_accuracy = 0.
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:

                t_vars = tf.trainable_variables()
                # print([var.name[:-2] for var in t_vars])
                save_keys = tensors_key_in_file(opt.save_path)
                # print(save_keys.keys())
                ss = set([var.name for var in t_vars]) & set(
                    [s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                # only restore variables with correct shape
                ss_right_shape = set(
                    [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[
                    var for var in t_vars if var.name in ss_right_shape
                ])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                kf = get_minibatches_idx(len(train),
                                         opt.batch_size,
                                         shuffle=True)
                for _, train_index in kf:
                    uidx += 1
                    sents = [train[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels = np.array(x_labels)
                    x_labels = x_labels.reshape((len(x_labels), 10))

                    x_batch, x_batch_mask = prepare_data_for_emb(sents, opt)

                    _, loss = sess.run(
                        [train_op, loss_],
                        feed_dict={
                            x_: x_batch,
                            x_mask_: x_batch_mask,
                            y_: x_labels,
                            keep_prob: opt.drop_rate
                        })

                    if uidx % opt.valid_freq == 0:
                        train_correct = 0.0
                        kf_train = get_minibatches_idx(500,
                                                       opt.batch_size,
                                                       shuffle=True)
                        for _, train_index in kf_train:
                            train_sents = [train[t] for t in train_index]
                            train_labels = [train_lab[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape(
                                (len(train_labels), 10))
                            x_train_batch, x_train_batch_mask = prepare_data_for_emb(
                                train_sents, opt)  # Batch L

                            train_accuracy = sess.run(accuracy_,
                                                      feed_dict={
                                                          x_: x_train_batch,
                                                          x_mask_:
                                                          x_train_batch_mask,
                                                          y_: train_labels,
                                                          keep_prob: 1.0
                                                      })

                            train_correct += train_accuracy * len(train_index)

                        train_accuracy = train_correct / 500

                        print("Iteration %d: Training loss %f " % (uidx, loss))
                        print("Train accuracy %f " % train_accuracy)

                        val_correct = 0.0
                        kf_val = get_minibatches_idx(20000,
                                                     opt.batch_size,
                                                     shuffle=True)
                        for _, val_index in kf_val:
                            val_sents = [val[t] for t in val_index]
                            val_labels = [val_lab[t] for t in val_index]
                            val_labels = np.array(val_labels)
                            val_labels = val_labels.reshape(
                                (len(val_labels), 10))
                            x_val_batch, x_val_batch_mask = prepare_data_for_emb(
                                val_sents, opt)

                            val_accuracy = sess.run(accuracy_,
                                                    feed_dict={
                                                        x_: x_val_batch,
                                                        x_mask_:
                                                        x_val_batch_mask,
                                                        y_: val_labels,
                                                        keep_prob: 1.0
                                                    })

                            val_correct += val_accuracy * len(val_index)

                        val_accuracy = val_correct / 20000
                        print("Validation accuracy %f " % val_accuracy)

                        if val_accuracy > max_val_accuracy:
                            max_val_accuracy = val_accuracy

                            test_correct = 0.0
                            kf_test = get_minibatches_idx(len(test),
                                                          opt.batch_size,
                                                          shuffle=True)
                            for _, test_index in kf_test:
                                test_sents = [test[t] for t in test_index]
                                test_labels = [test_lab[t] for t in test_index]
                                test_labels = np.array(test_labels)
                                test_labels = test_labels.reshape(
                                    (len(test_labels), 10))
                                x_test_batch, x_test_batch_mask = prepare_data_for_emb(
                                    test_sents, opt)

                                test_accuracy = sess.run(accuracy_,
                                                         feed_dict={
                                                             x_: x_test_batch,
                                                             x_mask_:
                                                             x_test_batch_mask,
                                                             y_: test_labels,
                                                             keep_prob: 1.0
                                                         })

                                test_correct += test_accuracy * len(test_index)

                            test_accuracy = test_correct / len(test)

                            print("Test accuracy %f " % test_accuracy)

                            max_test_accuracy = test_accuracy

                print("Epoch %d: Max Test accuracy %f" %
                      (epoch, max_test_accuracy))

                emb = sess.run(W_emb_, feed_dict={x_: x_test_batch})

                cPickle.dump([emb], open("yahoo_emb_max_300.p", "wb"))

            print("Max Test accuracy %f " % max_test_accuracy)

        except KeyboardInterrupt:
            # print 'Training interupted'
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
Example #2
0
def main():
    # Prepare training and testing data
    opt = Options()
    # load data
    loadpath = "./data/mimic3.p"
    embpath = "mimic3_emb.p"
    opt.num_class = 50

    x = cPickle.load(open(loadpath, "rb"))
    train, train_text, train_lab = x[0], x[1], x[2]
    val, val_text, val_lab = x[3], x[4], x[5]
    test, test_text, test_lab = x[6], x[7], x[8]
    wordtoix, ixtoword = x[10], x[9]
    del x
    print("load data finished")

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')
    opt.n_words = len(ixtoword)
    if opt.part_data:
        #np.random.seed(123)
        train_ind = np.random.choice(len(train),
                                     int(len(train) * opt.portion),
                                     replace=False)
        train = [train[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]

    os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID)

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    try:
        opt.W_emb = np.array(cPickle.load(open(embpath, 'rb')),
                             dtype='float32')
        opt.W_class_emb = load_class_embedding(wordtoix, opt)
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:1'):
        x_ = tf.placeholder(tf.int32,
                            shape=[opt.batch_size, opt.maxlen],
                            name='x_')
        x_mask_ = tf.placeholder(tf.float32,
                                 shape=[opt.batch_size, opt.maxlen],
                                 name='x_mask_')
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        y_ = tf.placeholder(tf.float32,
                            shape=[opt.batch_size, opt.num_class],
                            name='y_')
        class_penalty_ = tf.placeholder(tf.float32, shape=())
        accuracy_, loss_, train_op, W_norm_, global_step, logits_, prob_ = emb_classifier(
            x_, x_mask_, y_, keep_prob, opt, class_penalty_)
    uidx = 0
    max_val_accuracy = 0.
    max_test_accuracy = 0.
    max_val_auc_mean = 0.
    max_test_auc_mean = 0.

    config = tf.ConfigProto(
        log_device_placement=False,
        allow_soft_placement=True,
    )
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:
                t_vars = tf.trainable_variables()
                save_keys = tensors_key_in_file(opt.save_path)
                ss = set([var.name for var in t_vars]) & set(
                    [s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                # only restore variables with correct shape
                ss_right_shape = set(
                    [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[
                    var for var in t_vars if var.name in ss_right_shape
                ])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                kf = get_minibatches_idx(len(train),
                                         opt.batch_size,
                                         shuffle=True)
                for _, train_index in kf:
                    uidx += 1
                    sents = [train[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels = np.array(x_labels)
                    x_labels = x_labels.reshape((len(x_labels), opt.num_class))

                    x_batch, x_batch_mask = prepare_data_for_emb(sents, opt)
                    _, loss, step, = sess.run(
                        [train_op, loss_, global_step],
                        feed_dict={
                            x_: x_batch,
                            x_mask_: x_batch_mask,
                            y_: x_labels,
                            keep_prob: opt.dropout,
                            class_penalty_: opt.class_penalty
                        })

                    if uidx % opt.valid_freq == 0:
                        train_correct = 0.0
                        # sample evaluate accuaccy on 500 sample data
                        kf_train = get_minibatches_idx(500,
                                                       opt.batch_size,
                                                       shuffle=True)
                        for _, train_index in kf_train:
                            train_sents = [train[t] for t in train_index]
                            train_labels = [train_lab[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape(
                                (len(train_labels), opt.num_class))
                            x_train_batch, x_train_batch_mask = prepare_data_for_emb(
                                train_sents, opt)
                            train_accuracy = sess.run(accuracy_,
                                                      feed_dict={
                                                          x_: x_train_batch,
                                                          x_mask_:
                                                          x_train_batch_mask,
                                                          y_: train_labels,
                                                          keep_prob: 1.0,
                                                          class_penalty_: 0.0
                                                      })

                            train_correct += train_accuracy * len(train_index)

                        train_accuracy = train_correct / 500

                        print("Iteration %d: Training loss %f " % (uidx, loss))
                        print("Train accuracy %f " % train_accuracy)

                        val_correct = 0.0
                        val_y = []
                        val_logits_list = []
                        val_prob_list = []
                        val_true_list = []

                        kf_val = get_minibatches_idx(len(val),
                                                     opt.batch_size,
                                                     shuffle=True)
                        for _, val_index in kf_val:
                            val_sents = [val[t] for t in val_index]
                            val_labels = [val_lab[t] for t in val_index]
                            val_labels = np.array(val_labels)
                            val_labels = val_labels.reshape(
                                (len(val_labels), opt.num_class))
                            x_val_batch, x_val_batch_mask = prepare_data_for_emb(
                                val_sents, opt)
                            val_accuracy, val_logits, val_probs = sess.run(
                                [accuracy_, logits_, prob_],
                                feed_dict={
                                    x_: x_val_batch,
                                    x_mask_: x_val_batch_mask,
                                    y_: val_labels,
                                    keep_prob: 1.0,
                                    class_penalty_: 0.0
                                })

                            val_correct += val_accuracy * len(val_index)
                            val_y += np.argmax(val_labels, axis=1).tolist()
                            val_logits_list += val_logits.tolist()
                            val_prob_list += val_probs.tolist()
                            val_true_list += val_labels.tolist()

                        val_accuracy = val_correct / len(val)
                        val_logits_array = np.asarray(val_logits_list)
                        val_prob_array = np.asarray(val_prob_list)
                        val_true_array = np.asarray(val_true_list)
                        val_auc_list = []
                        val_auc_micro = roc_auc_score(y_true=val_true_array,
                                                      y_score=val_logits_array,
                                                      average='micro')
                        val_auc_macro = roc_auc_score(y_true=val_true_array,
                                                      y_score=val_logits_array,
                                                      average='macro')
                        for i in range(opt.num_class):
                            if np.max(val_true_array[:, i] > 0):
                                val_auc = roc_auc_score(
                                    y_true=val_true_array[:, i],
                                    y_score=val_logits_array[:, i],
                                )
                                val_auc_list.append(val_auc)
                        val_auc_mean = np.mean(val_auc)

                        # print("Validation accuracy %f " % val_accuracy)
                        print("val auc macro %f micro %f " %
                              (val_auc_macro, val_auc_micro))

                        if True:
                            test_correct = 0.0
                            test_y = []
                            test_logits_list = []
                            test_prob_list = []
                            test_true_list = []

                            kf_test = get_minibatches_idx(len(test),
                                                          opt.batch_size,
                                                          shuffle=True)
                            for _, test_index in kf_test:
                                test_sents = [test[t] for t in test_index]
                                test_labels = [test_lab[t] for t in test_index]
                                test_labels = np.array(test_labels)
                                test_labels = test_labels.reshape(
                                    (len(test_labels), opt.num_class))
                                x_test_batch, x_test_batch_mask = prepare_data_for_emb(
                                    test_sents, opt)

                                test_accuracy, test_logits, test_probs = sess.run(
                                    [accuracy_, logits_, prob_],
                                    feed_dict={
                                        x_: x_test_batch,
                                        x_mask_: x_test_batch_mask,
                                        y_: test_labels,
                                        keep_prob: 1.0,
                                        class_penalty_: 0.0
                                    })

                                test_correct += test_accuracy * len(test_index)

                                test_correct += test_accuracy * len(test_index)
                                test_y += np.argmax(test_labels,
                                                    axis=1).tolist()
                                test_logits_list += test_logits.tolist()
                                test_prob_list += test_probs.tolist()
                                test_true_list += test_labels.tolist()
                            test_accuracy = test_correct / len(test)
                            test_logits_array = np.asarray(test_logits_list)
                            test_prob_array = np.asarray(test_prob_list)
                            test_true_array = np.asarray(test_true_list)
                            test_auc_list = []
                            test_auc_micro = roc_auc_score(
                                y_true=test_true_array,
                                y_score=test_logits_array,
                                average='micro')
                            test_auc_macro = roc_auc_score(
                                y_true=test_true_array,
                                y_score=test_logits_array,
                                average='macro')

                            test_f1_micro = micro_f1(
                                test_prob_array.ravel() > 0.5,
                                test_true_array.ravel(),
                            )
                            test_f1_macro = macro_f1(
                                test_prob_array > 0.5,
                                test_true_array,
                            )
                            test_p5 = precision_at_k(test_logits_array,
                                                     test_true_array, 5)

                            for i in range(opt.num_class):
                                if np.max(test_true_array[:, i] > 0):
                                    test_auc = roc_auc_score(
                                        y_true=test_true_array[:, i],
                                        y_score=test_logits_array[:, i],
                                    )
                                    test_auc_list.append(test_auc)

                            test_auc_mean = np.mean(test_auc)
                            print("Test auc macro %f micro %f " %
                                  (test_auc_macro, test_auc_micro))
                            print("Test f1 macro %f micro %f " %
                                  (test_f1_macro, test_f1_micro))
                            print("P5 %f" % test_p5)
                            # max_test_accuracy = test_accuracy
                            max_test_auc_mean = test_auc_mean
                            # print("Test accuracy %f " % test_accuracy)
                            # max_test_accuracy = test_accuracy

                # print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy))
                print("Epoch %d: Max Test auc %f" % (epoch, max_test_auc_mean))
                saver.save(sess, opt.save_path, global_step=epoch)
            print("Max Test accuracy %f " % max_test_accuracy)

        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
Example #3
0
def main(opt):
    # global n_words
    # Prepare training and testing data
    
    
    data_path = opt.data_dir + "/" + opt.data_name
    print('loading '+data_path)
    x = cPickle.load(open(data_path, "rb"))
    train, val, test = x[0], x[1], x[2]
    wordtoix, ixtoword = x[3], x[4]


    opt.n_words = len(ixtoword) 
    print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
    print dict(opt)
    print('Total words: %d' % opt.n_words)

    with tf.device('/gpu:1'):
        x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
        x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
        y_ = tf.placeholder(tf.float32, shape=[opt.batch_size,])
        l_temp_ = tf.placeholder(tf.float32, shape=[])
        res_, loss_ ,train_op = cons_disc(x_1_, x_2_, y_, opt, l_temp_)
        merged = tf.summary.merge_all()

    

    uidx = 0
    
    config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.95
    
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer()) # feed_dict={x_: np.zeros([opt.batch_size, opt.sent_len]), x_org_: np.zeros([opt.batch_size, opt.sent_len])}

        if opt.restore:
            print('-'*20)
            print("Loading variables from '%s'." % opt.load_path)
            try:
                #pdb.set_trace()
                t_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) #tf.trainable_variables()
                #print([var.name[:-2] for var in t_vars]              
                save_keys = tensors_key_in_file(opt.load_path)
                ss = [var for var in t_vars if var.name[:-2] in save_keys.keys()]
                ss = [var.name for var in ss if var.get_shape() == save_keys[var.name[:-2]]]
                loader = tf.train.Saver(var_list= [var for var in t_vars if var.name in ss])
                loader.restore(sess, opt.load_path)
                print("Loaded variables:"+str(ss))
                print('-'*20)

            except Exception as e:
                print 'Error: '+str(e)
                exit()
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        # train
        # if don't want to train, set max_epochs=0

        for epoch in range(opt.max_epochs):
            print("Starting epoch %d" % epoch)
            opt.l_temp = min(opt.l_temp * opt.l_temp_factor, opt.l_temp_max)
            print("Annealing temperature " + str(opt.l_temp))
            kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True)
            for _, train_index in kf:
                uidx += 1
                sents = [train[t] for t in train_index]
                indice = [rand_pair(opt.task, opt.data_name) for _ in range(opt.batch_size)]
                if opt.task == 'L':

                    x_1 = [sents[i][idx[0]] for i, idx in enumerate(indice)]
                    x_2 = [sents[i][idx[1]] for i, idx in enumerate(indice)]
                    y_batch = [(i1-i2)%2 == 0 for i1,i2 in indice]
                elif opt.task == 'C':

                    batch_indice = np.concatenate([np.random.permutation(opt.batch_size/2) , range(opt.batch_size/2, opt.batch_size)]) 
                    y_batch = (range(opt.batch_size) == batch_indice)

                    rn = np.random.choice(7,size = opt.batch_size)
 
                    x_1 = [sents[i][idx[0]] for i, idx in enumerate(indice)]
                    x_2 = [sents[batch_indice[i]][idx[1]] for i, idx in enumerate(indice)]
                else: # G
                    batch_indice = np.concatenate([np.random.permutation(opt.batch_size/2) , range(opt.batch_size/2, opt.batch_size)]) 
                    y_batch = (range(opt.batch_size) == batch_indice)
              
                    x_1 = [sents[i][idx[0]] for i, idx in enumerate(indice)]
                    x_2 = [sents[batch_indice[i]][idx[1]] for i, idx in enumerate(indice)]
                x_1_batch = prepare_data_for_cnn(x_1, opt)  # Batch L
                x_2_batch = prepare_data_for_cnn(x_2, opt)  # Batch L

                feed = {x_1_: x_1_batch, x_2_: x_2_batch, y_:np.float32(y_batch),l_temp_:opt.l_temp}
                _, loss = sess.run([train_op, loss_], feed_dict=feed)



                if uidx % opt.print_freq == 1:
                    print("Iteration %d: loss %f " % (uidx, loss))
                    res = sess.run(res_, feed_dict=feed)
                    if opt.verbose:
                        print("logits:" + str(res['logits']))
                        print("H1:" + str(res['H_1'][0]))
                        print("H2:" + str(res['H_2'][0]))
                    # print("H2:" + str(res['H_1'][0]*res['H_2'][0]-0.5))
                    acc = sum(np.equal(res['y_pred'],y_batch))/np.float(opt.batch_size)
                    print("Accuracy: %f" % (acc))
                    print("y_mean: %f" % (np.mean(y_batch)))
                    print("corr:" + str(res['corr']))

                    sys.stdout.flush()
                    summary = sess.run(merged, feed_dict=feed)
                    train_writer.add_summary(summary, uidx)

                if uidx % opt.valid_freq == 1:
                    acc, loss_val, y_mean, corr = 0, 0, 0, 0
                    indice = [rand_pair(opt.task, opt.data_name) for _ in range(opt.batch_size)]
                    for i in range(100):
                        valid_index = np.random.choice(len(test), opt.batch_size)
                        sents = [test[t] for t in valid_index]
                        if opt.task == 'L':
                           
                            x_1 = [sents[i][idx[0]] for i, idx in enumerate(indice)]
                            x_2 = [sents[i][idx[1]] for i, idx in enumerate(indice)]
                            y_batch = [(i1-i2)%2 == 0 for i1,i2 in indice]
                        elif opt.task == 'C':
     
                            batch_indice = np.concatenate([np.random.permutation(opt.batch_size/2) , range(opt.batch_size/2, opt.batch_size)]) 
                            y_batch = (range(opt.batch_size) == batch_indice)
                     
                            rn = np.random.choice(7,size = opt.batch_size)
                        
                            x_1 = [sents[i][idx[0]] for i, idx in enumerate(indice)]
                            x_2 = [sents[batch_indice[i]][idx[1]] for i, idx in enumerate(indice)]
                        else: # G
                            batch_indice = np.concatenate([np.random.permutation(opt.batch_size/2) , range(opt.batch_size/2, opt.batch_size)]) 
                            y_batch = (range(opt.batch_size) == batch_indice)
                            x_1 = [sents[i][idx[0]] for i, idx in enumerate(indice)]
                            x_2 = [sents[batch_indice[i]][idx[1]] for i, idx in enumerate(indice)]

                        x_1_batch = prepare_data_for_cnn(x_1, opt)  # Batch L
                        x_2_batch = prepare_data_for_cnn(x_2, opt)  # Batch L

                        feed = {x_1_: x_1_batch, x_2_: x_2_batch, y_:np.float32(y_batch),l_temp_:opt.l_temp}
                        loss_val += sess.run(loss_, feed_dict=feed)
                        res = sess.run(res_, feed_dict=feed)
                        acc += sum(np.equal(res['y_pred'],y_batch))/np.float(opt.batch_size)
                        y_mean += np.mean(y_batch)
                        corr += res['corr']

                    loss_val = loss_val / 100.0
                    acc = acc / 100.0
                    y_mean = y_mean / 100.0
                    corr = corr / 100.0
                    print("Validation loss %.4f " % (loss_val))
                    print("Validation accuracy: %.4f" % (acc))
                    print("Validation y_mean: %.4f" % (y_mean))
                    print("Validation corr: %.4f" % (corr))
                    print("")
                    sys.stdout.flush()
                    
                    summary = sess.run(merged, feed_dict=feed)
                    test_writer.add_summary(summary, uidx)

            saver.save(sess, opt.save_path, global_step=epoch)


        # test

        if opt.test:
            print('Testing....')
            iter_num = np.int(np.floor(len(test)/opt.batch_size))+1
            for i in range(iter_num):
                if i%100 == 0:
                    print('Iter %i/%i'%(i, iter_num))
                test_index = range(i*opt.batch_size, (i+1)*opt.batch_size)
                test_sents = [test[t%len(test)] for t in test_index]
                indice = [(0,1),(2,3),(4,5),(6,7)]
                for idx in indice:
                    x_1 = [test_sents[i][idx[0]] for i in range(opt.batch_size)]
                    x_2 = [test_sents[i][idx[1]] for i in range(opt.batch_size)]
                    y_batch = [True for i in range(opt.batch_size)]
                    x_1_batch = prepare_data_for_cnn(x_1, opt)  # Batch L
                    x_2_batch = prepare_data_for_cnn(x_2, opt)  # Batch L

                    feed = {x_1_: x_1_batch, x_2_: x_2_batch, y_:np.float32(y_batch), l_temp_:opt.l_temp}
                    res = sess.run(res_, feed_dict=feed)
                    for d in range(opt.batch_size):
                        with open(opt.log_path + '.feature.txt', "a") as myfile:
                            myfile.write(str(test_index[d]) + "\t" + str(idx[0]) + "\t" + " ".join([ixtoword[x] for x in x_1_batch[d] if x != 0]) + "\t" + " ".join(map(str,res['H_1'][d]))+ "\n")
                            myfile.write(str(test_index[d]) + "\t" + str(idx[1]) + "\t" + " ".join([ixtoword[x] for x in x_2_batch[d] if x != 0]) + "\t" + " ".join(map(str,res['H_2'][d]))+ "\n")
Example #4
0
def main():
    # Prepare training and testing data
    opt = Options()
    # load data
    if opt.dataset == 'yahoo':
        loadpath = "./data/yahoo.p"
        embpath = "./data/yahoo_glove.p"
        opt.num_class = 10
        opt.class_name = [
            'Society Culture', 'Science Mathematics', 'Health',
            'Education Reference', 'Computers Internet', 'Sports',
            'Business Finance', 'Entertainment Music', 'Family Relationships',
            'Politics Government'
        ]
    elif opt.dataset == 'agnews':
        loadpath = "./data/ag_news.p"
        embpath = "./data/ag_news_glove.p"
        opt.num_class = 4
        opt.class_name = ['World', 'Sports', 'Business', 'Science']
    elif opt.dataset == 'dbpedia':
        loadpath = "./data/dbpedia.p"
        embpath = "./data/dbpedia_glove.p"
        opt.num_class = 14
        opt.class_name = [
            'Company',
            'Educational Institution',
            'Artist',
            'Athlete',
            'Office Holder',
            'Mean Of Transportation',
            'Building',
            'Natural Place',
            'Village',
            'Animal',
            'Plant',
            'Album',
            'Film',
            'Written Work',
        ]
    elif opt.dataset == 'yelp_full':
        loadpath = "./data/yelp_full.p"
        embpath = "./data/yelp_full_glove.p"
        opt.num_class = 5
        opt.class_name = ['worst', 'bad', 'middle', 'good', 'best']

    elif opt.dataset == 'Positive':
        loadpath = "./data/Positive.p"
        embpath = "./data/glove.p"
        opt.num_class = 2
        opt.class_name = ['Control', 'Other']

    elif opt.dataset == 'Negative':
        loadpath = "./data/Negative.p"
        embpath = "./data/glove.p"
        opt.num_class = 2
        opt.class_name = ['Control', 'Other']

    x = cPickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2]
    train_lab, val_lab, test_lab = x[3], x[4], x[5]
    wordtoix, ixtoword = x[6], x[7]
    del x
    print("load data finished")

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')
    opt.n_words = len(ixtoword)
    if opt.part_data:
        #np.random.seed(123)
        train_ind = np.random.choice(len(train),
                                     int(len(train) * opt.portion),
                                     replace=False)
        train = [train[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]

    os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID)

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    try:
        opt.W_emb = np.array(cPickle.load(open(embpath, 'rb')),
                             dtype='float32')
        #opt.W_class_emb =  load_class_embedding( wordtoix, opt)
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:1'):
        x_ = tf.placeholder(tf.int32,
                            shape=[opt.batch_size, opt.maxlen],
                            name='x_')
        x_mask_ = tf.placeholder(tf.float32,
                                 shape=[opt.batch_size, opt.maxlen],
                                 name='x_mask_')
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        y_ = tf.placeholder(tf.float32,
                            shape=[opt.batch_size, opt.num_class],
                            name='y_')
        class_penalty_ = tf.placeholder(tf.float32, shape=())
        accuracy_, loss_, train_op, W_norm_, W_class, global_step = emb_classifier(
            x_, x_mask_, y_, keep_prob, opt, class_penalty_)
    uidx = 0
    max_val_accuracy = 0.
    max_test_accuracy = 0.

    config = tf.ConfigProto(
        log_device_placement=False,
        allow_soft_placement=True,
    )
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:
                t_vars = tf.trainable_variables()
                save_keys = tensors_key_in_file(opt.save_path)
                ss = set([var.name for var in t_vars]) & set(
                    [s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                # only restore variables with correct shape
                ss_right_shape = set(
                    [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[
                    var for var in t_vars if var.name in ss_right_shape
                ])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)

                kf = get_minibatches_idx(len(train),
                                         opt.batch_size,
                                         shuffle=True)
                for _, train_index in kf:
                    uidx += 1
                    sents = [train[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels = np.array(x_labels)
                    x_labels = x_labels.reshape((len(x_labels), opt.num_class))

                    x_batch, x_batch_mask = prepare_data_for_emb(sents, opt)
                    _, loss, step, = sess.run(
                        [train_op, loss_, global_step],
                        feed_dict={
                            x_: x_batch,
                            x_mask_: x_batch_mask,
                            y_: x_labels,
                            keep_prob: opt.dropout,
                            class_penalty_: opt.class_penalty
                        })

                    if uidx % opt.valid_freq == 0:
                        train_correct = 0.0
                        # sample evaluate accuaccy on 500 sample data
                        kf_train = get_minibatches_idx(50,
                                                       opt.batch_size,
                                                       shuffle=True)
                        for _, train_index in kf_train:
                            train_sents = [train[t] for t in train_index]
                            train_labels = [train_lab[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape(
                                (len(train_labels), opt.num_class))
                            x_train_batch, x_train_batch_mask = prepare_data_for_emb(
                                train_sents, opt)
                            train_accuracy = sess.run(accuracy_,
                                                      feed_dict={
                                                          x_: x_train_batch,
                                                          x_mask_:
                                                          x_train_batch_mask,
                                                          y_: train_labels,
                                                          keep_prob: 1.0,
                                                          class_penalty_: 0.0
                                                      })

                            train_correct += train_accuracy * len(train_index)

                        train_accuracy = train_correct / 50

                        with open("weights.pkl", "wb") as handle:
                            pickle.dump(sess.run(W_norm_), handle)
                            pickle.dump(sess.run(W_class), handle)

                        print("Iteration %d: Training loss %f " % (uidx, loss))
                        print("Train accuracy %f " % train_accuracy)

                        val_correct = 0.0
                        kf_val = get_minibatches_idx(len(val),
                                                     opt.batch_size,
                                                     shuffle=True)
                        for _, val_index in kf_val:
                            val_sents = [val[t] for t in val_index]
                            val_labels = [val_lab[t] for t in val_index]
                            val_labels = np.array(val_labels)
                            val_labels = val_labels.reshape(
                                (len(val_labels), opt.num_class))
                            x_val_batch, x_val_batch_mask = prepare_data_for_emb(
                                val_sents, opt)
                            val_accuracy = sess.run(accuracy_,
                                                    feed_dict={
                                                        x_: x_val_batch,
                                                        x_mask_:
                                                        x_val_batch_mask,
                                                        y_: val_labels,
                                                        keep_prob: 1.0,
                                                        class_penalty_: 0.0
                                                    })

                            val_correct += val_accuracy * len(val_index)

                        val_accuracy = val_correct / (len(val) + 0.1)
                        print("Validation accuracy %f " % val_accuracy)

                        if val_accuracy > max_val_accuracy:
                            max_val_accuracy = val_accuracy

                            test_correct = 0.0

                            kf_test = get_minibatches_idx(len(test),
                                                          opt.batch_size,
                                                          shuffle=True)
                            for _, test_index in kf_test:
                                test_sents = [test[t] for t in test_index]
                                test_labels = [test_lab[t] for t in test_index]
                                test_labels = np.array(test_labels)
                                test_labels = test_labels.reshape(
                                    (len(test_labels), opt.num_class))
                                x_test_batch, x_test_batch_mask = prepare_data_for_emb(
                                    test_sents, opt)

                                test_accuracy = sess.run(accuracy_,
                                                         feed_dict={
                                                             x_: x_test_batch,
                                                             x_mask_:
                                                             x_test_batch_mask,
                                                             y_: test_labels,
                                                             keep_prob: 1.0,
                                                             class_penalty_:
                                                             0.0
                                                         })

                                test_correct += test_accuracy * len(test_index)
                            test_accuracy = test_correct / (len(test) + 0.1)
                            print("Test accuracy %f " % test_accuracy)
                            max_test_accuracy = test_accuracy

                print("Epoch %d: Max Test accuracy %f" %
                      (epoch, max_test_accuracy))
                saver.save(sess, opt.save_path, global_step=epoch)

            print("Max Test accuracy %f " % max_test_accuracy)

        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
Example #5
0
def main():
    # Prepare training and testing data
    opt = Options()
    # load data
    if opt.dataset == 'Tweet':
        loadpath = "./data/langdetect_tweet0.7.p"
        embpath = "./data/langdetect_tweet_emb.p"
        opt.num_class = 4
        opt.class_name = ['apple', 'google', 'microsoft', 'twitter']
    if opt.dataset == 'N20short':
        loadpath = "./data/N20short.p"
        embpath = "./data/N20short_emb.p"
        opt.class_name = [
            'rec.autos', 'talk.politics.misc', 'sci.electronics',
            'comp.sys.ibm.pc.hardware', 'talk.politics.guns', 'sci.med',
            'rec.motorcycles', 'soc.religion.christian',
            'comp.sys.mac.hardware', 'comp.graphics', 'sci.space',
            'alt.atheism', 'rec.sport.baseball', 'comp.windows.x',
            'talk.religion.misc', 'comp.os.ms-windows.misc', 'misc.forsale',
            'talk.politics.mideast', 'sci.crypt', 'rec.sport.hockey'
        ]
        opt.num_class = len(opt.class_name)
    elif opt.dataset == 'agnews':
        loadpath = "./data/ag_news.p"
        embpath = "./data/ag_news_glove.p"
        opt.num_class = 4
        opt.class_name = ['World', 'Sports', 'Business', 'Science']
    elif opt.dataset == 'dbpedia':
        loadpath = "./data/dbpedia.p"
        embpath = "./data/dbpedia_glove.p"
        opt.num_class = 14
        opt.class_name = [
            'Company',
            'Educational Institution',
            'Artist',
            'Athlete',
            'Office Holder',
            'Mean Of Transportation',
            'Building',
            'Natural Place',
            'Village',
            'Animal',
            'Plant',
            'Album',
            'Film',
            'Written Work',
        ]
    elif opt.dataset == 'yelp_full':
        loadpath = "./data/yelp_full.p"
        embpath = "./data/yelp_full_glove.p"
        opt.num_class = 5
        opt.class_name = ['worst', 'bad', 'middle', 'good', 'best']
    x = cPickle.load(open(loadpath, "rb"), encoding='iso-8859-1')
    train, val, test = x[0], x[1], x[2]
    print(len(val))
    train_lab, val_lab, test_lab = x[3], x[4], x[5]
    wordtoix, ixtoword = x[6], x[7]
    del x
    print("len of train,val,test:", len(train), len(val), len(test))
    print("load data finished")

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')
    opt.n_words = len(ixtoword)
    if opt.part_data:
        #np.random.seed(123)
        train_ind = np.random.choice(len(train),
                                     int(len(train) * opt.portion),
                                     replace=False)
        train = [train[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]

    os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID)

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    try:
        opt.W_emb = np.array(cPickle.load(open(embpath, 'rb'),
                                          encoding='iso-8859-1'),
                             dtype='float32')
        opt.W_class_emb = load_class_embedding(wordtoix, opt)
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/cpu:0'):
        x_ = tf.placeholder(tf.int32,
                            shape=[opt.batch_size, opt.maxlen],
                            name='x_')
        x_mask_ = tf.placeholder(tf.float32,
                                 shape=[opt.batch_size, opt.maxlen],
                                 name='x_mask_')
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        y_ = tf.placeholder(tf.float32,
                            shape=[opt.batch_size, opt.num_class],
                            name='y_')
        class_penalty_ = tf.placeholder(tf.float32, shape=())
        accuracy_, loss_, train_op, W_norm_, global_step, prob_ = emb_classifier(
            x_, x_mask_, y_, keep_prob, opt, class_penalty_)
    uidx = 0
    max_val_accuracy = 0.
    max_test_accuracy = 0.

    config = tf.ConfigProto(
        log_device_placement=False,
        allow_soft_placement=True,
    )
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:
                t_vars = tf.trainable_variables()
                save_keys = tensors_key_in_file(opt.save_path)
                ss = set([var.name for var in t_vars]) & set(
                    [s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                # only restore variables with correct shape
                ss_right_shape = set(
                    [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[
                    var for var in t_vars if var.name in ss_right_shape
                ])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                kf = get_minibatches_idx(len(train),
                                         opt.batch_size,
                                         shuffle=True)
                for _, train_index in kf:
                    uidx += 1
                    sents = [train[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    # print(x_labels)
                    x_labels = np.array(x_labels)
                    x_labels = x_labels.reshape((len(x_labels), opt.num_class))
                    # print(x_labels)
                    # exit()
                    x_batch, x_batch_mask = prepare_data_for_emb(sents, opt)
                    _, loss, step, = sess.run(
                        [train_op, loss_, global_step],
                        feed_dict={
                            x_: x_batch,
                            x_mask_: x_batch_mask,
                            y_: x_labels,
                            keep_prob: opt.dropout,
                            class_penalty_: opt.class_penalty
                        })

                    if uidx % opt.valid_freq == 0:
                        train_correct = 0.0
                        # sample evaluate accuaccy on 500 sample data
                        kf_train = get_minibatches_idx(500,
                                                       opt.batch_size,
                                                       shuffle=True)
                        for _, train_index in kf_train:
                            train_sents = [train[t] for t in train_index]
                            train_labels = [train_lab[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape(
                                (len(train_labels), opt.num_class))
                            x_train_batch, x_train_batch_mask = prepare_data_for_emb(
                                train_sents, opt)
                            train_accuracy = sess.run(accuracy_,
                                                      feed_dict={
                                                          x_: x_train_batch,
                                                          x_mask_:
                                                          x_train_batch_mask,
                                                          y_: train_labels,
                                                          keep_prob: 1.0,
                                                          class_penalty_: 0.0
                                                      })

                            train_correct += train_accuracy * len(train_index)

                        train_accuracy = train_correct / 500

                        print("Iteration %d: Training loss %f " % (uidx, loss))
                        print("Train accuracy %f " % train_accuracy)

                        val_correct = 0.0
                        kf_val = get_minibatches_idx(len(val),
                                                     opt.batch_size,
                                                     shuffle=True)
                        for _, val_index in kf_val:
                            val_sents = [val[t] for t in val_index]
                            val_labels = [val_lab[t] for t in val_index]
                            val_labels = np.array(val_labels)
                            val_labels = val_labels.reshape(
                                (len(val_labels), opt.num_class))
                            x_val_batch, x_val_batch_mask = prepare_data_for_emb(
                                val_sents, opt)

                            val_accuracy = sess.run(accuracy_,
                                                    feed_dict={
                                                        x_: x_val_batch,
                                                        x_mask_:
                                                        x_val_batch_mask,
                                                        y_: val_labels,
                                                        keep_prob: 1.0,
                                                        class_penalty_: 0.0
                                                    })
                            val_correct += val_accuracy * len(val_index)

                        val_accuracy = val_correct / len(val)
                        print("Validation accuracy %f " % val_accuracy)

                        if val_accuracy > max_val_accuracy:
                            max_val_accuracy = val_accuracy

                            # test_correct = 0.0
                            #
                            # kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True)
                            # for _, test_index in kf_test:
                            #     test_sents = [test[t] for t in test_index]
                            #     test_labels = [test_lab[t] for t in test_index]
                            #     test_labels = np.array(test_labels)
                            #     test_labels = test_labels.reshape((len(test_labels), opt.num_class))
                            #     x_test_batch, x_test_batch_mask = prepare_data_for_emb(test_sents, opt)
                            #
                            #     test_accuracy,predict_prob = sess.run([accuracy_,prob_],feed_dict={x_: x_test_batch, x_mask_: x_test_batch_mask,y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0})
                            #     print(predict_prob)
                            #     test_correct += test_accuracy * len(test_index)
                            #
                            # test_accuracy = test_correct / len(test)
                            # print("Test accuracy %f " % test_accuracy)
                            # max_test_accuracy = test_accuracy

                # print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy))
                saver.save(sess, opt.save_path, global_step=epoch)
                saver.save(sess, "save_model/model.ckpt")
            # print("Max Test accuracy %f " % max_test_accuracy)

            test_correct = 0.0

            kf_test = get_minibatches_idx(len(test),
                                          opt.batch_size,
                                          shuffle=False)
            for _, test_index in kf_test:
                test_sents = [test[t] for t in test_index]
                test_labels = [test_lab[t] for t in test_index]
                test_labels = np.array(test_labels)
                test_labels = test_labels.reshape(
                    (len(test_labels), opt.num_class))
                x_test_batch, x_test_batch_mask = prepare_data_for_emb(
                    test_sents, opt)

                test_accuracy, predict_prob = sess.run(
                    [accuracy_, prob_],
                    feed_dict={
                        x_: x_test_batch,
                        x_mask_: x_test_batch_mask,
                        y_: test_labels,
                        keep_prob: 1.0,
                        class_penalty_: 0.0
                    })

                for prob in predict_prob:
                    topnlabel_onedoc = [0] * opt.num_class
                    for iter_topnlabel in range(opt.topnlabel):
                        index_label = np.argwhere(prob == max(prob))
                        topnlabel_onedoc[index_label[0]
                                         [0]] = prob[index_label][0][0]
                        prob[index_label] = -1
                    topnlabel_docwithoutlabel.append(topnlabel_onedoc)
                test_correct += test_accuracy * len(test_index)
            print(topnlabel_docwithoutlabel)
            test_accuracy = test_correct / len(test)
            print("Predict accuracy %f " % test_accuracy)

            max_test_accuracy = test_accuracy

            filename = 'test'
            file = open(filename, 'w')
            file.write(str(len(test)))
            file.write('\n')
            # print(wordtoix.get('close'))
            # exit()
            for topic_prob in topnlabel_docwithoutlabel:
                print(topic_prob)
                for prob_each_label in topic_prob:
                    file.write(str(prob_each_label))
                    file.write(" ")
                file.write('\n')

        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
Example #6
0
def main():
    # Prepare training and testing data

    loadpath = "./data/"

    src_file = loadpath + "Pairs2M.src.num"
    tgt_file = loadpath + "Pairs2M.tgt.num"
    dic_file = loadpath + "Pairs2M.reddit.dic"

    opt = Options()
    opt_t = Options()

    train, val, test, wordtoix, ixtoword = read_pair_data_full(
        src_file,
        tgt_file,
        dic_file,
        max_num=opt.data_size,
        p_f=loadpath + 'demo.p')
    train = [
        x for x in train
        if 2 < len(x[1]) < opt.maxlen - 4 and 2 < len(x[0]) < opt_t.maxlen - 4
    ]
    val = [
        x for x in val
        if 2 < len(x[1]) < opt.maxlen - 4 and 2 < len(x[0]) < opt_t.maxlen - 4
    ]

    if TEST_FLAG:
        test = [test]
        opt.test_freq = 1

    opt.n_words = len(ixtoword)
    opt_t.n_words = len(ixtoword)
    print dict(opt)
    if opt.model == 'cnn_rnn':
        opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1
        opt_t.update_params()
        print dict(opt_t)

    print('Total words: %d' % opt.n_words)

    # load w2v
    if os.path.exists(opt.embedding_path_lime):
        with open(opt.embedding_path_lime, 'rb') as pfile:
            embedding = cPickle.load(pfile)
    else:
        w2v = gensim.models.KeyedVectors.load_word2vec_format(
            opt.embedding_path, binary=True)
        embedding = {
            i: copy.deepcopy(w2v[ixtoword[i]])
            for i in range(opt.n_words) if ixtoword[i] in w2v
        }
        with open(opt.embedding_path_lime, 'wb') as pfile:
            cPickle.dump(embedding, pfile, protocol=cPickle.HIGHEST_PROTOCOL)

    for d in ['/gpu:0']:
        with tf.device(d):
            src_ = tf.placeholder(tf.int32,
                                  shape=[opt.batch_size, opt.sent_len])
            tgt_ = tf.placeholder(tf.int32,
                                  shape=[opt_t.batch_size, opt_t.sent_len])
            res_, gan_cost_d_, train_op_d, gan_cost_g_, train_op_g = dialog_gan(
                src_, tgt_, opt, opt_t)
            merged = tf.summary.merge_all()

    uidx = 0
    graph_options = tf.GraphOptions(build_cost_model=1)
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True,
                            graph_options=graph_options)
    config.gpu_options.per_process_gpu_memory_fraction = 0.95

    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    run_metadata = tf.RunMetadata()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:

                t_vars = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES)  #tf.trainable_variables()

                if opt.load_from_ae:
                    save_keys = tensors_key_in_file(
                        opt.load_path)  #t_var g_W:0    key: W
                    ss = [
                        var for var in t_vars
                        if var.name[2:][:-2] in save_keys.keys()
                    ]
                    ss = [
                        var.name[2:] for var in ss
                        if var.get_shape() == save_keys[var.name[2:][:-2]]
                    ]
                    cc = {
                        var.name[2:][:-2]: var
                        for var in t_vars if var.name[2:] in ss
                    }

                    loader = tf.train.Saver(var_list=cc)
                    loader.restore(sess, opt.load_path)

                    print("Loading variables from '%s'." % opt.load_path)
                    print(
                        "Loaded variables:" + " ".join(
                            [var.name
                             for var in t_vars if var.name[2:] in ss]))
                else:
                    save_keys = tensors_key_in_file(opt.load_path)
                    ss = [
                        var for var in t_vars
                        if var.name[:-2] in save_keys.keys()
                    ]
                    ss = [
                        var.name for var in ss
                        if var.get_shape() == save_keys[var.name[:-2]]
                    ]
                    loader = tf.train.Saver(
                        var_list=[var for var in t_vars if var.name in ss])
                    loader.restore(sess, opt.load_path)
                    print("Loading variables from '%s'." % opt.load_path)
                    print("Loaded variables:" + str(ss))
                    # load reverse model
                    try:
                        save_keys = tensors_key_in_file('./save/rev_model')
                        ss = [
                            var for var in t_vars
                            if var.name[:-2] in save_keys.keys()
                            and 'g_rev_' in var.name
                        ]
                        ss = [
                            var.name for var in ss
                            if var.get_shape() == save_keys[var.name[:-2]]
                        ]
                        loader = tf.train.Saver(
                            var_list=[var for var in t_vars if var.name in ss])
                        loader.restore(sess, './save/rev_model')
                        print(
                            "Loading reverse variables from ./save/rev_model")
                        print("Loaded variables:" + str(ss))
                    except Exception as e:
                        print("No reverse model loaded")

            except Exception as e:
                print 'Error: ' + str(e)
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())
        loss_d, loss_g = 0, 0
        for epoch in range(opt.max_epochs):
            print("Starting epoch %d" % epoch)
            kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True)
            for _, train_index in kf:

                uidx += 1

                if uidx % opt.test_freq == 1:
                    iter_num = np.int(np.floor(len(test) / opt.batch_size)) + 1
                    res_all, test_tgt_all = [], []

                    for i in range(iter_num):
                        test_index = range(i * opt.batch_size,
                                           (i + 1) * opt.batch_size)
                        test_tgt, test_src = zip(
                            *[test[t % len(test)] for t in test_index])
                        test_tgt_all.extend(test_tgt)
                        x_batch = prepare_data_for_cnn(test_src, opt)
                        y_batch = prepare_data_for_rnn(
                            test_tgt, opt_t, is_add_GO=False
                        ) if opt.model == 'cnn_rnn' else prepare_data_for_cnn(
                            test_tgt, opt_t)
                        feed = {src_: x_batch, tgt_: y_batch}
                        res = sess.run(res_, feed_dict=feed)
                        res_all.extend(res['syn_sent'])

                    test_set = [prepare_for_bleu(s) for s in test_tgt_all]
                    gen = [prepare_for_bleu(s) for s in res_all]
                    [bleu1s, bleu2s, bleu3s,
                     bleu4s] = cal_BLEU_4(gen, {0: test_set},
                                          is_corpus=opt.is_corpus)
                    [rouge1, rouge2, rouge3, rouge4, rougeL,
                     rouges] = cal_ROUGE(gen, {0: test_set},
                                         is_corpus=opt.is_corpus)
                    etp_score, dist_score = cal_entropy(gen)
                    bleu_nltk = cal_BLEU_4_nltk(gen,
                                                test_set,
                                                is_corpus=opt.is_corpus)
                    rel_score = cal_relevance(gen, test_set, embedding)

                    print 'Test BLEU: ' + ' '.join([
                        str(round(it, 3))
                        for it in (bleu_nltk, bleu1s, bleu2s, bleu3s, bleu4s)
                    ])
                    print 'Test Rouge: ' + ' '.join([
                        str(round(it, 3))
                        for it in (rouge1, rouge2, rouge3, rouge4)
                    ])
                    print 'Test Entropy: ' + ' '.join([
                        str(round(it, 3))
                        for it in (etp_score[0], etp_score[1], etp_score[2],
                                   etp_score[3])
                    ])
                    print 'Test Diversity: ' + ' '.join([
                        str(round(it, 3))
                        for it in (dist_score[0], dist_score[1], dist_score[2],
                                   dist_score[3])
                    ])
                    print 'Test Relevance(G,A,E): ' + ' '.join([
                        str(round(it, 3))
                        for it in (rel_score[0], rel_score[1], rel_score[2])
                    ])
                    print 'Test Avg. length: ' + str(
                        round(
                            np.mean([
                                len([y for y in x if y != 0]) for x in res_all
                            ]), 3))
                    print ''

                    if TEST_FLAG:
                        exit()

                tgt, src = zip(*[train[t] for t in train_index])
                x_batch = prepare_data_for_cnn(src, opt)  # Batch L

                y_batch = prepare_data_for_rnn(
                    tgt, opt_t, is_add_GO=False
                ) if opt.model == 'cnn_rnn' else prepare_data_for_cnn(
                    tgt, opt_t)

                feed = {src_: x_batch, tgt_: y_batch}

                if uidx % opt.d_freq == 1:
                    if profile:
                        _, loss_d = sess.run(
                            [train_op_d, gan_cost_d_],
                            feed_dict=feed,
                            options=tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE),
                            run_metadata=run_metadata)
                    else:
                        _, loss_d = sess.run([train_op_d, gan_cost_d_],
                                             feed_dict=feed)

                if uidx % opt.g_freq == 1:
                    if profile:
                        _, loss_g = sess.run(
                            [train_op_g, gan_cost_g_],
                            feed_dict=feed,
                            options=tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE),
                            run_metadata=run_metadata)
                    else:
                        _, loss_g = sess.run([train_op_g, gan_cost_g_],
                                             feed_dict=feed)

                if profile:
                    tf.contrib.tfprof.model_analyzer.print_model_analysis(
                        tf.get_default_graph(),
                        run_meta=run_metadata,
                        tfprof_options=tf.contrib.tfprof.model_analyzer.
                        PRINT_ALL_TIMING_MEMORY)
                    exit(0)

                if uidx % opt.valid_freq == 1:
                    VALID_SIZE = 1024
                    valid_multiplier = np.int(
                        np.floor(VALID_SIZE / opt.batch_size))
                    res_all, val_tgt_all, loss_val_d_all, loss_val_g_all = [], [], [], []
                    for val_step in range(valid_multiplier):
                        valid_index = np.random.choice(len(val),
                                                       opt.batch_size)
                        val_tgt, val_src = zip(*[val[t] for t in valid_index])
                        val_tgt_all.extend(val_tgt)
                        x_val_batch = prepare_data_for_cnn(val_src,
                                                           opt)  # Batch L

                        y_val_batch = prepare_data_for_rnn(
                            val_tgt, opt_t, is_add_GO=False
                        ) if opt.model == 'cnn_rnn' else prepare_data_for_cnn(
                            val_tgt, opt_t)

                        feed_val = {src_: x_val_batch, tgt_: y_val_batch}
                        loss_val_d, loss_val_g = sess.run(
                            [gan_cost_d_, gan_cost_g_], feed_dict=feed_val)
                        loss_val_d_all.append(loss_val_d)
                        loss_val_g_all.append(loss_val_g)
                        res = sess.run(res_, feed_dict=feed_val)
                        res_all.extend(res['syn_sent'])

                    print("Validation: loss D %f loss G %f " %
                          (np.mean(loss_val_d_all), np.mean(loss_val_g_all)))
                    #print "Val Perm :" + " ".join([ixtoword[x] for x in val_src_permutated[0] if x != 0])
                    print "Val Source:" + u' '.join([
                        ixtoword[x] for x in val_src[0] if x != 0
                    ]).encode('utf-8').strip()
                    print "Val Target :" + u' '.join([
                        ixtoword[x] for x in val_tgt[0] if x != 0
                    ]).encode('utf-8').strip()
                    print "Val Generated:" + u' '.join([
                        ixtoword[x] for x in res['syn_sent'][0] if x != 0
                    ]).encode('utf-8').strip()
                    print ""

                    val_set = [prepare_for_bleu(s) for s in val_tgt_all]
                    gen = [prepare_for_bleu(s) for s in res_all]

                    [bleu1s, bleu2s, bleu3s,
                     bleu4s] = cal_BLEU_4(gen, {0: val_set},
                                          is_corpus=opt.is_corpus)
                    [rouge1, rouge2, rouge3, rouge4, rougeL,
                     rouges] = cal_ROUGE(gen, {0: val_set},
                                         is_corpus=opt.is_corpus)
                    etp_score, dist_score = cal_entropy(gen)
                    bleu_nltk = cal_BLEU_4_nltk(gen,
                                                val_set,
                                                is_corpus=opt.is_corpus)
                    rel_score = cal_relevance(gen, val_set, embedding)

                    print 'Val BLEU: ' + ' '.join([
                        str(round(it, 3))
                        for it in (bleu_nltk, bleu1s, bleu2s, bleu3s, bleu4s)
                    ])
                    print 'Val Rouge: ' + ' '.join([
                        str(round(it, 3))
                        for it in (rouge1, rouge2, rouge3, rouge4)
                    ])
                    print 'Val Entropy: ' + ' '.join([
                        str(round(it, 3))
                        for it in (etp_score[0], etp_score[1], etp_score[2],
                                   etp_score[3])
                    ])
                    print 'Val Diversity: ' + ' '.join([
                        str(round(it, 3))
                        for it in (dist_score[0], dist_score[1], dist_score[2],
                                   dist_score[3])
                    ])
                    print 'Val Relevance(G,A,E): ' + ' '.join([
                        str(round(it, 3))
                        for it in (rel_score[0], rel_score[1], rel_score[2])
                    ])
                    print 'Val Avg. length: ' + str(
                        round(
                            np.mean([
                                len([y for y in x if y != 0]) for x in res_all
                            ]), 3))
                    print ""
                    summary = sess.run(merged, feed_dict=feed_val)
                    summary2 = tf.Summary(value=[
                        tf.Summary.Value(tag="bleu-2", simple_value=bleu2s),
                        tf.Summary.Value(tag="rouge-2", simple_value=rouge2),
                        tf.Summary.Value(tag="etp-4",
                                         simple_value=etp_score[3])
                    ])

                    test_writer.add_summary(summary, uidx)
                    test_writer.add_summary(summary2, uidx)

                if uidx % opt.print_freq == 1:
                    print("Iteration %d: loss D %f loss G %f" %
                          (uidx, loss_d, loss_g))

                    res = sess.run(res_, feed_dict=feed)

                    if opt.grad_penalty:
                        print "grad_penalty: " + str(res['gp'])
                    print "Source:" + u' '.join([
                        ixtoword[x] for x in x_batch[0] if x != 0
                    ]).encode('utf-8').strip()
                    print "Target:" + u' '.join([
                        ixtoword[x] for x in y_batch[0] if x != 0
                    ]).encode('utf-8').strip()
                    print "Generated:" + u' '.join([
                        ixtoword[x] for x in res['syn_sent'][0] if x != 0
                    ]).encode('utf-8').strip()
                    print ""

                    sys.stdout.flush()
                    summary = sess.run(merged, feed_dict=feed)
                    train_writer.add_summary(summary, uidx)

                if uidx % opt.save_freq == 1:
                    saver.save(sess, opt.save_path)
Example #7
0
def main():
    opt = Options()
    vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary(
        "data/atec_nlp_sim_train2.csv",
        opt.vocab_size,
        name_scope=opt.name_scope,
        tokenize_style=opt.tokenize_style)
    vocab_size = len(vocabulary_word2index)
    print("vocab_size:", vocab_size)
    num_classes = len(vocabulary_index2label)
    print("num_classes:", num_classes)
    with open("./cache_SWEM_1/train_valid_test.pik") as f:
        train, valid, test, true_label_percent = pickle.load(f)
    train_q, train_a, _, train_lab = train
    print("train_nums:", len(train_q))
    val_q, val_a, _, val_lab = valid
    test_q, test_a, _, test_lab = test
    wordtoix = vocabulary_word2index
    ixtoword = vocabulary_index2word

    opt.n_words = len(ixtoword)
    # loadpath = "./data/snli.p"
    # x = cPickle.load(open(loadpath, "rb"))
    #
    # train, val, test = x[0], x[1], x[2]
    # wordtoix, ixtoword = x[4], x[5]
    #
    # train_q, train_a, train_lab = train[0], train[1], train[2]
    # val_q, val_a, val_lab = val[0], val[1], val[2]
    # test_q, test_a, test_lab = test[0], test[1], test[2]
    #
    # train_lab = np.array(train_lab, dtype='float32')
    # val_lab = np.array(val_lab, dtype='float32')
    # test_lab = np.array(test_lab, dtype='float32')
    #
    # opt = Options()
    # opt.n_words = len(ixtoword)
    #
    # del x

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    #若partially use labeled data则进行以下操作,这部分操作什么意思?
    # 目前猜测part_data设置为True时只利用部分训练集,portion就是保留的训练集大小,应该是用于测试模型阶段使用的
    if opt.part_data:
        np.random.seed(123)
        train_ind = np.random.choice(len(train_q),
                                     int(len(train_q) * opt.portion),
                                     replace=False)
        train_q = [train_q[t] for t in train_ind]
        train_a = [train_a[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]
    #验证训练集和预处理好的词嵌入文件是否对齐
    try:
        params = np.load('./data/snli_emb.p')
        if params[0].shape == (opt.n_words, opt.embed_size):
            print('Use saved embedding.')
            #pdb.set_trace()
            opt.W_emb = np.array(params[0], dtype='float32')
        else:
            print('Emb Dimension mismatch: param_g.npz:' +
                  str(params[0].shape) + ' opt: ' +
                  str((opt.n_words, opt.embed_size)))
            opt.fix_emb = False
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:0'):
        #注意训练数据是两批句子,所以x的占位符要成对定义
        x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_mask_1_ = tf.placeholder(tf.float32,
                                   shape=[opt.batch_size, opt.maxlen])
        x_mask_2_ = tf.placeholder(tf.float32,
                                   shape=[opt.batch_size, opt.maxlen])
        y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category])
        keep_prob = tf.placeholder(tf.float32)
        #auto_encoder就是模型的定义、模型运行过程中的所有tensor,这个项目将其封装起来了,很值得借鉴的工程技巧
        # 返回的是一些重要的tensor,后面sess.run的时候作为参数传入
        accuracy_, loss_, train_op_, W_emb, logits_ = auto_encoder(
            x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt)
        merged = tf.summary.merge_all()

    def do_eval(sess, train_q, train_a, train_lab):
        train_correct = 0.0
        # number_examples = len(train_q)
        # print("valid examples:", number_examples)
        eval_loss, eval_accc, eval_counter = 0.0, 0.0, 0
        eval_true_positive, eval_false_positive, eval_true_negative, eval_false_negative = 0, 0, 0, 0
        # batch_size = 1
        weights_label = {}  # weight_label[label_index]=(number,correct)
        weights = np.ones((opt.batch_size))
        kf_train = get_minibatches_idx(len(train_q),
                                       opt.batch_size,
                                       shuffle=True)
        for _, train_index in kf_train:
            train_sents_1 = [train_q[t] for t in train_index]
            train_sents_2 = [train_a[t] for t in train_index]
            train_labels = [train_lab[t] for t in train_index]
            train_labels_array = np.array(train_labels)
            # print("train_labels", train_labels.shape)
            # train_labels = train_labels.reshape((len(train_labels), opt.category))
            train_labels = np.eye(opt.category)[train_labels_array]
            x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(
                train_sents_1, opt)
            x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(
                train_sents_2, opt)

            curr_eval_loss, curr_accc, logits = sess.run(
                [loss_, accuracy_, logits_],
                feed_dict={
                    x_1_: x_train_batch_1,
                    x_2_: x_train_batch_2,
                    x_mask_1_: x_train_mask_1,
                    x_mask_2_: x_train_mask_2,
                    y_: train_labels,
                    opt.weights_label: weights,
                    keep_prob: 1.0
                })
            true_positive, false_positive, true_negative, false_negative = compute_confuse_matrix(
                logits, train_labels
            )  # logits:[batch_size,label_size]-->logits[0]:[label_size]
            # write_predict_error_to_file(start,file_object,logits[0], evalY[start:end][0],vocabulary_index2word,evalX1[start:end],evalX2[start:end])
            eval_loss, eval_accc, eval_counter = eval_loss + curr_eval_loss, eval_accc + curr_accc, eval_counter + 1  # 注意这里计算loss和accc的方法,计算累加值,然后归一化
            weights_label = compute_labels_weights(
                weights_label, logits, train_labels_array
            )  # compute_labels_weights(weights_label,logits,labels)
            eval_true_positive, eval_false_positive = eval_true_positive + true_positive, eval_false_positive + false_positive
            eval_true_negative, eval_false_negative = eval_true_negative + true_negative, eval_false_negative + false_negative
            # weights_label = compute_labels_weights(weights_label, logits, evalY[start:end]) #compute_labels_weights(weights_label,logits,labels)
        print("true_positive:", eval_true_positive, ";false_positive:",
              eval_false_positive, ";true_negative:", eval_true_negative,
              ";false_negative:", eval_false_negative)
        p = float(eval_true_positive) / float(eval_true_positive +
                                              eval_false_positive)
        r = float(eval_true_positive) / float(eval_true_positive +
                                              eval_false_negative)
        f1_score = (2 * p * r) / (p + r)
        print("eval_counter:", eval_counter, ";eval_acc:", eval_accc)
        return eval_loss / float(eval_counter), eval_accc / float(
            eval_counter), f1_score, p, r, weights_label

    max_val_accuracy = 0.
    max_test_accuracy = 0.
    weights_dict = init_weights_dict(
        vocabulary_label2index)  # init weights dict.
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:  #若使用已保存好的参数
            try:
                #pdb.set_trace()
                t_vars = tf.trainable_variables()
                # print([var.name[:-2] for var in t_vars])
                save_keys = tensors_key_in_file(opt.save_path)

                # pdb.set_trace()
                # print(save_keys.keys())
                ss = set([var.name for var in t_vars]) & set(
                    [s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                #pdb.set_trace()

                # only restore variables with correct shape
                ss_right_shape = set(
                    [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[
                    var for var in t_vars if var.name in ss_right_shape
                ])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            best_acc = 0
            best_f1_score = 0
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                loss, acc, uidx = 0.0, 0.0, 0.0
                kf = get_minibatches_idx(len(train_q),
                                         opt.batch_size,
                                         shuffle=True)  #随机创建minibatch数据
                for _, train_index in kf:
                    uidx += 1
                    sents_1 = [train_q[t]
                               for t in train_index]  #根据索引回到总数据集中寻找相应数据
                    sents_2 = [train_a[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels_array = np.array(x_labels)
                    # print("x_labels:", x_labels.shape)
                    # 为何要在这里进行reshape,是想进行onehot操作?但是这明显是错误的,((len(x_labels),))怎么能reshape成((len(x_labels),opt.category))
                    # x_labels = x_labels.reshape((len(x_labels),opt.category))
                    # one-hot向量化
                    x_labels = np.eye(opt.category)[x_labels_array]

                    #prepare_data_for_emb函数的作用是什么?初步猜测是把sents中每一个单词替换成相应的索引,然后才能根据索引获取词向量
                    x_batch_1, x_batch_mask_1 = prepare_data_for_emb(
                        sents_1, opt)
                    x_batch_2, x_batch_mask_2 = prepare_data_for_emb(
                        sents_2, opt)
                    weights = get_weights_for_current_batch(
                        list(x_labels_array), weights_dict)

                    _, curr_loss, curr_accuracy = sess.run(
                        [train_op_, loss_, accuracy_],
                        feed_dict={
                            x_1_: x_batch_1,
                            x_2_: x_batch_2,
                            x_mask_1_: x_batch_mask_1,
                            x_mask_2_: x_batch_mask_2,
                            y_: x_labels,
                            opt.weights_label: weights,
                            keep_prob: opt.dropout_ratio
                        })
                    loss, acc = loss + curr_loss, acc + curr_accuracy
                    if uidx % 100 == 0:
                        print(
                            "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\t" %
                            (epoch, uidx, loss / float(uidx),
                             acc / float(uidx)))

                if epoch % 1 == 0:
                    # do_eval参数待修改
                    eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval(
                        sess, train_q, train_a, train_lab)
                    weights_dict = get_weights_label_as_standard_dict(
                        weights_label)
                    # print("label accuracy(used for label weight):==========>>>>", weights_dict)
                    print(
                        "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f"
                        % (epoch, eval_loss, eval_accc, f1_scoree, precision,
                           recall))
                    # save model to checkpoint
                    if eval_accc > best_acc and f1_scoree > best_f1_score:
                        save_path = opt.ckpt_dir + "/model.ckpt"
                        print("going to save model. eval_f1_score:", f1_scoree,
                              ";previous best f1 score:", best_f1_score,
                              ";eval_acc", str(eval_accc),
                              ";previous best_acc:", str(best_acc))
                        saver.save(sess, save_path, global_step=epoch)
                        best_acc = eval_accc
                        best_f1_score = f1_scoree
            test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval(
                sess, test_q, test_a, test_lab)
            print(
                "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:"
                % (test_loss, acc_t, f1_score_t, precision, recall))

            #每训练valid_freq个minibatch就在训练集、验证集和测试集上计算准确率,并更新最优测试集准确率
            #         if uidx % opt.valid_freq == 0:
            #             train_correct = 0.0
            #             kf_train = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True)
            #             for _, train_index in kf_train:
            #                 train_sents_1 = [train_q[t] for t in train_index]
            #                 train_sents_2 = [train_a[t] for t in train_index]
            #                 train_labels = [train_lab[t] for t in train_index]
            #                 train_labels = np.array(train_labels)
            #                 # print("train_labels", train_labels.shape)
            #                 # train_labels = train_labels.reshape((len(train_labels), opt.category))
            #                 train_labels = np.eye(opt.category)[train_labels]
            #                 x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(train_sents_1, opt)
            #                 x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(train_sents_2, opt)
            #
            #                 train_accuracy = sess.run(accuracy_,
            #                                           feed_dict={x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2,
            #                                                      y_: train_labels, keep_prob: 1.0})
            #
            #                 train_correct += train_accuracy * len(train_index)
            #
            #             train_accuracy = train_correct / len(train_q)
            #
            #             # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx,
            #             #                                                                     loss, dis_loss, rec_loss))
            #             print("Train accuracy %f " % train_accuracy)
            #
            #             val_correct = 0.0
            #             is_train = True
            #             kf_val = get_minibatches_idx(len(val_q), opt.batch_size, shuffle=True)
            #             for _, val_index in kf_val:
            #                 val_sents_1 = [val_q[t] for t in val_index]
            #                 val_sents_2 = [val_a[t] for t in val_index]
            #                 val_labels = [val_lab[t] for t in val_index]
            #                 val_labels = np.array(val_labels)
            #                 # val_labels = val_labels.reshape((len(val_labels), opt.category))
            #                 val_labels = np.eye(opt.category)[val_labels]
            #                 x_val_batch_1, x_val_mask_1 = prepare_data_for_emb(val_sents_1, opt)
            #                 x_val_batch_2, x_val_mask_2 = prepare_data_for_emb(val_sents_2, opt)
            #
            #                 val_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_val_batch_1, x_2_: x_val_batch_2,
            #                                                               x_mask_1_: x_val_mask_1, x_mask_2_: x_val_mask_2, y_: val_labels, keep_prob: 1.0})
            #
            #                 val_correct += val_accuracy * len(val_index)
            #
            #             val_accuracy = val_correct / len(val_q)
            #
            #             print("Validation accuracy %f " % val_accuracy)
            #
            #             if val_accuracy > max_val_accuracy:
            #                 max_val_accuracy = val_accuracy
            #
            #                 test_correct = 0.0
            #                 kf_test = get_minibatches_idx(len(test_q), opt.batch_size, shuffle=True)
            #                 for _, test_index in kf_test:
            #                     test_sents_1 = [test_q[t] for t in test_index]
            #                     test_sents_2 = [test_a[t] for t in test_index]
            #                     test_labels = [test_lab[t] for t in test_index]
            #                     test_labels = np.array(test_labels)
            #                     # test_labels = test_labels.reshape((len(test_labels), opt.category))
            #                     test_labels = np.eye(opt.category)[test_labels]
            #                     x_test_batch_1, x_test_mask_1 = prepare_data_for_emb(test_sents_1, opt)
            #                     x_test_batch_2, x_test_mask_2 = prepare_data_for_emb(test_sents_2, opt)
            #
            #                     test_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_test_batch_1, x_2_: x_test_batch_2,
            #                                                                    x_mask_1_: x_test_mask_1, x_mask_2_: x_test_mask_2,
            #                                                                    y_: test_labels, keep_prob: 1.0})
            #
            #                     test_correct += test_accuracy * len(test_index)
            #
            #                 test_accuracy = test_correct / len(test_q)
            #
            #                 print("Test accuracy %f " % test_accuracy)
            #
            #                 max_test_accuracy = test_accuracy
            #
            #     print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy))
            #
            # print("Max Test accuracy %f " % max_test_accuracy)

        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
Example #8
0
def main():
    loadpath = "./data/snli.p"
    x = cPickle.load(open(loadpath, "rb"))

    train, val, test = x[0], x[1], x[2]
    wordtoix, ixtoword = x[4], x[5]

    train_q, train_a, train_lab = train[0], train[1], train[2]
    val_q, val_a, val_lab = val[0], val[1], val[2]
    test_q, test_a, test_lab = test[0], test[1], test[2]

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')

    opt = Options()
    opt.n_words = len(ixtoword)

    del x

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    if opt.part_data:
        np.random.seed(123)
        train_ind = np.random.choice(len(train_q), int(len(train_q)*opt.portion), replace=False)
        train_q = [train_q[t] for t in train_ind]
        train_a = [train_a[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]

    try:
        params = np.load('./data/snli_emb.p')
        if params[0].shape == (opt.n_words, opt.embed_size):
            print('Use saved embedding.')
            #pdb.set_trace()
            opt.W_emb = np.array(params[0], dtype='float32')
        else:
            print('Emb Dimension mismatch: param_g.npz:' + str(params[0].shape) + ' opt: ' + str(
                (opt.n_words, opt.embed_size)))
            opt.fix_emb = False
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:1'):
        x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_mask_1_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen])
        x_mask_2_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen])
        y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category])
        keep_prob = tf.placeholder(tf.float32)
        accuracy_, loss_, train_op_, W_emb_ = auto_encoder(x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt)
        merged = tf.summary.merge_all()

    uidx = 0
    max_val_accuracy = 0.
    max_test_accuracy = 0.
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
    config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:
                #pdb.set_trace()
                t_vars = tf.trainable_variables()
                # print([var.name[:-2] for var in t_vars])
                save_keys = tensors_key_in_file(opt.save_path)

                # pdb.set_trace()
                # print(save_keys.keys())
                ss = set([var.name for var in t_vars]) & set([s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                #pdb.set_trace()

                # only restore variables with correct shape
                ss_right_shape = set([s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[var for var in t_vars if var.name in ss_right_shape])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                kf = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True)
                for _, train_index in kf:

                    uidx += 1
                    sents_1 = [train_q[t] for t in train_index]
                    sents_2 = [train_a[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels = np.array(x_labels)
                    x_labels = x_labels.reshape((len(x_labels), opt.category))

                    x_batch_1, x_batch_mask_1 = prepare_data_for_emb(sents_1, opt)
                    x_batch_2, x_batch_mask_2 = prepare_data_for_emb(sents_2, opt)

                    _, loss = sess.run([train_op_, loss_], feed_dict={x_1_: x_batch_1, x_2_: x_batch_2,
                                       x_mask_1_: x_batch_mask_1, x_mask_2_: x_batch_mask_2, y_: x_labels, keep_prob: opt.dropout_ratio})

                    if uidx % opt.valid_freq == 0:

                        train_correct = 0.0
                        kf_train = get_minibatches_idx(3070, opt.batch_size, shuffle=True)
                        for _, train_index in kf_train:
                            train_sents_1 = [train_q[t] for t in train_index]
                            train_sents_2 = [train_a[t] for t in train_index]
                            train_labels = [train_lab[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape((len(train_labels), opt.category))
                            x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(train_sents_1, opt)
                            x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(train_sents_2, opt)

                            train_accuracy = sess.run(accuracy_,
                                                      feed_dict={x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2,
                                                                 y_: train_labels, keep_prob: 1.0})

                            train_correct += train_accuracy * len(train_index)

                        train_accuracy = train_correct / 3070

                        # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx,
                        #                                                                     loss, dis_loss, rec_loss))
                        print("Train accuracy %f " % train_accuracy)

                        val_correct = 0.0
                        is_train = True
                        kf_val = get_minibatches_idx(len(val_q), opt.batch_size, shuffle=True)
                        for _, val_index in kf_val:
                            val_sents_1 = [val_q[t] for t in val_index]
                            val_sents_2 = [val_a[t] for t in val_index]
                            val_labels = [val_lab[t] for t in val_index]
                            val_labels = np.array(val_labels)
                            val_labels = val_labels.reshape((len(val_labels), opt.category))
                            x_val_batch_1, x_val_mask_1 = prepare_data_for_emb(val_sents_1, opt)
                            x_val_batch_2, x_val_mask_2 = prepare_data_for_emb(val_sents_2, opt)

                            val_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_val_batch_1, x_2_: x_val_batch_2,
                                                                          x_mask_1_: x_val_mask_1, x_mask_2_: x_val_mask_2, y_: val_labels, keep_prob: 1.0})

                            val_correct += val_accuracy * len(val_index)

                        val_accuracy = val_correct / len(val_q)

                        print("Validation accuracy %f " % val_accuracy)

                        if val_accuracy > max_val_accuracy:
                            max_val_accuracy = val_accuracy

                            test_correct = 0.0
                            kf_test = get_minibatches_idx(len(test_q), opt.batch_size, shuffle=True)
                            for _, test_index in kf_test:
                                test_sents_1 = [test_q[t] for t in test_index]
                                test_sents_2 = [test_a[t] for t in test_index]
                                test_labels = [test_lab[t] for t in test_index]
                                test_labels = np.array(test_labels)
                                test_labels = test_labels.reshape((len(test_labels), opt.category))
                                x_test_batch_1, x_test_mask_1 = prepare_data_for_emb(test_sents_1, opt)
                                x_test_batch_2, x_test_mask_2 = prepare_data_for_emb(test_sents_2, opt)

                                test_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_test_batch_1, x_2_: x_test_batch_2,
                                                                               x_mask_1_: x_test_mask_1, x_mask_2_: x_test_mask_2,
                                                                               y_: test_labels, keep_prob: 1.0})

                                test_correct += test_accuracy * len(test_index)

                            test_accuracy = test_correct / len(test_q)

                            print("Test accuracy %f " % test_accuracy)

                            max_test_accuracy = test_accuracy

                print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy))

            print("Max Test accuracy %f " % max_test_accuracy)

        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
Example #9
0
def main():
    loadpath = "./data/snli.p"
    x = cPickle.load(open(loadpath, "rb"))

    train, val, test = x[0], x[1], x[2]
    wordtoix, ixtoword = x[4], x[5]

    train_q, train_a, train_lab = train[0], train[1], train[2]
    val_q, val_a, val_lab = val[0], val[1], val[2]
    test_q, test_a, test_lab = test[0], test[1], test[2]

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')

    opt = Options()
    opt.n_words = len(ixtoword)

    del x

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    #若partially use labeled data则进行以下操作,这部分操作什么意思?
    # 目前猜测part_data设置为True时只利用部分训练集,portion就是保留的训练集大小
    if opt.part_data:
        np.random.seed(123)
        train_ind = np.random.choice(len(train_q),
                                     int(len(train_q) * opt.portion),
                                     replace=False)
        train_q = [train_q[t] for t in train_ind]
        train_a = [train_a[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]
    #验证训练集和预处理好的词嵌入文件是否对齐
    try:
        params = np.load('./data/snli_emb.p')
        if params[0].shape == (opt.n_words, opt.embed_size):
            print('Use saved embedding.')
            #pdb.set_trace()
            opt.W_emb = np.array(params[0], dtype='float32')
        else:
            print('Emb Dimension mismatch: param_g.npz:' +
                  str(params[0].shape) + ' opt: ' +
                  str((opt.n_words, opt.embed_size)))
            opt.fix_emb = False
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:1'):
        #注意训练数据是两批句子,所以x的占位符要成对定义
        x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_mask_1_ = tf.placeholder(tf.float32,
                                   shape=[opt.batch_size, opt.maxlen])
        x_mask_2_ = tf.placeholder(tf.float32,
                                   shape=[opt.batch_size, opt.maxlen])
        y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category])
        keep_prob = tf.placeholder(tf.float32)
        #auto_encoder就是模型的定义、模型运行过程中的所有tensor,这个项目将其封装起来了,很值得借鉴的工程技巧
        # 返回的是一些重要的tensor,后面sess.run的时候作为参数传入
        accuracy_, loss_, train_op_, W_emb_ = auto_encoder(
            x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt)
        merged = tf.summary.merge_all()

    uidx = 0
    max_val_accuracy = 0.
    max_test_accuracy = 0.
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:  #若使用已保存好的参数
            try:
                #pdb.set_trace()
                t_vars = tf.trainable_variables()
                # print([var.name[:-2] for var in t_vars])
                save_keys = tensors_key_in_file(opt.save_path)

                # pdb.set_trace()
                # print(save_keys.keys())
                ss = set([var.name for var in t_vars]) & set(
                    [s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                #pdb.set_trace()

                # only restore variables with correct shape
                ss_right_shape = set(
                    [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[
                    var for var in t_vars if var.name in ss_right_shape
                ])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                kf = get_minibatches_idx(len(train_q),
                                         opt.batch_size,
                                         shuffle=True)  #随机创建minibatch数据
                for _, train_index in kf:

                    uidx += 1
                    sents_1 = [train_q[t]
                               for t in train_index]  #根据索引回到总数据集中寻找相应数据
                    sents_2 = [train_a[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels = np.array(x_labels)
                    print("x_labels:", x_labels.shape)
                    x_labels = x_labels.reshape(
                        (len(x_labels), opt.category))  #返回one-hot向量?

                    #prepare_data_for_emb函数的作用是什么?初步猜测是把sents中每一个单词替换成相应的索引,然后才能根据索引获取词向量
                    x_batch_1, x_batch_mask_1 = prepare_data_for_emb(
                        sents_1, opt)
                    x_batch_2, x_batch_mask_2 = prepare_data_for_emb(
                        sents_2, opt)

                    _, loss = sess.run(
                        [train_op_, loss_],
                        feed_dict={
                            x_1_: x_batch_1,
                            x_2_: x_batch_2,
                            x_mask_1_: x_batch_mask_1,
                            x_mask_2_: x_batch_mask_2,
                            y_: x_labels,
                            keep_prob: opt.dropout_ratio
                        })

                    #每训练valid_freq个minibatch就在训练集、验证集和测试集上计算准确率,并更新最优测试集准确率
                    if uidx % opt.valid_freq == 0:
                        train_correct = 0.0
                        kf_train = get_minibatches_idx(3070,
                                                       opt.batch_size,
                                                       shuffle=True)
                        for _, train_index in kf_train:
                            train_sents_1 = [train_q[t] for t in train_index]
                            train_sents_2 = [train_a[t] for t in train_index]
                            train_labels = [train_lab[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape(
                                (len(train_labels), opt.category))
                            x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(
                                train_sents_1, opt)
                            x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(
                                train_sents_2, opt)

                            train_accuracy = sess.run(accuracy_,
                                                      feed_dict={
                                                          x_1_:
                                                          x_train_batch_1,
                                                          x_2_:
                                                          x_train_batch_2,
                                                          x_mask_1_:
                                                          x_train_mask_1,
                                                          x_mask_2_:
                                                          x_train_mask_2,
                                                          y_: train_labels,
                                                          keep_prob: 1.0
                                                      })

                            train_correct += train_accuracy * len(train_index)

                        train_accuracy = train_correct / 3070

                        # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx,
                        #                                                                     loss, dis_loss, rec_loss))
                        print("Train accuracy %f " % train_accuracy)

                        val_correct = 0.0
                        is_train = True
                        kf_val = get_minibatches_idx(len(val_q),
                                                     opt.batch_size,
                                                     shuffle=True)
                        for _, val_index in kf_val:
                            val_sents_1 = [val_q[t] for t in val_index]
                            val_sents_2 = [val_a[t] for t in val_index]
                            val_labels = [val_lab[t] for t in val_index]
                            val_labels = np.array(val_labels)
                            val_labels = val_labels.reshape(
                                (len(val_labels), opt.category))
                            x_val_batch_1, x_val_mask_1 = prepare_data_for_emb(
                                val_sents_1, opt)
                            x_val_batch_2, x_val_mask_2 = prepare_data_for_emb(
                                val_sents_2, opt)

                            val_accuracy = sess.run(accuracy_,
                                                    feed_dict={
                                                        x_1_: x_val_batch_1,
                                                        x_2_: x_val_batch_2,
                                                        x_mask_1_:
                                                        x_val_mask_1,
                                                        x_mask_2_:
                                                        x_val_mask_2,
                                                        y_: val_labels,
                                                        keep_prob: 1.0
                                                    })

                            val_correct += val_accuracy * len(val_index)

                        val_accuracy = val_correct / len(val_q)

                        print("Validation accuracy %f " % val_accuracy)

                        if val_accuracy > max_val_accuracy:
                            max_val_accuracy = val_accuracy

                            test_correct = 0.0
                            kf_test = get_minibatches_idx(len(test_q),
                                                          opt.batch_size,
                                                          shuffle=True)
                            for _, test_index in kf_test:
                                test_sents_1 = [test_q[t] for t in test_index]
                                test_sents_2 = [test_a[t] for t in test_index]
                                test_labels = [test_lab[t] for t in test_index]
                                test_labels = np.array(test_labels)
                                test_labels = test_labels.reshape(
                                    (len(test_labels), opt.category))
                                x_test_batch_1, x_test_mask_1 = prepare_data_for_emb(
                                    test_sents_1, opt)
                                x_test_batch_2, x_test_mask_2 = prepare_data_for_emb(
                                    test_sents_2, opt)

                                test_accuracy = sess.run(accuracy_,
                                                         feed_dict={
                                                             x_1_:
                                                             x_test_batch_1,
                                                             x_2_:
                                                             x_test_batch_2,
                                                             x_mask_1_:
                                                             x_test_mask_1,
                                                             x_mask_2_:
                                                             x_test_mask_2,
                                                             y_: test_labels,
                                                             keep_prob: 1.0
                                                         })

                                test_correct += test_accuracy * len(test_index)

                            test_accuracy = test_correct / len(test_q)

                            print("Test accuracy %f " % test_accuracy)

                            max_test_accuracy = test_accuracy

                print("Epoch %d: Max Test accuracy %f" %
                      (epoch, max_test_accuracy))

            print("Max Test accuracy %f " % max_test_accuracy)

        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
Example #10
0
def main():
    # Prepare training and testing data
    opt = Options()
    main_Path = '/home/dell/桌面/GG/TDD/keyword/Our_method/dataset/'
    # load data
    if opt.dataset == 'yahoo':
        loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yahoo.p"
        embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yahoo_glove.p"
        load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/yahoo/08/cnn/yahoo_G.pickle'
        opt.num_class = 10
        opt.class_name = ['Society Culture',
                          'Science Mathematics',
                          'Health',
                          'Education Reference',
                          'Computers Internet',
                          'Sports',
                          'Business Finance',
                          'Entertainment Music',
                          'Family Relationships',
                          'Politics Government']
    elif opt.dataset == 'agnews':
        loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/ag_news.pickle"
        embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/ag_news_glove.pickle"
        load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/ag_news/08/cnn/ag_news_G.pickle'
        opt.num_class = 4
        opt.class_name = ['World',
                          'Sports',
                          'Business',
                          'Science']
    elif opt.dataset == 'dbpedia':
        loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/dbpedia.pickle"
        embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/dbpedia_glove.pickle"
        load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/dbpedia/08/cnn/dbpedia_G.pickle'
        opt.num_class = 14
        opt.class_name = ['Company',
                          'Educational Institution',
                          'Artist',
                          'Athlete',
                          'Office Holder',
                          'Mean Of Transportation',
                          'Building',
                          'Natural Place',
                          'Village',
                          'Animal',
                          'Plant',
                          'Album',
                          'Film',
                          'Written Work',
                          ]
    elif opt.dataset == 'yelp_full':
        loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp_full.pickle"
        embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp_full_glove.pickle"
        load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/yelp_full/08/cnn/yelp_full_G.pickle'
        opt.num_class = 5
        opt.class_name = ['worst',
                          'bad',
                          'middle',
                          'good',
                          'best']
    elif opt.dataset == 'yelp':
        loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp.pickle"
        embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp_glove.pickle"
        load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/yelp/08/cnn/yelp_G.pickle'
        opt.num_class = 2
        opt.class_name = ['bad',
                          'good']
    x = pickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2] #将单词由数字表示,已做了分词工作,且句子长度尚未统一
    train_lab, val_lab, test_lab = x[3], x[4], x[5]#label 采用one-hot编码形式表示
    wordtoix, ixtoword = x[6], x[7]

    #加载权重G
    G_train, G_val, G_test = pickle.load(open(load_G_path, "rb"))

    del x
    print("load data finished")

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')    
    opt.n_words = len(ixtoword)
    if opt.part_data:
        #np.random.seed(123)
        train_ind = np.random.choice(len(train), int(len(train)*opt.portion), replace=False)
        train = [train[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]
    
    os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID)

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    try:
        opt.W_emb = np.array(pickle.load(open(embpath, 'rb')),dtype='float32')
        opt.W_class_emb = load_class_embedding( wordtoix, opt)
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:1'):
        x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen],name='x_')
        x_mask_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen],name='x_mask_')
        keep_prob = tf.placeholder(tf.float32,name='keep_prob')
        y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.num_class],name='y_')
        class_penalty_ = tf.placeholder(tf.float32, shape=())
        G_our = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen, opt.num_class], name='G_our')
        seq_len = tf.placeholder(tf.int32, shape=[opt.batch_size], name='sque_sentence_num')
        accuracy_, loss_, train_op, W_norm_, global_step = emb_classifier(x_, x_mask_, y_, keep_prob, opt, class_penalty_, G_our, seq_len)
    uidx = 0
    max_val_accuracy = 0.
    max_test_accuracy = 0.
    val_acc = 0.

    config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, )
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:
                t_vars = tf.trainable_variables()
                save_keys = tensors_key_in_file(opt.save_path)
                ss = set([var.name for var in t_vars]) & set([s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                # only restore variables with correct shape
                ss_right_shape = set([s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[var for var in t_vars if var.name in ss_right_shape])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True)
                for _, train_index in kf:
                    uidx += 1
                    sents = [train[t] for t in train_index]
                    G1 = [G_train[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels = np.array(x_labels)
                    x_labels = x_labels.reshape((len(x_labels), opt.num_class))
                    x_batch, x_batch_mask, G_batch, seq_len_batch = prepare_data_for_emb(sents, G1, opt)

                    _, loss, step,  = sess.run([train_op, loss_, global_step], feed_dict={x_: x_batch, x_mask_: x_batch_mask, y_: x_labels, keep_prob: opt.dropout, class_penalty_:opt.class_penalty, G_our:G_batch, seq_len:seq_len_batch})

                    if uidx % opt.valid_freq == 0:
                        train_correct = 0.0
                        # sample evaluate accuaccy on 500 sample data
                        kf_train = get_minibatches_idx(500, opt.batch_size, shuffle=True)
                        for _, train_index in kf_train:
                            train_sents = [train[t] for t in train_index]
                            train_G = [G_train[t] for t in train_index]
                            train_labels = [train_lab[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape((len(train_labels), opt.num_class))
                            x_train_batch, x_train_batch_mask, G_train_batch, x_train_seq_len = prepare_data_for_emb(train_sents, train_G, opt)
                            train_accuracy = sess.run(accuracy_, feed_dict={x_: x_train_batch, x_mask_: x_train_batch_mask, y_: train_labels, keep_prob: 1.0, class_penalty_:0.0, G_our:G_train_batch, seq_len:x_train_seq_len})

                            train_correct += train_accuracy * len(train_index)

                        train_accuracy = train_correct / 500

                        print("Iteration %d: Training loss %f " % (uidx, loss))
                        print("Train accuracy %f " % train_accuracy)

                        if not os.path.exists(opt.dataset + '_Train_message.csv'):
                            with open(opt.dataset + '_Train_message.csv', 'a', newline='') as out:
                                # 设定写入模式
                                csv_write = csv.writer(out, dialect='excel')
                                # 写入具体内容
                                csv_write.writerow(["epoch", "Training loss", "Train accuracy"])
                                csv_write.writerow([epoch, loss, train_accuracy])
                        else:
                            with open(opt.dataset + '_Train_message.csv', 'a', newline='') as out:
                                # 设定写入模式
                                csv_write = csv.writer(out, dialect='excel')
                                csv_write.writerow([epoch, loss, train_accuracy])


                        val_correct = 0.0
                        kf_val = get_minibatches_idx(len(val), opt.batch_size, shuffle=True)
                        for _, val_index in kf_val:
                            val_sents = [val[t] for t in val_index]
                            val_Gs = [G_val[t] for t in val_index]
                            val_labels = [val_lab[t] for t in val_index]
                            val_labels = np.array(val_labels)
                            val_labels = val_labels.reshape((len(val_labels), opt.num_class))
                            x_val_batch, x_val_batch_mask, G_val_batch, x_val_seq_len = prepare_data_for_emb(val_sents, val_Gs, opt)
                            val_accuracy = sess.run(accuracy_, feed_dict={x_: x_val_batch, x_mask_: x_val_batch_mask, y_: val_labels, keep_prob: 1.0, class_penalty_:0.0, G_our:G_val_batch, seq_len:x_val_seq_len})

                            val_correct += val_accuracy * len(val_index)

                        val_accuracy = val_correct / len(val)
                        print("Validation accuracy %f " % val_accuracy)

                        #测试网络
                        test_correct = 0.0
                        kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True)
                        for _, test_index in kf_test:
                            test_sents = [test[t] for t in test_index]
                            test_Gs = [G_test[t] for t in test_index]
                            test_labels = [test_lab[t] for t in test_index]
                            test_labels = np.array(test_labels)
                            test_labels = test_labels.reshape((len(test_labels), opt.num_class))
                            x_test_batch, x_test_batch_mask, G_test_batch, x_test_seq_len = prepare_data_for_emb(test_sents, test_Gs,
                                                                                                 opt)

                            test_accuracy = sess.run(accuracy_, feed_dict={x_: x_test_batch, x_mask_: x_test_batch_mask, y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0, G_our: G_test_batch, seq_len:x_test_seq_len})

                            test_correct += test_accuracy * len(test_index)
                        test_accuracy = test_correct / len(test)
                        print("Test accuracy %f " % test_accuracy)
                        # max_test_accuracy = test_accuracy
                        if test_accuracy > max_test_accuracy:
                            max_test_accuracy = test_accuracy
                            val_acc = val_accuracy
#                        max_test_accuracy = max(test_accuracy, max_test_accuracy)
#                        val_acc = val_accuracy

                        if val_accuracy > max_val_accuracy:
                            max_val_accuracy = val_accuracy
                            test_acc = test_accuracy

                        if not os.path.exists(opt.dataset + '_Classification_Results.csv'):
                            with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out:
                                # 设定写入模式
                                csv_write = csv.writer(out, dialect='excel')
                                # 写入具体内容
                                csv_write.writerow(["epoch", "val_accuracy", "test_accuracy"])
                                csv_write.writerow([epoch, val_accuracy, test_accuracy])
                        else:
                            with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out:
                                # 设定写入模式
                                csv_write = csv.writer(out, dialect='excel')
                                csv_write.writerow([epoch, val_accuracy, test_accuracy])

                print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy))
                saver.save(sess, opt.save_path, global_step=epoch)
                
            print("Max Test accuracy %f , val accuracy %f " % (max_test_accuracy, val_acc))
            print("Max val accuracy %f , test accuracy %f" % (max_val_accuracy, test_acc))
            with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out:
                # 设定写入模式
                csv_write = csv.writer(out, dialect='excel')
                csv_write.writerow(['Max Test accuracy:', max_test_accuracy, 'val accuracy', val_acc])
                csv_write.writerow(['Max val accuracy:', max_val_accuracy, 'test accuracy', test_acc])
        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
            with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out:
                # 设定写入模式
                csv_write = csv.writer(out, dialect='excel')
                csv_write.writerow(['Max Test accuracy:', max_test_accuracy, 'val accuracy', val_acc])
                csv_write.writerow(['Max val accuracy:', max_val_accuracy, 'test accuracy', test_acc])