def load_setup(model_type, trn_name, dev_name):
    print("Loading data")
    trn_data = datasets.StanceDataBoW(
        DATA_PATH + trn_name,
        text_vocab_file=REP_PATH + 'text_vocab_top10000.txt',
        topic_vocab_file=REP_PATH + 'topic_vocab.txt')

    trn_datasampler = data_utils.DataSampler(trn_data,
                                             batch_size=len(trn_data))

    dev_data = datasets.StanceDataBoW(
        DATA_PATH + dev_name,
        text_vocab_file=REP_PATH + 'text_vocab_top10000.txt',
        topic_vocab_file=REP_PATH + 'topic_vocab.txt')

    dev_datasampler = data_utils.DataSampler(dev_data,
                                             batch_size=len(dev_data))

    print("Initializing model")
    #########
    # MODEL #
    #########
    if model_type == 'bowv':
        model = LogisticRegression(solver='lbfgs',
                                   class_weight='balanced',
                                   multi_class='multinomial',
                                   max_iter=600)
    elif model_type == 'cmaj':
        model = MajorityClusterBaseline(trn_data)

    return model, trn_datasampler, dev_datasampler
def main(args):
    """ parameters """
    RESULTS_DIR = args.results_path
    fname = open(RESULTS_DIR + '/PRR_mse.csv', 'ab')

    # network architecture
    ADD_NOISE = args.add_noise

    n_hidden = args.n_hidden
    dim_img = IMAGE_SIZE_MNIST  # number of pixels for a MNIST image
    dim_z = args.dim_z

    # train
    n_epochs = args.num_epochs
    batch_size = args.batch_size
    learn_rate = args.learn_rate

    # Plot
    PRR = args.PRR  # Plot Reproduce Result
    PRR_n_img_x = args.PRR_n_img_x  # number of images along x-axis in a canvas
    PRR_resize_factor = args.PRR_resize_factor  # resize factor for each image in a canvas

    PMLR = args.PMLR  # Plot Manifold Learning Result
    PMLR_n_img_x = args.PMLR_n_img_x  # number of images along x-axis in a canvas
    PMLR_resize_factor = args.PMLR_resize_factor  # resize factor for each image in a canvas
    PMLR_z_range = args.PMLR_z_range  # range for random latent vector
    PMLR_n_samples = args.PMLR_n_samples  # number of labeled samples to plot a map from input data space to the latent space
    """ prepare MNIST data """

    #train_total_data, train_size, _, _, test_data, test_labels = mnist_data.prepare_MNIST_data()
    #n_samples = train_size

    xs = data_utils.DataSampler()
    train_total_data = xs.xtrain
    train_size = xs.n_samples
    n_samples = train_size
    test_data = xs.xtest
    test_labels = xs.ytest
    """ build graph """

    # input placeholders
    # In denoising-autoencoder, x_hat == x + noise, otherwise x_hat == x
    x_hat = tf.placeholder(tf.float32, shape=[None, dim_img], name='input_img')
    x = tf.placeholder(tf.float32, shape=[None, dim_img], name='target_img')

    # dropout
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    # input for PMLR
    z_in = tf.placeholder(tf.float32,
                          shape=[None, dim_z],
                          name='latent_variable')

    # network architecture
    y, z, loss, neg_marginal_likelihood, KL_divergence = vae.autoencoder(
        x_hat, x, dim_img, dim_z, n_hidden, keep_prob)

    # optimization
    train_op = tf.train.AdamOptimizer(learn_rate).minimize(loss)
    """ training """

    # Plot for reproduce performance
    if PRR:
        PRR = plot_utils.Plot_Reproduce_Performance(RESULTS_DIR, PRR_n_img_x,
                                                    IMAGE_SIZE_MNIST,
                                                    PRR_resize_factor)

        x_PRR = test_data[0:PRR.n_tot_imgs, :]

        #x_PRR_img = x_PRR.reshape(PRR.n_tot_imgs, IMAGE_SIZE_MNIST, IMAGE_SIZE_MNIST)
        PRR.save_samples(x_PRR, name='input.csv')

        if ADD_NOISE:
            x_PRR = x_PRR * np.random.randint(2, size=x_PRR.shape)
            x_PRR += np.random.randint(2, size=x_PRR.shape)

            x_PRR_img = x_PRR.reshape(PRR.n_tot_imgs, IMAGE_SIZE_MNIST,
                                      IMAGE_SIZE_MNIST)
            PRR.save_images(x_PRR_img, name='input_noise.csv')

    # Plot for manifold learning result
    if PMLR and dim_z == 2:

        PMLR = plot_utils.Plot_Manifold_Learning_Result(
            RESULTS_DIR, PMLR_n_img_x, IMAGE_SIZE_MNIST, PMLR_resize_factor,
            PMLR_z_range)

        x_PMLR = test_data[0:PMLR_n_samples, :]
        id_PMLR = test_labels[0:PMLR_n_samples, :]

        if ADD_NOISE:
            x_PMLR = x_PMLR * np.random.randint(2, size=x_PMLR.shape)
            x_PMLR += np.random.randint(2, size=x_PMLR.shape)

        decoded = vae.decoder(z_in, dim_img, n_hidden)

    # train
    total_batch = int(n_samples / batch_size)
    min_tot_loss = 1e99

    saver = tf.train.Saver()

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer(), feed_dict={keep_prob: 1.0})

        for epoch in range(n_epochs):

            # Random shuffling
            np.random.shuffle(train_total_data)
            #train_data_ = train_total_data[:, :-mnist_data.NUM_LABELS]
            train_data_ = train_total_data

            # Loop over all batches
            for i in range(total_batch):
                # Compute the offset of the current minibatch in the data.
                offset = (i * batch_size) % (n_samples)
                batch_xs_input = train_data_[offset:(offset + batch_size), :]

                batch_xs_target = batch_xs_input

                # add salt & pepper noise
                if ADD_NOISE:
                    batch_xs_input = batch_xs_input * np.random.randint(
                        2, size=batch_xs_input.shape)
                    batch_xs_input += np.random.randint(
                        2, size=batch_xs_input.shape)

                _, tot_loss, loss_likelihood, loss_divergence = sess.run(
                    (train_op, loss, neg_marginal_likelihood, KL_divergence),
                    feed_dict={
                        x_hat: batch_xs_input,
                        x: batch_xs_target,
                        keep_prob: 1.0
                    })

            # print cost every epoch
            print(
                "epoch %d: L_tot %03.2f Neg L_likelihood %03.2f KL_divergence %03.2f"
                % (epoch, tot_loss, loss_likelihood, loss_divergence))
            saver.save(sess, RESULTS_DIR + '/reg_vae', global_step=epoch)

            # if minimum loss is updated or final epoch, plot results
            if min_tot_loss > tot_loss or epoch + 1 == n_epochs:
                min_tot_loss = tot_loss
                # Plot for reproduce performance
                if PRR:
                    y_PRR = sess.run(y, feed_dict={x_hat: x_PRR, keep_prob: 1})
                    #y_PRR_img = y_PRR.reshape(PRR.n_tot_imgs, IMAGE_SIZE_MNIST, IMAGE_SIZE_MNIST)
                    PRR.save_samples(y_PRR,
                                     name="/PRR_epoch_%02d" % (epoch) + ".csv")
                    PRR.save_mse(
                        np.reshape(
                            np.asarray([
                                np.mean(np.linalg.norm(x_PRR - y_PRR, 2, 1)**2,
                                        axis=0), loss_likelihood
                            ]), [1, -1]), fname)

                # Plot for manifold learning result
                if PMLR and dim_z == 2:
                    y_PMLR = sess.run(decoded,
                                      feed_dict={
                                          z_in: PMLR.z,
                                          keep_prob: 1
                                      })
                    #y_PMLR_img = y_PMLR.reshape(PMLR.n_tot_imgs, IMAGE_SIZE_MNIST, IMAGE_SIZE_MNIST)
                    PMLR.save_samples(y_PMLR,
                                      name="/PMLR_epoch_%02d" % (epoch) +
                                      ".csv")

                    # plot distribution of labeled images
                    z_PMLR = sess.run(z,
                                      feed_dict={
                                          x_hat: x_PMLR,
                                          keep_prob: 1
                                      })
                    PMLR.save_scattered_image(z_PMLR,
                                              id_PMLR,
                                              name="/PMLR_map_epoch_%02d" %
                                              (epoch) + ".jpg")

    fname.close()
Exemple #3
0
        vecs = data_utils.load_vectors('../resources/{}.vectors.npy'.format(vec_name),
                                       dim=vec_dim, seed=SEED)
        vocab_name = '../resources/{}.vocab.pkl'.format(vec_name)
        data = datasets.StanceData(args['trn_data'], vocab_name, pad_val=len(vecs) - 1,
                                   max_tok_len=int(config.get('max_tok_len', '200')),
                                   max_sen_len=int(config.get('max_sen_len', '10')),
                                   keep_sen=('keep_sen' in config),
                                   **trn_data_kwargs)
    else:
        data = datasets.StanceData(args['trn_data'], None, max_tok_len=config['max_tok_len'],
                                   max_top_len=config['max_top_len'], is_bert=True,
                                   add_special_tokens=(config.get('together_in', '0') == '0'),
                                   **trn_data_kwargs)

    dataloader = data_utils.DataSampler(data, batch_size=int(config['b']))

    if 'bert' not in config and 'bert' not in config['name']:
        dev_data = datasets.StanceData(args['dev_data'], vocab_name, pad_val=len(vecs) - 1,
                                       max_tok_len=int(config.get('max_tok_len', '200')),
                                       max_sen_len=int(config.get('max_sen_len', '10')),
                                       keep_sen=('keep_sen' in config),
                                       **dev_data_kwargs)
    else:
        dev_data = datasets.StanceData(args['dev_data'], None, max_tok_len=config['max_tok_len'],
                                       max_top_len=config['max_top_len'], is_bert=True,
                                       add_special_tokens=(config.get('together_in', '0') == '0'),
                                       **dev_data_kwargs)

    dev_dataloader = data_utils.DataSampler(dev_data, batch_size=int(config['b']), shuffle=False)
    parser.add_argument('-r',
                        '--num_trials',
                        help='Number of trials for search')
    args = vars(parser.parse_args())

    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

    data = datasets.StanceData(args['trn_data'],
                               None,
                               max_tok_len=200,
                               max_top_len=5,
                               is_bert=True,
                               add_special_tokens=True)
    dataloader = data_utils.DataSampler(data, batch_size=64, shuffle=False)

    dev_data = datasets.StanceData(args['dev_data'],
                                   None,
                                   max_tok_len=200,
                                   max_top_len=5,
                                   is_bert=True,
                                   add_special_tokens=True)
    dev_dataloader = data_utils.DataSampler(dev_data,
                                            batch_size=64,
                                            shuffle=False)

    if args['test_data'] is not None:
        test_data = datasets.StanceData(args['test_data'],
                                        None,
                                        max_tok_len=200,