def load_setup(model_type, trn_name, dev_name): print("Loading data") trn_data = datasets.StanceDataBoW( DATA_PATH + trn_name, text_vocab_file=REP_PATH + 'text_vocab_top10000.txt', topic_vocab_file=REP_PATH + 'topic_vocab.txt') trn_datasampler = data_utils.DataSampler(trn_data, batch_size=len(trn_data)) dev_data = datasets.StanceDataBoW( DATA_PATH + dev_name, text_vocab_file=REP_PATH + 'text_vocab_top10000.txt', topic_vocab_file=REP_PATH + 'topic_vocab.txt') dev_datasampler = data_utils.DataSampler(dev_data, batch_size=len(dev_data)) print("Initializing model") ######### # MODEL # ######### if model_type == 'bowv': model = LogisticRegression(solver='lbfgs', class_weight='balanced', multi_class='multinomial', max_iter=600) elif model_type == 'cmaj': model = MajorityClusterBaseline(trn_data) return model, trn_datasampler, dev_datasampler
def main(args): """ parameters """ RESULTS_DIR = args.results_path fname = open(RESULTS_DIR + '/PRR_mse.csv', 'ab') # network architecture ADD_NOISE = args.add_noise n_hidden = args.n_hidden dim_img = IMAGE_SIZE_MNIST # number of pixels for a MNIST image dim_z = args.dim_z # train n_epochs = args.num_epochs batch_size = args.batch_size learn_rate = args.learn_rate # Plot PRR = args.PRR # Plot Reproduce Result PRR_n_img_x = args.PRR_n_img_x # number of images along x-axis in a canvas PRR_resize_factor = args.PRR_resize_factor # resize factor for each image in a canvas PMLR = args.PMLR # Plot Manifold Learning Result PMLR_n_img_x = args.PMLR_n_img_x # number of images along x-axis in a canvas PMLR_resize_factor = args.PMLR_resize_factor # resize factor for each image in a canvas PMLR_z_range = args.PMLR_z_range # range for random latent vector PMLR_n_samples = args.PMLR_n_samples # number of labeled samples to plot a map from input data space to the latent space """ prepare MNIST data """ #train_total_data, train_size, _, _, test_data, test_labels = mnist_data.prepare_MNIST_data() #n_samples = train_size xs = data_utils.DataSampler() train_total_data = xs.xtrain train_size = xs.n_samples n_samples = train_size test_data = xs.xtest test_labels = xs.ytest """ build graph """ # input placeholders # In denoising-autoencoder, x_hat == x + noise, otherwise x_hat == x x_hat = tf.placeholder(tf.float32, shape=[None, dim_img], name='input_img') x = tf.placeholder(tf.float32, shape=[None, dim_img], name='target_img') # dropout keep_prob = tf.placeholder(tf.float32, name='keep_prob') # input for PMLR z_in = tf.placeholder(tf.float32, shape=[None, dim_z], name='latent_variable') # network architecture y, z, loss, neg_marginal_likelihood, KL_divergence = vae.autoencoder( x_hat, x, dim_img, dim_z, n_hidden, keep_prob) # optimization train_op = tf.train.AdamOptimizer(learn_rate).minimize(loss) """ training """ # Plot for reproduce performance if PRR: PRR = plot_utils.Plot_Reproduce_Performance(RESULTS_DIR, PRR_n_img_x, IMAGE_SIZE_MNIST, PRR_resize_factor) x_PRR = test_data[0:PRR.n_tot_imgs, :] #x_PRR_img = x_PRR.reshape(PRR.n_tot_imgs, IMAGE_SIZE_MNIST, IMAGE_SIZE_MNIST) PRR.save_samples(x_PRR, name='input.csv') if ADD_NOISE: x_PRR = x_PRR * np.random.randint(2, size=x_PRR.shape) x_PRR += np.random.randint(2, size=x_PRR.shape) x_PRR_img = x_PRR.reshape(PRR.n_tot_imgs, IMAGE_SIZE_MNIST, IMAGE_SIZE_MNIST) PRR.save_images(x_PRR_img, name='input_noise.csv') # Plot for manifold learning result if PMLR and dim_z == 2: PMLR = plot_utils.Plot_Manifold_Learning_Result( RESULTS_DIR, PMLR_n_img_x, IMAGE_SIZE_MNIST, PMLR_resize_factor, PMLR_z_range) x_PMLR = test_data[0:PMLR_n_samples, :] id_PMLR = test_labels[0:PMLR_n_samples, :] if ADD_NOISE: x_PMLR = x_PMLR * np.random.randint(2, size=x_PMLR.shape) x_PMLR += np.random.randint(2, size=x_PMLR.shape) decoded = vae.decoder(z_in, dim_img, n_hidden) # train total_batch = int(n_samples / batch_size) min_tot_loss = 1e99 saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer(), feed_dict={keep_prob: 1.0}) for epoch in range(n_epochs): # Random shuffling np.random.shuffle(train_total_data) #train_data_ = train_total_data[:, :-mnist_data.NUM_LABELS] train_data_ = train_total_data # Loop over all batches for i in range(total_batch): # Compute the offset of the current minibatch in the data. offset = (i * batch_size) % (n_samples) batch_xs_input = train_data_[offset:(offset + batch_size), :] batch_xs_target = batch_xs_input # add salt & pepper noise if ADD_NOISE: batch_xs_input = batch_xs_input * np.random.randint( 2, size=batch_xs_input.shape) batch_xs_input += np.random.randint( 2, size=batch_xs_input.shape) _, tot_loss, loss_likelihood, loss_divergence = sess.run( (train_op, loss, neg_marginal_likelihood, KL_divergence), feed_dict={ x_hat: batch_xs_input, x: batch_xs_target, keep_prob: 1.0 }) # print cost every epoch print( "epoch %d: L_tot %03.2f Neg L_likelihood %03.2f KL_divergence %03.2f" % (epoch, tot_loss, loss_likelihood, loss_divergence)) saver.save(sess, RESULTS_DIR + '/reg_vae', global_step=epoch) # if minimum loss is updated or final epoch, plot results if min_tot_loss > tot_loss or epoch + 1 == n_epochs: min_tot_loss = tot_loss # Plot for reproduce performance if PRR: y_PRR = sess.run(y, feed_dict={x_hat: x_PRR, keep_prob: 1}) #y_PRR_img = y_PRR.reshape(PRR.n_tot_imgs, IMAGE_SIZE_MNIST, IMAGE_SIZE_MNIST) PRR.save_samples(y_PRR, name="/PRR_epoch_%02d" % (epoch) + ".csv") PRR.save_mse( np.reshape( np.asarray([ np.mean(np.linalg.norm(x_PRR - y_PRR, 2, 1)**2, axis=0), loss_likelihood ]), [1, -1]), fname) # Plot for manifold learning result if PMLR and dim_z == 2: y_PMLR = sess.run(decoded, feed_dict={ z_in: PMLR.z, keep_prob: 1 }) #y_PMLR_img = y_PMLR.reshape(PMLR.n_tot_imgs, IMAGE_SIZE_MNIST, IMAGE_SIZE_MNIST) PMLR.save_samples(y_PMLR, name="/PMLR_epoch_%02d" % (epoch) + ".csv") # plot distribution of labeled images z_PMLR = sess.run(z, feed_dict={ x_hat: x_PMLR, keep_prob: 1 }) PMLR.save_scattered_image(z_PMLR, id_PMLR, name="/PMLR_map_epoch_%02d" % (epoch) + ".jpg") fname.close()
vecs = data_utils.load_vectors('../resources/{}.vectors.npy'.format(vec_name), dim=vec_dim, seed=SEED) vocab_name = '../resources/{}.vocab.pkl'.format(vec_name) data = datasets.StanceData(args['trn_data'], vocab_name, pad_val=len(vecs) - 1, max_tok_len=int(config.get('max_tok_len', '200')), max_sen_len=int(config.get('max_sen_len', '10')), keep_sen=('keep_sen' in config), **trn_data_kwargs) else: data = datasets.StanceData(args['trn_data'], None, max_tok_len=config['max_tok_len'], max_top_len=config['max_top_len'], is_bert=True, add_special_tokens=(config.get('together_in', '0') == '0'), **trn_data_kwargs) dataloader = data_utils.DataSampler(data, batch_size=int(config['b'])) if 'bert' not in config and 'bert' not in config['name']: dev_data = datasets.StanceData(args['dev_data'], vocab_name, pad_val=len(vecs) - 1, max_tok_len=int(config.get('max_tok_len', '200')), max_sen_len=int(config.get('max_sen_len', '10')), keep_sen=('keep_sen' in config), **dev_data_kwargs) else: dev_data = datasets.StanceData(args['dev_data'], None, max_tok_len=config['max_tok_len'], max_top_len=config['max_top_len'], is_bert=True, add_special_tokens=(config.get('together_in', '0') == '0'), **dev_data_kwargs) dev_dataloader = data_utils.DataSampler(dev_data, batch_size=int(config['b']), shuffle=False)
parser.add_argument('-r', '--num_trials', help='Number of trials for search') args = vars(parser.parse_args()) torch.manual_seed(SEED) torch.cuda.manual_seed_all(SEED) torch.backends.cudnn.deterministic = True data = datasets.StanceData(args['trn_data'], None, max_tok_len=200, max_top_len=5, is_bert=True, add_special_tokens=True) dataloader = data_utils.DataSampler(data, batch_size=64, shuffle=False) dev_data = datasets.StanceData(args['dev_data'], None, max_tok_len=200, max_top_len=5, is_bert=True, add_special_tokens=True) dev_dataloader = data_utils.DataSampler(dev_data, batch_size=64, shuffle=False) if args['test_data'] is not None: test_data = datasets.StanceData(args['test_data'], None, max_tok_len=200,