def setUp(self): stopwords = "stop".split() keywords = "information agency retrieval".split() # documents = [ # ("Document 1", "information retrieval information retrieval"), # ("Document 2", "retrieval retrieval retrieval retrieval"), # ("Document 3", "agency information retrieval agency"), # ("Document 4", "retrieval agency retrieval agency"), # ] documents = Loader.load_documents("data/documents-lab1.txt") self.s = TFIDF(keywords, documents, Cleaner(stopwords))
def test(args): data_loader = Loader('data/test/') assert not args.gpu or (args.gpu and torch.cuda.is_available()) test_data = data_loader.data with open(join('models', args.model, 'params.json'), 'r') as f: dicts = json.load(f) char_vocab = dicts['chars'] tag_vocab = dicts['tags'] model = torch.load(join('models', args.model, 'model.pt')) model.eval() print('Test samples:', len(test_data)) print(char_vocab) print(tag_vocab) test_data = [('$sqrt[8]{x^{8}}$', 'irration_fun'), ('$sqrt[11]{x^{11}}$', 'ration_fun'), ('$sqrt[462]{x^{462}}$', 'irration_fun'), ('$sqrt[1131]{x^{1131}}$', 'ration_fun')] evaluate_test_set(model, test_data, char_vocab, tag_vocab, args.gpu)
def test_from_json(self): json_obj = [{'source': [1, 2, 3], 'target': [4, 5, 6]}, {'source': [1, 2, 3], 'target': [4, 5, 6]}] file_path = '/path/to/json_file' import json # noqa: F401 open_patcher = patch('data.open') json_load_patcher = patch('json.load', return_value=json_obj) open_mock = open_patcher.start() json_load_mock = json_load_patcher.start() source = Field() target = Field() examples = Loader.from_json(file_path, {'source': source, 'target': target}) open_mock.assert_called_once_with(file_path) json_load_mock.assert_called_once_with(open_mock(file_path).__enter__()) self.assertEqual(len(examples['source']), 2) self.assertEqual(len(examples['target']), 2) self.assertListEqual(examples['source'].data, [[1, 2, 3], [1, 2, 3]]) self.assertListEqual(examples['target'].data, [[4, 5, 6], [4, 5, 6]]) open_patcher.stop() json_load_patcher.stop()
u"Copyright:\tThis is free software: you are free to change and " u"redistribute it.\n\t\tThere is NO WARRANTY, to the extent " u"permitted by law.")) parser.add_argument('-k', '--keywords', help="Keywords file path", default="data/keywords-2.txt") parser.add_argument('-s', '--stopwords', help="Stopwords file path", default="data/stopwords.txt") parser.add_argument('-d', '--documents', help="Documents file path", default="data/documents-2.txt") parser.add_argument('-n', '--noresults', help="Number of displayed results", default="5") parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.3') args = parser.parse_args() keywords = Loader.load_keywords(args.keywords) stopwords = Loader.load_stopwords(args.stopwords) documents = Loader.load_documents(args.documents) n = int(args.noresults) cleaner = Cleaner(stopwords) tfidf = TFIDF(keywords, documents, cleaner) question = raw_input("Enter search string or \"exit()\" and press enter: ") while question != "exit()": found = tfidf.search(question) for title, similarity, index in found[:n]: print "{0:4f}\t{1}".format(similarity, title) groups = tfidf.group_kmeans(9, 10) for i, group in enumerate(groups): print "\nGroup {0}:\n".format(i)
def main(args): validSet = Loader(os.path.join(FLAGS.data_dir, 'test.csv'), FLAGS.vocab_path, FLAGS.pn_batch_size, FLAGS.ctr_batch_size, FLAGS.seq_length) G = Generator() G(validSet.vocab_size) with tf.Session() as sess: saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('models/pre_G')) graph = tf.get_default_graph() encoder_inputs = graph.get_tensor_by_name("G_inputs/encoder_inputs:0") user_inputs = graph.get_tensor_by_name("G_inputs/user_inputs:0") input_lengths = graph.get_tensor_by_name("G_inputs/input_lengths:0") pointer_labels = graph.get_tensor_by_name("G_outputs/pointer_labels:0") pointer_hot_labels = graph.get_tensor_by_name( "G_outputs/pointer_hot_labels:0") # loss = graph.get_tensor_by_name("G_loss/loss:0") pointer_prob = graph.get_tensor_by_name("G_loss/pointer_prob:0") rank_pointers = graph.get_tensor_by_name("G_pointers/rank_pointers:0") print('finish loading model!') # test G_val_acc0, G_val_loss0 = 0, 0 for itr in range(validSet.n_batches): x_raw, x_batch, u_batch, x_lengths, y_batch, y_hot_batch = validSet.next_pn_batch( ) test_dict = { encoder_inputs: x_batch, user_inputs: u_batch, input_lengths: x_lengths, pointer_labels: y_batch, pointer_hot_labels: y_hot_batch } output_prob, pre_labels = sess.run([pointer_prob, rank_pointers], feed_dict=test_dict) jishu = 0 for j, line in enumerate(pre_labels): # print u_batch[j] for word in line: if word in y_batch[j]: jishu = jishu + 1 acc = jishu * 1.0 / (FLAGS.pn_batch_size * 5) G_val_acc0 += acc print(pre_labels) print(y_batch) if itr == 0: for i in range(FLAGS.pn_batch_size): print(i) origin = '' predict = '' for j in range(20): if j in y_batch[i]: origin += x_raw[i, j] for j in range(20): if j in pre_labels[i]: predict += x_raw[i, j] print(i, origin) print(i, predict) print("Test Generator: test_acc:{:.2f}".format(G_val_acc0 / validSet.n_batches))
def train(args): # load data vocab_path = os.path.join(args.data_dir, 'vocab.json') training = Loader(os.path.join(args.data_dir, 'train.txt'), vocab_path, args.batch_size, 45) validation = Loader(os.path.join(args.data_dir, 'validate.txt'), vocab_path, args.batch_size, 45) # create TensorFlow graph ptr_net = PointerNet(batch_size=args.batch_size, learning_rate=args.learning_rate) saver = tf.train.Saver() best_val_acc = 0 # record training loss & accuracy train_losses = [] train_accuracies = [] # initialize graph init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for ep in tqdm(range(args.n_epochs)): tr_loss, tr_acc = 0, 0 for itr in range(training.n_batches): x_batch, x_lengths, y_batch = training.next_batch() train_dict = { ptr_net.encoder_inputs: x_batch, ptr_net.input_lengths: x_lengths, ptr_net.pointer_labels: y_batch } loss, acc, _ = sess.run( [ptr_net.loss, ptr_net.exact_match, ptr_net.train_step], feed_dict=train_dict) tr_loss += loss tr_acc += acc train_losses.append(tr_loss / training.n_batches) train_accuracies.append(tr_acc / training.n_batches) # check validation accuracy every 10 epochs if ep % 10 == 0: val_acc = 0 for itr in range(validation.n_batches): x_batch, x_lengths, y_batch = validation.next_batch() val_dict = { ptr_net.encoder_inputs: x_batch, ptr_net.input_lengths: x_lengths, ptr_net.pointer_labels: y_batch } val_acc += sess.run(ptr_net.exact_match, feed_dict=val_dict) val_acc = val_acc / validation.n_batches print('epoch {:3d}, loss={:.2f}'.format( ep, tr_loss / training.n_batches)) print('Train EM: {:.2f}, Validation EM: {:.2f}'.format( tr_acc / training.n_batches, val_acc)) # save model if val_acc >= best_val_acc: print('Validation accuracy increased. Saving model.') saver.save(sess, os.path.join(args.save_dir, 'ptr_net.ckpt')) best_val_acc = val_acc else: print('Validation accuracy decreased. Restoring model.') saver.restore(sess, os.path.join(args.save_dir, 'ptr_net.ckpt')) print('Training complete.') print('Best Validation EM: {:.2f}".format(best_val_acc)')
import tensorflow as tf from config import get_config from trainer import Trainer from data import Loader, FeatureManager sess = tf.Session() config, _ = get_config() data_loader = Loader(config.dataset, config.restrict) feature_manager = FeatureManager(data_loader.restrictor) feature_manager.generate_data(data_loader.melodies) trainer = Trainer(config, feature_manager) trainer.train()
import os from data import Loader from word import Cleaner from search import TFIDF from guess import Guesses import expander from flask import Flask, render_template, request, jsonify keywords_path = "data/keywords-2.txt" stopwords_path = "data/stopwords.txt" documents_path = "data/documents-2.txt" keywords = Loader.load_keywords(keywords_path) stopwords = Loader.load_stopwords(stopwords_path) documents = Loader.load_documents(documents_path, categories=True) cleaner = Cleaner(stopwords) tfidf = TFIDF(keywords, documents, cleaner) autocomplete = Guesses(tfidf.get_term_document_matrix(), tfidf.keywords, tfidf.keywords_lookup) app = Flask(__name__) @app.route('/') def home(): found_extended = None question = "" if 'search' in request.args:
def main(): opts = optparser.parse_args()[0] train_loader = Loader(opts.train) opts.vocab_len = len(train_loader._char_to_id) opts.pos_len = len(train_loader._pos_to_id) opts.max_pos_len = train_loader._pos_max_len opts.max_target_len = train_loader._char_max_len opts.use_cuda = opts.use_cuda == 1 opts.eval = opts.eval == 1 opts.data_size = train_loader.get_data_size() if not torch.cuda.is_available(): opts.use_cuda = False torch.manual_seed(opts.seed) np.random.seed(opts.seed) if not opts.eval: # weights for paddings, set to 0 loss_weights = torch.ones(opts.vocab_len) loss_weights[0] = 0 criterion = nn.NLLLoss(loss_weights, size_average=False) c2i, i2c, p2i, i2p = train_loader.get_mappings() dev_loader = Loader(opts.dev, c2i, i2c, p2i, i2p) if dev_loader._pos_max_len > opts.max_pos_len: opts.max_pos_len = dev_loader._pos_max_len model = Module(opts) if opts.model_path is not '': model = torch.load(opts.model_path) train_batcher = Batcher(opts.batch_size, train_loader.get_data(), opts.max_pos_len, opts.eval) dev_batcher = Batcher(decode_batch, dev_loader.get_data(), opts.max_pos_len, True) print model start_train(model, criterion, opts, train_batcher, dev_batcher) else: model = torch.load(opts.model_path) model.eval() #print model c2i, i2c, p2i, i2p = train_loader.get_mappings() test_loader = Loader(opts.test, c2i, i2c, p2i, i2p) if test_loader._pos_max_len > opts.max_pos_len: opts.max_pos_len = test_loader._pos_max_len test_batcher = Batcher(1, test_loader.get_data(), opts.max_pos_len, opts.eval) opts.data_size = test_loader.get_data_size() decode(model, opts, test_batcher, i2c, i2p)
def main(args): # load data click1_Set = Loader(os.path.join(FLAGS.data_dir, 'click1.csv'), FLAGS.vocab_path, FLAGS.pn_batch_size, FLAGS.ctr_batch_size, FLAGS.seq_length) click0_Set = Loader(os.path.join(FLAGS.data_dir, 'click0.csv'), FLAGS.vocab_path, FLAGS.pn_batch_size, FLAGS.ctr_batch_size, FLAGS.seq_length) # pretrain graph generator = Generator(click1_Set.vocab_size) pre_optimize = tf.train.AdamOptimizer(FLAGS.learning_rate) pre_train_step = pre_optimize.minimize(generator.loss, var_list=generator.vars) saver = tf.train.Saver() title_inputs = tf.placeholder(tf.int32, [None, FLAGS.seq_length], name='title_inputs') title_distribution = tf.placeholder(tf.float32, [None, FLAGS.seq_length], name='title_distribution') # formal train graph generator_fake = Generator(click0_Set.vocab_size) real_discriminator = Discriminator(click1_Set.vocab_size, title_inputs, title_distribution) fake_discriminator = Discriminator(click0_Set.vocab_size, generator_fake.encoder_inputs, generator_fake.pointer_prob) reg = tc.layers.apply_regularization( tc.layers.l1_regularizer(2.5e-5), weights_list=[ var for var in tf.global_variables() if 'kernel' or 'W1' or 'W2' in var.name ]) D_real_loss = tf.reduce_mean(real_discriminator.predictions) D_fake_loss = tf.reduce_mean(fake_discriminator.predictions) D_loss = D_fake_loss - D_real_loss D_loss_reg = D_loss + reg D_optimize = tf.train.RMSPropOptimizer(FLAGS.learning_rate) # WGAN lipschitz-penalty alpha = tf.random_uniform(shape=[tf.shape(title_distribution)[0], 1, 1], minval=0., maxval=1.) differences = generator_fake.pointer_prob_hot - title_distribution interpolates = title_distribution + (alpha * differences) gradients = tf.gradients( Discriminator(click0_Set.vocab_size, generator_fake.encoder_inputs, interpolates).predictions, [interpolates])[0] slopes = tf.sqrt( tf.reduce_sum(tf.square(gradients), reduction_indices=[1, 2])) gradient_penalty = tf.reduce_mean((slopes - 1.)**2) D_loss_reg = D_loss + 10 * gradient_penalty D_train_step = D_optimize.minimize(D_loss_reg, var_list=fake_discriminator.vars) G_loss = -tf.reduce_mean(fake_discriminator.predictions) G_loss_reg = G_loss + reg G_optimize = tf.train.RMSPropOptimizer(FLAGS.learning_rate) G_train_step = G_optimize.minimize(G_loss_reg, var_list=generator_fake.vars) saver_G = tf.train.Saver(var_list=generator.vars) saver_D = tf.train.Saver(var_list=real_discriminator.vars) # for var in tf.global_variables(): # print(var.name) # training precess init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) pretrain_G(sess, saver, generator, click1_Set, FLAGS.G_pretrain_epochs, pre_train_step) print('Start training...') for i in range(FLAGS.train_epochs): d_iters = FLAGS.d_iters g_iters = FLAGS.g_iters for _ in range(0, d_iters): x_batch_fake, u_batch_fake, x_lengths_fake, y_batch_fake, y_hot_batch_fake = click0_Set.next_pn_batch( ) x_batch_real, u_batch_real, x_lengths_real, y_batch_real, y_hot_batch_real = click1_Set.next_pn_batch( ) D_dict = { generator_fake.encoder_inputs: x_batch_fake, generator_fake.user_inputs: u_batch_fake, generator_fake.input_lengths: x_lengths_fake, generator_fake.pointer_labels: y_batch_fake, generator_fake.pointer_hot_labels: y_hot_batch_fake, title_inputs: x_batch_real, title_distribution: y_hot_batch_real, real_discriminator.user_inputs: u_batch_real, fake_discriminator.user_inputs: u_batch_fake } sess.run(fake_discriminator.d_clip) loss_Real, loss_Fake, lossD, _ = sess.run( [D_real_loss, D_fake_loss, D_loss, D_train_step], feed_dict=D_dict) for _ in range(0, g_iters): x_batch_fake, u_batch_fake, x_lengths_fake, y_batch_fake, y_hot_batch_fake = click0_Set.next_pn_batch( ) x_batch_real, u_batch_real, x_lengths_real, y_batch_real, y_hot_batch_real = click1_Set.next_pn_batch( ) D_dict = { generator_fake.encoder_inputs: x_batch_fake, generator_fake.user_inputs: u_batch_fake, generator_fake.input_lengths: x_lengths_fake, generator_fake.pointer_labels: y_batch_fake, generator_fake.pointer_hot_labels: y_hot_batch_fake, title_inputs: x_batch_real, title_distribution: y_hot_batch_real, real_discriminator.user_inputs: u_batch_real, fake_discriminator.user_inputs: u_batch_fake } lossG, _ = sess.run([G_loss, G_train_step], feed_dict=D_dict) print( "epoch:{}, D_loss:{:.2f}, G_loss:{:.2f}, loss_Real:{:.2f}, loss_Fake:{:.2f}, Sum_loss:{:.2f}" .format(i, lossD, lossG, loss_Real, loss_Fake, lossD + lossG)) saver_G.save(sess, os.path.join(FLAGS.save_dir, 'G/train_generator')) saver_D.save(sess, os.path.join(FLAGS.save_dir, 'D/train_discriminator'))