Ejemplo n.º 1
0
 def setUp(self):
     stopwords = "stop".split()
     keywords = "information agency retrieval".split()
     # documents = [
     #        ("Document 1", "information retrieval information retrieval"),
     #        ("Document 2", "retrieval retrieval retrieval retrieval"),
     #        ("Document 3", "agency information retrieval agency"),
     #        ("Document 4", "retrieval agency retrieval agency"),
     #    ]
     documents = Loader.load_documents("data/documents-lab1.txt")
     self.s = TFIDF(keywords, documents, Cleaner(stopwords))
Ejemplo n.º 2
0
def test(args):
    data_loader = Loader('data/test/')
    assert not args.gpu or (args.gpu and torch.cuda.is_available())
    test_data = data_loader.data
    with open(join('models', args.model, 'params.json'), 'r') as f:
        dicts = json.load(f)
        char_vocab = dicts['chars']
        tag_vocab = dicts['tags']
    model = torch.load(join('models', args.model, 'model.pt'))
    model.eval()
    print('Test samples:', len(test_data))
    print(char_vocab)
    print(tag_vocab)
    test_data = [('$sqrt[8]{x^{8}}$', 'irration_fun'),
                 ('$sqrt[11]{x^{11}}$', 'ration_fun'),
                 ('$sqrt[462]{x^{462}}$', 'irration_fun'),
                 ('$sqrt[1131]{x^{1131}}$', 'ration_fun')]
    evaluate_test_set(model, test_data, char_vocab, tag_vocab, args.gpu)
Ejemplo n.º 3
0
    def test_from_json(self):
        json_obj = [{'source': [1, 2, 3], 'target': [4, 5, 6]},
                    {'source': [1, 2, 3], 'target': [4, 5, 6]}]
        file_path = '/path/to/json_file'
        import json  # noqa: F401
        open_patcher = patch('data.open')
        json_load_patcher = patch('json.load', return_value=json_obj)
        open_mock = open_patcher.start()
        json_load_mock = json_load_patcher.start()

        source = Field()
        target = Field()
        examples = Loader.from_json(file_path, {'source': source, 'target': target})

        open_mock.assert_called_once_with(file_path)
        json_load_mock.assert_called_once_with(open_mock(file_path).__enter__())

        self.assertEqual(len(examples['source']), 2)
        self.assertEqual(len(examples['target']), 2)
        self.assertListEqual(examples['source'].data, [[1, 2, 3], [1, 2, 3]])
        self.assertListEqual(examples['target'].data, [[4, 5, 6], [4, 5, 6]])

        open_patcher.stop()
        json_load_patcher.stop()
Ejemplo n.º 4
0
          u"Copyright:\tThis is free software: you are free to change and "
          u"redistribute it.\n\t\tThere is NO WARRANTY, to the extent "
          u"permitted by law."))
    parser.add_argument('-k', '--keywords', help="Keywords file path",
            default="data/keywords-2.txt")
    parser.add_argument('-s', '--stopwords', help="Stopwords file path",
            default="data/stopwords.txt")
    parser.add_argument('-d', '--documents', help="Documents file path",
            default="data/documents-2.txt")
    parser.add_argument('-n', '--noresults',
            help="Number of displayed results", default="5")
    parser.add_argument('-v', '--version', action='version',
            version='%(prog)s 0.3')
    args = parser.parse_args()

    keywords = Loader.load_keywords(args.keywords)
    stopwords = Loader.load_stopwords(args.stopwords)
    documents = Loader.load_documents(args.documents)
    n = int(args.noresults)

    cleaner = Cleaner(stopwords)
    tfidf = TFIDF(keywords, documents, cleaner)

    question = raw_input("Enter search string or \"exit()\" and press enter: ")
    while question != "exit()":
            found = tfidf.search(question)           
            for title, similarity, index in found[:n]:
                print "{0:4f}\t{1}".format(similarity, title)
            groups = tfidf.group_kmeans(9, 10)
            for i, group in enumerate(groups):
                print "\nGroup {0}:\n".format(i)
Ejemplo n.º 5
0
def main(args):
    validSet = Loader(os.path.join(FLAGS.data_dir, 'test.csv'),
                      FLAGS.vocab_path, FLAGS.pn_batch_size,
                      FLAGS.ctr_batch_size, FLAGS.seq_length)
    G = Generator()
    G(validSet.vocab_size)

    with tf.Session() as sess:

        saver = tf.train.Saver()
        saver.restore(sess, tf.train.latest_checkpoint('models/pre_G'))

        graph = tf.get_default_graph()
        encoder_inputs = graph.get_tensor_by_name("G_inputs/encoder_inputs:0")
        user_inputs = graph.get_tensor_by_name("G_inputs/user_inputs:0")
        input_lengths = graph.get_tensor_by_name("G_inputs/input_lengths:0")
        pointer_labels = graph.get_tensor_by_name("G_outputs/pointer_labels:0")
        pointer_hot_labels = graph.get_tensor_by_name(
            "G_outputs/pointer_hot_labels:0")

        # loss = graph.get_tensor_by_name("G_loss/loss:0")
        pointer_prob = graph.get_tensor_by_name("G_loss/pointer_prob:0")
        rank_pointers = graph.get_tensor_by_name("G_pointers/rank_pointers:0")

        print('finish loading model!')
        # test
        G_val_acc0, G_val_loss0 = 0, 0
        for itr in range(validSet.n_batches):
            x_raw, x_batch, u_batch, x_lengths, y_batch, y_hot_batch = validSet.next_pn_batch(
            )
            test_dict = {
                encoder_inputs: x_batch,
                user_inputs: u_batch,
                input_lengths: x_lengths,
                pointer_labels: y_batch,
                pointer_hot_labels: y_hot_batch
            }
            output_prob, pre_labels = sess.run([pointer_prob, rank_pointers],
                                               feed_dict=test_dict)
            jishu = 0
            for j, line in enumerate(pre_labels):
                # print u_batch[j]
                for word in line:
                    if word in y_batch[j]:
                        jishu = jishu + 1
            acc = jishu * 1.0 / (FLAGS.pn_batch_size * 5)
            G_val_acc0 += acc
            print(pre_labels)
            print(y_batch)
            if itr == 0:
                for i in range(FLAGS.pn_batch_size):
                    print(i)
                    origin = ''
                    predict = ''
                    for j in range(20):
                        if j in y_batch[i]:
                            origin += x_raw[i, j]
                    for j in range(20):
                        if j in pre_labels[i]:
                            predict += x_raw[i, j]
                    print(i, origin)
                    print(i, predict)

        print("Test Generator: test_acc:{:.2f}".format(G_val_acc0 /
                                                       validSet.n_batches))
Ejemplo n.º 6
0
def train(args):
    # load data
    vocab_path = os.path.join(args.data_dir, 'vocab.json')
    training = Loader(os.path.join(args.data_dir, 'train.txt'), vocab_path,
                      args.batch_size, 45)
    validation = Loader(os.path.join(args.data_dir, 'validate.txt'),
                        vocab_path, args.batch_size, 45)

    # create TensorFlow graph
    ptr_net = PointerNet(batch_size=args.batch_size,
                         learning_rate=args.learning_rate)
    saver = tf.train.Saver()
    best_val_acc = 0

    # record training loss & accuracy
    train_losses = []
    train_accuracies = []

    # initialize graph
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for ep in tqdm(range(args.n_epochs)):
            tr_loss, tr_acc = 0, 0
            for itr in range(training.n_batches):
                x_batch, x_lengths, y_batch = training.next_batch()
                train_dict = {
                    ptr_net.encoder_inputs: x_batch,
                    ptr_net.input_lengths: x_lengths,
                    ptr_net.pointer_labels: y_batch
                }
                loss, acc, _ = sess.run(
                    [ptr_net.loss, ptr_net.exact_match, ptr_net.train_step],
                    feed_dict=train_dict)
                tr_loss += loss
                tr_acc += acc

            train_losses.append(tr_loss / training.n_batches)
            train_accuracies.append(tr_acc / training.n_batches)

            # check validation accuracy every 10 epochs
            if ep % 10 == 0:
                val_acc = 0
                for itr in range(validation.n_batches):
                    x_batch, x_lengths, y_batch = validation.next_batch()
                    val_dict = {
                        ptr_net.encoder_inputs: x_batch,
                        ptr_net.input_lengths: x_lengths,
                        ptr_net.pointer_labels: y_batch
                    }
                    val_acc += sess.run(ptr_net.exact_match,
                                        feed_dict=val_dict)
                val_acc = val_acc / validation.n_batches

                print('epoch {:3d}, loss={:.2f}'.format(
                    ep, tr_loss / training.n_batches))
                print('Train EM: {:.2f}, Validation EM: {:.2f}'.format(
                    tr_acc / training.n_batches, val_acc))

                # save model
                if val_acc >= best_val_acc:
                    print('Validation accuracy increased. Saving model.')
                    saver.save(sess, os.path.join(args.save_dir,
                                                  'ptr_net.ckpt'))
                    best_val_acc = val_acc
                else:
                    print('Validation accuracy decreased. Restoring model.')
                    saver.restore(sess,
                                  os.path.join(args.save_dir, 'ptr_net.ckpt'))

        print('Training complete.')
        print('Best Validation EM: {:.2f}".format(best_val_acc)')
Ejemplo n.º 7
0
import tensorflow as tf
from config import get_config
from trainer import Trainer

from data import Loader, FeatureManager

sess = tf.Session()
config, _ = get_config()

data_loader = Loader(config.dataset, config.restrict)
feature_manager = FeatureManager(data_loader.restrictor)
feature_manager.generate_data(data_loader.melodies)

trainer = Trainer(config, feature_manager)

trainer.train()
Ejemplo n.º 8
0
import os

from data import Loader
from word import Cleaner
from search import TFIDF
from guess import Guesses
import expander

from flask import Flask, render_template, request, jsonify

keywords_path = "data/keywords-2.txt"
stopwords_path = "data/stopwords.txt"
documents_path = "data/documents-2.txt"

keywords = Loader.load_keywords(keywords_path)
stopwords = Loader.load_stopwords(stopwords_path)
documents = Loader.load_documents(documents_path, categories=True)

cleaner = Cleaner(stopwords)
tfidf = TFIDF(keywords, documents, cleaner)
autocomplete = Guesses(tfidf.get_term_document_matrix(), tfidf.keywords, tfidf.keywords_lookup)

app = Flask(__name__)


@app.route('/')
def home():
    found_extended = None
    question = ""
    if 'search' in request.args:
Ejemplo n.º 9
0
def main():
    opts = optparser.parse_args()[0]

    train_loader = Loader(opts.train)

    opts.vocab_len = len(train_loader._char_to_id)
    opts.pos_len = len(train_loader._pos_to_id)
    opts.max_pos_len = train_loader._pos_max_len
    opts.max_target_len = train_loader._char_max_len
    opts.use_cuda = opts.use_cuda == 1
    opts.eval = opts.eval == 1
    opts.data_size = train_loader.get_data_size()

    if not torch.cuda.is_available():
        opts.use_cuda = False
    torch.manual_seed(opts.seed)
    np.random.seed(opts.seed)

    if not opts.eval:
        # weights for paddings, set to 0
        loss_weights = torch.ones(opts.vocab_len)
        loss_weights[0] = 0
        criterion = nn.NLLLoss(loss_weights, size_average=False)

        c2i, i2c, p2i, i2p = train_loader.get_mappings()
        dev_loader = Loader(opts.dev, c2i, i2c, p2i, i2p)
        if dev_loader._pos_max_len > opts.max_pos_len:
            opts.max_pos_len = dev_loader._pos_max_len

        model = Module(opts)
        if opts.model_path is not '':
            model = torch.load(opts.model_path)

        train_batcher = Batcher(opts.batch_size, train_loader.get_data(),
                                opts.max_pos_len, opts.eval)

        dev_batcher = Batcher(decode_batch, dev_loader.get_data(),
                              opts.max_pos_len, True)

        print model
        start_train(model, criterion, opts, train_batcher, dev_batcher)
    else:
        model = torch.load(opts.model_path)
        model.eval()
        #print model

        c2i, i2c, p2i, i2p = train_loader.get_mappings()

        test_loader = Loader(opts.test, c2i, i2c, p2i, i2p)
        if test_loader._pos_max_len > opts.max_pos_len:
            opts.max_pos_len = test_loader._pos_max_len
        test_batcher = Batcher(1, test_loader.get_data(), opts.max_pos_len,
                               opts.eval)

        opts.data_size = test_loader.get_data_size()
        decode(model, opts, test_batcher, i2c, i2p)
Ejemplo n.º 10
0
def main(args):
    # load data
    click1_Set = Loader(os.path.join(FLAGS.data_dir, 'click1.csv'),
                        FLAGS.vocab_path, FLAGS.pn_batch_size,
                        FLAGS.ctr_batch_size, FLAGS.seq_length)
    click0_Set = Loader(os.path.join(FLAGS.data_dir, 'click0.csv'),
                        FLAGS.vocab_path, FLAGS.pn_batch_size,
                        FLAGS.ctr_batch_size, FLAGS.seq_length)

    # pretrain graph
    generator = Generator(click1_Set.vocab_size)
    pre_optimize = tf.train.AdamOptimizer(FLAGS.learning_rate)
    pre_train_step = pre_optimize.minimize(generator.loss,
                                           var_list=generator.vars)
    saver = tf.train.Saver()
    title_inputs = tf.placeholder(tf.int32, [None, FLAGS.seq_length],
                                  name='title_inputs')
    title_distribution = tf.placeholder(tf.float32, [None, FLAGS.seq_length],
                                        name='title_distribution')

    # formal train graph
    generator_fake = Generator(click0_Set.vocab_size)
    real_discriminator = Discriminator(click1_Set.vocab_size, title_inputs,
                                       title_distribution)
    fake_discriminator = Discriminator(click0_Set.vocab_size,
                                       generator_fake.encoder_inputs,
                                       generator_fake.pointer_prob)
    reg = tc.layers.apply_regularization(
        tc.layers.l1_regularizer(2.5e-5),
        weights_list=[
            var for var in tf.global_variables()
            if 'kernel' or 'W1' or 'W2' in var.name
        ])
    D_real_loss = tf.reduce_mean(real_discriminator.predictions)
    D_fake_loss = tf.reduce_mean(fake_discriminator.predictions)
    D_loss = D_fake_loss - D_real_loss
    D_loss_reg = D_loss + reg
    D_optimize = tf.train.RMSPropOptimizer(FLAGS.learning_rate)

    # WGAN lipschitz-penalty
    alpha = tf.random_uniform(shape=[tf.shape(title_distribution)[0], 1, 1],
                              minval=0.,
                              maxval=1.)
    differences = generator_fake.pointer_prob_hot - title_distribution
    interpolates = title_distribution + (alpha * differences)
    gradients = tf.gradients(
        Discriminator(click0_Set.vocab_size, generator_fake.encoder_inputs,
                      interpolates).predictions, [interpolates])[0]
    slopes = tf.sqrt(
        tf.reduce_sum(tf.square(gradients), reduction_indices=[1, 2]))
    gradient_penalty = tf.reduce_mean((slopes - 1.)**2)
    D_loss_reg = D_loss + 10 * gradient_penalty

    D_train_step = D_optimize.minimize(D_loss_reg,
                                       var_list=fake_discriminator.vars)

    G_loss = -tf.reduce_mean(fake_discriminator.predictions)
    G_loss_reg = G_loss + reg
    G_optimize = tf.train.RMSPropOptimizer(FLAGS.learning_rate)
    G_train_step = G_optimize.minimize(G_loss_reg,
                                       var_list=generator_fake.vars)

    saver_G = tf.train.Saver(var_list=generator.vars)
    saver_D = tf.train.Saver(var_list=real_discriminator.vars)

    # for var in tf.global_variables():
    #     print(var.name)
    # training precess
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        pretrain_G(sess, saver, generator, click1_Set, FLAGS.G_pretrain_epochs,
                   pre_train_step)
        print('Start training...')
        for i in range(FLAGS.train_epochs):
            d_iters = FLAGS.d_iters
            g_iters = FLAGS.g_iters
            for _ in range(0, d_iters):
                x_batch_fake, u_batch_fake, x_lengths_fake, y_batch_fake, y_hot_batch_fake = click0_Set.next_pn_batch(
                )
                x_batch_real, u_batch_real, x_lengths_real, y_batch_real, y_hot_batch_real = click1_Set.next_pn_batch(
                )
                D_dict = {
                    generator_fake.encoder_inputs: x_batch_fake,
                    generator_fake.user_inputs: u_batch_fake,
                    generator_fake.input_lengths: x_lengths_fake,
                    generator_fake.pointer_labels: y_batch_fake,
                    generator_fake.pointer_hot_labels: y_hot_batch_fake,
                    title_inputs: x_batch_real,
                    title_distribution: y_hot_batch_real,
                    real_discriminator.user_inputs: u_batch_real,
                    fake_discriminator.user_inputs: u_batch_fake
                }
                sess.run(fake_discriminator.d_clip)
                loss_Real, loss_Fake, lossD, _ = sess.run(
                    [D_real_loss, D_fake_loss, D_loss, D_train_step],
                    feed_dict=D_dict)

            for _ in range(0, g_iters):
                x_batch_fake, u_batch_fake, x_lengths_fake, y_batch_fake, y_hot_batch_fake = click0_Set.next_pn_batch(
                )
                x_batch_real, u_batch_real, x_lengths_real, y_batch_real, y_hot_batch_real = click1_Set.next_pn_batch(
                )
                D_dict = {
                    generator_fake.encoder_inputs: x_batch_fake,
                    generator_fake.user_inputs: u_batch_fake,
                    generator_fake.input_lengths: x_lengths_fake,
                    generator_fake.pointer_labels: y_batch_fake,
                    generator_fake.pointer_hot_labels: y_hot_batch_fake,
                    title_inputs: x_batch_real,
                    title_distribution: y_hot_batch_real,
                    real_discriminator.user_inputs: u_batch_real,
                    fake_discriminator.user_inputs: u_batch_fake
                }
                lossG, _ = sess.run([G_loss, G_train_step], feed_dict=D_dict)

            print(
                "epoch:{}, D_loss:{:.2f}, G_loss:{:.2f}, loss_Real:{:.2f}, loss_Fake:{:.2f}, Sum_loss:{:.2f}"
                .format(i, lossD, lossG, loss_Real, loss_Fake, lossD + lossG))

        saver_G.save(sess, os.path.join(FLAGS.save_dir, 'G/train_generator'))
        saver_D.save(sess, os.path.join(FLAGS.save_dir,
                                        'D/train_discriminator'))