Esempio n. 1
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    train_data, val_data = load_preprocess_data(FLAGS.data_dir,
                                                FLAGS.max_context_len,
                                                FLAGS.max_question_len)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    embeddings = tf.constant(load_embeddings(embed_path), tf.float32)

    encoder = Encoder(FLAGS.state_size, FLAGS.summary_flag,
                      FLAGS.max_context_len, FLAGS.max_question_len)
    decoder = Decoder(FLAGS.state_size, FLAGS.summary_flag)

    qa = QASystem(encoder, decoder, FLAGS, embeddings, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, train_data, val_data, save_train_dir)
Esempio n. 2
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    datasetTrain = initialize_datasets(FLAGS.data_dir, 'train')
    datasetVal = initialize_datasets(FLAGS.data_dir, 'val')
    #datasetTrain = datasetTrain[0:100]


    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, datasetTrain, save_train_dir)

        qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
Esempio n. 3
0
def main(args):
    if args:
        restore = args

    embed_path = FLAGS.embed_path or "data/squad/glove.trimmed.{}.npz".format(
        FLAGS.embedding_size)
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()
    assert embeddings.shape[1] == FLAGS.embedding_size

    vocab_len = embeddings.shape[0]

    train = load_squad(FLAGS.data_dir,
                       "train",
                       max_vocab=vocab_len if FLAGS.check_embeddings else 0,
                       max_samples=FLAGS.max_train_samples)
    val = load_squad(FLAGS.data_dir,
                     "val",
                     max_vocab=vocab_len if FLAGS.check_embeddings else 0,
                     max_samples=FLAGS.max_val_samples)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if FLAGS.verify_only:
        print_samples(train, FLAGS.verify_only, rev_vocab)

        return

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    qa = QASystem(train_dir, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)

        qa.train(sess, train)

        qa.evaluate_answer(sess, qa.preprocess_sequence_data(val), log=True)
Esempio n. 4
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    # load all in once, maybe better to try batch by batch
    question_path = "./data/squad/train.ids.question"
    context_path = "./data/squad/train.ids.context"
    answer_path = "./data/squad/train.span"

    val_q = "./data/squad/val.ids.question"
    val_c = "./data/squad/val.ids.context"
    val_a = "./data/squad/val.span"

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")

    # embeddings is a matrix of shape [vocab_size, embedding_size]
    embeddings = np.load(embed_path)['glove'].astype(np.float32)
    val_data = load_and_pad_val_data(val_q, val_c, val_a)

    # vocab is the mapping from word -> token id
    # rev_vocab is the reverse mapping, from id -> word
    vocab, rev_vocab = initialize_vocab(vocab_path)

    # someone posted that the max length of question is 766
    info = (question_path, context_path, answer_path, FLAGS.batch_size,
            FLAGS.max_length, FLAGS.output_size)
    '''   
    batch_gen = batch_generator(question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size)
    i = 0;
    while True:
        batch_gen.next()
        i += 1
        logging.info(i)
    '''

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, batch_generator, info, save_train_dir, val_data,
                 rev_vocab)
Esempio n. 5
0
def main(FLAGS):
    print(80 * "=")
    print("INITIALIZING")
    print(80 * "=")
    # Do what you need to load datasets from FLAGS.data_dir

    #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug)
    if not os.path.exists('./data/weights/'):
        os.makedirs('./data/weights/')
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    print("Loading Embedding Matrix")
    embeddings = np.load(embed_path)['glove']

    encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size)

    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS, embeddings)
    #qa = QASystem(encoder, FLAGS, embeddings)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        print("Building Network ... ")
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        print("Load Training Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='train',
                                      debugMode=True)
        # encoder.encode_question(
        #     dataset['Questions'], question['Questions_masks'])
        print(80 * "=")
        print("Training")
        print(80 * "=")
        qa.train(sess, dataset, save_train_dir)
        print("Finished Training")
        print("Load Validation Data")
        dataset = initialize_datasets(FLAGS.data_dir,
                                      dataset='val',
                                      debugMode=True)
        print(80 * "=")
        print("Evaluation")
        print(80 * "=")
        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def run_func(model_name):
    train = SquadData.load(config.SQUAD_TRAIN_PREFIX, size=config.TRAIN_SIZE)
    dev = SquadData.load(config.SQUAD_DEV_PREFIX, size=config.EVAL_SIZE)

    qa = QASystem(model_name)
    
    with tf.Session() as sess:
        # ====== Load a pretrained model if it exists or create a new one if no pretrained available ======
        qa.initialize_model(sess)
        qa.train(sess, [train, dev])
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    #dataset = load_data(FLAGS.data_dir)  # None
    dataset = {}
    num_train = load_data_dq(dataset, 'train', FLAGS.data_dir)
    num_val = load_data_dq(dataset, 'val', FLAGS.data_dir)
    load_data_sa(dataset, 'train', FLAGS.data_dir, num_train)
    load_data_sa(dataset, 'val', FLAGS.data_dir, num_val)
    trim(dataset['train'])
    trim(dataset['val'])

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    # Session moved upfront to set the ifgpu flag before QASystem
    with tf.Session() as sess:
        pass
    local_device_protos = device_lib.list_local_devices()  # 38559755
    for x in local_device_protos:
        if x.device_type == 'GPU':
            FLAGS.ifgpu = True
            break

    qa = QASystem(encoder, decoder, embed_path, rev_vocab)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    #print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # tfdbg
    #with tf.Session() as sess:
    #    pass
    #sess = tfdbg.LocalCLIDebugWrapperSession(sess)
    #sess.add_tensor_filter("has_inf_or_nan", tfdbg.has_inf_or_nan)

    load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                              or FLAGS.train_dir)
    initialize_model(sess, qa, load_train_dir)

    save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
    qa.train(sess, dataset, save_train_dir)

    #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
    sess.close()  #tfdbg
Esempio n. 8
0
def main(_):

    # TODO
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = dict()
    for dataset_type in ['train', 'val']:
        with open(os.path.join(FLAGS.data_dir,
                               "%s.ids.context" % dataset_type)) as f:
            data_context = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        with open(
                os.path.join(FLAGS.data_dir,
                             "%s.ids.question" % dataset_type)) as f:
            data_question = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        with open(os.path.join(FLAGS.data_dir, "%s.span" % dataset_type)) as f:
            data_span = [
                map(int, line.split()) for line in f.read().splitlines()
            ]
        dataset[dataset_type] = (data_context, data_question, data_span)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      config=config)
    decoder = Decoder(output_size=FLAGS.output_size, config=config)

    qa = QASystem(encoder, decoder, config=config)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
Esempio n. 9
0
def main(_):
    dataset = {}
    question_max_len = 40
    context_max_len = 600
    # Preprocess and collect small test data
    test_q_path = pjoin(FLAGS.data_dir, "test.ids.question")
    test_q_data, test_q_seq_len = pad_sentences(test_q_path, question_max_len)
    assert not any(
        test_q_seq_len > test_q_data.shape[1]
    ), 'Some questions have length greater than max question length'
    test_c_path = pjoin(FLAGS.data_dir, "test.ids.context")
    test_c_data, test_c_seq_len = pad_sentences(test_c_path, context_max_len)
    assert not any(
        test_c_seq_len > test_c_data.shape[1]
    ), 'Some contexts have length greater than max context length'
    test_s_path = pjoin(FLAGS.data_dir, "test.span")
    test_s_e_id = get_answer_span(test_s_path, context_max_len)
    dataset['test'] = [
        test_q_data, test_q_seq_len, test_c_data, test_c_seq_len, test_s_e_id
    ]

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, question_max_len, context_max_len,
                  embed_path, FLAGS.learning_rate, FLAGS.batch_size,
                  FLAGS.dropout, FLAGS.optimizer,
                  FLAGS.max_gradient_norm)  # try without dropout

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)

        qa.train(sess, dataset['test'], save_train_dir,
                 small_data_test=True)  #
Esempio n. 10
0
def main(_):
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)
    logger = logging.getLogger()

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    train_data = zip(*load_data(FLAGS.data_dir, "train"))
    val_data = zip(*load_data(FLAGS.data_dir, "val"))
    dev_data = zip(*load_data(FLAGS.data_dir, "dev"))

    #model_train_data = train_data + val_data + dev_data
    model_train_data = train_data
    model_eval_data = val_data

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Graph().as_default():
        with tf.Session() as sess:
            logger.info("Loading embeddings")
            embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' +
                                 str(FLAGS.embedding_size) + '.npz')
            pretrained_embeddings = embeddings['glove']
            logger.info("Embeddings loaded with shape: %s %s" %
                        (pretrained_embeddings.shape))

            qa = QASystem(FLAGS,
                          pretrained_embeddings,
                          vocab_dim=len(vocab.keys()))
            initialize_model(sess, qa, train_dir)
            qa.train(sess, model_train_data, model_eval_data)
Esempio n. 11
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    # use .readlines() to load file ourselves
    # use python generator
    question_path = pjoin(FLAGS.data_dir, "data_train/train_tokenH.txt")
    paragraph_path = pjoin(FLAGS.data_dir, "data_train/train_tokenP.txt")
    answer_path = pjoin(FLAGS.data_dir, "data_train/train_index.txt")

    val_question_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenH.txt")
    val_paragraph_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenP.txt")
    val_answer_path = pjoin(FLAGS.data_dir, "data_dev/dev_index.txt")

    # for testing
    # dataset = [(1,1,1), (1,1,1)]
    dataset = load_dataset(question_path, paragraph_path, answer_path,
                           FLAGS.batch_size)
    val_dataset = load_dataset(val_question_path, val_paragraph_path,
                               val_answer_path, FLAGS.batch_size)
    #generate_histograms(dataset)
    #generate_histograms(val_dataset)

    # loads embedding
    FLAGS.embed_path = FLAGS.embed_path or pjoin("data", "sgns.merge.word.npz")
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.txt")
    vocab, rev_vocab = initialize_vocab(
        vocab_path)  # one is list and one is dict

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(size=FLAGS.state_size, output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder, FLAGS)

    # log file
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # start training
    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, val_dataset, save_train_dir, rev_vocab)
Esempio n. 12
0
def main(_):
    test_device_placement()

    prepend = FLAGS.sample_data_prepend  # change this FLAGS value to an empty string to train on the full data set.

    #Some logic that lets us only run validation. It should be cleaner.
    val_only = True if len(
        sys.argv) > 1 and sys.argv[1] == 'val_only' else False
    print('Reading data')
    print('==================')

    val_set = fetch_data_set(prepend, 'val')
    if val_only:
        tr_set_size = 0
    else:
        tr_set = fetch_data_set(prepend, 'train')
        tr_set_size = tr_set[0].shape[0]
    print('Finished reading data')
    print('==================')

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")

    #vocab is map from words to indices, rev_vocab is our list of words in reverse frequency order
    vocab, rev_vocab = initialize_vocab(vocab_path)

    idx_word = data_utils.invert_map(vocab)

    del vocab
    del rev_vocab

    qa = QASystem(FLAGS, embed_path, idx_word, False, tr_set_size, False)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, FLAGS.train_dir)
        if val_only:
            print('Running validation only')
            qa.validate(sess, val_set, 'validation')
        else:
            qa.train(sess, tr_set, val_set, FLAGS.train_dir)
Esempio n. 13
0
File: train.py Progetto: truncs/QA
def main(_):

    # TODO maybe pass as loaded dataset abstraction instead of
    # file_paths?

    default_hparams = create_hparams(FLAGS)

    context_file_path = FLAGS.data_dir + '/train.ids.context'
    question_file_path = FLAGS.data_dir + '/train.ids.question'
    span_file_path = FLAGS.data_dir + '/train.span'
    dataset = (context_file_path, question_file_path, span_file_path)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    default_hparams.add_hparam('vocab_size', len(vocab))

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, default_hparams)

    # Setup embeddings
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    np_embeddings = np.float32(np.load(embed_path)['glove'])

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True

    with tf.Session(config=session_config) as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir, np_embeddings)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Esempio n. 14
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    datasetTrain = initialize_datasets(FLAGS.data_dir,
                                       'train.',
                                       debugMode=False)
    datasetVal = initialize_datasets(FLAGS.data_dir, 'val.', debugMode=False)
    datasetTrain.extend(datasetVal)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    #This is taking a long time
    tic = datetime.now()
    qa = QASystem(encoder, decoder, embed_path, FLAGS, rev_vocab)
    print('Time to setup the model: ', datetime.now() - tic)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    #saver = tf.train.Saver()

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        # Get directory to save model
        #save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        results_path = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
        save_train_dir = results_path + "model.weights/"
        if not os.path.exists(save_train_dir):
            os.makedirs(save_train_dir)

        qa.train(sess, datasetTrain, save_train_dir)  #, saver)

        qa.evaluate_answer(sess, datasetVal, rev_vocab, sample=1000, log=True)
Esempio n. 15
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = load_dataset()

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      embedding_size=FLAGS.embedding_size,
                      output_size=FLAGS.output_size)
    decoder = Decoder(state_size=FLAGS.state_size,
                      output_size=FLAGS.output_size)

    qa_args = {
        "embed_path": embed_path,
        "embedding_size": FLAGS.embedding_size,
        "output_size": FLAGS.output_size,
        "optimizer": FLAGS.optimizer,
        "learning_rate": FLAGS.learning_rate,
        "epochs": FLAGS.epochs,
        "batch_size": FLAGS.batch_size,
        "max_gradient_norm": FLAGS.max_gradient_norm,
        "dropout_keep_prob": 1.0 - FLAGS.dropout,
        "train_dir": FLAGS.train_dir,
        "state_size": FLAGS.state_size
    }
    qa = QASystem(encoder, decoder, **qa_args)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)
Esempio n. 16
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    with open(pjoin("data", "squad", "train.ids.context"), encoding='utf-8') as fcontext, \
            open(pjoin("data", "squad", "train.ids.question"), encoding='utf-8') as fquestion, \
            open(pjoin("data", "squad", "train.span"), encoding='utf-8') as fspan:
        context = np.array([[int(idx) for idx in l.split()]
                            for l in fcontext.readlines()])
        question = np.array([[int(idx) for idx in l.split()]
                             for l in fquestion.readlines()])
        ans = np.array([[int(idx) for idx in l.split()]
                        for l in fspan.readlines()])
        dataset = {
            'context': context,
            'question': question,
            'answer_span_start': ans[:, 0],
            'answer_span_end': ans[:, 1]
        }

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embeddings = load_embeddings(embed_path)
    qa = QASystem(embeddings, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    # print(vars(FLAGS))
    # with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w', encoding='utf-8') as fout:
    #     json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        # load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        load_train_dir = FLAGS.train_dir
        initialize_model(sess, qa, load_train_dir)

        # save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        save_train_dir = FLAGS.train_dir
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
Esempio n. 17
0
def main(_):

    logging.info("Loading training data")
    dataset_train = load_train_data(FLAGS.data_dir, isValidation = False)
    logging.info("Loading validation data")
    dataset_val = load_train_data(FLAGS.data_dir, isValidation = True)

    logging.info("Building Model Graph")
    tf.set_random_seed(42)
    np.random.seed(43)
    
    select_test(0)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = SimpleLinearDecoder() #AnswerPointerDecoder()

    qa = QASystem(encoder, decoder, len(dataset_train[0]))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    logging.info(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    cris_flag = os.environ.get('CS224N_CRIS')

    if cris_flag is not None:
        logging.info('hi cris')
        sess = tf.Session(config = tf.ConfigProto(intra_op_parallelism_threads = 1))
    else:
        sess = tf.Session()

    with sess.as_default():
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset_train, dataset_val, save_train_dir)
       
    sess.close()
Esempio n. 18
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = {"train": load_data(FLAGS.data_dir, mode="train"), \
               "val": load_data(FLAGS.data_dir, mode="val")}

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    matcher = Matcher(perspective_dim=25,
                      input_size=FLAGS.state_size)  # add flag
    decoder = Decoder(output_size=FLAGS.output_size,
                      state_size=FLAGS.state_size,
                      n_perspective_dim=50 * 2)  # add flag

    qa = QASystem(encoder, matcher, decoder, \
                  vocab=vocab, vocab_dim=FLAGS.embedding_size, rev_vocab=rev_vocab, embed_path=embed_path)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        tf.global_variables_initializer().run()
        graph_writer = tf.summary.FileWriter("qa-graph")
        graph_writer.add_graph(sess.graph)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, 500, log=True)
Esempio n. 19
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    training_question_data_path = pjoin(FLAGS.data_dir, 'train.question')
    dataset = load_dataset(FLAGS.data_dir)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    pretrained_embeddings = np.load(embed_path)['glove']
    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      pretrained_embeddings=pretrained_embeddings,
                      max_question_length=FLAGS.max_question_length,
                      max_context_length=FLAGS.max_context_length)
    decoder = Decoder(output_size=FLAGS.output_size,
                      size=FLAGS.state_size,
                      max_context_length=FLAGS.max_context_length)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Esempio n. 20
0
def main(_):

    dataset = [
        load_dataset([
            "embedding/train.ids.question", "embedding/train.ids.context",
            "data/train.span"
        ]),
        load_dataset([
            "embedding/val.ids.question", "embedding/val.ids.context",
            "data/val.span"
        ])
    ]

    embed_path = pjoin("embedding",
                       "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    tf.reset_default_graph()
    encoder = Encoder(FLAGS.num_hidden_unit, tf.contrib.rnn.GRUCell)
    decoder = Decoder(FLAGS.num_hidden_unit, tf.contrib.rnn.BasicLSTMCell)
    embedding = np.load(embed_path)["glove"]

    qa = QASystem(encoder, decoder, embedding, FLAGS.keep_prob)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, FLAGS.train_dir)
        qa.train(sess, dataset, FLAGS.epochs, FLAGS.batch_size,
                 FLAGS.train_dir)
Esempio n. 21
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.size)
    deocder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, deocder)

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        initialize_model(sess, qa, train_dir)

        qa.train(sess, dataset)

        qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
Esempio n. 22
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    

    dataset = None

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    with np.load(embed_path) as data:
        glove_embeddings = np.asfarray(data["glove"], dtype=np.float32)
        
        dataset = load_and_preprocess_data()

        # print(train_data)

        encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=FLAGS)
        decoder = Decoder(output_size=FLAGS.output_size, config=FLAGS)

        qa = QASystem(encoder, decoder, embeddings=glove_embeddings, config=FLAGS, vocab=(vocab, rev_vocab))

        if not os.path.exists(FLAGS.log_dir):
            os.makedirs(FLAGS.log_dir)
        file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
        logging.getLogger().addHandler(file_handler)

        print(vars(FLAGS))
        with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
            json.dump(FLAGS.__flags, fout)

        with tf.Session() as sess:
            load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
            initialize_model(sess, qa, load_train_dir)

            save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
            qa.train(sess, dataset, save_train_dir)

            qa.evaluate_answer(sess, FLAGS.evaluate, log=True)
Esempio n. 23
0
def run_func():
    config = Config()
    train = squad_dataset(config.question_train, config.context_train,
                          config.answer_train)
    dev = squad_dataset(config.question_dev, config.context_dev,
                        config.answer_dev)

    embed_path = config.embed_path
    vocab_path = config.vocab_path
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embeddings = get_trimmed_glove_vectors(embed_path)

    encoder = Encoder(config.hidden_state_size)
    decoder = Decoder(config.hidden_state_size)

    qa = QASystem(encoder, decoder, embeddings, config)

    with tf.Session() as sess:
        # ====== Load a pretrained model if it exists or create a new one if no pretrained available ======
        qa.initialize_model(sess, config.train_dir)
        qa.train(sess, [train, dev], config.train_dir)
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = get_dataset(FLAGS.data_dir, FLAGS.max_question_size,
                          FLAGS.max_paragraph_size)

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    FLAGS.embed_path = embed_path
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size,
                      FLAGS=FLAGS)
    decoder = Decoder(FLAGS=FLAGS)

    qa = QASystem(encoder, decoder, FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        #load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) #Change these back for final submission
        load_train_dir = FLAGS.load_train_dir or FLAGS.train_dir
        print("load_train_dir: ", load_train_dir)
        initialize_model(sess, qa, load_train_dir)

        #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) #Change back for final submission
        save_train_dir = FLAGS.train_dir
        print("save_train_dir: ", save_train_dir)
        qa.train(sess, dataset, save_train_dir, rev_vocab)
Esempio n. 25
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    train_path = 'data/squad'
    val_path = 'data/squad'

    config = Config(embed_path, train_path, val_path)

    encoder = EncoderCoattention(config)
    decoder = DecoderDynamic(config)

    qa = QASystem(encoder, decoder, config)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, save_train_dir)
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = []
    dataset.append(pjoin(FLAGS.data_dir, "train.ids.question"))
    dataset.append(pjoin(FLAGS.data_dir, "train.ids.context"))
    dataset.append(pjoin(FLAGS.data_dir, "train.span"))
    dataset.append(pjoin(FLAGS.data_dir, "val.ids.question"))
    dataset.append(pjoin(FLAGS.data_dir, "val.ids.context"))
    dataset.append(pjoin(FLAGS.data_dir, "val.span"))

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.paragraph_output_size)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, save_train_dir)
Esempio n. 27
0
def main(_):
    config = Config()
    dataset = None  # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly

    embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size))
    embeddingz = np.load(embed_path)
    embeddings = embeddingz['glove']
    embeddingz.close()

    vocab_len = embeddings.shape[0]

    train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples)
    val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples)

    print('train size: ', len(train), ' val size: ', len(val))

    vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer)
    encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size)
    decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size)

    qa = QASystem(encoder, decoder)
    with tf.Session() as sess:
        load_train_dir = (config.load_train_dir or config.train_dir)  # put "" here if you want to build a new model
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = config.train_dir
        ds_train = qa.pad_sequences(train)
        ret_q, ret_p, ret_labels = ds_train
        qa.train(sess, ds_train, save_train_dir)
        ds_val = qa.pad_sequences(val)

        print('train error')
        qa.evaluate_answer(sess, ds_train, log=True)

        print('val error')
        qa.evaluate_answer(sess, ds_val, log=True)
Esempio n. 28
0
def main(_):
    FLAGS.config = int(sys.argv[1])
    load_config(current_config=FLAGS.config)

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = load_data(FLAGS.data_dir) # ((question, context), answer)
    train_data = preprocess_dataset(dataset['train'],
        FLAGS.output_size, FLAGS.question_size)
    val_data = preprocess_dataset(dataset['val'],
        FLAGS.output_size, FLAGS.question_size)

    # print(dataset)
    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    qa = QASystem(encoder, decoder)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, 
        "log" + '_config_' + str(FLAGS.config) + ".txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags" + '_config_' +\
        str(FLAGS.config) + ".json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)
        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, train_data, val_data, save_train_dir)
        qa.evaluate_answer(sess, train_data, val_data, FLAGS.evaluate, log=True)
def main(_):
    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    args = parse_arg()
    update_config(args, c_time)
    # pprint.pprint(cfg)
    logging.info(cfg)
    if args.test:
        pdb.set_trace()

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    answers = read_answers(data_dir)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    embed_path = pjoin(data_dir, "glove.trimmed." + str(cfg.embed_size) + ".npz")
    # logging.info('embed size: {} for path {}'.format(cfg.embed_size, embed_path))
    # embedding = np.load(embed_path)['glove']

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    print_parameters()

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, embed_path)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(cfg.train_dir)
        logging.info('=========== trainable varaibles ============')
        for i in tf.trainable_variables():
            logging.info(i.name)
        logging.info('=========== regularized varaibles ============')
        for i in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES):
            logging.info(i.name)

        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(cfg.train_dir)
        if args.test:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     debug_num=100,
                     rev_vocab=rev_vocab)
        else:
            qa.train(cfg.start_lr, sess, dataset, answers, save_train_dir,
                     raw_answers=raw_answers,
                     rev_vocab=rev_vocab)
        qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                           log=True,
                           training=True,
                           sample=4000)
Esempio n. 30
0
def main(_):
    # Do what you need to load datasets from FLAGS.data_dir
    dataset = {}
    question_max_len = 40
    context_max_len = 600
    # Preprocess and collect train data
    train_q_path = pjoin(FLAGS.data_dir, "train.ids.question")
    train_q_data, train_q_seq_len = pad_sentences(train_q_path,
                                                  question_max_len)
    assert not any(
        train_q_seq_len > train_q_data.shape[1]
    ), 'Some questions have length greater than max question length'
    train_c_path = pjoin(FLAGS.data_dir, "train.ids.context")
    train_c_data, train_c_seq_len = pad_sentences(train_c_path,
                                                  context_max_len)
    assert not any(
        train_c_seq_len > train_c_data.shape[1]
    ), 'Some contexts have length greater than max context length'
    train_s_path = pjoin(FLAGS.data_dir, "train.span")
    train_s_e_id = get_answer_span(train_s_path, context_max_len)
    dataset['train'] = [
        train_q_data, train_q_seq_len, train_c_data, train_c_seq_len,
        train_s_e_id
    ]
    # Preprocess and collect validation data
    val_q_path = pjoin(FLAGS.data_dir, "val.ids.question")
    val_q_data, val_q_seq_len = pad_sentences(val_q_path, question_max_len)
    val_c_path = pjoin(FLAGS.data_dir, "val.ids.context")
    val_c_data, val_c_seq_len = pad_sentences(val_c_path, context_max_len)
    val_s_path = pjoin(FLAGS.data_dir, "val.span")
    val_s_e_id = get_answer_span(val_s_path, context_max_len)
    dataset['val'] = [
        val_q_data, val_q_seq_len, val_c_data, val_c_seq_len, val_s_e_id
    ]

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    encoder = Encoder(state_size=FLAGS.state_size,
                      vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(state_size=FLAGS.state_size)

    qa = QASystem(encoder, decoder, question_max_len, context_max_len,
                  embed_path, FLAGS.learning_rate, FLAGS.batch_size,
                  FLAGS.dropout, FLAGS.optimizer,
                  FLAGS.max_gradient_norm)  # try without dropout

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)

        for i in range(FLAGS.epochs):
            qa.train(sess, dataset['train'], save_train_dir)  #
            print('Finish training epoch {}'.format(i))
            qa.evaluate_answer(sess, dataset['val'])  # vocab, FLAGS.evaluate
Esempio n. 31
0
def main(_):

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    context_ids_path = pjoin(FLAGS.data_dir, "train.ids.context")
    question_ids_path = pjoin(FLAGS.data_dir, "train.ids.question")
    answer_span_path = pjoin(FLAGS.data_dir, "train.span")
    val_context_ids_path = pjoin(FLAGS.data_dir, "val.ids.context")
    val_question_ids_path = pjoin(FLAGS.data_dir, "val.ids.question")
    val_answer_span_path = pjoin(FLAGS.data_dir, "val.span")
    context_path = pjoin(FLAGS.data_dir, "train.context")
    val_context_path = pjoin(FLAGS.data_dir, "val.context")

    context_ids = initialize_data(context_ids_path)
    question_ids = initialize_data(question_ids_path)
    answer_spans = initialize_data(answer_span_path)
    context = initialize_data(context_path, keep_as_string=True)
    val_context_ids = initialize_data(val_context_ids_path)
    val_question_ids = initialize_data(val_question_ids_path)
    val_answer_spans = initialize_data(val_answer_span_path)
    val_context = initialize_data(val_context_path, keep_as_string=True)

    # TODO: check this clipping, especially the answer

    # Reducing context length to the specified max in FLAGS.output_size

    paragraph_lengths = []
    # question_lengths = []
    for i in range(0, len(context_ids)):
        paragraph_lengths.append(len(context_ids[i]))
        context_ids[i] = context_ids[i][:FLAGS.output_size]
        context[i] = context[i][:FLAGS.output_size]
        answer_spans[i] = np.clip(answer_spans[i], 0, FLAGS.output_size - 1)
        question_ids[i] = question_ids[i][:FLAGS.question_size]
    for j in range(0, len(val_context_ids)):
        paragraph_lengths.append(len(val_context_ids[j]))
        val_context_ids[j] = val_context_ids[j][:FLAGS.output_size]
        val_context[j] = val_context[j][:FLAGS.output_size]
        val_answer_spans[j] = np.clip(val_answer_spans[j], 0,
                                      FLAGS.output_size - 1)
        val_question_ids[j] = val_question_ids[j][:FLAGS.question_size]

    embeddings = initialize_embeddings(embed_path)

    max_ctx_len = max(max(map(len, context_ids)),
                      max(map(len, val_context_ids)))
    max_q_len = max(max(map(len, question_ids)),
                    max(map(len, val_question_ids)))

    assert max_ctx_len == FLAGS.output_size, "MISMATCH BETWEEN MAX_CTX_LEN AND FLAGS.OUTPUT_SIZE: " + str(
        max_ctx_len) + ", " + str(FLAGS.output_size)

    context_ids, ctx_mask = pad(context_ids, FLAGS.output_size)
    question_ids, q_mask = pad(question_ids, FLAGS.question_size)
    val_context_ids, val_ctx_mask = pad(val_context_ids, FLAGS.output_size)
    val_question_ids, val_q_mask = pad(val_question_ids, FLAGS.question_size)

    context_ids = np.array(context_ids)
    question_ids = np.array(question_ids)
    answer_spans = np.array(answer_spans)
    ctx_mask = np.array(ctx_mask)
    q_mask = np.array(q_mask)

    val_context_ids = np.array(val_context_ids)
    val_question_ids = np.array(val_question_ids)
    val_answer_spans = np.array(val_answer_spans)
    val_ctx_mask = np.array(val_ctx_mask)
    val_q_mask = np.array(val_q_mask)

    check_pad(context_ids, ctx_mask)
    print("CONTEXT IDS PADDED AND CHECKED")
    check_pad(question_ids, q_mask)
    print("QUESTION IDS PADDED AND CHECKED")
    check_pad(val_context_ids, val_ctx_mask)
    print("VAL CONTEXT IDS PADDED AND CHECKED")
    check_pad(val_question_ids, val_q_mask)
    print("VAL QUESTION IDS PADDED AND CHECKED")

    dataset = [
        context_ids, question_ids, answer_spans, ctx_mask, q_mask, context
    ]
    val_dataset = [
        val_context_ids, val_question_ids, val_answer_spans, val_ctx_mask,
        val_q_mask, val_context
    ]
    assert len(vocab) == embeddings.shape[
        0], "Mismatch between embedding shape and vocab length"
    assert embeddings.shape[
        1] == FLAGS.embedding_size, "Mismatch between embedding shape and FLAGS"
    assert len(context_ids) == len(question_ids) == len(
        answer_spans
    ), "Mismatch between context, questions, and answer lengths"

    print("Using model type : {}".format(FLAGS.model_type))

    qa = QASystem(pretrained_embeddings=embeddings, flags=FLAGS)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir
                                                  or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        saver = tf.train.Saver()

        qa.train(session=sess,
                 dataset=dataset,
                 val_dataset=val_dataset,
                 train_dir=save_train_dir)