def run_func2(dataset, config): vocab, rev_vocab = initialize_vocab(config.vocab_path) q, c, a = zip(*[[_q, _c, _a] for (_q, _c, _a) in dataset]) dataset = [q, c, a] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) question_uuid_data = [i for i in xrange(len(a))] with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, answers_canonical = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('dev-prediction.txt', 'w', encoding='utf-8') as f: for i in xrange(len(a)): curr_ans = unicode(answers[i], "utf-8") f.write("%s\n" % (curr_ans))
def run_func(): config = Config() # ========= Load Dataset ========= # You can change this code to load dataset in your own way vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "data/squad/fuse.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('temp/fuse-answer.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def run_func(): config = Config() vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "download/squad/test.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) data = "Id,Answer\n" with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) for a in answers: ans = answers[a] data += a + "," + normalize_answer(ans).replace(" s ", "s ") + "\n" with open('submission.csv', 'wb') as file: file.write(data)
def predict(model_name): qa = QASystem(model_name) with tf.Session() as sess: qa.initialize_model(sess) while True: question = input("Ask a question: ") for answer, confidence, doc in answer_question(qa, sess, question, best_n=10): print('{:.2f}:\t{} ({})'.format(confidence, answer, doc))
def run_func(model_name): train = SquadData.load(config.SQUAD_TRAIN_PREFIX, size=config.TRAIN_SIZE) dev = SquadData.load(config.SQUAD_DEV_PREFIX, size=config.EVAL_SIZE) qa = QASystem(model_name) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess) qa.train(sess, [train, dev])
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) # print(config.question_train) embed_path = config.embed_path vocab_path = config.vocab_path # print(config.embed_path, config.vocab_path) vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_size) decoder = Decoder(config.hidden_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) # train process # qa.train(sess, [train, dev], config.train_dir) # em = qa.evaluate_model(sess, dev) # run process while True: question = input('please input question: ') if question == 'exit': break raw_context = input('please input context: ') if raw_context == 'exit': break question = [ vocab[x] if x in vocab.keys() else 2 for x in question.split() ] context = [ vocab[x] if x in vocab.keys() else 2 for x in raw_context.split() ] test = [[question], [context], [[1, 2]]] a_s, a_e = qa.answer(sess, test) if a_e == a_s: print("answer: ", raw_context.split()[a_s[0]]) else: print("answer: ", ' '.join(raw_context.split()[a_s[0]:a_e[0] + 1]))
def evaluate(model_name, n=None): data = [] with open(config.TREC_PATH, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in list(reader)[199:]: data.append((row[2].strip(), row[3].strip())) if not n: n = len(data) qa = QASystem(model_name) top_count = 0 top_5_count = 0 top_10_count = 0 with tf.Session() as sess: qa.initialize_model(sess) with open(os.path.join(config.MODELS_DIR, model_name, 'trec.csv'), 'w') as f: writer = csv.writer(f) i = 0 for question, answer_pattern in tqdm(data[:n]): answers = [ answer for answer, confidence, doc in answer_question( qa, sess, question, 10) ] writer.writerow(answers) correct = [ bool(re.search(answer_pattern, answer)) for answer in answers ] if True in correct[:1]: top_count += 1 if True in correct[:5]: top_5_count += 1 if True in correct[:10]: top_10_count += 1 i += 1 print('{}: {}, {}, {}'.format(i, float(top_count) / i, float(top_5_count) / i, float(top_10_count) / i)) print('Top match: {}'.format(float(top_count) / n)) print('Top 5 match: {}'.format(float(top_5_count) / n)) print('Top 10 match: {}'.format(float(top_10_count) / n))
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) embed_path = config.embed_path vocab_path = config.vocab_path vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) qa.train(sess, [train, dev], config.train_dir)