Esempio n. 1
0
    def get_q(self, ctxt, ans, ans_pos):
        ctxt_filt, ans_pos = preprocessing.filter_context(ctxt, ans_pos, 0, 30)
        ans_toks = preprocessing.tokenise(ans, asbytes=False)
        doc = self.nlp(ctxt_filt)
        ctxt_toks = [str(tok).lower() for tok in doc]
        # ans_ix = preprocessing.char_pos_to_word(ctxt_filt, ctxt_toks, ans_pos, asbytes=False)
        if ans_toks[0] not in ctxt_toks:
            # print(ans_toks[0], ctxt_toks)
            ans_ix=preprocessing.char_pos_to_word(ctxt_filt, ctxt_toks, ans_pos, asbytes=False)
            # print(ctxt_toks[ans_ix])
        else:
            ans_ix = ctxt_toks.index(ans_toks[0])


        ans_type = Counter([doc[i].ent_type_ for i in range(ans_ix, min(ans_ix+len(ans_toks), len(doc)))]).most_common()[0][0]
        # print(ans_type)

        type_distances=[]
        verb_distances=[]
        for offset in range(len(ctxt_toks)):
            # print(doc[offset].ent_type_, doc[offset])
            if str(doc[offset]).lower() not in ans_toks:
                # print(doc[offset], ans_toks)
                if doc[offset].pos_ == 'NOUN':
                    type_distances.append((max(offset-ans_ix-len(ans_toks)+1, ans_ix-offset), 'THING', doc[offset], offset))
                if doc[offset].ent_type_ != '' \
                    and not (doc[offset].ent_iob_ == 'B' and str(doc[min(offset+1, len(doc)-1)]).lower() in ans_toks) \
                    and self.type_translate(doc[offset].ent_type_) != 'CARDINAL':
                    type_distances.append((max(offset-ans_ix-len(ans_toks)+1, ans_ix-offset), doc[offset].ent_type_, doc[offset], offset))
                if doc[offset].tag_ in ['VBG','VBN']:
                    # print(doc[offset])
                    verb_distances.append((max(offset-ans_ix-len(ans_toks)+1, ans_ix-offset), doc[offset].tag_, doc[offset], offset))

        nearest_verb = sorted(verb_distances, key=lambda x: x[0])[0] if len(verb_distances) >0 else (0,'VBG', 'is',0)

        if len(type_distances) >0:
            nearest_entity = sorted(type_distances, key=lambda x: x[0])[0]
            ix= nearest_entity[3]
            entity_ixs=[ix]
            # print(nearest_entity)
            while ix+1 < len(doc) and doc[ix+1].ent_iob_ == 'I':
                entity_ixs.append(ix+1)
                ix+=1

            entity_toks = [str(tok) for tok in doc[entity_ixs[0]:entity_ixs[-1]+1]]
            entity_type=nearest_entity[1]
        else:
            entity_toks = ["thing"]
            entity_type="THING"

        # print(entity_toks)
        return self.format_q(self.type_translate(ans_type), self.type_translate(entity_type), entity_toks, nearest_verb[2])
Esempio n. 2
0
def main(_):
    from tqdm import tqdm
    FLAGS = tf.app.flags.FLAGS

    # questions = ["What colour is the car?","When was the car made?","Where was the date?", "What was the dog called?","Who was the oldest cat?"]
    # contexts=["The car is green, and was built in 1985. This sentence should make it less likely to return the date, when asked about a cat. The oldest cat was called creme puff and lived for many years!" for i in range(len(questions))]

    trainable = False

    squad_train_full = loader.load_squad_triples(path="./data/")
    squad_dev_full = loader.load_squad_triples(path="./data/",
                                               dev=True,
                                               ans_list=True)

    para_limit = FLAGS.test_para_limit
    ques_limit = FLAGS.test_ques_limit
    char_limit = FLAGS.char_limit

    def filter_func(example, is_test=False):
        return len(example["context_tokens"]) > para_limit or \
               len(example["ques_tokens"]) > ques_limit or \
               (example["y2s"][0] - example["y1s"][0]) > ans_limit

    qa = QANetInstance()
    qa.load_from_chkpt("./models/saved/qanet2/", trainable=trainable)

    squad_train = []
    for x in squad_train_full:
        c_toks = word_tokenize(x[0])
        q_toks = word_tokenize(x[1])
        if len(c_toks) < para_limit and len(q_toks) < ques_limit:
            squad_train.append(x)

    squad_dev = []
    for x in squad_dev_full:
        c_toks = word_tokenize(x[0])
        q_toks = word_tokenize(x[1])
        if len(c_toks) < para_limit and len(q_toks) < ques_limit:
            squad_dev.append(x)

    num_train_steps = len(squad_train) // FLAGS.batch_size
    num_eval_steps = len(squad_dev) // FLAGS.batch_size

    best_f1 = 0
    if trainable:
        run_id = str(int(time.time()))
        chkpt_path = FLAGS.model_dir + 'qanet/' + run_id
        if not os.path.exists(chkpt_path):
            os.makedirs(chkpt_path)

        summary_writer = tf.summary.FileWriter(
            FLAGS.log_directory + 'qanet/' + run_id, qa.model.graph)
        for i in tqdm(range(FLAGS.qa_num_epochs * num_train_steps)):
            if i % num_train_steps == 0:
                print('Shuffling training set')
                np.random.shuffle(squad_train)

            this_batch = squad_train[i * FLAGS.batch_size:(i + 1) *
                                     FLAGS.batch_size]
            batch_contexts, batch_questions, batch_ans_text, batch_ans_charpos = zip(
                *this_batch)

            batch_answers = []
            for j, ctxt in enumerate(batch_contexts):
                ans_span = char_pos_to_word(
                    ctxt.encode(), [t.encode() for t in word_tokenize(ctxt)],
                    batch_ans_charpos[j])
                ans_span = (np.eye(FLAGS.test_para_limit)[ans_span],
                            np.eye(FLAGS.test_para_limit)
                            [ans_span + len(word_tokenize(batch_ans_text[j])) -
                             1])
                batch_answers.append(ans_span)
            this_loss = qa.train_step(batch_contexts, batch_questions,
                                      batch_answers)

            if i % 50 == 0:
                losssummary = tf.Summary(value=[
                    tf.Summary.Value(tag="train_loss/loss",
                                     simple_value=np.mean(this_loss))
                ])

                summary_writer.add_summary(losssummary, global_step=i)

            if i > 0 and i % 1000 == 0:
                qa_f1s = []
                qa_em = []

                for j in tqdm(range(num_eval_steps)):
                    this_batch = squad_dev[j * FLAGS.batch_size:(j + 1) *
                                           FLAGS.batch_size]

                    spans = qa.get_ans([x[0] for x in this_batch],
                                       [x[1] for x in this_batch])

                    for b in range(len(this_batch)):
                        qa_f1s.append(
                            metrics.f1(
                                metrics.normalize_answer(this_batch[b][2]),
                                metrics.normalize_answer(spans[b])))
                        qa_em.append(
                            1.0 * (metrics.normalize_answer(this_batch[b][2])
                                   == metrics.normalize_answer(spans[b])))

                f1summary = tf.Summary(value=[
                    tf.Summary.Value(tag="dev_perf/f1",
                                     simple_value=np.mean(qa_f1s))
                ])

                summary_writer.add_summary(f1summary, global_step=i)
                if np.mean(qa_f1s) > best_f1:
                    print("New best F1! ", np.mean(qa_f1s), " Saving...")
                    best_f1 = np.mean(qa_f1s)
                    qa.saver.save(qa.sess, chkpt_path + '/model.checkpoint')

    qa_f1s = []
    qa_em = []

    for i in tqdm(range(num_eval_steps)):
        this_batch = squad_dev[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size]

        spans = qa.get_ans([x[0] for x in this_batch],
                           [x[1] for x in this_batch])

        for b in range(len(this_batch)):
            this_f1s = []
            this_em = []
            for a in range(len(this_batch[b][2])):
                this_f1s.append(
                    metrics.f1(metrics.normalize_answer(this_batch[b][2][a]),
                               metrics.normalize_answer(spans[b])))
                this_em.append(1.0 *
                               (metrics.normalize_answer(this_batch[b][2][a])
                                == metrics.normalize_answer(spans[b])))
            qa_em.append(max(this_em))
            qa_f1s.append(max(this_f1s))

        if i == 0:
            print(qa_f1s, qa_em)
            print(this_batch[0])
            print(spans[0])

    print('EM: ', np.mean(qa_em))
    print('F1: ', np.mean(qa_f1s))
def main(_):
    train_data = loader.load_squad_triples(FLAGS.data_path, False)
    dev_data = loader.load_squad_triples(FLAGS.data_path, True)[:500]

    chkpt_path = FLAGS.model_dir + 'saved/qatest'
    # chkpt_path = FLAGS.model_dir+'qa/1528885583'

    print('Loaded SQuAD with ', len(train_data), ' triples')
    train_contexts, train_qs, train_as, train_a_pos = zip(*train_data)
    dev_contexts, dev_qs, dev_as, dev_a_pos = zip(*dev_data)

    # vocab = loader.get_vocab(train_contexts, tf.app.flags.FLAGS.qa_vocab_size)
    with open(chkpt_path + '/vocab.json') as f:
        vocab = json.load(f)

    model = MpcmQa(vocab, training_mode=False)
    with model.graph.as_default():
        saver = tf.train.Saver()

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_limit)
    with tf.Session(graph=model.graph,
                    config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        if not os.path.exists(chkpt_path):
            os.makedirs(chkpt_path)
        summary_writer = tf.summary.FileWriter(
            FLAGS.log_dir + 'qa/' + str(int(time.time())), sess.graph)

        saver.restore(sess, chkpt_path + '/model.checkpoint')

        num_steps = len(dev_data) // FLAGS.batch_size

        f1s = []
        exactmatches = []
        for e in range(1):
            np.random.shuffle(train_data)
            train_contexts, train_qs, train_as, train_a_pos = zip(*train_data)
            for i in tqdm(range(num_steps), desc='Epoch ' + str(e)):
                # TODO: this keeps coming up - refactor it
                batch_contexts = dev_contexts[i * FLAGS.batch_size:(i + 1) *
                                              FLAGS.batch_size]
                batch_questions = dev_qs[i * FLAGS.batch_size:(i + 1) *
                                         FLAGS.batch_size]
                batch_ans_text = dev_as[i * FLAGS.batch_size:(i + 1) *
                                        FLAGS.batch_size]
                batch_answer_charpos = dev_a_pos[i * FLAGS.batch_size:(i + 1) *
                                                 FLAGS.batch_size]

                batch_answers = []
                for j, ctxt in enumerate(batch_contexts):
                    ans_span = char_pos_to_word(
                        ctxt.encode(),
                        [t.encode() for t in tokenise(ctxt, asbytes=False)],
                        batch_answer_charpos[j])
                    ans_span = (
                        ans_span, ans_span +
                        len(tokenise(batch_ans_text[j], asbytes=False)))
                    batch_answers.append(ans_span)

                # print(batch_answers[:3])
                # exit()

                summ, pred = sess.run(
                    [model.eval_summary, model.pred_span],
                    feed_dict={
                        model.context_in:
                        get_padded_batch(batch_contexts, vocab),
                        model.question_in:
                        get_padded_batch(batch_questions, vocab),
                        model.answer_spans_in: batch_answers,
                        model.is_training: False
                    })

                summary_writer.add_summary(summ,
                                           global_step=(e * num_steps + i))

                gold_str = []
                pred_str = []
                for b in range(FLAGS.batch_size):
                    gold_str.append(" ".join(
                        tokenise(batch_contexts[b], asbytes=False)
                        [batch_answers[b][0]:batch_answers[b][1]]))
                    pred_str.append(" ".join(
                        tokenise(batch_contexts[b],
                                 asbytes=False)[pred[b][0]:pred[b][1]]))

                f1s.extend([
                    f1(gold_str[b], pred_str[b])
                    for b in range(FLAGS.batch_size)
                ])
                exactmatches.extend([
                    np.product(pred[b] == batch_answers[b]) * 1.0
                    for b in range(FLAGS.batch_size)
                ])

                if i % FLAGS.eval_freq == 0:
                    out_str = "<h1>" + "Eval - Dev set" + "</h1>"
                    for b in range(FLAGS.batch_size):
                        out_str += batch_contexts[b] + '<br/>'
                        out_str += batch_questions[b] + '<br/>'
                        out_str += str(batch_answers[b]) + str(
                            tokenise(batch_contexts[b], asbytes=False)
                            [batch_answers[b][0]:batch_answers[b][1]]
                        ) + '<br/>'
                        out_str += str(pred[b]) + str(
                            tokenise(batch_contexts[b], asbytes=False)
                            [pred[b][0]:pred[b][1]]) + '<br/>'
                        out_str += batch_ans_text[b] + '<br/>'
                        out_str += pred_str[b] + '<br/>'
                        out_str += "F1: " + str(f1(gold_str[b],
                                                   pred_str[b])) + '<br/>'
                        out_str += "EM: " + str(
                            np.product(pred[b] == batch_answers[b]) * 1.0)
                        out_str += "<hr/>"
                    with open(FLAGS.log_dir + 'out_qa_eval.htm', 'w') as fp:
                        fp.write(out_str)
        print("F1: ", np.mean(f1s))
        print("EM: ", np.mean(exactmatches))
Esempio n. 4
0
def main(_):
    if FLAGS.testing:
        print('TEST MODE - reducing model size')
        FLAGS.qa_encoder_units =32
        FLAGS.qa_match_units=32
        FLAGS.qa_batch_size =16
        FLAGS.embedding_size=50

    run_id = str(int(time.time()))

    chkpt_path = FLAGS.model_dir+'qa/'+run_id
    restore_path=FLAGS.model_dir+'qa/1529056867'

    if not os.path.exists(chkpt_path):
        os.makedirs(chkpt_path)

    train_data = loader.load_squad_triples(FLAGS.data_path, False)
    dev_data = loader.load_squad_triples(FLAGS.data_path, dev=True, ans_list=True)

    train_data = filter_squad(train_data, window_size=FLAGS.filter_window_size, max_tokens=FLAGS.filter_max_tokens)
    # dev_data = filter_squad(dev_data, window_size=FLAGS.filter_window_size, max_tokens=FLAGS.filter_max_tokens)

    if FLAGS.testing:
        train_data=train_data[:1000]
        num_dev_samples=100
    else:
        num_dev_samples=3000

    print('Loaded SQuAD with ',len(train_data),' triples')
    train_contexts, train_qs, train_as,train_a_pos = zip(*train_data)
    dev_contexts, dev_qs, dev_as,dev_a_pos = zip(*dev_data)

    if FLAGS.restore:
        with open(restore_path+'/vocab.json') as f:
            vocab = json.load(f)
    else:
        vocab = loader.get_vocab(train_contexts+train_qs, tf.app.flags.FLAGS.qa_vocab_size)
        with open(chkpt_path+'/vocab.json', 'w') as outfile:
            json.dump(vocab, outfile)



    model = MpcmQa(vocab)
    with model.graph.as_default():
        saver = tf.train.Saver()



    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_limit, allow_growth = True)
    with tf.Session(graph=model.graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        summary_writer = tf.summary.FileWriter(FLAGS.log_directory+'qa/'+run_id, sess.graph)

        if FLAGS.restore:
            saver.restore(sess, restore_path+ '/model.checkpoint')
            start_e=40#FLAGS.qa_num_epochs
            print('Loaded model')
        else:
            print("Building graph, loading glove")
            start_e=0
            sess.run(tf.global_variables_initializer())

        num_steps_train = len(train_data)//FLAGS.qa_batch_size
        num_steps_dev = num_dev_samples//FLAGS.qa_batch_size

        f1summary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/f1",
                                         simple_value=0.0)])
        emsummary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/em",
                                  simple_value=0.0)])

        summary_writer.add_summary(f1summary, global_step=start_e*num_steps_train)
        summary_writer.add_summary(emsummary, global_step=start_e*num_steps_train)

        best_oos_nll=1e6

        for e in range(start_e,start_e+FLAGS.qa_num_epochs):
            np.random.shuffle(train_data)
            train_contexts, train_qs, train_as,train_a_pos = zip(*train_data)

            for i in tqdm(range(num_steps_train), desc='Epoch '+str(e)):
                # TODO: this keeps coming up - refactor it
                batch_contexts = train_contexts[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]
                batch_questions = train_qs[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]
                batch_ans_text = train_as[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]
                batch_answer_charpos = train_a_pos[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]

                batch_answers=[]
                for j, ctxt in enumerate(batch_contexts):
                    ans_span=char_pos_to_word(ctxt.encode(), [t.encode() for t in tokenise(ctxt, asbytes=False)], batch_answer_charpos[j])
                    ans_span=(ans_span, ans_span+len(tokenise(batch_ans_text[j],asbytes=False))-1)
                    batch_answers.append(ans_span)

                # print(batch_answers[:3])
                # exit()
                # run_metadata = tf.RunMetadata()
                # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                _,summ, pred = sess.run([model.optimizer, model.train_summary, model.pred_span],
                        feed_dict={model.context_in: get_padded_batch(batch_contexts,vocab),
                                model.question_in: get_padded_batch(batch_questions,vocab),
                                model.answer_spans_in: batch_answers,
                                model.is_training: True})
                                # ,run_metadata=run_metadata, options=run_options)

                summary_writer.add_summary(summ, global_step=(e*num_steps_train+i))
                # summary_writer.add_run_metadata(run_metadata, tag="step "+str(i), global_step=(e*num_steps_train+i))

                if i%FLAGS.eval_freq==0:
                    gold_str=[]
                    pred_str=[]
                    f1s = []
                    exactmatches= []
                    for b in range(FLAGS.qa_batch_size):
                        gold_str.append(" ".join(tokenise(batch_contexts[b],asbytes=False)[batch_answers[b][0]:batch_answers[b][1]+1]))
                        pred_str.append( " ".join(tokenise(batch_contexts[b],asbytes=False)[pred[b][0]:pred[b][1]+1]) )

                    f1s.extend([f1(gold_str[b], pred_str[b]) for b in range(FLAGS.qa_batch_size)])
                    exactmatches.extend([ np.product(pred[b] == batch_answers[b])*1.0 for b in range(FLAGS.qa_batch_size) ])

                    f1summary = tf.Summary(value=[tf.Summary.Value(tag="train_perf/f1",
                                                     simple_value=sum(f1s)/len(f1s))])
                    emsummary = tf.Summary(value=[tf.Summary.Value(tag="train_perf/em",
                                              simple_value=sum(exactmatches)/len(exactmatches))])

                    summary_writer.add_summary(f1summary, global_step=(e*num_steps_train+i))
                    summary_writer.add_summary(emsummary, global_step=(e*num_steps_train+i))


                    # saver.save(sess, chkpt_path+'/model.checkpoint')


            f1s=[]
            exactmatches=[]
            nlls=[]

            np.random.shuffle(dev_data)
            dev_subset = dev_data[:num_dev_samples]
            for i in tqdm(range(num_steps_dev), desc='Eval '+str(e)):
                dev_contexts,dev_qs,dev_as,dev_a_pos = zip(*dev_subset)
                batch_contexts = dev_contexts[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]
                batch_questions = dev_qs[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]
                batch_ans_text = dev_as[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]
                batch_answer_charpos = dev_a_pos[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size]

                batch_answers=[]
                for j, ctxt in enumerate(batch_contexts):
                    ans_span=char_pos_to_word(ctxt.encode(), [t.encode() for t in tokenise(ctxt, asbytes=False)], batch_answer_charpos[j][0])
                    ans_span=(ans_span, ans_span+len(tokenise(batch_ans_text[j][0],asbytes=False))-1)
                    batch_answers.append(ans_span)


                pred,nll = sess.run([model.pred_span, model.nll],
                        feed_dict={model.context_in: get_padded_batch(batch_contexts,vocab),
                                model.question_in: get_padded_batch(batch_questions,vocab),
                                model.answer_spans_in: batch_answers,
                                model.is_training: False})
                gold_str=[]
                pred_str=[]

                for b in range(FLAGS.qa_batch_size):
                    pred_str = " ".join(tokenise(batch_contexts[b],asbytes=False)[pred[b][0]:pred[b][1]+1])
                    this_f1=[]
                    this_em=[]
                    for a in range(len(batch_ans_text[b])):
                        this_f1.append(f1(normalize_answer(batch_ans_text[b][a]), normalize_answer(pred_str)))
                        this_em.append(1.0*(normalize_answer(batch_ans_text[b][a]) == normalize_answer(pred_str)))
                    f1s.append(max(this_f1))
                    exactmatches.append(max(this_em))
                nlls.extend(nll.tolist())
            f1summary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/f1",
                                             simple_value=sum(f1s)/len(f1s))])
            emsummary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/em",
                                      simple_value=sum(exactmatches)/len(exactmatches))])
            nllsummary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/nll",
                                      simple_value=np.mean(nlls))])

            summary_writer.add_summary(f1summary, global_step=((e+1)*num_steps_train))
            summary_writer.add_summary(emsummary, global_step=((e+1)*num_steps_train))
            summary_writer.add_summary(nllsummary, global_step=((e+1)*num_steps_train))

            mean_nll=np.mean(nlls)
            if mean_nll < best_oos_nll:
                print("New best NLL! ", mean_nll, " Saving... F1: ", np.mean(f1s))
                best_oos_nll = mean_nll
                saver.save(sess, chkpt_path+'/model.checkpoint')
            else:
                print("NLL not improved ", mean_nll)