Ejemplo n.º 1
0
    def evaluate_answer(self, session, dataset, sample=100, log=False):
        f1 = 0.
        em = 0.

        num_iter = 0
        flag = False
        for q_id, p_id, a_span, paragraph in load_dataset(dataset):
            q_batch, m_q_batch = zip(
                *pad_sequences(q_id, FLAGS.question_output_size))
            p_batch, m_p_batch = zip(
                *pad_sequences(p_id, FLAGS.paragraph_output_size))
            a_s, a_e = self.answer(session, q_batch, p_batch, m_q_batch,
                                   m_p_batch)
            for i in range(len(a_span)):
                answer = paragraph[i][a_s[i]:a_e[i] + 1]
                true_answer = paragraph[i][a_span[i][0]:a_span[i][1] + 1]
                f1 += f1_score(answer, true_answer)
                em += exact_match_score(answer, true_answer)
                num_iter += 1
                if num_iter >= sample:
                    flag = True
                    break
            if flag:
                break

        f1 = f1 / sample
        em = em / sample

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))

        return f1, em
Ejemplo n.º 2
0
    def evaluate_answer(self, session, dataset, sample=100, log=False):
        """
								Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
								with the set of true answer labels

								This step actually takes quite some time. So we can only sample 100 examples
								from either training or testing set.

								:param session: session should always be centrally managed in train.py
								:param dataset: a representation of our data, in some implementations, you can
																								pass in multiple components (arguments) of one dataset to this function
								:param sample: how many examples in dataset we look at
								:param log: whether we print to std out stream
								:return:
								"""
        batch = dataset
        if sample is None:
            sample = len(dataset)
        else:  # If we only select a subset of the data
            random_indices = [
                random.randint(0, len(dataset)) for _ in range(sample)
            ]
            batch = [dataset[idx] for idx in random_indices]
        question_batch, context_batch, question_mask_batch, context_mask_batch, start_answer_batch, end_answer_batch = zip(
            *batch)

        a_s, a_e = self.answer(
            session, batch)  # These are both arrays of length sample size
        true_a_s = np.argmax(start_answer_batch, axis=1)
        true_a_e = np.argmax(end_answer_batch, axis=1)
        print("predicted a_s: ", a_s)
        print("predicted a_e: ", a_e)
        print("true start answer: ", true_a_s)
        print("true end answer: ", true_a_e)
        answers = [
            context_batch[i][a_s[i]:a_e[i] + 1] for i in range(len(a_s))
        ]
        true_answers = [
            context_batch[i][true_a_s[i]:true_a_e[i] + 1]
            for i in range(len(true_a_s))
        ]

        f1s = []
        ems = []
        for i in range(len(true_answers)):
            answer = answers[i]
            true_answer = true_answers[i]
            f1_one_example = f1_score(answer, true_answer)
            f1s.append(f1_one_example)
            em_one_example = exact_match_score(answer, true_answer)
            ems.append(em_one_example)

        f1 = np.sum(f1s) / float(sample)
        em = np.sum(ems) / float(sample)

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))

        return f1s, ems
Ejemplo n.º 3
0
    def evaluate_answer(self, session, dataset, save=False):
        # Evaluate the model's performance using the harmonic mean of F1 and
        # Exact Match (EM) with the set of true answer labels.

        res = []
        prob = []
        for j in range(0, len(dataset)):
            sample = create_minibatch(dataset, 1, j)
            s, e, p1, p2 = self.answer(session, sample)
            _, p, a, _, _ = sample
            idx = list(p[0])
            res.append((idx[s[0]:e[0] + 1], idx[a[0][0]:a[0][1] + 1]))

            # save prediciton probability for future use
            if save:
                prob.append((p1.tolist(), p2.tolist(), p.tolist(), a.tolist()))

        f1 = 0.
        em = 0.
        for p, g in res:
            text_p = " ".join(str(i) for i in p)
            text_g = " ".join(str(i) for i in g)
            f1 += f1_score(text_p, text_g)
            em += exact_match_score(text_p, text_g)

        return f1 / len(dataset), em / len(dataset), prob
Ejemplo n.º 4
0
    def get_spans(self, session, context_path, qn_path, ans_path, dataset, num_samples=0):
        """
        Sample from the provided (train/dev) set.
        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
        Returns:
          begin_prob, end_prob: The average probabilities the sampled examples.
        """
        total_start_dists = []
        total_end_dists = []
        f1_em_scores = []
        example_num = 0
        for batch in get_batch_generator(
            self.word2id,
            context_path,
            qn_path,
            ans_path,
            self.FLAGS.batch_size,
            context_len=self.FLAGS.context_len,
            question_len=self.FLAGS.question_len,
            discard_long=False,
            random=False):

            pred_start_dists, pred_end_dists = self.get_prob_dists(session, batch)
            pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch)

            # Convert the start and end positions to lists length batch_size
            pred_start_pos = pred_start_pos.tolist() # list length batch_size
            pred_end_pos = pred_end_pos.tolist() # list length batch_size
            pred_start_dists = pred_start_dists.tolist() # list length batch_size
            pred_end_dists = pred_end_dists.tolist() # list length batch_size

            for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                # Important: batch.context_tokens contains the original words (no UNKs)
                # You need to use the original no-UNK version when measuring F1/EM
                pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_em_scores.append((f1,em))
                # print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em)
                if num_samples != 0 and example_num >= num_samples:
                    break

            # Convert the start and end positions to lists length batch_size
            total_end_dists += pred_end_dists
            total_start_dists += pred_start_dists
            if num_samples != 0 and example_num >= num_samples:
                break
        return np.asarray(total_start_dists), np.asarray(total_end_dists), np.asarray(f1_em_scores)
Ejemplo n.º 5
0
    def evaluate_answer(self, session, dataset, vocab, sample=400, log=False):
        f1 = 0.
        em = 0.

        N = len(dataset)
        sampleIndices = np.random.choice(N, sample, replace=False)
        evaluate_set = [dataset[i] for i in sampleIndices]
        predicts = self.predict_on_batch(session, evaluate_set)

        for example, (start, end) in zip(evaluate_set, predicts):
            q, _, c, _, (true_s, true_e) = example
            # print (start, end, true_s, true_e)
            context_words = [vocab[w] for w in c]

            true_answer = ' '.join(context_words[true_s:true_e + 1])
            if start <= end:
                predict_answer = ' '.join(context_words[start:end + 1])
            else:
                predict_answer = ''
            f1 += f1_score(predict_answer, true_answer)
            em += exact_match_score(predict_answer, true_answer)

        f1 = 100 * f1 / sample
        em = 100 * em / sample

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))

        return f1, em
Ejemplo n.º 6
0
    def get_eval(self, sess, dataset, batch_size, sample=True):
        ''' if sample, take first batch only '''
        f1 = em = total = 0
        for i, batch in enumerate(
                get_minibatches(dataset, batch_size, shuffle=True)):
            p, q, p_len, q_len, a_s, a_e, p_raw = zip(*batch)
            loss, norm, ys, ye = self.eval_batch(sess, p, q, p_len, q_len, a_s,
                                                 a_e)
            a_s_pred = np.argmax(ys, axis=1)
            a_e_pred = np.argmax(ye, axis=1)
            for i in range(len(batch)):
                #predicted a_s and a_e
                s_pred = a_s_pred[i]
                e_pred = a_e_pred[i]

                #ground truth lables
                a_raw = ' '.join(p_raw[i][a_s[i]:a_e[i] + 1])
                pred_raw = ' '.join(p_raw[i][s_pred:e_pred + 1])

                f1 += f1_score(pred_raw, a_raw)
                em += exact_match_score(pred_raw, a_raw)
                total += 1
            if sample:
                break

        em = 100.0 * em / total
        f1 = 100.0 * f1 / total
        return (f1, em, loss, norm)
Ejemplo n.º 7
0
    def evaluate_answer(self,
                        session,
                        dataset,
                        sample=100,
                        log=False,
                        datatype='val'):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels
        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.
        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        f1 = 0.
        em = 0.
        fname = "../.."
        with open(
                os.path.join(self.config.flag.data_dir,
                             "%s.context" % datatype)) as f:
            data_paragraph = [line.split() for line in f.read().splitlines()]
        with open(
                os.path.join(self.config.flag.data_dir,
                             "%s.answer" % datatype)) as f:
            data_answer = [line.split() for line in f.read().splitlines()]
        ground_truth = (data_paragraph, data_answer)

        i = 0
        while i < sample:
            preds = self.answer(
                session,
                (dataset[datatype][0][i:i + self.config.flag.batch_size],
                 dataset[datatype][1][i:i + self.config.flag.batch_size],
                 dataset[datatype][2][i:i + self.config.flag.batch_size]))
            for j in range(len(preds[0])):
                prediction = ' '.join(
                    ground_truth[0][i][preds[0][j]:(preds[1][j] + 1)])
                gt = ' '.join(ground_truth[1][i])
                f1_instance = f1_score(prediction, gt)
                em_instance = exact_match_score(prediction, gt)
                em = em + em_instance
                f1 = f1 + f1_instance
                i += 1
        em = 100 * em / float(sample)
        f1 = 100 * f1 / float(sample)

        if log:
            logging.info(
                "Output for '{}' dataset - F1: {}, EM: {}, for {} samples".
                format(datatype, f1, em, sample))

        return f1, em
    def check_f1_em(self,
                    context_path,
                    qn_path,
                    ans_path,
                    dataset,
                    num_samples=1000):
        f1_total = 0.
        em_total = 0.
        example_num = 0

        for batch in data_batcher.get_batch_generator(self.word2id,
                                                      self.id2idf,
                                                      context_path,
                                                      qn_path,
                                                      ans_path,
                                                      self.batch_size,
                                                      context_len=300,
                                                      question_len=30,
                                                      discard_long=False):

            pred_start_pos, pred_end_pos = self.get_predictions(batch)

            # Convert the start and end positions to lists length batch_size
            pred_start_pos = pred_start_pos.tolist()  # list length batch_size
            pred_end_pos = pred_end_pos.tolist()  # list length batch_size

            for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in \
                    enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                # Important: batch.context_tokens contains the original words (no UNKs)
                # You need to use the original no-UNK version when measuring F1/EM
                pred_ans_tokens = batch.context_tokens[ex_idx][
                    pred_ans_start:pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        return f1_total, em_total
Ejemplo n.º 9
0
    def check_f1_em(self, model, dataset, num_samples=100, print_to_screen=False):
        logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset))

        if dataset == "train":
            context_path, qn_path, ans_path = self.train_context_path, self.train_qn_path, self.train_ans_path
        elif dataset == "dev":
            context_path, qn_path, ans_path = self.dev_context_path, self.dev_qn_path, self.dev_ans_path
        else:
            raise ('dataset is not defined')

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()

        for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, config.batch_size,
                                         context_len=config.context_len, question_len=config.question_len,
                                         discard_long=False):

            pred_start_pos, pred_end_pos = self.test_one_batch(batch, model)

            pred_start_pos = pred_start_pos.tolist()
            pred_end_pos = pred_end_pos.tolist()

            for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) \
                    in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)):
                example_num += 1
                pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                true_answer = " ".join(true_ans_tokens)

                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx],
                                  batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start,
                                  pred_ans_end, true_answer, pred_answer, f1, em)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        toc = time.time()
        logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic))

        return f1_total, em_total
Ejemplo n.º 10
0
def argmax_eval(data):
    em = 0.
    f1 = 0.
    for start, end, paragraph, answer in data:
        text_p = " ".join(paragraph[np.argmax(start):np.argmax(end) + 1])
        text_g = " ".join(paragraph[answer[0]:answer[1] + 1])
        f1 += f1_score(text_p, text_g)
        em += exact_match_score(text_p, text_g)
    print("argmax EM: {:.5f}, F1: {:.5f}".format(em / len(data),
                                                 f1 / len(data)))
Ejemplo n.º 11
0
    def eval_sentence(self, preds_ind, gold_ind, sentence):
        pred_vecs = [s for s, p in zip(sentence, preds_ind) if p]
        gold_vecs = [s for s, g in zip(sentence, gold_ind) if g]

        pred_sent = ' '.join(self.vocab[i] for i in pred_vecs)
        gold_sent = ' '.join(self.vocab[i] for i in gold_vecs)

        f1 = new_f1_score(pred_sent, gold_sent)
        em = exact_match_score(pred_sent, gold_sent)
        return f1, em, pred_sent, gold_sent
Ejemplo n.º 12
0
    def evaluate_answer(self,
                        session,
                        dataset,
                        context,
                        sample=100,
                        log=False,
                        eval_set='train'):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        if sample is None:
            sampled = dataset
            sample = len(dataset)
        else:
            #np.random.seed(0)
            sampled = dataset[np.random.choice(dataset.shape[0], sample)]

        a_s, a_e = self.answer(session, sampled)

        f1 = []
        em = []
        #embed()
        sampled = sampled.T
        for i in range(len(sampled[0])):
            pred_words = ' '.join(context[i][a_s[i]:a_e[i] + 1])
            actual_words = ' '.join(
                context[i][sampled[2][i][0]:sampled[2][i][1] + 1])
            # print('I:',i)
            # print ("INDICES",a_s[i],a_e[i])
            # print ("PRED_WORDS:",pred_words)
            # print ("ACTUAL WORD",actual_words)
            f1.append(f1_score(pred_words, actual_words))
            # print('F1:',f1)
            em.append(exact_match_score(pred_words, actual_words))
            # print('EM:',em)
            # print (" ")

        if log:
            logging.info("{},F1: {}, EM: {}, for {} samples".format(
                eval_set, np.mean(f1), None, sample))
        # f1=sum(f1)/len(f1)
        # em=sum(em)/len(em)
        return f1, em
Ejemplo n.º 13
0
def EM_F1(pos_scores, batch_target):
    pos = [np.argmax(x, axis=1) for x in pos_scores]
    predict_ans = normalize_ans(pos)
    ans = normalize_ans(batch_target)
    em = f1 = 0
    for prediction, ground_truth in zip(predict_ans, ans):
        em += exact_match_score(prediction, ground_truth)
        f1 += f1_score(prediction, ground_truth)
    em = 100.0 * em / len(ans)
    f1 = 100.0 * f1 / len(ans)
    return em, f1
Ejemplo n.º 14
0
    def evaluate_answer(self,
                        session,
                        dataset,
                        rev_vocab,
                        sample=20,
                        log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """
        sample = min(sample, len(dataset))
        overall_f1 = 0.
        overall_em = 0.
        minibatch_size = 100
        num_batches = int(sample / minibatch_size)
        for batch in range(0, num_batches):
            start = batch * minibatch_size
            end = min(len(dataset), start + minibatch_size)
            h_s, h_e, _ = self.decode(session, dataset[start:end])
            for i in range(minibatch_size):
                a_s = np.argmax(h_s[i])
                a_e = np.argmax(h_e[i])
                if a_s > a_e:
                    k = a_e
                    a_e = a_s
                    a_s = k

                sample_dataset = dataset[start + i]
                context = sample_dataset[0]
                (a_s_true, a_e_true) = sample_dataset[6]
                predicted_answer = self.formulate_answer(
                    context, rev_vocab, a_s, a_e)
                true_answer = self.formulate_answer(context, rev_vocab,
                                                    a_s_true, a_e_true)
                f1 = f1_score(predicted_answer, true_answer)
                overall_f1 += f1
                if exact_match_score(predicted_answer, true_answer):
                    overall_em += 1

        average_f1 = overall_f1 / sample
        overall_em = overall_em / sample
        logging.info("F1: {}, EM: {}, for {} samples\n".format(
            average_f1, overall_em, sample))
        return overall_f1, overall_em
Ejemplo n.º 15
0
    def evaluate_answer(self, session, qs, cs, sample=100, log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        print("Evaluating Answers")

        f1 = 0.
        em = 0.

        text_file = open("./data/squad/train.context", "r")
        inputs_c = text_file.read().split("\n")
        #print(inputs_c)

        context = []
        text_file.close()
        for i in range(sample):
            words = inputs_c[i].split()
            context.append(words)

        prediction = []
        #need to define self.true somewhere
        ground_truth = []

        self.a_s, self.a_e = self.answer(session, qs, cs)

        #these functions are defined in evaluate.py. They are already written and should not be changed
        #Not sure if these indices are the best way to access these
        for i in range(sample):
            prediction.append(context[i][self.a_s, self.a_e + 1])
            ground_truth.append(context[i][self.true_s, self.true_e + 1])
            f1 = f1 + f1_score(prediction[i], ground_truth[i]) / sample
            em = em + exact_match_score(prediction[i],
                                        ground_truth[i]) / sample

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))
            print("With Print, F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))  #Might be redundent

        return f1, em
def eval_text():
    '''figure out how evaluation works.'''

    with open(an_path) as f:
        raw_data = [line.strip() for line in f.readlines()]
    print(raw_data[:10])

    l = ['Corpus','Juris', 'canonici']
    s = ' '.join(l)
    print(s)
    print(f1_score(s, raw_data[0]))
    print(exact_match_score(s, raw_data[0]) / 1.0)
Ejemplo n.º 17
0
    def evaluate_answer(self,
                        session,
                        dataset,
                        context,
                        sample=100,
                        log=False,
                        eval_set='train'):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        if sample is None:
            sampled = dataset
            sample = len(dataset[0])
        else:
            #np.random.seed(0)
            inds = np.random.choice(len(dataset[0]), sample)

            sampled = [elem[inds] for elem in dataset]
            context = [context[i] for i in inds]

        a_s, a_e = self.answer(session, sampled)

        context_ids, question_ids, answer_spans, ctx_mask, q_mask = sampled

        f1 = []
        em = []
        # #embed()
        for i in range(len(sampled[0])):
            pred_words = ' '.join(context[i][a_s[i]:a_e[i] + 1])
            actual_words = ' '.join(
                context[i][answer_spans[i][0]:answer_spans[i][1] + 1])
            f1.append(f1_score(pred_words, actual_words))
            cur_em = exact_match_score(pred_words, actual_words)
            em.append(float(cur_em))

        if log:
            logging.info("{},F1: {}, EM: {}, for {} samples".format(
                eval_set, np.mean(f1), np.mean(em), sample))

        return np.mean(f1), np.mean(em)
Ejemplo n.º 18
0
    def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.
        """
        logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()

        for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False):

            pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch)

            for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        toc = time.time()
        logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic))

        return f1_total, em_total
    def evaluate_answer(self, session, dataset, rev_vocab, sample=100, log=False):
        """
        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """
        
        our_answers = []
        their_answers = []
        eval_set = random.sample(dataset, sample)

        batches, num_batches = get_batches(eval_set, self.FLAGS.batch_size)

        #for question, question_mask, paragraph, paragraph_mask, span, true_answer in eval_set:
        for batch in batches:
            val_questions, val_question_masks, val_paragraphs, val_paragraph_masks, _, val_true_answers = zip(*batch)
            a_s, a_e = self.answer(session, val_questions, val_paragraphs, val_question_masks, val_paragraph_masks)
            for s, e, paragraph in zip(a_s, a_e, val_paragraphs):
                token_answer = paragraph[s : e + 1]      #The slice of the context paragraph that is our answer

                sentence = [rev_vocab[token] for token in token_answer]
                our_answer = ' '.join(word for word in sentence)
                our_answers.append(our_answer)

            for true_answer in val_true_answers:
                their_answer = ' '.join(word for word in true_answer)
                their_answers.append(their_answer)

        assert(len(our_answers) == len(their_answers))

        f1 = exact_match = total = 0
        answer_tuples = zip(their_answers, our_answers)
        for ground_truth, prediction in answer_tuples:
            total += 1
            exact_match += exact_match_score(prediction, ground_truth)
            f1 += f1_score(prediction, ground_truth)

        exact_match = 100.0 * exact_match / total
        f1 = 100.0 * f1 / total

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(f1, exact_match, sample))
            logging.info("Samples:")
            for i in xrange(min(10, sample)):
                ground_truth, our_answer = answer_tuples[i]
                logging.info("Ground Truth: {}, Our Answer: {}".format(ground_truth, our_answer))

        return f1, exact_match
Ejemplo n.º 20
0
 def evaluate(self, answers, gold):
     '''
     calculates f1 and em, given a batch of guesses and gold data
     '''
     num = len(answers)        
     assert num == len(gold)
     f1 = 0.
     em = 0.
     for i in xrange(num):
         f1 += f1_score(answers[i], gold[i])
         emm = exact_match_score(answers[i], gold[i])
         em += emm
         #print(i, str(emm)[0], '|', answers[i], '|', gold[i])
     return (f1/num, em/num)
    def evaluate_answer(self, session, dataset_address, sample=100, log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        ######### the f1_score and exact_match_score functions defined work only with strings
        ######### need to write new ones that work with lists like below

        f1 = 0.
        em = 0.
        dataset, num_samples = get_sample(
            dataset_address, self.FLAGS.context_paragraph_max_length, sample)
        test_questions, test_paragraphs, test_start_answers, test_end_answers = dataset
        predictions = self.answer(session, test_paragraphs, test_questions)
        for i in range(num_samples):
            answer_beg = test_start_answers[i][0]  # this is a list of length 1
            answer_end = test_end_answers[i][0]  # same
            answer_str_list = [
                str(test_paragraphs[i][j])
                for j in range(answer_beg, answer_end + 1)
            ]
            true_answer = ' '.join(answer_str_list)
            prediction_str_list = [
                str(test_paragraphs[i][j])
                for j in range(predictions[i][0], predictions[i][1] + 1)
            ]
            prediction_string = ' '.join(prediction_str_list)
            f1 += f1_score(prediction_string, true_answer)
            em += exact_match_score(prediction_string, true_answer)
        f1 = 1.0 * f1 / sample
        em = 1.0 * em / sample

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))

        return f1, em
Ejemplo n.º 22
0
Archivo: rnet.py Proyecto: wykxyz/rnet
def evaluate(args):
    opt = json.load(open('models/config.json', 'r'))['rnet']
    config = tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    saved_model = args.model_path
    
    EM = 0.0
    F1 = 0.0
    with sess.as_default():
        print('Reading data')
        dp = preprocess.read_data('dev', opt)
        it, enqueue_op = dp.provide(sess)
        rnet_model = model.RNet(opt)
        loss, pt, accu = rnet_model.build_model(it)
        dequeued_p, asi, aei = it['p'], it['asi'], it['aei']
        
         # restore model
        print('restoring model...')
        saver = tf.train.Saver()
        saver.restore(sess, saved_model)

        # start feeding threads
        coord = tf.train.Coordinator()

        threads = []
        for i in range(opt['num_threads']):
            t = Thread(target=feeder, args=(dp, sess, enqueue_op, coord, i, args.debug))
            t.start()
            threads.append(t)
        # start prediction
        print('Prediction starts')
        num_batch = int(dp.num_sample/dp.batch_size)
        for j in tqdm(range(num_batch)):
            pt_val, p_batch, asi_batch, aei_batch = sess.run([pt, dequeued_p, asi, aei])
            f1, em = 0.0, 0.0
            for k in range(len(p_batch)):
                paragraph = p_batch[k][0].decode('utf8').split(' ')
                true_start, true_end = asi_batch[k][0], aei_batch[k][0]
                pred_start, pred_end = pt_val[k][0], pt_val[k][1]
                pred_tokens = paragraph[pred_start:(pred_end+1)]
                true_tokens = paragraph[true_start:(true_end+1)]
                f1 += f1_score(' '.join(pred_tokens), ' '.join(true_tokens))
                em += exact_match_score(' '.join(pred_tokens), ' '.join(true_tokens))
            print('{}th batch | f1: {} | em: {}'.format(j, f1/len(p_batch), em/len(p_batch)))
            F1 += f1
            EM += em
        print('Evaluation complete, F1 score: {}, EM score: {}'.format(F1/dp.num_sample, EM/dp.num_sample))
Ejemplo n.º 23
0
    def evaluate_answer(self,
                        sess,
                        dataset,
                        sample=100,
                        log=False,
                        mode="val"):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels
        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.
        :param sess: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """
        f1 = 0.
        em = 0.

        len_data = len(dataset[mode][-1])
        indexer = random.sample(xrange(len_data), sample)
        # 1. Pad data
        p, q, span, pw, qw, ans = [[component[i] for i in indexer]
                                   for component in dataset[mode]]
        p, mask_p, actual_p = pad_input(p, Config.max_p_len)
        q, mask_q, actual_q = pad_input(q, Config.max_q_len)
        begin, end = zip(*span)
        # get answer
        a_s, a_e = self.answer_all(sess,
                                   [p, mask_p, actual_p, q, mask_q, actual_q])

        for i in range(sample):
            # ground truth
            gt = ' '.join(ans[i])
            # prediction
            pred = ' '.join(pw[i][a_s[i]:a_e[i] + 1])
            em += exact_match_score(pred, gt)
            f1 += f1_score(pred, gt)
        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1 / sample, em / sample, sample))
        for a, b in zip(zip(begin, end), zip(a_s, a_e))[:6]:
            print("Actual: {} Predicted: {}".format(a, b))
        return f1 / sample, em / sample
Ejemplo n.º 24
0
    def evaluate_answer(self, session, dataset, vocab, sample=100, log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        f1 = 0.
        em = 0.

        totExamples = len(dataset)
        examplesToEvaluate = np.random.choice(totExamples, sample)

        for i in examplesToEvaluate:
            true_a_s = int(dataset[i]["span"][0])
            true_a_e = int(dataset[i]["span"][1])
            predicted_a_s, predicted_a_e = self.answer(session, dataset[i])

            paragraphWords = [vocab[j] for j in dataset[i]["context"]]
            ground_truth = paragraphWords[true_a_s:true_a_e + 1]
            prediction = paragraphWords[predicted_a_s:predicted_a_e + 1]

            # Turn into a sentence
            ground_truth = ' '.join(ground_truth)
            prediction = ' '.join(prediction)

            # Evaluate
            em += float(exact_match_score(prediction, ground_truth))
            f1 += f1_score(prediction, ground_truth)
        f1 /= sample
        em /= sample

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))

        return f1, em
Ejemplo n.º 25
0
    def evaluate_answer(self, session, data, rev_vocab, sample_num=200):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :return:
        """

        overall_f1 = 0.
        overall_em = 0.

        eval_batch = [
            data[i]
            for i in np.random.choice(len(data), sample_num, replace=False)
        ]
        eval_batch = list(zip(*eval_batch))  # unzip the list

        a_s_vec, a_e_vec = self.answer(session, eval_batch)
        for (a_s, a_e, context, a_true) in zip(a_s_vec, a_e_vec, eval_batch[0],
                                               eval_batch[6]):
            if a_s > a_e:
                tmp = a_s
                a_s = a_e
                a_e = tmp
            predicted_answer = self.formulate_answer(context, rev_vocab, a_s,
                                                     a_e)
            true_answer = self.formulate_answer(context, rev_vocab, a_true[0],
                                                a_true[1])
            f1 = f1_score(predicted_answer, true_answer)
            overall_f1 += f1
            if exact_match_score(predicted_answer, true_answer):
                overall_em += 1

        average_f1 = overall_f1 / sample_num
        overall_em /= sample_num
        # logging.info("F1: {}, EM: {}, for {} samples\n".format(average_f1, overall_em, sample_num))

        return average_f1, overall_em
Ejemplo n.º 26
0
    def evaluate_answer(self, session, dataset, log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """
        f1 = 0.
        em = 0.
        (question, par, labels) = dataset

        num_sample = len(labels)
        # why not batch
        for index in range(0, num_sample):
            a_s, a_e = self.answer(session, question[index], par[index], labels[index])
            # print(a_s,a_e)
            answers = par[index][0][a_s: a_e + 1]
            p_s, p_e = labels[index]
            # print(p_s,p_e)
            true_answer = par[index][0][p_s: p_e + 1]

            answers = " ".join(str(a) for a in answers)
            true_answer = " ".join(str(ta) for ta in true_answer)

            # print(answers)
            # print(true_answer)
            f1 += f1_score(answers, true_answer)
            # print('@@@@@@@@@@@@@@@@@@@')
            em += exact_match_score(' '.join(str(a) for a in answers), ' '.join(str(ta) for ta in true_answer))
            # logging.info("answers %s, true_answer %s" % (answers, true_answer))
        f1 /= num_sample
        em /= num_sample

        if log:
            logging.info("F1: {:.2%}, EM: {:.2%}, for {} samples".format(f1, em, num_sample))

        return f1, em
Ejemplo n.º 27
0
    def evaluate_answer(self, session, dataset, samples=100, log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param log: whether we print to std out stream
        :return:
        """
        samples = min(samples, len(dataset[0]))

        c_ids, c_len, q_ids, q_len, span = dataset

        f1 = 0.
        em = 0.

        for index in range(samples):
            a_s, a_e = self.answer(
                session,
                (c_ids[index], c_len[index], q_ids[index], q_len[index]))
            answers = c_ids[index][a_s:a_e + 1]
            p_s, p_e = span[index]
            true_answer = c_ids[index][p_s:p_e + 1]

            answers = " ".join(str(a) for a in answers)
            true_answer = " ".join(str(ta) for ta in true_answer)

            f1 += f1_score(answers, true_answer)
            em += exact_match_score(' '.join(str(a) for a in answers),
                                    ' '.join(str(ta) for ta in true_answer))
            #logging.info("answers %s, true_answer %s" % (answers, true_answer))

        f1 /= samples
        em /= samples

        if log:
            logging.info("F1: {:.2%}, EM: {:.2%}, for {} samples".format(
                f1, em, samples))

        return f1, em
Ejemplo n.º 28
0
    def evaluate_answer(self, session, dataset, sample=100, log=False):
        """
        Our dataset format: a list of (context, question, begin, end)


        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        if len(dataset) > sample:
            dataset = random.sample(dataset, sample)

        f1, em = 0., 0.
        for context, question, begin, end in dataset:
            a_s, a_e = self.answer(session, context, question)
            a_s = min(a_s, len(context) - 1)
            a_e = min(a_e, len(context) - 1)
            if a_s > a_e:
                a_s, a_e = a_e, a_s
            prediction = context[a_s:(a_e + 1)]
            prediction = ' '.join([str(x) for x in prediction])
            ground_truth = context[begin:(end + 1)]
            ground_truth = ' '.join([str(x) for x in ground_truth])
            f1 += f1_score(prediction, ground_truth)
            em += exact_match_score(prediction, ground_truth)

        f1 = f1 * 100 / len(dataset)
        em = em * 100 / len(dataset)

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))

        return f1, em
Ejemplo n.º 29
0
    def evaluate_answer(self, session, dataset, sample=100, log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        f1 = 0.
        em = 0.

        q_batch, q_lens, p_batch, p_lens, s_label_batch, e_label_batch = make_eval_batch(
            dataset, sample)

        test_x = (q_batch, q_lens, p_batch, p_lens)
        pred_s, pred_e = self.answer(session, test_x)

        f1 = [
            f1_score(p_batch[i][pred_s[i]:pred_e[i]],
                     p_batch[i][s_label_batch[i]:e_label_batch[i]])
            for i in range(sample)
        ]
        em = [
            exact_match_score(p_batch[i][pred_s[i]:pred_e[i]],
                              p_batch[i][s_label_batch[i]:e_label_batch[i]])
            for i in range(sample)
        ]

        if log:
            logging.info("F1: {}, EM: {}, for {} samples".format(
                f1, em, sample))

        return f1, em
Ejemplo n.º 30
0
    def evaluate_answer(self,
                        session,
                        dataset_train,
                        dataset_val,
                        sample=100,
                        log=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """
        f1, em = 0., 0.
        # Sample each for half of total samples
        feed_data, ground_truth = get_sampled_data(dataset_train,
                                                   dataset_val,
                                                   self.context_length,
                                                   self.question_length,
                                                   sample=sample)

        for i, d in enumerate(feed_data):
            a_s, a_e = self.answer(session, (d[0], d[1]))
            answer = d[0][0].flatten()[int(a_s):int(a_e) + 1].tolist()
            truth = ' '.join([str(s) for s in ground_truth[i]])
            ans = ' '.join([str(s) for s in answer])
            f1 += f1_score(ans, truth) / sample
            if exact_match_score(ans, truth):
                em += 1. / sample

        if log:
            logging.info("F1: {}, EM: {}%, for {} samples".format(
                f1, em * 100, sample))
        return f1, em
Ejemplo n.º 31
0
def search_eval(data, max_span=15, op="+"):
    em = 0.
    f1 = 0.
    for start, end, paragraph, answer in data:
        s, e, prob = 0, 0, 0
        for i in range(len(start)):
            for j in range(min(max_span, len(end) - i)):
                if op == "+":
                    if start[i] + end[i + j] > prob:
                        prob = start[i] + end[i + j]
                        s, e = i, i + j
                if op == "*":
                    if start[i] * end[i + j] > prob:
                        prob = start[i] * end[i + j]
                        s, e = i, i + j
        text_p = " ".join(paragraph[s:e + 1])
        text_g = " ".join(paragraph[answer[0]:answer[1] + 1])
        f1 += f1_score(text_p, text_g)
        em += exact_match_score(text_p, text_g)
    print("search EM: {:.5f}, F1: {:.5f} (max_span={}, op={})".format(
        em / len(data), f1 / len(data), max_span, op))
Ejemplo n.º 32
0
    def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.

        Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode.
        This function uses the pre-processed version of the e.g. dev set for speed,
        whereas "official_eval" mode uses the original JSON. Therefore:
          1. official_eval takes your max F1/EM score w.r.t. the three reference answers,
            whereas this function compares to just the first answer (which is what's saved in the preprocessed data)
          2. Our preprocessed version of the dev set is missing some examples
            due to tokenization issues (see squad_preprocess.py).
            "official_eval" includes all examples.

        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
          num_samples: int. How many samples to use. If num_samples=0 then do whole dataset.
          print_to_screen: if True, pretty-prints each example to screen

        Returns:
          F1 and EM: Scalars. The average across the sampled examples.
        """
        logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()

        # Note here we select discard_long=False because we want to sample from the entire dataset
        # That means we're truncating, rather than discarding, examples with too-long context or questions
        for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False):

            pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch)

            # Convert the start and end positions to lists length batch_size
            pred_start_pos = pred_start_pos.tolist() # list length batch_size
            pred_end_pos = pred_end_pos.tolist() # list length batch_size

            for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                # Important: batch.context_tokens contains the original words (no UNKs)
                # You need to use the original no-UNK version when measuring F1/EM
                pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        toc = time.time()
        logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic))

        return f1_total, em_total
    def evaluate_answer(self, session, dataset, answers, rev_vocab,
                        set_name='val', training=False, log=False,
                        sample=(100, 100), sendin=None, ensemble=False):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        :return:
        """

        if not isinstance(rev_vocab, np.ndarray):
            rev_vocab = np.array(rev_vocab)

        if not isinstance(sample, tuple):
            sample = (sample, sample)

        input_batch_size = 100

        if training:
            train_context = dataset['train_context'][:sample[0]]
            train_question = dataset['train_question'][:sample[0]]
            train_answer = answers['raw_train_answer'][:sample[0]]
            train_len = len(train_context)

            if sendin and len(sendin) > 2:
                train_a_s, train_a_e = sendin[0:2]
            else:
                train_a_e = np.array([], dtype=np.int32)
                train_a_s = np.array([], dtype=np.int32)

                for i in tqdm(range(train_len // input_batch_size), desc='trianing set'):
                    # sys.stdout.write('>>> %d / %d \r'%(i, train_len // input_batch_size))
                    # sys.stdout.flush()
                    train_as, train_ae = self.answer(session,
                                                     train_context[i * input_batch_size:(i + 1) * input_batch_size],
                                                     train_question[i * input_batch_size:(i + 1) * input_batch_size])
                    train_a_s = np.concatenate((train_a_s, train_as), axis=0)
                    train_a_e = np.concatenate((train_a_e, train_ae), axis=0)

            tf1 = 0.
            tem = 0.
            for i, con in enumerate(train_context):
                sys.stdout.write('>>> %d / %d \r' % (i, train_len))
                sys.stdout.flush()
                prediction_ids = con[0][train_a_s[i]: train_a_e[i] + 1]
                prediction = rev_vocab[prediction_ids]
                prediction = ' '.join(prediction)
                # if i < 10:
                #     print('context: {}'.format(con[0]))
                #     print('prediction: {}'.format( prediction))
                #     print(' g-truth:   {}'.format( train_answer[i]))
                #     print('f1_score: {}'.format(f1_score(prediction, train_answer[i])))

                tf1 += f1_score(prediction, train_answer[i])
                tem += exact_match_score(prediction, train_answer[i])

            if log:
                logging.info("Training set ==> F1: {}, EM: {}, for {} samples".
                             format(tf1 / train_len, tem / train_len, train_len))

        # it was set to 1.0
        f1 = 0.0
        em = 0.0
        val_context = dataset[set_name + '_context'][:sample[1]]
        val_question = dataset[set_name + '_question'][:sample[1]]
        # ['Corpus Juris Canonici', 'the Northside', 'Naples', ...]
        val_answer = answers['raw_val_answer'][:sample[1]]

        val_len = len(val_context)
        # logging.info('calculating the validation set predictions.')

        if sendin and len(sendin) > 2:
            val_a_s, val_a_e = sendin[-2:]
        elif sendin:
            val_a_s, val_a_e = sendin
        else:
            val_a_s = np.array([], dtype=np.int32)
            val_a_e = np.array([], dtype=np.int32)
            for i in tqdm(range(val_len // input_batch_size), desc='validation   '):
                # sys.stdout.write('>>> %d / %d \r'%(i, val_len // input_batch_size))
                # sys.stdout.flush()
                a_s, a_e = self.answer(session, val_context[i * input_batch_size:(i + 1) * input_batch_size],
                                       val_question[i * input_batch_size:(i + 1) * input_batch_size])
                val_a_s = np.concatenate((val_a_s, a_s), axis=0)
                val_a_e = np.concatenate((val_a_e, a_e), axis=0)

        # logging.info('getting scores of dev set.')
        for i, con in enumerate(val_context):
            sys.stdout.write('>>> %d / %d \r' % (i, val_len))
            sys.stdout.flush()
            prediction_ids = con[0][val_a_s[i]: val_a_e[i] + 1]
            prediction = rev_vocab[prediction_ids]
            prediction = ' '.join(prediction)
            # if i < 10:
            #     print('context : {}'.format(con[0]))
            #     print('prediction: {}'.format( prediction))
            #     print(' g-truth:   {}'.format( val_answer[i]))
            #     print('f1_score: {}'.format(f1_score(prediction, val_answer[i])))
            f1 += f1_score(prediction, val_answer[i])
            em += exact_match_score(prediction, val_answer[i])

        if log:
            logging.info("Validation   ==> F1: {}, EM: {}, for {} samples".
                         format(f1 / val_len, em / val_len, val_len))
        # pdb.set_trace()

        if ensemble and training:
            return train_a_s, train_a_e, val_a_s, val_a_e
        elif ensemble:
            return val_a_s, val_a_e
        # else:
        #    return , train_a_e, val_a_s, val_a_e
        else:
            return tf1 / train_len, tem / train_len, f1 / val_len, em / val_len