Python pre_procの例、general_utils.pre_proc Pythonの例

コード例 #1

0

ファイルを表示

        for qa in paragraph['qas']:
            id_, question = qa['id'], qa['question']
            if args.no_answer:
                answers = ['', '', '']
            else:
                answers = qa['answers']
                answers = [a['text'] for a in answers]
            rows.append((id_, context, question, answers))
    return rows


test = flatten_json(test_file, proc_test)
test = pd.DataFrame(test, columns=['id', 'context', 'question', 'answers'])
log.info('test json data flattened.')

testC_iter = (pre_proc(c) for c in test.context)
testQ_iter = (pre_proc(q) for q in test.question)
testC_docs = [
    doc for doc in nlp.pipe(testC_iter, batch_size=64, n_threads=args.threads)
]
testQ_docs = [
    doc for doc in nlp.pipe(testQ_iter, batch_size=64, n_threads=args.threads)
]
testC_unnorm_tokens = [[w.text for w in doc] for doc in testC_docs]
log.info('unnormalized tokens for test is obtained.')

test['context_span'] = [
    get_context_span(a, b) for a, b in zip(test.context, testC_unnorm_tokens)
]
log.info('context span for test is generated.')

コード例 #2

0

ファイルを表示

ファイル: preprocess_CoQA.py プロジェクト: nguyenmao2101/FlowQA

        if j > 0:
            q_text = article['answers'][j - 1]['input_text'] + " // " + q_text

        rows.append(
            (ith, q_text, answer, answer_start, answer_end, rationale, rationale_start, rationale_end, answer_choice))
    return rows, context


train, train_context = flatten_json(trn_file, proc_train)
train = pd.DataFrame(train, columns=['context_idx', 'question', 'answer', 'answer_start', 'answer_end', 'rationale',
                                     'rationale_start', 'rationale_end', 'answer_choice'])
log.info('train json data flattened.')

# print(train)

trC_iter = (pre_proc(c) for c in train_context)
trQ_iter = (pre_proc(q) for q in train.question)
trC_docs = [doc for doc in nlp.pipe(trC_iter, batch_size=64, n_threads=args.threads)]
trQ_docs = [doc for doc in nlp.pipe(trQ_iter, batch_size=64, n_threads=args.threads)]

# tokens
trC_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc] for doc in trC_docs]
trQ_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc] for doc in trQ_docs]
trC_unnorm_tokens = [[re.sub(r'_', ' ', w.text) for w in doc] for doc in trC_docs]
log.info('All tokens for training are obtained.')

train_context_span = [get_context_span(a, b) for a, b in zip(train_context, trC_unnorm_tokens)]

ans_st_token_ls, ans_end_token_ls = [], []
for ans_st, ans_end, idx in zip(train.answer_start, train.answer_end, train.context_idx):
    ans_st_token, ans_end_token = find_answer_span(train_context_span[idx], ans_st, ans_end)

コード例 #3

0

ファイルを表示

                    datefmt='%m/%d/%Y %I:%M:%S')
log = logging.getLogger(__name__)

log.info('start data preparing... (using {} threads)'.format(args.threads))

glove_vocab = load_glove_vocab(wv_file, wv_dim)  # return a "set" of vocabulary
log.info('glove loaded.')

#===============================================================
#=================== Work on training data =====================
#===============================================================

train = process_jsonlines(trn_file)
log.info('train jsonline data flattened.')

trP_iter = (pre_proc(p) for p in train.P)
trH_iter = (pre_proc(h) for h in train.H)
trP_docs = [
    doc for doc in nlp.pipe(trP_iter, batch_size=64, n_threads=args.threads)
]
trH_docs = [
    doc for doc in nlp.pipe(trH_iter, batch_size=64, n_threads=args.threads)
]

# tokens
trP_tokens = [[normalize_text(w.text) for w in doc] for doc in trP_docs]
trH_tokens = [[normalize_text(w.text) for w in doc] for doc in trH_docs]
log.info('All tokens for training are obtained.')

# features
trP_tags, trP_ents, trP_features = feature_gen(trP_docs, trH_docs)

コード例 #4

0

ファイルを表示

def preprocess_eval_data(filename, output_msgpack):
    EvalData = process_jsonlines(filename)

    filename = os.path.basename(filename)
    log.info(filename + ' flattened.')

    EvalDataP_iter = (pre_proc(p) for p in EvalData.P)
    EvalDataH_iter = (pre_proc(h) for h in EvalData.H)
    EvalDataP_docs = [
        doc for doc in nlp.pipe(
            EvalDataP_iter, batch_size=64, n_threads=args.threads)
    ]
    EvalDataH_docs = [
        doc for doc in nlp.pipe(
            EvalDataH_iter, batch_size=64, n_threads=args.threads)
    ]

    # tokens
    EvalDataP_tokens = [[normalize_text(w.text) for w in doc]
                        for doc in EvalDataP_docs]
    EvalDataH_tokens = [[normalize_text(w.text) for w in doc]
                        for doc in EvalDataH_docs]
    log.info('All tokens for ' + filename + ' are obtained.')

    # features
    EvalDataP_tags, EvalDataP_ents, EvalDataP_features = feature_gen(
        EvalDataP_docs, EvalDataH_docs)
    EvalDataH_tags, EvalDataH_ents, EvalDataH_features = feature_gen(
        EvalDataH_docs, EvalDataP_docs)
    log.info('features for ' + filename + ' is generated.')

    def build_EvalData_vocab(A, B):  # most vocabulary comes from tr_vocab
        existing_vocab = set(tr_vocab)
        new_vocab = list(
            set([
                w for doc in A + B for w in doc
                if w not in existing_vocab and w in glove_vocab
            ]))
        vocab = tr_vocab + new_vocab
        log.info('train vocab {0}, total vocab {1}'.format(
            len(tr_vocab), len(vocab)))
        return vocab

    # vocab
    EvalData_vocab = build_EvalData_vocab(
        EvalDataP_tokens,
        EvalDataH_tokens)  # tr_vocab is a subset of EvalData_vocab
    EvalDataP_ids = token2id(EvalDataP_tokens, EvalData_vocab, unk_id=1)
    EvalDataH_ids = token2id(EvalDataH_tokens, EvalData_vocab, unk_id=1)

    # tags
    EvalDataP_tag_ids = token2id(EvalDataP_tags, vocab_tag)
    EvalDataH_tag_ids = token2id(EvalDataH_tags,
                                 vocab_tag)  # vocab_tag same as training

    # entities
    EvalDataP_ent_ids = token2id(EvalDataP_ents,
                                 vocab_ent)  # vocab_ent same as training
    EvalDataH_ent_ids = token2id(EvalDataH_ents,
                                 vocab_ent)  # vocab_ent same as training
    log.info('vocabulary for ' + filename + ' is built.')

    EvalData_embedding = build_embedding(
        wv_file, EvalData_vocab,
        wv_dim)  # tr_embedding is a submatrix of EvalData_embedding
    log.info('got embedding matrix for ' + filename)

    result = {
        'premise_ids': EvalDataP_ids,
        'premise_features': EvalDataP_features,  # exact match, tf
        'premise_tags': EvalDataP_tag_ids,  # POS tagging
        'premise_ents': EvalDataP_ent_ids,  # Entity recognition
        'hypothesis_ids': EvalDataH_ids,
        'hypothesis_features': EvalDataH_features,  # exact match, tf
        'hypothesis_tags': EvalDataH_tag_ids,  # POS tagging
        'hypothesis_ents': EvalDataH_ent_ids,  # Entity recognition
        'vocab': EvalData_vocab,
        'embedding': EvalData_embedding.tolist(),
        'answers': EvalData.label
    }
    with open(output_msgpack, 'wb') as f:
        msgpack.dump(result, f)

    log.info('saved ' + output_msgpack + ' to disk.')

コード例 #5

0

ファイルを表示

    # tokens_tensor = torch.tensor(tokens_tensor)
    return tokens_tensor


# train 为所有的qas,len=83568(因为是extend连接的)，train_context是对应的context,len=11567
train, train_context = flatten_json(trn_file, proc_train)

train = pd.DataFrame(train,
                     columns=[
                         'context_idx', 'question', 'answer', 'answer_start',
                         'answer_end', 'answer_choice'
                     ])

log.info('train json data flattened.')

trC_iter = (pre_proc(c) for c in train_context)
trQ_iter = (pre_proc(q) for q in train.question)
trC_docs = [
    doc for doc in nlp.pipe(trC_iter, batch_size=64, n_threads=args.threads)
]
trQ_docs = [
    doc for doc in nlp.pipe(trQ_iter, batch_size=64, n_threads=args.threads)
]
trC_docs_forbert = [pre_proc(c) for c in train_context]
trQ_docs_forbert = [pre_proc(q) for q in train.question]
# tokens
trC_tokens = [[normalize_text(w.text) for w in doc] for doc in trC_docs]
trQ_tokens = [[normalize_text(w.text) for w in doc] for doc in trQ_docs]
trC_unnorm_tokens = [[w.text for w in doc] for doc in trC_docs]
log.info('All tokens for training are obtained.')
# 得到context中除了空白符的其它所有word和标点符号的以character为单位的span

コード例 #6

0

ファイルを表示

ファイル: preprocess_QuAC.py プロジェクト: leehamw/FlowQA_Dp_Graph

def proc_train(ith, article, graph_arr):
    rows = []

    for paragraph in article['paragraphs']:
        #就 enumerate 1次
        context = paragraph['context']
        node_num = len(graph_arr['nodes'])
        # for g in graph_arr['']:
        #     node_num += len(g['nodes'])

        node_arr = []
        assert len(graph_arr['nodes']) == len(graph_arr['edges'])
        edge_arr = [[0 for ei in range(node_num)] for w0 in range(node_num)]
        for node_index, node in enumerate(graph_arr['nodes']):
            tmp_edge = graph_arr['edges'][node_index]

            node_arr.append(nlp(pre_proc(node['word'])))

            for ed_idx, edge in enumerate(tmp_edge):

                if (edge == '' or edge == 'SELF'):
                    continue
                else:
                    edge_arr[node_index][ed_idx] = 1
        new_node_arr = node_arr
        new_edge_arr = edge_arr

        # for ei, e in enumerate(edge_arr):
        #     if np.sum(e) == 0:
        #         continue
        #     else:
        #         new_node_arr.append(node_arr[ei])
        #         new_edge_arr.append(e)
        # graph=dict()
        # current_ptr=len(graph_arr)
        # node_num=0
        # for g in graph_arr:
        #     node_num+=len(g['nodes'])
        # node_arr=[]
        # edge_arr=[[0 for ei in range(node_num)] for w0 in range(node_num)]
        # for curr in range(current_ptr):
        #     curr_graph=graph_arr[curr]
        #     curr_node = curr_graph['nodes']
        #     curr_node_num=len(curr_node)
        #     acc_curr_node_idx=[len(node_arr)+id for id in range(curr_node_num)]
        #     curr_edges=curr_graph['edges'] #arr[[]]
        #     for node_detail in curr_node:
        #         node_arr.append(nlp(pre_proc(node_detail['word'])))
        #     for ed_idx, edge in enumerate(curr_edges):
        #         #edge[]
        #         acc_edge_idx=acc_curr_node_idx[ed_idx] #这个结点的实际index
        #         for ci in range(curr_node_num):
        #             if(edge[ci]=='' or edge[ci]=='SELF' ):
        #                 continue
        #             else:
        #                 edge_arr[acc_edge_idx][acc_curr_node_idx[ci]]=1

        # graph['nodes']=node_arr
        # graph['edges'] = edge_arr
        for qa in paragraph['qas']:
            question = qa['question']
            answers = qa['orig_answer']

            answer = answers['text']
            answer_start = answers['answer_start']
            answer_end = answers['answer_start'] + len(answers['text'])
            answer_choice = 0 if answer == 'CANNOTANSWER' else\
                            1 if qa['yesno'] == 'y' else\
                            2 if qa['yesno'] == 'n' else\
                            3 # Not a yes/no question
            if answer_choice != 0:
                """
                0: Do not ask a follow up question!
                1: Definitely ask a follow up question!
                2: Not too important, but you can ask a follow up.
                """
                answer_choice += 10 * (0 if qa['followup'] == "n" else\
                                       1 if qa['followup'] == "y" else\
                                       2)
            else:
                answer_start, answer_end = -1, -1
            rows.append((ith, question, answer, answer_start, answer_end,
                         answer_choice))  #在列表末尾一次性追加另一个序列中的多个值（用新列表扩展原来的列表）。
    return rows, context, new_node_arr, new_edge_arr

コード例 #7

0

ファイルを表示

ファイル: preprocess_QuAC.py プロジェクト: leehamw/FlowQA_Dp_Graph

def proc_dev(ith, article, graph_arr):
    rows = []

    for paragraph in article['paragraphs']:
        context = paragraph['context']
        # current_ptr = len(graph_arr)
        node_num = len(graph_arr['nodes'])
        # for g in graph_arr['']:
        #     node_num += len(g['nodes'])

        node_arr = []
        assert len(graph_arr['nodes']) == len(graph_arr['edges'])
        edge_arr = [[0 for ei in range(node_num)] for w0 in range(node_num)]
        for node_index, node in enumerate(graph_arr['nodes']):
            node_arr.append(nlp(pre_proc(node['word'])))
            tmp_edge = graph_arr['edges'][node_index]
            for ed_idx, edge in enumerate(tmp_edge):

                if (edge == '' or edge == 'SELF'):
                    continue
                else:
                    edge_arr[node_index][ed_idx] = 1
        new_node_arr = node_arr
        new_edge_arr = edge_arr
        # for ei, e in enumerate(edge_arr):
        #     if np.sum(e) == 0:
        #         continue
        #     else:
        #         new_node_arr.append(node_arr[ei])
        #         new_edge_arr.append(e)

        for qa in paragraph['qas']:
            question = qa['question']
            answers = qa['orig_answer']

            answer = answers['text']
            answer_start = answers['answer_start']
            answer_end = answers['answer_start'] + len(answers['text'])
            answer_choice = 0 if answer == 'CANNOTANSWER' else\
                            1 if qa['yesno'] == 'y' else\
                            2 if qa['yesno'] == 'n' else\
                            3 # Not a yes/no question
            if answer_choice != 0:
                """
                0: Do not ask a follow up question!
                1: Definitely ask a follow up question!
                2: Not too important, but you can ask a follow up.
                """
                answer_choice += 10 * (0 if qa['followup'] == "n" else\
                                       1 if qa['followup'] == "y" else\
                                       2)
            else:
                answer_start, answer_end = -1, -1

            ans_ls = []
            for ans in qa['answers']:
                ans_ls.append(ans['text'])

            rows.append((ith, question, answer, answer_start, answer_end,
                         answer_choice, ans_ls))
    return rows, context, new_node_arr, new_edge_arr

コード例 #8

0

ファイルを表示

ファイル: test_e2e.py プロジェクト: Yash-5/FlowQA

def preprocess_data(dev_file):
    dev, dev_context = flatten_json(dev_file, proc_dev)

    dev = pd.DataFrame(dev,
                       columns=[
                           'context_idx', 'question', 'answer', 'answer_start',
                           'answer_end', 'answer_choice', 'all_answer', 'qid'
                       ])
    print('dev json data flattened.')

    devC_iter = (pre_proc(c) for c in dev_context)
    devQ_iter = (pre_proc(q) for q in dev.question)
    nlp = spacy.load('en', disable=['parser'])
    devC_docs = [
        doc for doc in nlp.pipe(
            devC_iter, batch_size=64, n_threads=multiprocessing.cpu_count())
    ]
    devQ_docs = [
        doc for doc in nlp.pipe(
            devQ_iter, batch_size=64, n_threads=multiprocessing.cpu_count())
    ]
    del nlp

    devC_tokens = [[normalize_text(w.text) for w in doc] for doc in devC_docs]
    devQ_tokens = [[normalize_text(w.text) for w in doc] for doc in devQ_docs]
    devC_unnorm_tokens = [[w.text for w in doc] for doc in devC_docs]
    print('All tokens for dev are obtained.')

    dev_context_span = [
        get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens)
    ]
    print('context span for dev is generated.')

    ans_st_token_ls, ans_end_token_ls = [], []
    for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end,
                                    dev.context_idx):
        ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx],
                                                       ans_st, ans_end)
        ans_st_token_ls.append(ans_st_token)
        ans_end_token_ls.append(ans_end_token)

    dev['answer_start_token'], dev[
        'answer_end_token'] = ans_st_token_ls, ans_end_token_ls
    initial_len = len(dev)
    dev.dropna(inplace=True)  # modify self DataFrame
    print('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev),
                                                      initial_len))
    print('answer span for dev is generated.')

    devC_tags, devC_ents, devC_features = feature_gen(devC_docs,
                                                      dev.context_idx,
                                                      devQ_docs, False)
    print('features for dev is generated: {}, {}, {}'.format(
        len(devC_tags), len(devC_ents), len(devC_features)))

    dev_vocab = build_dev_vocab(
        devQ_tokens, devC_tokens)  # tr_vocab is a subset of dev_vocab
    devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1)
    devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1)
    devQ_tokens = [["<S>"] + doc + ["</S>"] for doc in devQ_tokens]
    devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids]

    # BERT stuff
    devC_bert_tokens = tokenize(devC_tokens)
    devC_bert_ids = [bert_tokens_to_ids(x) for x in devC_bert_tokens]
    devQ_bert_tokens = tokenize(devQ_tokens)
    devQ_bert_ids = [bert_tokens_to_ids(x) for x in devQ_bert_tokens]

    devC_bert_spans = [
        calc_bert_spans(b, t) for b, t in zip(devC_bert_tokens, devC_tokens)
    ]
    devQ_bert_spans = [
        calc_bert_spans(b, t) for b, t in zip(devQ_bert_tokens, devQ_tokens)
    ]

    vocab_tag = pickle.load(open('./vocab_tag.pkl', 'rb'))
    vocab_ent = pickle.load(open('./vocab_ent.pkl', 'rb'))

    devC_tag_ids = token2id(devC_tags, vocab_tag)  # vocab_tag same as training
    # entities
    devC_ent_ids = token2id(devC_ents, vocab_ent,
                            unk_id=0)  # vocab_ent same as training
    print('vocabulary for dev is built.')

    dev_embedding = build_embedding('glove/glove.840B.300d.txt', dev_vocab,
                                    300)

    meta = {'vocab': dev_vocab, 'embedding': dev_embedding.tolist()}

    prev_CID, first_question = -1, []
    for i, CID in enumerate(dev.context_idx):
        if not (CID == prev_CID):
            first_question.append(i)
        prev_CID = CID

    result = {
        'qids': dev.qid.tolist(),
        'question_ids': devQ_ids,
        'context_ids': devC_ids,
        'context_features': devC_features,  # exact match, tf
        'context_tags': devC_tag_ids,  # POS tagging
        'context_ents': devC_ent_ids,  # Entity recognition
        'context': dev_context,
        'context_span': dev_context_span,
        '1st_question': first_question,
        'question_CID': dev.context_idx.tolist(),
        'question': dev.question.tolist(),
        'answer': dev.answer.tolist(),
        'answer_start': dev.answer_start_token.tolist(),
        'answer_end': dev.answer_end_token.tolist(),
        'answer_choice': dev.answer_choice.tolist(),
        'all_answer': dev.all_answer.tolist(),
        'context_tokenized': devC_tokens,
        'question_tokenized': devQ_tokens,
        'context_bertidx': devC_bert_ids,
        'context_bert_spans': devC_bert_spans,
        'question_bertidx': devQ_bert_ids,
        'question_bert_spans': devQ_bert_spans
    }

    return meta, result

コード例 #9

0

ファイルを表示

ファイル: Full_CoQA_eval.py プロジェクト: nguyenmao2101/FlowQA

def build_test_data(opt, dev_file, vocab):

    # random.seed(args.seed)
    # np.random.seed(args.seed)

    # logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG,
    #                     datefmt='%m/%d/%Y %I:%M:%S')
    log = logging.getLogger(__name__)
    # tags
    vocab_tag = [''] + list(nlp.tagger.labels)

    # entities
    # log.info('start data preparing... (using {} threads)'.format(args.threads))

    # glove_vocab = load_glove_vocab(wv_file, wv_dim)  # return a "set" of vocabulary
    # log.info('glove loaded.')

    def proc_dev(ith, article):
        rows = []
        context = article['story']

        for j, (question, answers) in enumerate(
                zip(article['questions'], article['answers'])):
            gold_answer = answers['input_text']
            span_answer = answers['span_text']

            answer, char_i, char_j = free_text_to_span(gold_answer,
                                                       span_answer)
            answer_choice = 0 if answer == '__NA__' else \
                1 if answer == '__YES__' else \
                    2 if answer == '__NO__' else \
                        3  # Not a yes/no question

            if answer_choice == 3:
                answer_start = answers['span_start'] + char_i
                answer_end = answers['span_start'] + char_j
            else:
                answer_start, answer_end = -1, -1

            rationale = answers['span_text']
            rationale_start = answers['span_start']
            rationale_end = answers['span_end']

            q_text = question['input_text']
            if j > 0:
                q_text = article['answers'][j -
                                            1]['input_text'] + " // " + q_text

            rows.append(
                (ith, q_text, answer, answer_start, answer_end, rationale,
                 rationale_start, rationale_end, answer_choice))
        return rows, context

    dev, dev_context = flatten_json(dev_file, proc_dev)
    dev = pd.DataFrame(dev,
                       columns=[
                           'context_idx', 'question', 'answer', 'answer_start',
                           'answer_end', 'rationale', 'rationale_start',
                           'rationale_end', 'answer_choice'
                       ])
    # log.info('dev json data flattened.')

    # print(dev)

    devC_iter = (pre_proc(c) for c in dev_context)
    devQ_iter = (pre_proc(q) for q in dev.question)
    devC_docs = [
        doc
        for doc in nlp.pipe(devC_iter, batch_size=64, n_threads=args.threads)
    ]
    devQ_docs = [
        doc
        for doc in nlp.pipe(devQ_iter, batch_size=64, n_threads=args.threads)
    ]

    # tokens
    devC_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc]
                   for doc in devC_docs]
    devQ_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc]
                   for doc in devQ_docs]
    devC_unnorm_tokens = [[re.sub(r'_', ' ', w.text) for w in doc]
                          for doc in devC_docs]
    # log.info('All tokens for dev are obtained.')

    dev_context_span = [
        get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens)
    ]
    # log.info('context span for dev is generated.')

    ans_st_token_ls, ans_end_token_ls = [], []
    for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end,
                                    dev.context_idx):
        ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx],
                                                       ans_st, ans_end)
        ans_st_token_ls.append(ans_st_token)
        ans_end_token_ls.append(ans_end_token)

    ration_st_token_ls, ration_end_token_ls = [], []
    for ration_st, ration_end, idx in zip(dev.rationale_start,
                                          dev.rationale_end, dev.context_idx):
        ration_st_token, ration_end_token = find_answer_span(
            dev_context_span[idx], ration_st, ration_end)
        ration_st_token_ls.append(ration_st_token)
        ration_end_token_ls.append(ration_end_token)

    dev['answer_start_token'], dev[
        'answer_end_token'] = ans_st_token_ls, ans_end_token_ls
    dev['rationale_start_token'], dev[
        'rationale_end_token'] = ration_st_token_ls, ration_end_token_ls

    initial_len = len(dev)
    dev.dropna(inplace=True)  # modify self DataFrame
    # log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev), initial_len))
    # log.info('answer span for dev is generated.')

    # features
    devC_tags, devC_ents, devC_features = feature_gen(devC_docs,
                                                      dev.context_idx,
                                                      devQ_docs, args.no_match)
    # log.info('features for dev is generated: {}, {}, {}'.format(len(devC_tags), len(devC_ents), len(devC_features)))
    vocab_ent = list(set([ent for sent in devC_ents for ent in sent]))

    # vocab
    dev_vocab = vocab  # tr_vocab is a subset of dev_vocab
    devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1)
    devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1)
    devQ_tokens = [["<S>"] + doc + ["</S>"] for doc in devQ_tokens]
    devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids]
    # print(devQ_ids[:10])
    # tags
    devC_tag_ids = token2id(devC_tags, vocab_tag)  # vocab_tag same as training
    # entities
    devC_ent_ids = token2id(devC_ents, vocab_ent,
                            unk_id=0)  # vocab_ent same as training
    # log.info('vocabulary for dev is built.')

    prev_CID, first_question = -1, []
    for i, CID in enumerate(dev.context_idx):
        if not (CID == prev_CID):
            first_question.append(i)
        prev_CID = CID

    data = {
        'question_ids': devQ_ids,
        'context_ids': devC_ids,
        'context_features': devC_features,  # exact match, tf
        'context_tags': devC_tag_ids,  # POS tagging
        'context_ents': devC_ent_ids,  # Entity recognition
        'context': dev_context,
        'context_span': dev_context_span,
        '1st_question': first_question,
        'question_CID': dev.context_idx.tolist(),
        'question': dev.question.tolist(),
        'answer': dev.answer.tolist(),
        'answer_start': dev.answer_start_token.tolist(),
        'answer_end': dev.answer_end_token.tolist(),
        'rationale_start': dev.rationale_start_token.tolist(),
        'rationale_end': dev.rationale_end_token.tolist(),
        'answer_choice': dev.answer_choice.tolist(),
        'context_tokenized': devC_tokens,
        'question_tokenized': devQ_tokens
    }
    # with open('CoQA/test_data.msgpack', 'wb') as f:
    #     msgpack.dump(result, f)

    # log.info('saved test to disk.')
    dev = {
        'context':
        list(
            zip(data['context_ids'], data['context_tags'],
                data['context_ents'], data['context'], data['context_span'],
                data['1st_question'], data['context_tokenized'])),
        'qa':
        list(
            zip(data['question_CID'], data['question_ids'],
                data['context_features'], data['answer_start'],
                data['answer_end'], data['rationale_start'],
                data['rationale_end'], data['answer_choice'], data['question'],
                data['answer'], data['question_tokenized']))
    }
    print("test_data built")
    # embedding = torch.Tensor(meta['embedding'])
    return dev