def label_mp(split):
    """ process the data split with multi-processing"""
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    n_data = count_data(data_dir)
    with mp.Pool() as pool:
        list(pool.imap_unordered(process(split),
                                 list(range(n_data)), chunksize=1024))
    print('finished in {}'.format(timedelta(seconds=time()-start)))
Ejemplo n.º 2
0
def dump(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    dump_dir = join(DATA_DIR, 'refs', split)
    n_data = count_data(data_dir)
    for i in range(n_data):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data),
              end='')
        with open(join(data_dir, '{}.json'.format(i))) as f:
            data = json.loads(f.read())
        abs_sents = data['abstract']
        with open(join(dump_dir, '{}.ref'.format(i)), 'w') as f:
            f.write(make_html_safe('\n'.join(abs_sents)))
    print('finished in {}'.format(timedelta(seconds=time()-start)))
def label(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    n_data = count_data(data_dir)
    for i in range(n_data):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data),
              end='')
        with open(join(data_dir, '{}.json'.format(i))) as f:
            data = json.loads(f.read())
        tokenize = compose(list, _split_words)
        art_sents = tokenize(data['article'])
        abs_sents = tokenize(data['abstract'])
        extracted, scores = get_extract_label(art_sents, abs_sents)
        data['extracted'] = extracted
        data['score'] = scores
        with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
            json.dump(data, f, indent=4)
    print('finished in {}'.format(timedelta(seconds=time()-start)))
def label(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    n_data = count_data(data_dir)
    for i in range(n_data):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data,
                                                     100 * i / n_data),
              end='')
        with open(join(data_dir, '{}.json'.format(i)), encoding='utf-8') as f:
            data = json.loads(f.read(), encoding='utf-8')
        tokenize = compose(list, _split_words)
        art_sents = tokenize(data['article'])
        abs_sents = tokenize(data['abstract'])
        extracted, scores = get_extract_label(art_sents, abs_sents)
        data['extracted'] = extracted
        data['score'] = scores
        with open(join(data_dir, '{}.json'.format(i)), 'w',
                  encoding='utf-8') as f:
            json.dump(data, f, indent=4)
    print('finished in {}'.format(timedelta(seconds=time() - start)))
Ejemplo n.º 5
0
 def __init__(self):
     self._path = join(DATA_DIR, 'train')
     self._n_data = count_data(self._path)
Ejemplo n.º 6
0
def main(args):
    print('no use bert')
    os.makedirs(AFTER_DIR)
    # meta = json.load(open(join(DATA_DIR, 'meta.json')))
    # nargs = meta['net_args']
    # ckpt = load_best_ckpt(DATA_DIR)
    # net = BertMatcher(**nargs)
    # net.load_state_dict(ckpt)
    # if args.cuda:
    #     net = net.cuda()
    # net.eval()
    # tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain')
    stopwords = stopwordlist()
    context_path = 'data/class/context'
    context_data = count_data(context_path)
    corpus = []
    new_docid_arr = []
    for i in range(context_data):
        with open(join('data/class/context', '{}.json'.format(i + 1))) as f:
            js_data = json.load(f)
            text = filter_text(js_data['text'].replace(' ', '').replace(
                ' ', '').replace('&rbsp;', '').replace('&mbsp;', ''))
            new_docid = js_data['new_docid']
        data = list(jieba.lcut(filter_text(text), cut_all=False, HMM=True))
        remove = lambda token: False if token in stopwords else True
        data = list(filter(remove, data))
        print(new_docid)
        corpus.append(data)
        new_docid_arr.append(new_docid)
    dictionary = corpora.Dictionary(corpus)
    bm25Model = bm25.BM25(corpus)

    with torch.no_grad():
        for index in range(1643):

            with open(
                    join(join('data/final', 'original_test_sample'),
                         '{}.json'.format(index + 1))) as f:
                js_data = json.load(f)
                print('loading: {}'.format(index + 1))
                id, question_text, ques_id = (js_data['id'],
                                              js_data['question'],
                                              js_data['question_id'])

            remove = lambda token: False if token in stopwords else True
            q_data = list(
                jieba.lcut(filter_text(question_text), cut_all=False,
                           HMM=True))
            q_data = list(filter(remove, q_data))
            scores = bm25Model.get_scores(q_data)
            max_num_index_list = map(scores.index, heapq.nlargest(10, scores))
            max_num_index_list = list(max_num_index_list)
            arr = []
            for m in max_num_index_list:
                idx = m
                fname = new_docid_arr[idx]
                arr.append(fname)

            new_corpus = []
            new_new_docid_arr = []

            for con in arr:

                with open(
                        join(join(DATASET_DIR, 'context'),
                             '{}.json'.format(con))) as c:
                    cn_data = json.load(c)
                    co_docid, docid, text = (cn_data['new_docid'],
                                             cn_data['docid'], cn_data['text'])
                    data = list(
                        jieba.lcut(filter_text(text), cut_all=False, HMM=True))
                    remove = lambda token: False if token in stopwords else True
                    data = list(filter(remove, data))

                    new_corpus.append(data)
                    new_new_docid_arr.append(co_docid)

            new_bm25Model = bm25.BM25(new_corpus)
            new_scores = new_bm25Model.get_scores(q_data)
            max_num_index_list = map(new_scores.index,
                                     heapq.nlargest(1, new_scores))
            max_num_index_list = list(max_num_index_list)
            final_docid = new_new_docid_arr[max_num_index_list[0]]
            with open(join('data/class/context',
                           '{}.json'.format(final_docid))) as l:
                cn_data = json.load(l)
                f_new_docid, f_docid, f_text = (cn_data['new_docid'],
                                                cn_data['docid'],
                                                cn_data['text'])

                # text_tok = tokenizer.tokenize(text)
                # text_id = tokenizer.convert_tokens_to_ids(text_tok)
                # text_len = len(text_id)
                #
                # question_len = len(ques_id)
                # if (question_len + text_len <= 512):
                #     concat_text=ques_id+text_id
                #
                #
                #     token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize([concat_text], args.cuda)
                #
                #     fw_args = (token_tensor, segment_tensor, mask_tensor)
                #     net_out = net(*fw_args)
                #
                #     if (net_out[0][0].item() > highest_score[-1]) :
                #         highest_score.clear()
                #         highest_score.append(net_out[0][0].item())
                #         context_new_id.clear()
                #         context_new_id.append(new_docid)
                #         context_id.clear()
                #         context_id.append(docid)
                #         context_content.clear()
                #         context_content.append(text)
                #
                # else:
                #     sp = 0
                #     ep = 412
                #     scores_arr=[]
                #     while (True):
                #         if (ep >= text_len and sp < text_len):
                #             sub_text = text_id[sp:text_len]
                #             concat_text = ques_id + sub_text
                #             token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize([concat_text], args.cuda)
                #
                #             fw_args = (token_tensor, segment_tensor, mask_tensor)
                #             net_out = net(*fw_args)
                #             scores_arr.append(net_out[0][0].item())
                #             sp += 312
                #             ep += 312
                #         else:
                #             if (ep > text_len):
                #                 break
                #             else:
                #                 sub_text = text_id[sp:ep]
                #                 concat_text = ques_id + sub_text
                #                 token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize([concat_text],
                #                                                                                 args.cuda)
                #
                #                 fw_args = (token_tensor, segment_tensor, mask_tensor)
                #                 net_out = net(*fw_args)
                #                 scores_arr.append(net_out[0][0].item())
                #                 sp += 312
                #                 ep += 312
                #     if (max(scores_arr)>highest_score[-1]):
                #         highest_score.clear()
                #         highest_score.append(net_out[0][0].item())
                #         context_new_id.clear()
                #         context_new_id.append(new_docid)
                #         context_id.clear()
                #         context_id.append(docid)
                #         context_content.clear()
                #         context_content.append(text)
            tmp_dict = {}
            tmp_dict['index'] = index + 1
            tmp_dict['id'] = id
            tmp_dict['question'] = question_text
            tmp_dict['new_docid'] = final_docid
            tmp_dict['docid'] = f_docid
            tmp_dict['text'] = f_text
            with open(join(AFTER_DIR, '{}.json'.format(index + 1)),
                      'w',
                      encoding='utf-8') as p:
                json.dump(tmp_dict, p, ensure_ascii=False)

            print('finish processing {}'.format(index + 1))
Ejemplo n.º 7
0
def main(args):
    print('./MRC_pretrain')
    os.makedirs(AFTER_DIR)
    meta = json.load(open(join(DATA_DIR, 'meta.json')))
    nargs = meta['net_args']
    ckpt = load_best_ckpt(DATA_DIR)
    net = BertMatcher(**nargs)
    net.load_state_dict(ckpt)
    if args.cuda:
        net = net.cuda()
    net.eval()
    tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain')
    stopwords = stopwordlist()
    context_path = 'data/class/context'
    context_data = count_data(context_path)
    corpus = []
    new_docid_arr = []
    for i in range(context_data):
        with open(join('data/class/context', '{}.json'.format(i + 1))) as f:
            js_data = json.load(f)
            text = filter_text(js_data['text'].replace(' ', '').replace(
                '&ensp;', '').replace('&rbsp;', '').replace('&mbsp;', ''))
            new_docid = js_data['new_docid']
        data = list(jieba.lcut(filter_text(text), cut_all=False, HMM=True))
        remove = lambda token: False if token in stopwords else True
        data = list(filter(remove, data))
        print(new_docid)
        corpus.append(data)
        new_docid_arr.append(new_docid)
    dictionary = corpora.Dictionary(corpus)
    bm25Model = bm25.BM25(corpus)

    with torch.no_grad():
        for index in range(1643):

            with open(
                    join(join('data/final', 'original_test_sample'),
                         '{}.json'.format(index + 1))) as f:
                js_data = json.load(f)
                print('loading: {}'.format(index + 1))
                id, question_text, ques_id = (js_data['id'],
                                              js_data['question'],
                                              js_data['question_id'])

            remove = lambda token: False if token in stopwords else True
            q_data = list(
                jieba.lcut(filter_text(question_text), cut_all=False,
                           HMM=True))
            q_data = list(filter(remove, q_data))
            scores = bm25Model.get_scores(q_data)
            max_num_index_list = map(scores.index, heapq.nlargest(5, scores))
            max_num_index_list = list(max_num_index_list)
            arr = []
            for m in max_num_index_list:
                idx = m
                fname = new_docid_arr[idx]
                arr.append(fname)

            highest_score = []
            context_new_id = []
            context_id = []
            context_content = []
            for con in arr:

                with open(
                        join(join(DATASET_DIR, 'context'),
                             '{}.json'.format(con))) as c:
                    cn_data = json.load(c)
                    new_docid, docid, text = (cn_data['new_docid'],
                                              cn_data['docid'],
                                              cn_data['text'])

                text_tok = tokenizer.tokenize(text)
                text_id = tokenizer.convert_tokens_to_ids(text_tok)
                text_len = len(text_id)

                question_len = len(ques_id)
                if (question_len + text_len <= 512):
                    concat_text = ques_id + text_id

                    token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize(
                        [concat_text], args.cuda)

                    fw_args = (token_tensor, segment_tensor, mask_tensor)
                    net_out = net(*fw_args)

                    # if (net_out[0][0].item() > highest_score[-1]) :
                    if (True):

                        highest_score.append(net_out[0][0].item())

                        context_new_id.append(new_docid)

                        context_id.append(docid)

                        context_content.append(text)

                else:
                    sp = 0
                    ep = 412
                    scores_arr = []
                    while (True):
                        if (ep >= text_len and sp < text_len):
                            sub_text = text_id[sp:text_len]
                            concat_text = ques_id + sub_text
                            token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize(
                                [concat_text], args.cuda)

                            fw_args = (token_tensor, segment_tensor,
                                       mask_tensor)
                            net_out = net(*fw_args)
                            # scores_arr.append(net_out[0][0].item())
                            output = ''
                            text_tok_arr = text_tok[sp:text_len]
                            for tok in text_tok_arr:
                                output += tok if (tok != '[UNK]') else ''
                            output = output.replace('##', '')
                            print(output)
                            highest_score.append(net_out[0][0].item())
                            context_new_id.append(new_docid)
                            context_id.append(docid)
                            context_content.append(output)
                            sp += 312
                            ep += 312
                        else:
                            if (ep > text_len):
                                break
                            else:
                                sub_text = text_id[sp:ep]
                                concat_text = ques_id + sub_text
                                token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize(
                                    [concat_text], args.cuda)

                                fw_args = (token_tensor, segment_tensor,
                                           mask_tensor)
                                net_out = net(*fw_args)
                                # scores_arr.append(net_out[0][0].item())
                                output = ''
                                text_tok_arr = text_tok[sp:text_len]
                                for tok in text_tok_arr:
                                    output += tok if (tok != '[UNK]') else ''
                                output = output.replace('##', '')
                                print(output)
                                highest_score.append(net_out[0][0].item())
                                context_new_id.append(new_docid)
                                context_id.append(docid)
                                context_content.append(output)
                                sp += 312
                                ep += 312
                    # if (max(scores_arr)>highest_score[-1]):
            ranking_index = map(highest_score.index,
                                heapq.nlargest(5, highest_score))
            ranking_index = list(ranking_index)
            fi = ''
            for cnm in ranking_index:
                fi += context_content[cnm]
            tmp_dict = {}
            tmp_dict['index'] = index + 1
            tmp_dict['id'] = id
            tmp_dict['question'] = question_text
            tmp_dict['new_docid'] = context_new_id[0]
            tmp_dict['docid'] = context_id[0]
            tmp_dict['text'] = fi
            with open(join(AFTER_DIR, '{}.json'.format(index + 1)),
                      'w',
                      encoding='utf-8') as f:
                json.dump(tmp_dict, f, ensure_ascii=False)

            print('finish processing {}'.format(index + 1))
Ejemplo n.º 8
0
def process_positive_example_v1():
    stopwords = stopwordlist()
    os.makedirs('data/class/pos')
    os.makedirs('data/class/neg')
    context_path = 'data/class/context'
    context_data = count_data(context_path)
    corpus = []
    new_docid_arr = []
    for i in range(context_data):
        with open(join('data/class/context', '{}.json'.format(i + 1))) as f:
            js_data = json.load(f)
            text = filter_text(
                js_data['text'].replace(' ', '').replace('&ensp;', '').replace('&rbsp;', '').replace('&mbsp;', ''))
            new_docid = js_data['new_docid']
        data = list(jieba.lcut(filter_text(text), cut_all=False, HMM=True))
        remove = lambda token: False if token in stopwords else True
        data = list(filter(remove, data))
        print(new_docid)
        corpus.append(data)
        new_docid_arr.append(new_docid)
    dictionary = corpora.Dictionary(corpus)
    bm25Model = bm25.BM25(corpus)

    csv_reader = csv.reader(open(TRAIN_DIR), delimiter='\t')
    rows = [row for row in csv_reader]
    docid_name = rows[0][1]
    question_name = rows[0][2]
    answer_name = rows[0][3]
    json_positive_dirs = join(CLASSIFICATION_DIR, 'positive_sample')
    # if not exists(json_positive_dirs):
    #     os.makedirs(json_positive_dirs)
    #     print('Dir used for positive samples Created ')
    with open(REALATE_DIR,'rb') as v:
        relation_dict=pickle.load(v)
    sample_rows = rows[:1582] + rows[1583:1955] + rows[1956:3781] + rows[3782:]

    maxlen=0
    count=1
    ncount=1
    right=0
    for i, sample_raw in enumerate(sample_rows):
        print('loading {}'.format(i))
        if (i == 0):
            continue
        else:
            tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain')
            new_docid=relation_dict[sample_raw[1]]
            remove = lambda token: False if token in stopwords else True
            question = filter_text(sample_raw[2].replace(' ', '').replace('&ensp;', ''))
            q_data = list(jieba.lcut(filter_text(question), cut_all=False, HMM=True))
            q_data = list(filter(remove, q_data))
            scores = bm25Model.get_scores(q_data)
            max_num_index_list = map(scores.index, heapq.nlargest(3, scores))
            max_num_index_list = list(max_num_index_list)
            arr = []
            for m in max_num_index_list:
                idx = m
                fname = new_docid_arr[idx]
                arr.append(fname)
            if (not(new_docid in arr)):
                continue
            else:
                right+=1

                for con in arr:
                    with open(join('data/class/context', '{}.json'.format(con))) as c:
                        cn_data = json.load(c)
                        cont_docid, docid, text = (cn_data['new_docid'], cn_data['docid'], cn_data['text'])

                    ques_tok = tokenizer.tokenize("[CLS] " + question + " [SEP]")
                    ques_id = tokenizer.convert_tokens_to_ids(ques_tok)
                    question_len = len(ques_id)

                    text_tok = tokenizer.tokenize(text)
                    text_id = tokenizer.convert_tokens_to_ids(text_tok)
                    text_len = len(text_id)
                    if (con == new_docid):
                        tmp_dict = {}
                        tmp_dict['is_related'] = 1
                        tmp_dict['new_docid'] = new_docid

                        answer = filter_text(sample_raw[3].replace(' ', '').replace('&ensp;', ''))

                        ans_tok = tokenizer.tokenize(answer)
                        ans_id = tokenizer.convert_tokens_to_ids(ans_tok)
                        ans_len = len(ans_id)
                        suppose_start = []  # 可能的start位置
                        for i in range(text_len):
                            if (text_id[i] == ans_id[0]):
                                suppose_start.append(i)

                        s = 0
                        e = 0
                        if (len(suppose_start) <= 0):
                            continue

                        else:
                            for t in range(len(suppose_start)):
                                start = suppose_start[t]
                                end = suppose_start[t]
                                for m in range(ans_len):
                                    if (m + start >= text_len):
                                        break
                                    elif (ans_id[m] == text_id[m + start]):
                                        end += 1
                                    else:
                                        break
                                if (end - start != ans_len):
                                    continue
                                else:
                                    s = suppose_start[t]
                                    e = end
                                    break
                        if (s == 0 and e == 0):
                            continue
                        else:
                            span_arr = [0] * (s - 0) + [1] * (e - s) + [0] * (text_len - e)

                        if (question_len + text_len <= 512):

                            tmp_dict['question'] = ques_id

                            tmp_dict['text'] = text_id

                            with open(join('data/class/pos', '{}.json'.format(count)), 'w', encoding='utf-8') as f:
                                json.dump(tmp_dict, f, ensure_ascii=False)
                                count += 1
                        else:
                            sp = 0
                            ep = 412
                            assert question_len <= 100 and text_len >= 412
                            while (True):
                                if (ep >= text_len and sp < text_len):

                                    sub_text = text_id[sp:text_len]
                                    tmp_dict['question'] = ques_id

                                    tmp_dict['text'] = sub_text

                                    assert question_len + text_len - sp <= 512

                                    with open(join('data/class/pos', '{}.json'.format(count)), 'w',
                                              encoding='utf-8') as f:
                                        json.dump(tmp_dict, f, ensure_ascii=False)
                                        count += 1

                                    sp += 312
                                    ep += 312
                                # else:
                                #         break
                                else:
                                    if (ep > text_len):
                                        break
                                    else:
                                        sub_text = text_id[sp:ep]
                                        tmp_dict['question'] = ques_id

                                        tmp_dict['text'] = sub_text

                                        assert question_len + ep - sp <= 512

                                        with open(join('data/class/pos', '{}.json'.format(count)), 'w',
                                                  encoding='utf-8') as f:
                                            json.dump(tmp_dict, f, ensure_ascii=False)
                                            count += 1

                                        sp += 312
                                        ep += 312
                    else:
                        tmp_dict['is_related'] = 0
                        tmp_dict['new_docid'] = con
                        tmp_dict['question'] = ques_id
                        if (question_len + text_len <= 512):

                            tmp_dict['text'] = text_id

                            with open(join('data/class/neg', '{}.json'.format(ncount)), 'w', encoding='utf-8') as f:
                                json.dump(tmp_dict, f, ensure_ascii=False)
                                ncount += 1

                        else:

                            sp = 0
                            ep = 412
                            assert question_len <= 100 and text_len >= 412
                            while (True):
                                if (ep >= text_len and sp < text_len):

                                    sub_text = text_id[sp:text_len]
                                    tmp_dict['question'] = ques_id

                                    tmp_dict['text'] = sub_text

                                    assert question_len + text_len - sp <= 512

                                    with open(join('data/class/neg', '{}.json'.format(ncount)), 'w',
                                              encoding='utf-8') as f:
                                        json.dump(tmp_dict, f, ensure_ascii=False)
                                        ncount += 1

                                    sp += 312
                                    ep += 312
                                # else:
                                #         break
                                else:
                                    if (ep > text_len):
                                        break
                                    else:
                                        sub_text = text_id[sp:ep]
                                        tmp_dict['question'] = ques_id

                                        tmp_dict['text'] = sub_text

                                        assert question_len + ep - sp <= 512

                                        with open(join('data/class/neg', '{}.json'.format(ncount)), 'w',
                                                  encoding='utf-8') as f:
                                            json.dump(tmp_dict, f, ensure_ascii=False)
                                            ncount += 1

                                        sp += 312
                                        ep += 312











    print('Pre-processed {} positive samples finished'.format(right))
    print(len(sample_rows))
Ejemplo n.º 9
0
 def __init__(self):
     self._path = join(DATA_DIR, 'train')
     self._n_data = count_data(self._path)
Ejemplo n.º 10
0
 def __init__(self, data_dir):
     self._path = os.path.join(data_dir, 'train')
     self._n_data = count_data(self._path)