Exemple #1
0
def generate_doc_data(path, files):
    paths = [w.strip() for w in open(files).readlines()]
    docs = []
    done_num = 0
    for p in paths:
        if p.strip().endswith("DS_Store"): continue
        done_num += 1
        file_name = p.strip()
        if file_name.endswith("onf"):
            if args.reduced == 1 and done_num >= 30: break
            doc = get_info_from_file(file_name, 2)
            docs.append(doc)
    return docs
def generate_vector(path, files):
    read_f = file("./data/emb", "rb")
    embedding, words, wd = cPickle.load(read_f)
    read_f.close()

    paths = [w.strip() for w in open(files).readlines()]

    #paths = utils.get_file_name(path,[])

    total_sentence_num = 0
    vectorized_sentences = []
    zp_info = []

    startt = timeit.default_timer()
    done_num = 0
    for p in paths:
        if p.strip().endswith("DS_Store"): continue
        done_num += 1
        file_name = p.strip()
        if file_name.endswith("onf"):

            if args.reduced == 1 and done_num >= 3: break

            zps, azps, candi, nodes_info = get_info_from_file(file_name, 2)
            anaphorics = []
            ana_zps = []
            for (zp_sentence_index, zp_index, antecedents, coref_id) in azps:
                for (candi_sentence_index, begin_word_index, end_word_index,
                     coref_id) in antecedents:
                    anaphorics.append(
                        (zp_sentence_index, zp_index, candi_sentence_index,
                         begin_word_index, end_word_index))
                    ana_zps.append((zp_sentence_index, zp_index))

            si2reali = {}
            for k in nodes_info:
                nl, wl = nodes_info[k]
                vectorize_words = list_vectorize(wl, words)
                vectorized_sentences.append(vectorize_words)
                si2reali[k] = total_sentence_num
                total_sentence_num += 1

            for (sentence_index, zp_index) in zps:
                ana = 0
                if (sentence_index, zp_index) in ana_zps:
                    ana = 1
                index_in_file = si2reali[sentence_index]
                zp = (index_in_file, sentence_index, zp_index, ana)
                zp_nl, zp_wl = nodes_info[sentence_index]

                candi_info = []
                if ana == 1:
                    for ci in range(max(0, sentence_index - 2),
                                    sentence_index + 1):
                        candi_sentence_index = ci
                        candi_nl, candi_wl = nodes_info[candi_sentence_index]

                        for (candi_begin,
                             candi_end) in candi[candi_sentence_index]:
                            if ci == sentence_index and candi_end > zp_index:
                                continue
                            res = 0
                            if (sentence_index, zp_index, candi_sentence_index,
                                    candi_begin, candi_end) in anaphorics:
                                res = 1
                            candi_index_in_file = si2reali[
                                candi_sentence_index]

                            ifl = get_fl(
                                (sentence_index, zp_index),
                                (candi_sentence_index, candi_begin, candi_end),
                                zp_wl, candi_wl, wd)

                            candidate = (candi_index_in_file,
                                         candi_sentence_index, candi_begin,
                                         candi_end, res, -res, ifl)
                            candi_info.append(candidate)
                zp_info.append((zp, candi_info))

    endt = timeit.default_timer()
    print >> sys.stderr
    print >> sys.stderr, "Total use %.3f seconds for Data Generating" % (
        endt - startt)
    vectorized_sentences = numpy.array(vectorized_sentences)
    return zp_info, vectorized_sentences
def generate_data(files):
    paths = [w.strip() for w in open(files).readlines()]

    total_sentence_num = 0
    sentences = []
    sentences_ori = []
    noun_phrases = []
    zp_info = defaultdict(list)

    azp_in_np, azp_total = 0.0, 0.0
    zp_anaph, zp_total = 0.0, 0.0

    startt = timeit.default_timer()
    for p in paths:
        if p.strip().endswith("DS_Store"): continue
        file_name = p.strip()
        if file_name.endswith("onf"):
            #file_name += "_autotree"
            zps, azps, nps, nodes = get_info_from_file(file_name, 2)

            # generate mappings, store sentences
            senti2globalsenti = {
            }  # sentence id mapping from local file to global
            wi2realwi = {
            }  # for each k, word id mapping from with ZP to without ZP
            for k in nodes.keys():
                senti2globalsenti[k] = total_sentence_num
                total_sentence_num += 1
                nl, wl = nodes[k]
                wi2realwi[total_sentence_num - 1] = {}
                realwl = []
                i2 = 0
                for i1, w in enumerate(wl):
                    w = w.word
                    if is_zp(w) == False:
                        wi2realwi[total_sentence_num - 1][i1] = i2
                        i2 += 1
                        realwl.append(w)
                sentences.append(realwl)
                sentences_ori.append([w.word for w in wl])

            # generate NP information
            for k in nps.keys():
                nps_new = []
                cur_sentence_num = senti2globalsenti[k]
                # A B *pro* [ *OP* C D *pro* ] *OP* E F
                for (st_index, ed_index) in nps[k]:
                    #print ' '.join(sentences_ori[cur_sentence_num][st_index:ed_index+1]).decode('utf-8')
                    st = get_prev_index(st_index,
                                        wi2realwi[cur_sentence_num]) + 1
                    ed = get_prev_index(ed_index + 1,
                                        wi2realwi[cur_sentence_num])
                    #print ' '.join(sentences[cur_sentence_num][st:ed+1]).decode('utf-8')
                    #print '====='
                    nps_new.append((st, ed))
                noun_phrases.append(nps_new)

            # generate zp information
            zp2ana = {
            }  # (zp-sent, zp) ==> list of (candi-sent, candi-begin, candi-end)
            for (zp_sent_index, zp_index, antecedents, coref_id) in azps:
                zp_sent_index = senti2globalsenti[zp_sent_index]
                zp_index = get_prev_index(zp_index,
                                          wi2realwi[zp_sent_index]) + 1
                #A = ' '.join(sentences[zp_sent_index][:zp_index])
                #B = ' '.join(sentences[zp_sent_index][zp_index:])
                #print (A + ' *pro* ' + B).decode('utf-8')
                is_match = not len(
                    antecedents
                )  # if no antecedents, then we consider it matched
                zp2ana[(zp_sent_index, zp_index)] = []
                for (candi_sent_index, candi_begin_index, candi_end_index,
                     coref_id) in antecedents:
                    candi_sent_index = senti2globalsenti[candi_sent_index]
                    #print ' '.join(sentences_ori[candi_sent_index][candi_begin_index:candi_end_index+1]).decode('utf8')
                    candi_begin_index = get_prev_index(
                        candi_begin_index, wi2realwi[candi_sent_index]) + 1
                    candi_end_index = get_prev_index(
                        candi_end_index + 1, wi2realwi[candi_sent_index])
                    #print ' '.join(sentences[candi_sent_index][candi_begin_index:candi_end_index+1]).decode('utf8')
                    #print '====='
                    # previous two sentences, or same but before zp_index
                    if zp_sent_index - 3 < candi_sent_index < zp_sent_index or \
                            (candi_sent_index == zp_sent_index and candi_end_index < zp_index):
                        is_match |= (
                            candi_begin_index,
                            candi_end_index) in noun_phrases[candi_sent_index]
                        zp2ana[(zp_sent_index, zp_index)].append(
                            (candi_sent_index, candi_begin_index,
                             candi_end_index))
                azp_in_np += is_match
                azp_total += 1.0

            for (zp_sent_index, zp_index) in zps:
                zp_sent_index = senti2globalsenti[zp_sent_index]
                zp_index = get_prev_index(zp_index,
                                          wi2realwi[zp_sent_index]) + 1
                if (zp_sent_index, zp_index) not in zp2ana:
                    zp2ana[(zp_sent_index, zp_index)] = []

            for k, v in zp2ana.items():
                zp_total += 1.0
                zp_anaph += len(v) > 0

            # store zp information
            for k, v in zp2ana.items():
                zp_sent_index, zp_index = k
                v = sorted(v)
                zp_info[zp_sent_index].append({
                    'zp_index': zp_index,
                    'ana_spans': v
                })
    print('AZP percent in NP: {}, {}, {}'.format(azp_in_np / azp_total,
                                                 azp_in_np, azp_total))
    print('Anaphora percent in ZPs: {}, {}, {} '.format(
        zp_anaph / zp_total, zp_anaph, zp_total))

    for i in range(len(sentences)):
        sentences[i] = (' '.join(sentences[i])).decode('utf-8')
        sentences_ori[i] = (' '.join(sentences_ori[i])).decode('utf-8')

    endt = timeit.default_timer()
    print >> sys.stderr
    print >> sys.stderr, "Total use %.3f seconds for Data Generating" % (
        endt - startt)
    return zp_info, sentences, noun_phrases
Exemple #4
0
def generate_vector(path, files):
    read_f = file('./data/emb', "rb")
    embedding, words, wd = cPickle.load(read_f)
    read_f.close()

    paths = [w.strip() for w in open(files).readlines()]

    total_sentence_num = 0
    vectorized_sentences = []
    zp_info = []

    startt = timeit.default_timer()
    is_test = True if 'test' in path else False
    for p in paths:
        if p.strip().endswith("DS_Store"):
            continue
        file_name = p.strip()
        if file_name.endswith('onf'):
            print 'Processing', file_name
            zps, azps, candi, nodes_info = get_info_from_file(file_name)

            anaphorics = []
            ana_zps = []
            for (zp_sentence_index, zp_begin_index, zp_end_index, antecedents, coref_id, is_real) in azps:
                for (candi_sentence_index, begin_word_index, end_word_index, coref_id) in antecedents:
                    anaphorics.append(
                        (zp_sentence_index, zp_begin_index, zp_end_index, candi_sentence_index, begin_word_index,
                         end_word_index))
                    ana_zps.append((zp_sentence_index, zp_begin_index, zp_end_index, is_real))

            si2reali = {}
            for k in nodes_info:
                nl, wl = nodes_info[k]
                vectorize_words = list_vectorize(wl, words)
                vectorized_sentences.append(vectorize_words)
                si2reali[k] = total_sentence_num
                total_sentence_num += 1

            for (sentence_index, zp_begin_index, zp_end_index, antecedents, coref_id, is_real) in azps:
                index_in_file = si2reali[sentence_index]
                zp = (index_in_file, sentence_index, zp_begin_index, zp_end_index)
                zp_nl, zp_wl = nodes_info[sentence_index]

                if (sentence_index, zp_begin_index, zp_end_index, is_real) not in ana_zps:
                    continue

                if is_test and is_real == 0:
                    continue

                candi_info = []
                for ci in range(max(0, sentence_index - 2), sentence_index + 1):
                    candi_sentence_index = ci
                    candi_nl, candi_wl = nodes_info[candi_sentence_index]

                    for (candi_begin, candi_end) in candi[candi_sentence_index]:
                        if ci == sentence_index and candi_end > zp_begin_index:
                            continue
                        res = 0
                        if (sentence_index, zp_begin_index, zp_end_index, candi_sentence_index, candi_begin,
                            candi_end) in anaphorics:
                            res = 1
                        candi_index_in_file = si2reali[candi_sentence_index]

                        ifl = get_fl((sentence_index, zp_begin_index, zp_end_index),
                                     (candi_sentence_index, candi_begin, candi_end),
                                     zp_wl, candi_wl, wd)

                        candidate = (
                            candi_index_in_file, candi_sentence_index, candi_begin, candi_end, res, -res, ifl)
                        candi_info.append(candidate)
                zp_info.append((zp, candi_info))

    endt = timeit.default_timer()
    print >> sys.stderr, "Total use %.3f seconds for Data Generating" % (endt - startt)
    vectorized_sentences = numpy.array(vectorized_sentences)
    return zp_info, vectorized_sentences
def generate_vector(path, files):
    read_f = open(args.data + "emb", "rb")
    _, _, wd = pickle.load(read_f, encoding='latin1')
    read_f.close()
    f = open(args.data + 'vocab_attention.json', 'r')
    words = json.load(f)
    f.close()

    tokenizer = BertTokenizer.from_pretrained(args.bert_dir + 'vocab.txt')
    orig_to_tok_maps_bert = []
    # vectorized_sentences_bert = []
    vectorized_sentences_bert_idx = []
    # mask_sentences_bert = []

    paths = [w.strip() for w in open(files).readlines()]
    #paths = utils.get_file_name(path,[])
    total_sentence_num = 0
    vectorized_sentences = []

    zp_info = []

    startt = timeit.default_timer()
    done_num = 0
    for p in paths:
        if p.strip().endswith("DS_Store"): continue
        done_num += 1
        file_name = args.data + p.strip()
        if file_name.endswith("onf"):

            if args.reduced == 1 and done_num >= 3: break

            zps, azps, candi, nodes_info = get_info_from_file(file_name, 2)
            anaphorics = []
            ana_zps = []
            for (zp_sentence_index, zp_index, antecedents, coref_id) in azps:
                for (candi_sentence_index, begin_word_index, end_word_index,
                     coref_id) in antecedents:
                    anaphorics.append(
                        (zp_sentence_index, zp_index, candi_sentence_index,
                         begin_word_index, end_word_index))
                    ana_zps.append((zp_sentence_index, zp_index))

            si2reali = {}
            for k in nodes_info:
                nl, wl = nodes_info[k]
                vectorize_words = list_vectorize(wl, words)
                vectorized_sentences.append(vectorize_words)
                bert_tokens = []
                orig_to_tok_map = []
                orig_tokens = [w.word for w in wl]
                # bert_tokens.append("[CLS]")
                for i, orig_token in enumerate(orig_tokens):
                    orig_to_tok_map.append(len(bert_tokens))
                    if "*pro*" in orig_token:
                        bert_tokens.extend(["[MASK]"])
                    else:
                        bert_tokens.extend(tokenizer.tokenize(orig_token))
                # bert_tokens.append("[SEP]")
                #orig_tokens=['什么样', '的', '记忆', '?']
                orig_to_tok_maps_bert.append(
                    orig_to_tok_map)  #orig_to_tok_map=[0,3,4,6]
                indexed_tokens = tokenizer.convert_tokens_to_ids(
                    bert_tokens
                )  #bert_tokens=['什', '么', '样', '的', '记', '忆', '?']
                vectorized_sentences_bert_idx.append(
                    indexed_tokens
                )  #indexed_tokens=[784, 720, 3416, 4638, 6381, 2554, 8043]

                # max_index_bert = len(indexed_tokens)
                # indexed_tokens=indexed_tokens[:min(args.max_sent_len,max_index_bert)]
                # sent_bert_mask = (len(indexed_tokens) * [1] + (args.max_sent_len - len(indexed_tokens)) * [0])
                # indexed_tokens = (indexed_tokens  + (args.max_sent_len - len(indexed_tokens)) * [0])

                # vectorized_sentences_bert.append(indexed_tokens)
                # mask_sentences_bert.append(sent_bert_mask)

                si2reali[k] = total_sentence_num
                total_sentence_num += 1

            for (sentence_index, zp_index) in zps:
                ana = 0
                if (sentence_index, zp_index) in ana_zps:
                    ana = 1
                index_in_file = si2reali[sentence_index]
                zp = (index_in_file, sentence_index, zp_index, ana)
                zp_nl, zp_wl = nodes_info[sentence_index]

                candi_info = []
                if ana == 1:
                    for ci in range(max(0, sentence_index - 2),
                                    sentence_index + 1):
                        candi_sentence_index = ci
                        candi_nl, candi_wl = nodes_info[candi_sentence_index]

                        for (candi_begin,
                             candi_end) in candi[candi_sentence_index]:
                            if ci == sentence_index and candi_end > zp_index:
                                continue
                            res = 0
                            if (sentence_index, zp_index, candi_sentence_index,
                                    candi_begin, candi_end) in anaphorics:
                                res = 1
                            candi_index_in_file = si2reali[
                                candi_sentence_index]

                            ifl = get_fl(
                                (sentence_index, zp_index),
                                (candi_sentence_index, candi_begin, candi_end),
                                zp_wl, candi_wl, wd)

                            candidate = (candi_index_in_file,
                                         candi_sentence_index, candi_begin,
                                         candi_end, res, -res, ifl)
                            candi_info.append(candidate)
                zp_info.append((zp, candi_info))

    endt = timeit.default_timer()
    print(file=sys.stderr)
    print("Total use %.3f seconds for Data Generating" % (endt - startt),
          file=sys.stderr)
    vectorized_sentences = numpy.array(vectorized_sentences)
    # vectorized_sentences_bert = numpy.array(vectorized_sentences_bert)
    vectorized_sentences_bert_idx = numpy.array(vectorized_sentences_bert_idx)
    # mask_sentences_bert = numpy.array(mask_sentences_bert)
    orig_to_tok_maps_bert = numpy.array(orig_to_tok_maps_bert)
    # return zp_info,vectorized_sentences,vectorized_sentences_bert,orig_to_tok_maps_bert,mask_sentences_bert,vectorized_sentences_bert_idx
    return zp_info, vectorized_sentences, orig_to_tok_maps_bert, vectorized_sentences_bert_idx
Exemple #6
0
def preprocess(file_path, wd, mode='train'):
    """
    param file_path: 存储训练/测试文档地址的文件
    param wd: 特征字典
    param mode: 训练/测试模式
    """
    paths = [
        line.strip() for line in open(file_path, encoding='utf-8').readlines()
    ]

    total_sentence_num = 0
    all_words = []
    zps_info = []

    is_test = True if 'test' in mode else False

    for path in paths:
        file_name = path.strip()
        if file_name.endswith('onf'):
            print('Processing', file_name)

            zps, azps, cands, nodes_info = get_info_from_file(file_name)

            anaphorics = []
            ana_zps = []

            for (zp_sent_idx, zp_begin_idx, zp_end_idx, antecedents, coref_id,
                 is_real) in azps:
                for (cand_sent_idx, cand_begin_idx, cand_end_idx,
                     coref_id) in antecedents:
                    item_1 = (zp_sent_idx, zp_begin_idx, zp_end_idx,
                              cand_sent_idx, cand_begin_idx, cand_end_idx)
                    anaphorics.append(item_1)
                    item_2 = (zp_sent_idx, zp_begin_idx, zp_end_idx, is_real)
                    ana_zps.append(item_2)

            si2reali = {}
            for k in nodes_info:
                nl, wl = nodes_info[k]
                words = get_words(wl)
                all_words.append(words)
                si2reali[k] = total_sentence_num
                total_sentence_num += 1

            for (zp_sent_idx, zp_begin_idx, zp_end_idx, antecedents, coref_id,
                 is_real) in azps:
                real_zp_sent_idx = si2reali[zp_sent_idx]
                zp = (real_zp_sent_idx, zp_sent_idx, zp_begin_idx, zp_end_idx)
                zp_nl, zp_wl = nodes_info[zp_sent_idx]

                if (zp_sent_idx, zp_begin_idx, zp_end_idx,
                        is_real) not in ana_zps:
                    continue

                if is_test and is_real == 0:
                    continue

                cands_info = []

                for cand_sent_idx in range(max(0, zp_sent_idx - 2),
                                           zp_sent_idx + 1):
                    cand_nl, cand_wl = nodes_info[cand_sent_idx]
                    for (cand_begin_idx, cand_end_idx) in cands[cand_sent_idx]:
                        if cand_sent_idx == zp_sent_idx and cand_end_idx > zp_begin_idx:
                            continue

                        res = 0
                        if (zp_sent_idx, zp_begin_idx, zp_end_idx,
                                cand_sent_idx, cand_begin_idx,
                                cand_end_idx) in anaphorics:
                            res = 1
                        real_cand_sent_idx = si2reali[cand_sent_idx]

                        ifl = get_fl(
                            (zp_sent_idx, zp_begin_idx, zp_end_idx),
                            (cand_sent_idx, cand_begin_idx, cand_end_idx),
                            zp_wl, cand_wl, wd)

                        cand = (real_cand_sent_idx, cand_sent_idx,
                                cand_begin_idx, cand_end_idx, res, -res, ifl)

                        cands_info.append(cand)

                zps_info.append((zp, cands_info))

    return zps_info, all_words