def generate_doc_data(path, files): paths = [w.strip() for w in open(files).readlines()] docs = [] done_num = 0 for p in paths: if p.strip().endswith("DS_Store"): continue done_num += 1 file_name = p.strip() if file_name.endswith("onf"): if args.reduced == 1 and done_num >= 30: break doc = get_info_from_file(file_name, 2) docs.append(doc) return docs
def generate_vector(path, files): read_f = file("./data/emb", "rb") embedding, words, wd = cPickle.load(read_f) read_f.close() paths = [w.strip() for w in open(files).readlines()] #paths = utils.get_file_name(path,[]) total_sentence_num = 0 vectorized_sentences = [] zp_info = [] startt = timeit.default_timer() done_num = 0 for p in paths: if p.strip().endswith("DS_Store"): continue done_num += 1 file_name = p.strip() if file_name.endswith("onf"): if args.reduced == 1 and done_num >= 3: break zps, azps, candi, nodes_info = get_info_from_file(file_name, 2) anaphorics = [] ana_zps = [] for (zp_sentence_index, zp_index, antecedents, coref_id) in azps: for (candi_sentence_index, begin_word_index, end_word_index, coref_id) in antecedents: anaphorics.append( (zp_sentence_index, zp_index, candi_sentence_index, begin_word_index, end_word_index)) ana_zps.append((zp_sentence_index, zp_index)) si2reali = {} for k in nodes_info: nl, wl = nodes_info[k] vectorize_words = list_vectorize(wl, words) vectorized_sentences.append(vectorize_words) si2reali[k] = total_sentence_num total_sentence_num += 1 for (sentence_index, zp_index) in zps: ana = 0 if (sentence_index, zp_index) in ana_zps: ana = 1 index_in_file = si2reali[sentence_index] zp = (index_in_file, sentence_index, zp_index, ana) zp_nl, zp_wl = nodes_info[sentence_index] candi_info = [] if ana == 1: for ci in range(max(0, sentence_index - 2), sentence_index + 1): candi_sentence_index = ci candi_nl, candi_wl = nodes_info[candi_sentence_index] for (candi_begin, candi_end) in candi[candi_sentence_index]: if ci == sentence_index and candi_end > zp_index: continue res = 0 if (sentence_index, zp_index, candi_sentence_index, candi_begin, candi_end) in anaphorics: res = 1 candi_index_in_file = si2reali[ candi_sentence_index] ifl = get_fl( (sentence_index, zp_index), (candi_sentence_index, candi_begin, candi_end), zp_wl, candi_wl, wd) candidate = (candi_index_in_file, candi_sentence_index, candi_begin, candi_end, res, -res, ifl) candi_info.append(candidate) zp_info.append((zp, candi_info)) endt = timeit.default_timer() print >> sys.stderr print >> sys.stderr, "Total use %.3f seconds for Data Generating" % ( endt - startt) vectorized_sentences = numpy.array(vectorized_sentences) return zp_info, vectorized_sentences
def generate_data(files): paths = [w.strip() for w in open(files).readlines()] total_sentence_num = 0 sentences = [] sentences_ori = [] noun_phrases = [] zp_info = defaultdict(list) azp_in_np, azp_total = 0.0, 0.0 zp_anaph, zp_total = 0.0, 0.0 startt = timeit.default_timer() for p in paths: if p.strip().endswith("DS_Store"): continue file_name = p.strip() if file_name.endswith("onf"): #file_name += "_autotree" zps, azps, nps, nodes = get_info_from_file(file_name, 2) # generate mappings, store sentences senti2globalsenti = { } # sentence id mapping from local file to global wi2realwi = { } # for each k, word id mapping from with ZP to without ZP for k in nodes.keys(): senti2globalsenti[k] = total_sentence_num total_sentence_num += 1 nl, wl = nodes[k] wi2realwi[total_sentence_num - 1] = {} realwl = [] i2 = 0 for i1, w in enumerate(wl): w = w.word if is_zp(w) == False: wi2realwi[total_sentence_num - 1][i1] = i2 i2 += 1 realwl.append(w) sentences.append(realwl) sentences_ori.append([w.word for w in wl]) # generate NP information for k in nps.keys(): nps_new = [] cur_sentence_num = senti2globalsenti[k] # A B *pro* [ *OP* C D *pro* ] *OP* E F for (st_index, ed_index) in nps[k]: #print ' '.join(sentences_ori[cur_sentence_num][st_index:ed_index+1]).decode('utf-8') st = get_prev_index(st_index, wi2realwi[cur_sentence_num]) + 1 ed = get_prev_index(ed_index + 1, wi2realwi[cur_sentence_num]) #print ' '.join(sentences[cur_sentence_num][st:ed+1]).decode('utf-8') #print '=====' nps_new.append((st, ed)) noun_phrases.append(nps_new) # generate zp information zp2ana = { } # (zp-sent, zp) ==> list of (candi-sent, candi-begin, candi-end) for (zp_sent_index, zp_index, antecedents, coref_id) in azps: zp_sent_index = senti2globalsenti[zp_sent_index] zp_index = get_prev_index(zp_index, wi2realwi[zp_sent_index]) + 1 #A = ' '.join(sentences[zp_sent_index][:zp_index]) #B = ' '.join(sentences[zp_sent_index][zp_index:]) #print (A + ' *pro* ' + B).decode('utf-8') is_match = not len( antecedents ) # if no antecedents, then we consider it matched zp2ana[(zp_sent_index, zp_index)] = [] for (candi_sent_index, candi_begin_index, candi_end_index, coref_id) in antecedents: candi_sent_index = senti2globalsenti[candi_sent_index] #print ' '.join(sentences_ori[candi_sent_index][candi_begin_index:candi_end_index+1]).decode('utf8') candi_begin_index = get_prev_index( candi_begin_index, wi2realwi[candi_sent_index]) + 1 candi_end_index = get_prev_index( candi_end_index + 1, wi2realwi[candi_sent_index]) #print ' '.join(sentences[candi_sent_index][candi_begin_index:candi_end_index+1]).decode('utf8') #print '=====' # previous two sentences, or same but before zp_index if zp_sent_index - 3 < candi_sent_index < zp_sent_index or \ (candi_sent_index == zp_sent_index and candi_end_index < zp_index): is_match |= ( candi_begin_index, candi_end_index) in noun_phrases[candi_sent_index] zp2ana[(zp_sent_index, zp_index)].append( (candi_sent_index, candi_begin_index, candi_end_index)) azp_in_np += is_match azp_total += 1.0 for (zp_sent_index, zp_index) in zps: zp_sent_index = senti2globalsenti[zp_sent_index] zp_index = get_prev_index(zp_index, wi2realwi[zp_sent_index]) + 1 if (zp_sent_index, zp_index) not in zp2ana: zp2ana[(zp_sent_index, zp_index)] = [] for k, v in zp2ana.items(): zp_total += 1.0 zp_anaph += len(v) > 0 # store zp information for k, v in zp2ana.items(): zp_sent_index, zp_index = k v = sorted(v) zp_info[zp_sent_index].append({ 'zp_index': zp_index, 'ana_spans': v }) print('AZP percent in NP: {}, {}, {}'.format(azp_in_np / azp_total, azp_in_np, azp_total)) print('Anaphora percent in ZPs: {}, {}, {} '.format( zp_anaph / zp_total, zp_anaph, zp_total)) for i in range(len(sentences)): sentences[i] = (' '.join(sentences[i])).decode('utf-8') sentences_ori[i] = (' '.join(sentences_ori[i])).decode('utf-8') endt = timeit.default_timer() print >> sys.stderr print >> sys.stderr, "Total use %.3f seconds for Data Generating" % ( endt - startt) return zp_info, sentences, noun_phrases
def generate_vector(path, files): read_f = file('./data/emb', "rb") embedding, words, wd = cPickle.load(read_f) read_f.close() paths = [w.strip() for w in open(files).readlines()] total_sentence_num = 0 vectorized_sentences = [] zp_info = [] startt = timeit.default_timer() is_test = True if 'test' in path else False for p in paths: if p.strip().endswith("DS_Store"): continue file_name = p.strip() if file_name.endswith('onf'): print 'Processing', file_name zps, azps, candi, nodes_info = get_info_from_file(file_name) anaphorics = [] ana_zps = [] for (zp_sentence_index, zp_begin_index, zp_end_index, antecedents, coref_id, is_real) in azps: for (candi_sentence_index, begin_word_index, end_word_index, coref_id) in antecedents: anaphorics.append( (zp_sentence_index, zp_begin_index, zp_end_index, candi_sentence_index, begin_word_index, end_word_index)) ana_zps.append((zp_sentence_index, zp_begin_index, zp_end_index, is_real)) si2reali = {} for k in nodes_info: nl, wl = nodes_info[k] vectorize_words = list_vectorize(wl, words) vectorized_sentences.append(vectorize_words) si2reali[k] = total_sentence_num total_sentence_num += 1 for (sentence_index, zp_begin_index, zp_end_index, antecedents, coref_id, is_real) in azps: index_in_file = si2reali[sentence_index] zp = (index_in_file, sentence_index, zp_begin_index, zp_end_index) zp_nl, zp_wl = nodes_info[sentence_index] if (sentence_index, zp_begin_index, zp_end_index, is_real) not in ana_zps: continue if is_test and is_real == 0: continue candi_info = [] for ci in range(max(0, sentence_index - 2), sentence_index + 1): candi_sentence_index = ci candi_nl, candi_wl = nodes_info[candi_sentence_index] for (candi_begin, candi_end) in candi[candi_sentence_index]: if ci == sentence_index and candi_end > zp_begin_index: continue res = 0 if (sentence_index, zp_begin_index, zp_end_index, candi_sentence_index, candi_begin, candi_end) in anaphorics: res = 1 candi_index_in_file = si2reali[candi_sentence_index] ifl = get_fl((sentence_index, zp_begin_index, zp_end_index), (candi_sentence_index, candi_begin, candi_end), zp_wl, candi_wl, wd) candidate = ( candi_index_in_file, candi_sentence_index, candi_begin, candi_end, res, -res, ifl) candi_info.append(candidate) zp_info.append((zp, candi_info)) endt = timeit.default_timer() print >> sys.stderr, "Total use %.3f seconds for Data Generating" % (endt - startt) vectorized_sentences = numpy.array(vectorized_sentences) return zp_info, vectorized_sentences
def generate_vector(path, files): read_f = open(args.data + "emb", "rb") _, _, wd = pickle.load(read_f, encoding='latin1') read_f.close() f = open(args.data + 'vocab_attention.json', 'r') words = json.load(f) f.close() tokenizer = BertTokenizer.from_pretrained(args.bert_dir + 'vocab.txt') orig_to_tok_maps_bert = [] # vectorized_sentences_bert = [] vectorized_sentences_bert_idx = [] # mask_sentences_bert = [] paths = [w.strip() for w in open(files).readlines()] #paths = utils.get_file_name(path,[]) total_sentence_num = 0 vectorized_sentences = [] zp_info = [] startt = timeit.default_timer() done_num = 0 for p in paths: if p.strip().endswith("DS_Store"): continue done_num += 1 file_name = args.data + p.strip() if file_name.endswith("onf"): if args.reduced == 1 and done_num >= 3: break zps, azps, candi, nodes_info = get_info_from_file(file_name, 2) anaphorics = [] ana_zps = [] for (zp_sentence_index, zp_index, antecedents, coref_id) in azps: for (candi_sentence_index, begin_word_index, end_word_index, coref_id) in antecedents: anaphorics.append( (zp_sentence_index, zp_index, candi_sentence_index, begin_word_index, end_word_index)) ana_zps.append((zp_sentence_index, zp_index)) si2reali = {} for k in nodes_info: nl, wl = nodes_info[k] vectorize_words = list_vectorize(wl, words) vectorized_sentences.append(vectorize_words) bert_tokens = [] orig_to_tok_map = [] orig_tokens = [w.word for w in wl] # bert_tokens.append("[CLS]") for i, orig_token in enumerate(orig_tokens): orig_to_tok_map.append(len(bert_tokens)) if "*pro*" in orig_token: bert_tokens.extend(["[MASK]"]) else: bert_tokens.extend(tokenizer.tokenize(orig_token)) # bert_tokens.append("[SEP]") #orig_tokens=['什么样', '的', '记忆', '?'] orig_to_tok_maps_bert.append( orig_to_tok_map) #orig_to_tok_map=[0,3,4,6] indexed_tokens = tokenizer.convert_tokens_to_ids( bert_tokens ) #bert_tokens=['什', '么', '样', '的', '记', '忆', '?'] vectorized_sentences_bert_idx.append( indexed_tokens ) #indexed_tokens=[784, 720, 3416, 4638, 6381, 2554, 8043] # max_index_bert = len(indexed_tokens) # indexed_tokens=indexed_tokens[:min(args.max_sent_len,max_index_bert)] # sent_bert_mask = (len(indexed_tokens) * [1] + (args.max_sent_len - len(indexed_tokens)) * [0]) # indexed_tokens = (indexed_tokens + (args.max_sent_len - len(indexed_tokens)) * [0]) # vectorized_sentences_bert.append(indexed_tokens) # mask_sentences_bert.append(sent_bert_mask) si2reali[k] = total_sentence_num total_sentence_num += 1 for (sentence_index, zp_index) in zps: ana = 0 if (sentence_index, zp_index) in ana_zps: ana = 1 index_in_file = si2reali[sentence_index] zp = (index_in_file, sentence_index, zp_index, ana) zp_nl, zp_wl = nodes_info[sentence_index] candi_info = [] if ana == 1: for ci in range(max(0, sentence_index - 2), sentence_index + 1): candi_sentence_index = ci candi_nl, candi_wl = nodes_info[candi_sentence_index] for (candi_begin, candi_end) in candi[candi_sentence_index]: if ci == sentence_index and candi_end > zp_index: continue res = 0 if (sentence_index, zp_index, candi_sentence_index, candi_begin, candi_end) in anaphorics: res = 1 candi_index_in_file = si2reali[ candi_sentence_index] ifl = get_fl( (sentence_index, zp_index), (candi_sentence_index, candi_begin, candi_end), zp_wl, candi_wl, wd) candidate = (candi_index_in_file, candi_sentence_index, candi_begin, candi_end, res, -res, ifl) candi_info.append(candidate) zp_info.append((zp, candi_info)) endt = timeit.default_timer() print(file=sys.stderr) print("Total use %.3f seconds for Data Generating" % (endt - startt), file=sys.stderr) vectorized_sentences = numpy.array(vectorized_sentences) # vectorized_sentences_bert = numpy.array(vectorized_sentences_bert) vectorized_sentences_bert_idx = numpy.array(vectorized_sentences_bert_idx) # mask_sentences_bert = numpy.array(mask_sentences_bert) orig_to_tok_maps_bert = numpy.array(orig_to_tok_maps_bert) # return zp_info,vectorized_sentences,vectorized_sentences_bert,orig_to_tok_maps_bert,mask_sentences_bert,vectorized_sentences_bert_idx return zp_info, vectorized_sentences, orig_to_tok_maps_bert, vectorized_sentences_bert_idx
def preprocess(file_path, wd, mode='train'): """ param file_path: 存储训练/测试文档地址的文件 param wd: 特征字典 param mode: 训练/测试模式 """ paths = [ line.strip() for line in open(file_path, encoding='utf-8').readlines() ] total_sentence_num = 0 all_words = [] zps_info = [] is_test = True if 'test' in mode else False for path in paths: file_name = path.strip() if file_name.endswith('onf'): print('Processing', file_name) zps, azps, cands, nodes_info = get_info_from_file(file_name) anaphorics = [] ana_zps = [] for (zp_sent_idx, zp_begin_idx, zp_end_idx, antecedents, coref_id, is_real) in azps: for (cand_sent_idx, cand_begin_idx, cand_end_idx, coref_id) in antecedents: item_1 = (zp_sent_idx, zp_begin_idx, zp_end_idx, cand_sent_idx, cand_begin_idx, cand_end_idx) anaphorics.append(item_1) item_2 = (zp_sent_idx, zp_begin_idx, zp_end_idx, is_real) ana_zps.append(item_2) si2reali = {} for k in nodes_info: nl, wl = nodes_info[k] words = get_words(wl) all_words.append(words) si2reali[k] = total_sentence_num total_sentence_num += 1 for (zp_sent_idx, zp_begin_idx, zp_end_idx, antecedents, coref_id, is_real) in azps: real_zp_sent_idx = si2reali[zp_sent_idx] zp = (real_zp_sent_idx, zp_sent_idx, zp_begin_idx, zp_end_idx) zp_nl, zp_wl = nodes_info[zp_sent_idx] if (zp_sent_idx, zp_begin_idx, zp_end_idx, is_real) not in ana_zps: continue if is_test and is_real == 0: continue cands_info = [] for cand_sent_idx in range(max(0, zp_sent_idx - 2), zp_sent_idx + 1): cand_nl, cand_wl = nodes_info[cand_sent_idx] for (cand_begin_idx, cand_end_idx) in cands[cand_sent_idx]: if cand_sent_idx == zp_sent_idx and cand_end_idx > zp_begin_idx: continue res = 0 if (zp_sent_idx, zp_begin_idx, zp_end_idx, cand_sent_idx, cand_begin_idx, cand_end_idx) in anaphorics: res = 1 real_cand_sent_idx = si2reali[cand_sent_idx] ifl = get_fl( (zp_sent_idx, zp_begin_idx, zp_end_idx), (cand_sent_idx, cand_begin_idx, cand_end_idx), zp_wl, cand_wl, wd) cand = (real_cand_sent_idx, cand_sent_idx, cand_begin_idx, cand_end_idx, res, -res, ifl) cands_info.append(cand) zps_info.append((zp, cands_info)) return zps_info, all_words