def get_disambi_title(self, infile): disambi_title = {} for line in LoadFile.readline(infile): words = line.strip().split("\",\"") title_tmp = Clean.clean_word(words[1], clean_level="title") disambi_tmp = Clean.clean_word(words[0], clean_level="disambi") # title_tmp = title_tmp.strip().strip("\"") disambi_title[disambi_tmp] = title_tmp return disambi_title
def main(): with open("./410_baidu/410_disambi_infobox.csv", 'r', encoding='UTF-8') as inf: lines = inf.readlines() f = open("./410_baidu/410_disambi_infobox_out.csv", "w", encoding='utf-8') list_attr = [] title_list = get_word_list("./410_baidu/410_title.csv") err_count = 0 counts = {} for line in tqdm(lines): words = line.strip().split(",") disambi = Clean.clean_word(words[0], clean_level='disambi') infobox = ",".join(words[1:]) try: info_dict = json.loads(json.loads(infobox)) for attr in info_dict.keys(): clean_attr = Clean.clean_word(attr) info_dict[clean_attr] = info_dict.pop(attr) value = info_dict[clean_attr] clean_attr = clean_attr counts[clean_attr] = counts.setdefault(clean_attr, 0) + 1 list_attr.append(clean_attr) value_split = re.split(u"[,。、,/]", value.strip()) for v in value_split: v = Clean.clean_word(v).strip(u"等").strip(u"收起") title_list.append(v) f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n") except Exception as e: print(e) err_count += 1 title_list = [t.strip(u"\\") for t in title_list] title_list = list(set(title_list)) list_attr = list(set(list_attr)) sort_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True) with open("./sort_counts.txt", "w", encoding='utf-8') as ouf: for i in sort_counts: ouf.write(str(i) + "\n") with open("./all_attr.txt", "w", encoding='utf-8') as ouf: for word_counts in sort_counts: if word_counts[1] >= 10: ouf.write(str(word_counts[0]) + "\n") with open("./410_baidu/410_title_new.csv", "w", encoding='utf-8') as ouf: for i in title_list: ouf.write("\"" + i + "\"\r\n") with open("./410_baidu/all_attr.txt", "w", encoding='utf-8') as ouf: for i in list_attr: ouf.write(i + "\n") print("err_count: ", err_count)
def clean_title_disambi(infile="title_disambi.csv", outfile="title_disambi_out.csv"): with open(infile, "r",encoding='utf-8') as inf: lines = inf.readlines() err_counts = 0 with open(outfile, "w",encoding='utf-8') as ouf: for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 2: err_counts += 1 continue title = Clean.clean_word(words[0], clean_level='title') disambi = Clean.clean_word(words[1], clean_level='disambi') ouf.write("\"" + title + "\",\"" + disambi + "\"\r\n") print("err_counts for disambi_redirect: ", err_counts)
def clean_disambi_redirect(infile="source", outfile="target"): with open(infile) as inf: reader = csv.reader(inf) err_counts = 0 with open(outfile, "w") as ouf: for line in tqdm(reader): if len(line) != 2: err_counts += 1 continue print(line) disambi = Clean.clean_word(line[0], clean_level='disambi') redirect = Clean.clean_word(line[1], clean_level='redirect') ouf.write("\"" + disambi + "\",\"" + redirect + "\"\n") print("err_counts for disambi_redirect:%d" % (err_counts))
def get_word_list(filename): with open(filename, "r") as inf: lines = inf.readlines() # print "type line: ", type(lines[0].encode("utf-8")) lines = [ Clean.clean_word(line.decode('utf-8'), clean_level='title') for line in lines ] return lines
def get_title(infile): all_title = set([]) for line in LoadFile.readline(infile): title_tmp = Clean.clean_word(line.strip(), clean_level="title") title_tmp = title_tmp.strip().strip("\"") if title_tmp == "": continue all_title.add(title_tmp) return all_title
def clean_disambi_subject(infile="disambi_subject.csv", outfile="disambi_subject_out.csv"): with open(infile) as inf: lines = inf.readlines() err_counts = 0 with open(outfile, "w") as ouf: for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 2: err_counts += 1 continue disambi = Clean.clean_word( words[0].decode('utf-8'), clean_level='disambi').encode('utf-8') subject = Clean.clean_word( words[1].decode('utf-8'), clean_level='subject').encode('utf-8') ouf.write("\"" + disambi + "\",\"" + subject + "\"\r\n") print "err_counts for disambi_redirect: ", err_counts
def clean_disambi_literal(infile="source", outfile="target"): with open(infile) as inf: reader = csv.reader(inf) err_counts = 0 with open(outfile, "w") as ouf: for line in tqdm(reader): if len(line) != 2: err_counts += 1 continue disambi = Clean.clean_word(line[0], clean_level='disambi') literal = Clean.clean_word(line[1], clean_level='literal') if literal != '' and disambi != '': if '[朱槿品种]' in disambi: literal = '快乐' disambi = '快乐[[朱槿品种]]' if '"' in literal: literal = literal.replace('"', '""') if '\\' in literal: literal = literal.replace('\\', '') if '"' in disambi: disambi = disambi.replace('"', '""') ouf.write("\"" + disambi + "\",\"" + literal + "\"\n") print("err_counts for disambi_redirect:%d" % (err_counts))
def main(): with open("source/disambi.csv") as in_f_disambi, open("source/infobox.csv", "r") as in_f_infobox,\ open('source/literal.csv') as in_f_literal, open("target/disambi_infobox.csv", "w") as out_f: literal_list = get_word_list(in_f_literal) disambi_reader = csv.reader(in_f_disambi) info_lines = in_f_infobox.readlines() list_attr = [] list_value = [] err_count = 0 attr_counts = {} for (disambi, infobox) in tqdm(zip(disambi_reader, info_lines)): disambi = Clean.clean_word(disambi[0], clean_level='disambi') if '"' in disambi: disambi = disambi.replace('"', '""') if infobox != '{}': try: #print(json.loads(infobox)) info_dict = json.loads(json.loads(infobox).replace("\\", r"\\")) clean_info_dict = {} for attr in info_dict.keys(): clean_attr = Clean.clean_word(attr, clean_level='others') if clean_attr not in clean_info_dict.keys(): clean_info_dict[clean_attr] = info_dict[attr] for clean_attr in clean_info_dict.keys(): value = str(','.join(clean_info_dict[clean_attr])) if clean_info_dict[clean_attr] != [] else None if value: #value = value.replace('\"','').replace("\\",'').replace('"','""') value = value.replace('"','""') attr_counts[clean_attr] = attr_counts.setdefault(clean_attr, 0) + 1 # Collect Attr. Frequency list_attr.append(clean_attr) ####### #literal_list.append(value) list_value.append(value) out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + value + "\"" + "\n") ####### #value_split = re.split(u"[,。、,/]", value.strip()) #for v in value_split: #v = Clean.clean_word(v).strip(u"等").strip(u"收起") #v = v.strip(u"等").strip(u"收起") #if len(v) > 0: #literal_list.append(v) #list_value.append(v) #out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n") except Exception as e: print(f'Error:{e},Disambi:{disambi},Infobox:{infobox}') err_count += 1 #break literal_list = [t.replace('\"','').replace("\\",'').replace('"','""') for t in literal_list] literal_list = list(set(literal_list)) list_attr = list(set(list_attr)) list_value = list(set(list_value)) sort_counts = sorted(attr_counts.items(),key = lambda x:x[1],reverse = True) with open("target/sorted_all_attr.txt", "w") as ouf: for i in sort_counts: ouf.write(str(i) + "\n") with open("target/sorted_filerted_attr.txt", "w") as ouf: for word_counts in sort_counts: if word_counts[1] >= 10: ouf.write(str(word_counts[0]) + "\n") with open("target/literal.csv", "w") as ouf: for i in literal_list: ouf.write("\"" + i + "\"\n") with open("target/attr.txt", "w") as ouf: for i in list_attr: ouf.write(i + "\n") with open("target/value.csv", "w") as ouf: for i in list_value: ouf.write("\"" + i + "\"\n") print("err_count: ", err_count)
def get_word_list(in_f): #with open(filename, "r") as in_f: reader = csv.reader(in_f) lines = [Clean.clean_word(line[0], clean_level='literal') for line in reader] lines.remove('') return lines
#!/usr/bin/env python # coding=utf-8 import re from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi.csv", "r", encoding='utf-8') as inf: title_dict = {} count = 0 lines = inf.readlines() for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 4: count += 1 clean_disambi = Clean.clean_word(words[0], 'disambi') title_dict[clean_disambi] = words[1:] print("Error lines: ", count) with open("./410_baidu/410_disambi_new.csv", "w", encoding='utf-8') as ouf: for i in title_dict.keys(): ouf.write("\"" + i + "\",\"" + "\",\"".join(title_dict[i]) + "\r\n")
''' 将disambi名称进行清洗,其余不变 ''' import re from clean import Clean from tqdm import tqdm import csv with open("source/disambi_attrs.csv") as inf: title_dict = {} err_count = 0 reader = csv.reader(inf) for line in tqdm(reader): curLink = line[-2] exterLink = line[-1] clean_disambi = Clean.clean_word(line[0], 'disambi') if '"' in clean_disambi: clean_disambi = clean_disambi.replace('"', '""') if curLink == 'http://www.baike.com/wiki/%22': clean_disambi = '""[标点符号]' if len(line) < 5: print(f'\n{line},{len(line)}') err_count += 1 literal = '""' abstract = Clean.clean_word(line[1], 'others').strip() else: literal = Clean.clean_word(line[0], 'title') abstract = line[2] if len(line) == 5 else ''.join(line[2:-2]) abstract = abstract.replace('编辑摘要 ', '').replace('"', "'").strip() title_dict[clean_disambi] = [literal, abstract, curLink, exterLink] print("Error count:%d" % (err_count))
#!/usr/bin/env python # coding=utf-8 from collections import defaultdict from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi_subject.csv", "r", encoding='utf-8') as inf: lines = inf.readlines() # all_subject = defaultdict(list) total_subject = [] f = open("./410_baidu/disambi_subject.csv", "w", encoding='utf-8') for line in tqdm(lines): words = line.strip().split(",") disambi = Clean.clean_word(words[0], clean_level='disambi') subjects = words[1:] subjects = [ Clean.clean_word(s, clean_level="subject") for s in subjects ] # subjects = [s.replace("\"", "").strip("\\") for s in subjects] # subjects = [s.strip() for s in subjects] total_subject.extend(subjects) for subject in subjects: if subject == "": continue f.write("\"" + disambi + "\",\"" + subject + "\"\r\n") # all_subject[disambi].append(subjects) f.close() total_subject = list(set(total_subject)) print("Total subjects: ", len(total_subject)) with open("./410_baidu/all_subject.csv", "w", encoding='utf-8') as ouf:
#!/usr/bin/env python # coding=utf-8 import re from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi.csv") as inf: title_dict = {} count = 0 lines = inf.readlines() for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 4: count += 1 clean_disambi = Clean.clean_word(words[0].decode('utf-8'), 'disambi') title_dict[clean_disambi] = words[1:] print "Error lines: ", count with open("./410_baidu/410_disambi_new.csv", "w") as ouf: for i in title_dict.keys(): ouf.write("\"" + i.encode('utf-8') + "\",\"" + "\",\"".join(title_dict[i]) + "\r\n")
def build_entity_relation(in_file, train_file, test_file, rfile, tfile=args.title_path): with open(in_file) as inf, open(train_file, "w") as trf, open(test_file, "w") as tef: disambi_dict, tt_pair_set = load_dict(rfile, regen=True) all_title = get_title(tfile) title_id = {k: n for n, k in enumerate(all_title)} disambi_id = {k: n for n, k in enumerate(disambi_dict.keys())} # print("tt_pair_set: ", tt_pair_set) total_lines = int( commands.getoutput("awk 'END{print NR}' %s" % (in_file))) error_counts = 0 RESID = 0 RES_list = [] count_re = 0 count_na = 0 total_sentence_used = 0 re_set = set([]) # each line here is all_text for lemma # re_in_lemma = 0 for line_num in tqdm(range(total_lines)): re_in_lemma = 0 if count_na + count_re > args.max_sentence: # re_in_lemma = 0 continue all_info = inf.readline().strip().decode("utf-8") title_disambi_text = all_info.split(",") if len(title_disambi_text) != 3: error_counts += 1 continue text_title = Clean.clean_word(title_disambi_text[0], clean_level="title") text_disambi = Clean.clean_word(title_disambi_text[1], clean_level="disambi") all_text = title_disambi_text[2].replace("\"", "") try: relation_tuple = disambi_dict[text_disambi] except: continue if relation_tuple == "": error_counts += 1 continue lines = re.split(u"[。?;!!?]", all_text) # relation_in_sentence = 0 for r_tuple in relation_tuple: if un_ner(text_title) or un_ner(r_tuple[1]): continue sentence_used = False for line in lines: RES_sentence_dict = {} RES_head_dict = {} RES_tail_dict = {} line = line.strip().replace(" ", "") + u"。" seg_line = " ".join(jieba.cut(line)) pos1 = string.find(seg_line, text_title) pos2 = string.find(seg_line, r_tuple[1]) if pos1 != -1 and pos2 != -1 and pos1 != pos2: if count_re > args.max_sentence * 0.25: continue if r_tuple[0] != "": count_re += 1 re_in_lemma += 1 sentence_used = True RES_head_dict['word'] = text_title RES_head_dict['id'] = str(disambi_id[text_disambi]) RES_head_dict['type'] = "None" RES_tail_dict['word'] = r_tuple[1] RES_tail_dict['id'] = str(title_id[r_tuple[1]]) RES_tail_dict['type'] = "None" RES_sentence_dict['sentence'] = seg_line RES_sentence_dict['head'] = RES_head_dict RES_sentence_dict['tail'] = RES_tail_dict RES_sentence_dict['relation'] = r_tuple[0] RES_list.append(RES_sentence_dict) RESID += 1 re_set.add(r_tuple[0]) for line in lines: if count_na > args.max_sentence * 0.75 or re_in_lemma > args.max_NA_in_lemmas: break sentence_used = False relation_in_sentence = 0 RES_sentence_dict = {} RES_head_dict = {} RES_tail_dict = {} line = line.strip().replace(" ", "") + u"。" seg_line = " ".join(jieba.cut(line)) words = seg_line.split() words = [ Clean.clean_word(word, clean_level="others") for word in words ] enti_pos = [] for i in range(len(words)): if words[i] != u'' and words[ i] in all_title and not un_ner(words[i]): enti_pos.append(i) enti_pair_pos = [] for i in enti_pos: for j in enti_pos: if i != j and not ( (words[i] + "#" + words[j]) in tt_pair_set): enti_pair_pos.append((words[i], words[j])) if enti_pair_pos == []: continue for enti_pair in enti_pair_pos: if relation_in_sentence > args.max_relation_in_sentence or re_in_lemma > args.max_NA_in_lemmas: break count_na += 1 relation_in_sentence += 1 re_in_lemma += 1 sentence_used = True RES_head_dict['word'] = enti_pair[0] RES_head_dict['id'] = str(title_id[enti_pair[0]]) RES_head_dict['type'] = "None" RES_tail_dict['word'] = enti_pair[1] RES_tail_dict['id'] = str(title_id[enti_pair[1]]) RES_tail_dict['type'] = "None" RES_sentence_dict['sentence'] = seg_line RES_sentence_dict['head'] = RES_head_dict RES_sentence_dict['tail'] = RES_tail_dict RES_sentence_dict['relation'] = "NA" RES_list.append(RES_sentence_dict) RESID += 1 if sentence_used == True: total_sentence_used += 1 print "Total Realtion: ", count_re + count_na, "\t Current RE: ", count_re, "\t Current NA: ", count_na with open("rel2id.json", "w") as rf: relation_id = {k: v + 1 for v, k in enumerate(re_set)} relation_id['NA'] = 0 rf.write(json.dumps(relation_id)) print "count_re: ", count_re, "\t count_na: ", count_na, "\t count_total: ", count_re + count_na print "total_sentence_used: ", total_sentence_used total_len = count_re + count_na if count_re + count_na < args.max_sentence else args.max_sentence train_list = RES_list[:int(0.8 * total_len)] test_list = RES_list[int(0.8 * total_len) + 1:] json.dump(train_list, trf) json.dump(test_list, tef)
from collections import defaultdict from clean import Clean from tqdm import tqdm import csv with open("source/disambi_topic.csv") as in_f, open("target/disambi_topic.csv", "w") as out_f: reader = csv.reader(in_f) total_topic = [] for line in tqdm(reader): #print(line) disambi = line[0] topics = [] for i in line[1].split(','): topics.extend(i.split()) disambi = Clean.clean_word(disambi, clean_level='disambi') topics = [Clean.clean_word(s, clean_level="topic") for s in topics] total_topic.extend(topics) for topic in topics: if topic == "": continue if '[朱槿品种]' in disambi: disambi = '快乐[[朱槿品种]]' if '"' in disambi: disambi = disambi.replace('"', '""') if '"' in topic: topic = topic.replace('"', '""') out_f.write("\"" + disambi + "\",\"" + topic + "\"\n") total_topic = list(set(total_topic)) print("Total topics:%d " % (len(total_topic)))
#!/usr/bin/env python # coding=utf-8 from collections import defaultdict from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi_subject.csv") as inf: lines = inf.readlines() # all_subject = defaultdict(list) total_subject = [] f = open("./410_baidu/disambi_subject.csv", "w") for line in tqdm(lines): words = line.strip().split(",") disambi = Clean.clean_word(words[0].decode('utf-8'), clean_level='disambi').encode("utf-8") subjects = words[1:] subjects = [ Clean.clean_word(s.decode('utf-8'), clean_level="subject").encode("utf-8") for s in subjects ] # subjects = [s.replace("\"", "").strip("\\") for s in subjects] # subjects = [s.strip() for s in subjects] total_subject.extend(subjects) for subject in subjects: if subject == "": continue f.write("\"" + disambi + "\",\"" + subject + "\"\r\n") # all_subject[disambi].append(subjects) f.close()
import re import json import re from tqdm import tqdm from clean import Clean def get_word_list(filename): with open(filename, "r", encoding='utf-8') as inf: lines = inf.readlines() # print "type line: ", type(lines[0].encode("utf-8")) lines = [Clean.clean_word(line, clean_level='title') for line in lines] return lines print(Clean.clean_word(u"\"你好 呀#\"$%^&*@!,。、;:‘’】季 候【")) def main(): with open("./410_baidu/410_disambi_infobox.csv", 'r', encoding='UTF-8') as inf: lines = inf.readlines() f = open("./410_baidu/410_disambi_infobox_out.csv", "w", encoding='utf-8') list_attr = [] title_list = get_word_list("./410_baidu/410_title.csv") err_count = 0 counts = {} for line in tqdm(lines): words = line.strip().split(",")