def collect_team_suffix(suffix_out_path): Print("collect team suffix, write to [%s]" % suffix_out_path) ename_title_map = Resource.get_singleton().get_baike_ename_title() baike_info_map = Resource.get_singleton().get_baike_info() ltp = Resource.get_singleton().get_ltp() suffix_cnt = {} Print("collect suffix") for bk_url in tqdm(baike_info_map, total=len(baike_info_map)): e_types = baike_info_map[bk_url].types if not is_org(e_types): continue enames = ename_title_map[bk_url] for name in enames: words = ltp.cut(name) ed = len(words) for st in range(1, ed): suffix = "".join(words[st:]) if not suffix in suffix_cnt: suffix_cnt[suffix] = 0 suffix_cnt[suffix] += 1 threshold = 10 outf = file(suffix_out_path, 'w') for key in sorted(suffix_cnt, key=lambda x: suffix_cnt[x], reverse=True): cnt = suffix_cnt[key] if cnt < threshold: continue outf.write("%s\t%d\n" % (key, cnt)) outf.close()
def merge_summary_and_infobox(summary_path, infobox_path, out_path): Print("load summary from [%s]" %summary_path) outf = file(out_path, 'w') summary_map = {} for line in tqdm(file(summary_path, 'r'), total = nb_lines_of(summary_path)): p = line.split('\t') key = p[0] summary = json.loads(p[1])['summary'] # summary = filter_bad_summary(summary) summary_map[key] = summary.encode('utf-8') Print('add infobox value to summary, path is [%s]' %infobox_path) for line in tqdm(file(infobox_path), total = nb_lines_of(infobox_path)): p = line.split('\t') key = p[0] info_values = list() info = json.loads(p[1])['info'] for value_list in info.values(): for value in value_list: info_values.append(value) if len(info_values) == 0: continue text = u"。" + u"#".join(info_values) text = text.encode('utf-8') if not key in summary_map: summary_map[key] = text else: summary_map[key] = summary_map[key] + text Print("write summary and infobox to [%s]" %out_path) outf = file(out_path, 'w') for bk_url in tqdm(sorted(summary_map.keys()), total = len(summary_map)): summary = {'summary': summary_map[bk_url]} outf.write('%s\t%s\n' %(bk_url, json.dumps(summary, ensure_ascii = False)) ) outf.close()
def generate_data_from_doc(doc_path, bk2fb, fb_uris, outpath): resource = Resource.get_singleton() fb_rels_map = resource.get_half_named_fb_info() ner = NamedEntityReg() e_linker = PageMemoryEntityLinker() doc_processor = DocProcessor(ner) url2names = resource.get_url2names() bk_info_map = resource.get_baike_info() important_domains = resource.get_important_domains() rel_extracotr = VerbRelationExtractor() schema = resource.get_schema() Print('generate data from [%s]' % os.path.basename(doc_path)) outf = file(outpath, 'w') cnt = 0 for line in tqdm(file(doc_path), total=nb_lines_of(doc_path)): bk_url, doc = line.split('\t') if bk_url not in bk2fb: continue fb_uri = bk2fb[bk_url] if fb_uri not in fb_rels_map: continue fb_rels = fb_rels_map[fb_uri] cnt += 1 if cnt % 100 == 0: print "" Print('cnt = %d' % cnt) # Print('parsing %s' %bk_url) # outf.write('##start parsing %s\n' %(bk_url)) bk_info = bk_info_map[bk_url] if bk_info.pop < 4 + 5: continue types = bk_info.types names = url2names[bk_url] page_info = PageInfo(names[-1], names, bk_url, get_url_domains(types, important_domains), types) e_linker.start_new_page(bk_url) # summary = [json.loads(summary)['summary']] # chapter_title = 'intro_summary' doc = json.loads(doc) for chapter_title, chapter in doc: chapter = [para for para in chapter if para.find('</table>') == -1] if len(chapter) == 0: continue generate_data_from_chapter(chapter_title, chapter, page_info, doc_processor, fb_rels, rel_extracotr, outf, e_linker, schema) outf.close()
def train_extract_summary_name(summary_path, out_path): outf = file(out_path, 'w') url2names = Resource.get_singleton().get_url2names() extor = SummaryNameExtractor() Print('train summary extra name') for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)): url, summary = line.split('\t') summary = json.loads(summary)['summary'] names = url2names[url] names = [x.decode('utf-8') for x in names] ret = extor.find_name_sentence(summary, names) if ret is None: continue sent, first_name = ret ret = extor.find_extra_name(sent, first_name, names) if ret is None: continue first_name, sent, second_name = ret outs = [url, first_name, sent, second_name] outf.write('%s\n' % ('\t'.join(outs))) outf.close()
def collect_table_cnt(cnt_path, outpath, local_info): Print("collect table cols cnt from [%s], write to [%s]" %(os.path.basename(cnt_path), os.path.basename(outpath))) cols_cnt, cols_type_cnt, cols_title_cnt = collect_tables(cnt_path, local_info) outf = file(outpath, 'w') useful_cols = [] total = 0 for cols in sorted(cols_cnt.keys(), key = lambda x: (len(x), x), reverse = True): cols_obj = cols.split(" # ") if not check_in(useful_cols, cols_obj): if cols_cnt[cols] < 20: continue total += cols_cnt[cols] useful_cols.append(cols_obj) types_cnt = topk_keys(cols_type_cnt[cols], 8) titles_cnt = topk_keys(cols_title_cnt[cols], 4) types_str = " ".join([fb_type + "#" + str(cnt) for fb_type, cnt in types_cnt]) titles_str = " ".join([title + "#" + str(cnt) for title, cnt in titles_cnt]) outf.write("%s\t%d\n" %(cols, cols_cnt[cols] )) for fb_type, cnt in types_cnt: outf.write("\t%s\t%d\n" %(fb_type, cnt)) for title, cnt in titles_cnt: outf.write('\t%s\t%d\n' %(title, cnt)) else: total += cols_cnt[cols] print total outf.close()
def init(self, paths=None): if paths is None: rules_dir = os.path.join(table_dir, 'rules') paths = glob.glob(rules_dir + "/*rule") path_str = " ".join([os.path.basename(x) for x in paths]) Print('load table rule from [%s]' % path_str) for path in paths: self.load_from_file(path)
def filter_summary(ori_path, new_path): Print('filter summary from [%s] to [%s]' %(os.path.basename(ori_path), os.path.basename(new_path))) outf = file(new_path, 'w') for line in tqdm(file(ori_path), total = nb_lines_of(ori_path)): bk_url, summary = line.split('\t') summary = json.loads(summary)['summary'] new_summary = filter_bad_summary(summary) new_summary = {'summary': new_summary} outf.write("%s\t%s\n" %(bk_url, json.dumps(new_summary, ensure_ascii = False)) ) outf.close()
def save(self, filepath): Print("save team dict to [%s]" % filepath) outf = file(filepath, 'w') for suffix in self.dicts: team_dict = self.dicts[suffix] for bk_url in team_dict: out = "%s\t%s\t%s" % (suffix, bk_url, "\t".join( team_dict[bk_url])) outf.write(out.rstrip() + '\n') outf.close()
def gen_name_map(extractor): baike_ename_title = Resource.get_singleton().get_baike_ename_title() url2names = Resource.get_singleton().get_url2names() bk_static_info = Resource.get_singleton().get_baike_info() location_dict = Resource.get_singleton().get_location_dict() location_dict = set([x.decode('utf-8') for x in location_dict]) all_names = set() for bk_url in url2names: if not bk_url in bk_static_info: continue bk_types = bk_static_info[bk_url].types if is_art_work(bk_types): continue enames = url2names[bk_url] is_son = False for ename in enames: parent_name = extractor.try_extract_parent_name(ename) if parent_name: is_son = True if is_son: continue for ename in enames: all_names.add(ename) name_map = {} Print("extract parent name") for bk_url in tqdm(baike_ename_title, total=len(baike_ename_title)): if not bk_url in bk_static_info: continue bk_types = bk_static_info[bk_url].types if is_art_work(bk_types): continue enames = baike_ename_title[bk_url] for ename in enames: parent_name = extractor.try_extract_parent_name( ename) # return unicode or None if not parent_name: continue if parent_name.encode('utf-8') in all_names: add_to_dict_list(name_map, parent_name, ename.decode('utf-8')) second_parent_name = del_loc_prefix(parent_name, location_dict) if second_parent_name and second_parent_name.encode( 'utf-8') in all_names: add_to_dict_list(name_map, second_parent_name, ename.decode('utf-8')) return name_map
def load_baike_ename_title(): path = os.path.join(result_dir, '360/360_entity_info_processed.json') Print('load baike\'s ename and title from [%s]' % path) ename_title_map = {} for line in tqdm(file(path), total=nb_lines_of(path)): bk_url, obj = line.split('\t') obj = json.loads(obj) ename, title = obj['ename'].encode('utf-8'), obj['title'].encode( 'utf-8') if title != ename: ename_title_map[bk_url] = [ename, title] else: ename_title_map[bk_url] = [ename] return ename_title_map
def load_extra_table(self, path=None): if path is None: path = os.path.join(table_dir, 'rules/extra_table.tsv') Print('load extra table from [%s]' % os.path.basename(path)) for line in file(path): p = line.strip().split("\t") table = p[0] rule_names = p[1:] for rule in rule_names: find = False for table_rule in self.table_rules: if table_rule.name == rule: find = True table_rule.register_table(table) break assert find
def gen_province_dict(): Print('generate province dict') resource = Resource.get_singleton() baike_info_map = resource.get_baike_info() ename_title_map = resource.get_baike_ename_title() out_path = os.path.join(dict_dir, 'province.txt') province_names = set() error_domains = get_error_domains() for bk_url in tqdm(ename_title_map, total=len(ename_title_map)): enames = ename_title_map[bk_url] if not bk_url in baike_info_map: continue bk_info = baike_info_map[bk_url] bk_types = bk_info.types if not "fb:location.location" in bk_types: continue is_province = False for bk_type in bk_types: if get_domain(bk_type) == 'fb:location' and ( 'state' in bk_type or "province" in bk_type): print "province bk_type: %s" % bk_url is_province = True for ename in enames: ename = ename.decode('utf-8') if len(ename) > 2 and (ename.endswith(u'省') or ename.endswith(u"州")): print "province ename: %s %s" % (ename, bk_url) is_province = True # if is_province: # for bk_type in bk_types: # if get_domain(bk_type) in error_domains: # is_province = False # print "province error type: %s" %(bk_url) if is_province: province_names.update(enames) outf = file(out_path, 'w') for name in province_names: if not is_chinese(name): continue outf.write("%s\n" % (name)) outf.close()
def extract_table_columns(): outpath = os.path.join(table_dir, 'table_column_count.tsv') outf = file(outpath, 'w') doc_path = os.path.join(rel_ext_dir, 'baike_doc.json') Print('count table\'s columns') for line in tqdm(file(doc_path), total=nb_lines_of(doc_path)): bk_url, doc = line.split('\t') doc = json.loads(doc) for title, chapter in doc: if type(chapter) is not unicode: continue try: tables = parse_tables_from_html(chapter) for table in tables: outf.write('%s\t%s\t%s\n' % (bk_url, title, " # ".join(table['columns']))) except Exception, e: print 'error at parse %s title = %s' % (bk_url, title)
def process(inpath, outpath, name_map, fb_uris): schema = Resource.get_singleton().get_schema() error_props = load_error_property() Print('process %s' % inpath) outf = file(outpath, 'w') error_outf = file('log/error.log', 'w') for line in tqdm(file(inpath), total=nb_lines_of(inpath)): fb_key, rels = line.split('\t') if not fb_key in fb_uris: continue rels = json.loads(rels) new_rels = {} for fb_property, obj in rels: if schema.reverse_property(fb_property) == fb_property: continue if fb_property in error_props: continue if obj in name_map: names = name_map[obj] else: literal = process_fb_value(obj) if literal.startswith('fb:m.'): # error_outf.write('error property %s, entity %s\n' %(fb_property, fb_key)) names = [] else: names = [process_fb_value(obj)] if len(names) == 0: continue if not fb_property in new_rels: new_rels[fb_property] = [] new_rels[fb_property].extend(names) big = False for fb_property in new_rels: new_rels[fb_property] = list(set(new_rels[fb_property])) if len(new_rels[fb_property]) > 300: error_outf.write( 'big size property of url = %s, property = %s, size = %d\n' % (fb_key, fb_property, len(new_rels[fb_property]))) outf.write("%s\t%s\n" % (fb_key, json.dumps(new_rels, ensure_ascii=False))) outf.close() error_outf.close()
def gen_citytown_dict(): Print('generate citytown dict') resource = Resource.get_singleton() baike_info_map = resource.get_baike_info() ename_title_map = resource.get_baike_ename_title() citydown_names = set() for bk_url in tqdm(baike_info_map, total=len(baike_info_map)): if not bk_url in baike_info_map or not bk_url in ename_title_map: continue bk_types = baike_info_map[bk_url].types if not 'fb:location.location' in bk_types: continue if not "fb:location.citytown" in bk_types: continue if 'fb:people.person' in bk_types: continue enames = ename_title_map[bk_url] # is_error_name = False # error_suffix = ['乡', "镇", '村', '街道', '道路'] # for ename in enames: # for suffix in error_suffix: # if ename.endswith(error_suffix): # is_error_name = True # if is_error_name: # continue citydown_names.update(enames) out_path = os.path.join(dict_dir, 'citytown.txt') outf = file(out_path, 'w') for name in citydown_names: if not is_chinese(name): continue outf.write("%s\n" % name) outf.close()
prof_dict_path = os.path.join(dict_dir, 'profession.txt') prof_url_dict_path = os.path.join(dict_dir, 'profession_url.txt') prof_dict = set(load_file(prof_dict_path)) candidate_profs = set() for line in file(prof_cnt_path): p = line.split('\t') if len(p) == 2: prof = p[0] cnt = int(p[1]) if cnt >= prof_cnt_threshold and is_chinese( prof) and prof not in prof_dict and len( prof.decode('utf-8')) >= 2: candidate_profs.add(prof) Print("#candidate name = %d" % len(candidate_profs)) resource = Resource.get_singleton() baike_ename_title_map = resource.get_baike_ename_title() prof2bk = {} for bk_url in baike_ename_title_map: enames = baike_ename_title_map[bk_url] for ename in enames: if ename in candidate_profs: if not ename in prof2bk: prof2bk[ename] = set() prof2bk[ename].add(bk_url) Print("#hit candidate name = %d" % len(prof2bk)) extra_prof_out_path = os.path.join(dict_dir, 'extra_profession.txt') extra_prof_url_out_path = os.path.join(dict_dir, 'extra_profession_url.txt')
return True return False if __name__ == "__main__": name2bk = Resource.get_singleton().get_name2bk() keys = sorted(name2bk.keys()) year_pattern = re.compile(ur'(公元前|公元)?\d{1,4}年$') re_digit = re.compile(r'^[0-9+\-=!?]+$') re_eng = re.compile(r"^[a-zA-Z]+$") valid_func = is_vertical_domain out_path = os.path.join(dict_dir, 'vertical_domain_baike_dict.txt') Print('use valid_func: valic_domains') bk_info_map = Resource.get_singleton().get_baike_info() outf = file(out_path, 'w') Print('write dict to %s' % out_path) for name in tqdm(keys, total=len(keys)): if not is_valid_dict_name(name): continue # if has_punc_eng(name): # continue bks = name2bk[name] # pop = 0 valid = False for bk_url in bks:
def adjust_pop_by_summary(self): Print('adjust entity popularity according to its summary length') for bk_url in tqdm(self.bk_info_map, total=len(self.bk_info_map)): summary_length = len(self.summary_map.get(bk_url, "")) / 100 self.bk_info_map[bk_url].pop += min(summary_length * 2, 10)
def extract_team_extra_name_from_summary(summary_path, out_path): resource = Resource().get_singleton() url2names = resource.get_url2names() ename_title_map = resource.get_baike_ename_title() baike_info_map = resource.get_baike_info() location_dict = resource.get_location_dict() ltp = resource.get_ltp() ner = NamedEntityReg() team_suffixes = load_team_suffix() team_suffixes = [x.decode('utf-8') for x in team_suffixes] team_suffixes = set(team_suffixes) Print('extract org\'s extra name from summary [%s]' % summary_path) Print("result write to [%s]" % out_path) outf = file(out_path, 'w') for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)): bk_url, summary = line.split('\t') types = baike_info_map[bk_url].types if not is_team(types): continue enames = ename_title_map[bk_url] enames = [x.decode('utf-8') for x in enames] ori_names = url2names[bk_url] ori_names = set([x.decode('utf-8') for x in ori_names]) summary = json.loads(summary)['summary'] sentences = split_sentences(summary) parsed_names = [] j_names_cnt = {} for sentence in sentences: names, j_names = parse_entity(sentence, ltp, ner, location_dict) parsed_names.extend(names) for j_name in j_names: if not j_name in j_names_cnt: j_names_cnt[j_name] = 0 j_names_cnt[j_name] += 1 succeed_names = set() suffixes = [] for ename in enames: if "fb:sports.sports_team" in types: suffix = try_get_suffix(ename, team_suffixes) else: suffix = u'' suffixes.append(suffix) if has_eng_digit(ename): continue if len(suffix) > 0: new_name = ename[:len(ename) - len(suffix)] if not is_location(new_name, location_dict): succeed_names.add(new_name) succeed_names.add(new_name + u"队") parsed_names = set(parsed_names) for parsed_name in parsed_names: valid = False for ename, suffix in zip(enames, suffixes): if has_eng_digit(ename): continue if is_good_sub_seq(parsed_name, ename, suffix): valid = True if valid: succeed_names.add(parsed_name) for j_name in j_names_cnt: if j_names_cnt[j_name] >= 2: valid = False for ename, suffix in zip(enames, suffixes): if has_eng_digit(ename): continue if j_name not in ename and is_good_sub_seq( j_name, ename, suffix): valid = True if valid: succeed_names.add(j_name) succeed_names = [ new_name for new_name in succeed_names if not new_name in ori_names ] succeed_names = [ new_name for new_name in succeed_names if not has_strange_punc(new_name) ] # succeed_names = [new_name for new_name in succeed_names if not is_location(new_name, location_dict)] if len(succeed_names) > 0: succeed_names = set(succeed_names) outf.write('%s\t%s\n' % (bk_url, "\t".join(succeed_names))) outf.close()
def gen_title_rel_dict(fb_type, count_filepath, out_path, cnt_threshold, extra_name_filepath=None, error_func=None, url_path=None): Print('gen dict by type [%s]' % fb_type) candidate_urls = set() resource = Resource.get_singleton() baike_static_info_map = resource.get_baike_info() Print("gen candidate baike_url") for bk_url in tqdm(baike_static_info_map, total=len(baike_static_info_map)): types = baike_static_info_map[bk_url].types if fb_type in types: candidate_urls.add(bk_url) candidate_names = set() if count_filepath is not None: for line in file(count_filepath): p = line.strip().split('\t') if len(p) == 2: name, cnt = p cnt = int(cnt) if cnt >= cnt_threshold: candidate_names.add(name) Print('#candidate urls = %d, #candidate names = %d' % (len(candidate_urls), len(candidate_names))) # ename_title_map = resource.get_baike_ename_title() url2names = resource.get_url2names() title_names = set() title2url = {} for candidate_url in candidate_urls: enames = url2names[candidate_url] for ename in enames: if ename in candidate_names or count_filepath is None: # assert ename not in title_names title_names.add(ename) if ename in title2url: pre_pop = baike_static_info_map[title2url[ename]].pop pop = baike_static_info_map[candidate_url].pop if pre_pop > pop: title_url = title2url[ename] else: title_url = candidate_url else: title_url = candidate_url title2url[ename] = title_url else: print "%s: miss name: %s" % (fb_type, ename) if extra_name_filepath is not None: Print("add extra name from [%s]" % extra_name_filepath) for line in file(extra_name_filepath): title_names.add(line.rstrip()) outf = file(out_path, 'w') if url_path: url_outf = file(url_path, 'w') for title_name in sorted(title_names): if title_name == '无': continue if error_func is not None and error_func(title_name): print "%s: error func name: %s" % (fb_type, title_name) continue if len(title_name.decode('utf-8')) < 2: print "%s: short name: %s" % (fb_type, title_name) continue if is_chinese(title_name): outf.write(title_name + '\n') if url_path: url_outf.write("%s\t%s\n" % (title_name, title2url[title_name])) outf.close() if url_path: url_outf.close()
def count_predicate(inpath, outpath, predicate): cnt_map = {} for line in tqdm(file(inpath), total=nb_lines_of(inpath)): _, obj = line.split('\t') info = json.loads(obj)['info'] values = info.get(predicate, []) for value in values: if not value in cnt_map: cnt_map[value] = 1 else: cnt_map[value] += 1 outf = file(outpath, 'w') for key in sorted(cnt_map.keys(), key=lambda x: cnt_map[x], reverse=True): if is_chinese(key): outf.write("%s\t%s\n" % (key, cnt_map[key])) outf.close() if __name__ == "__main__": base_dir = os.path.join(rel_ext_dir, 'infobox_count') predicates = [u'职业', u'国籍'] infobox_path = os.path.join(result_dir, '360/360_entity_info_processed.json') if not os.path.exists(base_dir): os.mkdir(base_dir) for predicate in predicates: Print("process predicate %s" % predicate) outpath = os.path.join(base_dir, '%s_cnt.tsv' % (predicate)) count_predicate(infobox_path, outpath, predicate)
main_pred = predicate.split("#")[0] if len(main_pred.decode('utf-8')) < 2: continue pred_map = self.map[predicate] for prop in pred_map: prop_cnt = pred_map[prop] self.add(main_pred, [prop], prop_cnt) for predicate in error_preds: self.map.pop(predicate) if __name__ == '__main__': inpath = sys.argv[1] outpath = sys.argv[2] Print('collect from [%s] write to [%s]' % (inpath, outpath)) pred_maps = PredicateMaps() for line in file(inpath): if not line.startswith('\t'): continue l = line.strip() p = l.split('\t') predicate = p[0] predicate = predicate.strip("'\" :#") if len(predicate) == 0: continue if len(predicate.decode('utf-8')) < 2 and predicate != '是': continue props = p[1:] pred_maps.add(predicate, props)
def extract_summary_name(summary_path, keywords, outpath, bracket_name_outpath): Print('extract extra name from [%s]' % summary_path) # url2names = Resource.get_singleton().get_url2names() url2names = load_url2names() bk_info_map = Resource.get_singleton().get_baike_info() error_domains = ['fb:chemistry'] ext = SummaryNameExtractor() outf = file(outpath, 'w') bracket_name_outf = file(bracket_name_outpath, 'w') for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)): url, summary = line.split('\t') types = bk_info_map[url].types in_error_domain = False for bk_type in types: if get_domain(bk_type) in error_domains: in_error_domain = True if in_error_domain: continue summary = json.loads(summary)['summary'] summary = summary.replace(u'(', u'(').replace(u')', u')') names = url2names[url] names = [x.decode('utf-8') for x in names] ret = ext.find_name_sentence(summary, names) if ret is None: extra_name = None sentences = split_sentences(summary) if len(sentences) > 0: first_sentence = sentences[0] no_subj = True for name in names: if name in first_sentence: no_subj = False if no_subj: extra_name = ext.find_no_subj_name(summary, keywords) else: rest_sentence, first_name = ret extra_name = ext.find_new_extra_name(rest_sentence, keywords) if extra_name is not None: extra_name = extra_name.strip() extra_names = unfold(extra_name, names) succeed_names = [] for extra_name in extra_names: extra_name = extra_name.strip(u'\'" \t\n”“') if not has_strange_punc(extra_name) \ and not too_long_name(extra_name, names) \ and not extra_name in names \ and not error_bracket_name(extra_name, names) \ and not too_short_name(extra_name) \ and not is_error_name(extra_name) \ and not digit_in_name(extra_name): succeed_names.append(extra_name) if len(succeed_names) > 0: succeed_names = list(set(succeed_names)) outf.write('%s\t%s\n' % (url, "\t".join(succeed_names))) names.extend(succeed_names) # extract bracket name extra_bracket_names = ext.extract_bracket_names( summary, keywords, names) succeed_names = [] for extra_name in extra_bracket_names: extra_name = extra_name.strip() extra_names = unfold(extra_name, names) for extra_name in extra_names: extra_name = extra_name.strip(u'\'" \t\n”“') if not has_strange_punc(extra_name) \ and not too_long_name(extra_name, names) \ and not extra_name in names \ and not too_short_name(extra_name) \ and not is_error_name(extra_name) \ and not digit_in_name(extra_name): succeed_names.append(extra_name) if len(succeed_names) > 0: succeed_names = list(set(succeed_names)) bracket_name_outf.write('%s\t%s\n' % (url, "\t".join(succeed_names))) outf.close() bracket_name_outf.close()
from src.IOUtil import Print, rel_ext_dir, result_dir, nb_lines_of from tqdm import tqdm import json def load_baike_ename_title(): path = os.path.join(result_dir, '360/360_entity_info_processed.json') Print('load baike\'s ename and title from [%s]' % path) ename_title_map = {} for line in tqdm(file(path), total=nb_lines_of(path)): bk_url, obj = line.split('\t') obj = json.loads(obj) ename, title = obj['ename'].encode('utf-8'), obj['title'].encode( 'utf-8') if title != ename: ename_title_map[bk_url] = [ename, title] else: ename_title_map[bk_url] = [ename] return ename_title_map if __name__ == "__main__": ename_title_map = load_baike_ename_title() out_path = os.path.join(rel_ext_dir, 'baike_ename_title.tsv') Print("write to [%s]" % out_path) outf = file(out_path, 'w') for bk_url in tqdm(sorted(ename_title_map.keys()), total=len(ename_title_map)): outf.write('%s\t%s\n' % (bk_url, "\t".join(ename_title_map[bk_url]))) outf.close()