def begin_filter_with_chinese(fname): entities = set() for line in read_file_iter(fname): if regchinese.match(line.decode('utf-8')): entities.add(line) write_file('entities_chinese_2_10.txt', entities)
def get_fudanperiod_entity( entity_dict=False): # get fudaninc entities stored in periodCache from hzlib.libfile import write_file saved_dir = '/data/hproject/2016/fudaninc-20160825' entities = set() i = 0 for f in os.listdir(saved_dir): fname = os.path.join(saved_dir, f) with open(fname) as fd: try: data = json.load(fd) except: i += 1 with open('failed.txt', 'a') as out: out.write(fd.read() + '\n') print(i) for entity, dic in data.iteritems(): if entity_dict: m = regdropbrackets.match(entity) if m: entities.add(m.group(1)) else: entities.add(entity) else: entities.add(entity) if entity_dict: write_file('fudankg_entities_dict.txt', list(entities)) else: write_file('fudankg_entities.txt', list(entities))
def comic_song_extract_entity(fname, persistent=False): entities = set() for line in read_file_iter(fname): m = regdropbrackets.match(line.decode('utf-8')) entity = m.group(1).encode('utf-8') if m else line entities.add(entity) print('comic song entities length: ', len(entities)) if persistent: write_file('entities/comic_song_entities.txt', entities) return entities
def wiki_title_entity(fname, persistent=False): entities = set() for line in read_file_iter(fname): m = regdisambiguation.match(line.strip().decode('utf-8')) item = m.group(1).encode('utf-8') if m else line.strip() if not item.startswith('\xee'): # human unreadable string entities.add(item.strip()) write_file('entities/{}_title'.format(fname), entities) if persistent: print('wiki title entities length: ', len(entities)) return entities
def dbpedia_extract_entity(fname, persistent=False): entities = set() for jsn in read_file_iter(fname, jsn=True): key, value = jsn.items()[0] key = value[u'resource_label'].strip() m = regdisambiguation.match(key) entity = m.group(1) if m else key entities.add(entity.encode('utf-8')) print('dbpedia entities length: ', len(entities)) if persistent: write_file('entities/dbpedia_entities.txt', entities) return entities
def bdbk_extract_entity(ifilename, persistent=False): entities = set() last_line = '</>' for line in read_file_iter(ifilename): if last_line == '</>': entities.add(line) elif line.startswith('@@@LINK='): entities.add(line[8:]) last_line = line print('bdbk entities length: ', len(entities)) if persistent: write_file('entities/{}_entities.txt'.format(ifilename), entities) return entities
def wiki_extract_entity(fname, persistent=False): entities = set() for jsn in read_file_iter(fname, jsn=True): m = regdisambiguation.match(jsn[u'chinese_label']) item = m.group(1) if m else jsn[u'chinese_label'] entities.add(item.encode('utf-8').strip()) if u'chinese_aliases' in jsn: entities.update( map(string.strip, map(lambda x: x.encode('utf-8'), jsn[u'chinese_aliases']))) print('wiki entities length: ', len(entities)) if persistent: write_file('entities/wiki_entities.txt', entities) return entities
def zgdbk_extract_entity(infilename, persistent=False): entities = set() re_entity = re.compile('<span id="span2" class="STYLE2">(.+)</span') for line in read_file_iter(infilename): m = re_entity.match(line) if m: entity = regrmlabel(m.group(1)) entity = zgdbk_parse_entity(entity) if entity: entities.add(entity.strip()) print('zgdbk entities length: ', len(entities)) if persistent: write_file('entities/zgdbk_entities.txt', entities) return entities
def zgdbk_extract_info(): pairs = [] content = read_file('zgdbk.txt') infos = re.findall( '<span id="span2" class="STYLE2">(.+?)</span.+?<span id="span14".+?</SRIPT>(.+?)</table>', content, re.S) for i in infos: entity = zgdbk_parse_entity(i[0]) if entity is None: continue info = i[1][:i[1].rfind('</span>')] info = lxml.html.fromstring(info.decode('utf-8')).text_content() pairs.append(json.dumps({entity: info})) write_file('zgdbk_result.txt', pairs)
def information_exist_proportion(): from hzlib.libfile import write_file saved_dir = '/data/crawler_file_cache/fudankg_saved' gcounter = Counter() entities_info_not_exist = set() for f in os.listdir(saved_dir): fname = os.path.join(saved_dir, f) with open(fname) as fd: for entity, dic in json.load(fd).iteritems(): if u'Information' in dic: gcounter['exist'] += 1 else: entities_info_not_exist.add(entity) gcounter['total'] += 1 print(gcounter['exist'] / gcounter['total']) write_file('fudankg_entities_info_not_exist.txt', list(entities_info_not_exist))
def get_fudankg_entity(entity_dict=False): from hzlib.libfile import write_file saved_dir = '/data/crawler_file_cache/fudankg_saved' entities = set() for f in os.listdir(saved_dir): fname = os.path.join(saved_dir, f) with open(fname) as fd: for entity, dic in json.load(fd).iteritems(): if entity_dict: m = regdropbrackets.match(entity) if m: entities.add(m.group(1)) else: entities.add(entity) else: entities.add(entity) if entity_dict: write_file('fudankg_entities_dict.txt', list(entities)) else: write_file('fudankg_entities.txt', list(entities))
def extract_bdbk_with_alias(ifilename): with open('{}_entities.txt'.format(ifilename), 'w') as wfd, codecs.open(ifilename) as rfd: last_line = '</>' for i, line in enumerate(rfd): line = line.strip() if last_line == '</>': if i == 0: wfd.write(line) else: wfd.write('\n' + line) elif line.startswith('@@@LINK='): wfd.write('\t' + line[8:]) last_line = line entity_alias = {} with open('{}_entities.txt'.format(ifilename)) as fd: for line in fd: if '\t' in line: entity_alias.update(dict([line.strip().split('\t')])) write_file('bdbk_entity_alias.json', entity_alias, jsn=True)
def comic_song_extract_entity(fname, persistent=False): entities = set() for line in read_file_iter(fname): m = regdropbrackets.match(line.decode('utf-8')) entity = m.group(1).encode('utf-8') if m else line entities.add(entity) print('comic song entities length: ', len(entities)) if persistent: write_file('entities/comic_song_entities.txt', entities) return entities if __name__ == '__main__': entities = set() entities.update(zgdbk_extract_entity('zgdbk.txt', True)) entities.update(bdbk_extract_entity('vbk2012.txt', True)) entities.update(bdbk_extract_entity('vbk2012_ext.txt', True)) entities.update(wiki_extract_entity('wikidata_zh_simplified.json', True)) entities.update( dbpedia_extract_entity('merge_step_5_simplified.json', True)) entities.update( wiki_title_entity('zhwiki-20160601-all-titles-in-ns2.txt', True)) entities.update(comic_song_extract_entity('ertong.txt', True)) write_file('entities/entities_0630_raw.txt', entities)
def begin_filter_with_lower(fname): entities = set() for line in read_file_iter(fname): entities.add(line.lower()) return entities def begin_filter_with_search(fname): entities = set() for line in read_file_iter(fname): m = regentityfilt.match(line.decode('utf-8')) if m: entities.add(m.group(1)) return entities if __name__ == '__main__': entities = set() # entities.update( begin_filter_with_search('entities/first_order_entities.txt') ) entities.update( begin_filter_with_search('entities/second_order_entities.txt')) entities.update( begin_filter_with_search('entities/third_order_entities.txt')) print('length of first order filtered entities', len(entities)) write_file('entities_for_fudankg_search.txt', entities)
setattr(load_db_data_bd, '_cache', (dbpedia, wikidata, bdbk)) return load_db_data_bd._cache def intersection_of3(dbpedia, wikidata, bdbk): if not hasattr(intersection_of3, '_cache'): inter_temp = dbpedia.intersection(wikidata) inter3 = inter_temp.intersection(bdbk) setattr(intersection_of3, '_cache', (inter3, inter_temp - inter3, inter_temp)) return intersection_of3._cache def get_dbpedia_wikidata(): dbpedia, wikidata, bdbk = load_db_data_bd() _, _, db_wiki_each = intersection_of3(dbpedia, wikidata, bdbk) return list(dbpedia - db_wiki_each) + list(wikidata - db_wiki_each) def get_bdbk(): dbpedia, wikidata, bdbk = load_db_data_bd() return list(bdbk.difference(dbpedia).difference(wikidata)) if __name__ == '__main__': first = run() write_file('entities/first_order_entities.txt', first) write_file('entities/second_order_entities.txt', get_dbpedia_wikidata()) write_file('entities/third_order_entities.txt', get_bdbk())