def read_jsn(self, data_dir): for fname in os.listdir(data_dir): for js in libfile.read_file_iter(os.path.join(data_dir, fname), jsn=True): self.parse_info(js) for fname in os.listdir(data_dir): for js in libfile.read_file_iter(os.path.join(data_dir, fname), jsn=True): self.parse_subcompany(js) self.parse_alias(js)
def begin_filter_with_chinese(fname): entities = set() for line in read_file_iter(fname): if regchinese.match(line.decode('utf-8')): entities.add(line) write_file('entities_chinese_2_10.txt', entities)
def read_jsn(self, data_dir): for fname in os.listdir(data_dir): for js in libfile.read_file_iter(os.path.join(data_dir, fname), jsn=True): try: self.parse(js) except Exception, e: print('{}: {}'.format(type(e), e.message))
def begin_filter_with_search(fname): entities = set() for line in read_file_iter(fname): m = regentityfilt.match(line.decode('utf-8')) if m: entities.add(m.group(1)) return entities
def load_zhwiki_alias(dirname, fname): zhwiki_entity_alias = {} name = os.path.join(dirname, fname) for js in read_file_iter(name, jsn=True): if u'chinese_aliases' in js: zhwiki_entity_alias[js[u'chinese_label']] = js[u'chinese_aliases'] return zhwiki_entity_alias
def comic_song_extract_entity(fname, persistent=False): entities = set() for line in read_file_iter(fname): m = regdropbrackets.match(line.decode('utf-8')) entity = m.group(1).encode('utf-8') if m else line entities.add(entity) print('comic song entities length: ', len(entities)) if persistent: write_file('entities/comic_song_entities.txt', entities) return entities
def wiki_title_entity(fname, persistent=False): entities = set() for line in read_file_iter(fname): m = regdisambiguation.match(line.strip().decode('utf-8')) item = m.group(1).encode('utf-8') if m else line.strip() if not item.startswith('\xee'): # human unreadable string entities.add(item.strip()) write_file('entities/{}_title'.format(fname), entities) if persistent: print('wiki title entities length: ', len(entities)) return entities
def dbpedia_extract_entity(fname, persistent=False): entities = set() for jsn in read_file_iter(fname, jsn=True): key, value = jsn.items()[0] key = value[u'resource_label'].strip() m = regdisambiguation.match(key) entity = m.group(1) if m else key entities.add(entity.encode('utf-8')) print('dbpedia entities length: ', len(entities)) if persistent: write_file('entities/dbpedia_entities.txt', entities) return entities
def bdbk_extract_entity(ifilename, persistent=False): entities = set() last_line = '</>' for line in read_file_iter(ifilename): if last_line == '</>': entities.add(line) elif line.startswith('@@@LINK='): entities.add(line[8:]) last_line = line print('bdbk entities length: ', len(entities)) if persistent: write_file('entities/{}_entities.txt'.format(ifilename), entities) return entities
def load_dbpedia(): data = {} for line in read_file_iter(DIR + 'merge_step_5_simplified.json', jsn=True): for key, value in line.items(): entity = value[u'resource_label'] data[entity] = {} if u'short_abstract' in value: data[entity]['definition'] = value[u'short_abstract'] # if u'resource_alias' in value: # data[entity]['aliases'] = value[u'resource_alias'] send_definition_to_es(data, 'definition') return data
def load_merge_step5_wiki_simplified(dirname, fname): merge_step5_wiki_simplified = {} name = os.path.join(dirname, fname) for js in read_file_iter(name, jsn=True): for key, value in js.iteritems(): entity = value[u'resource_label'] if u'resource_alias' in value: merge_step5_wiki_simplified[entity] = value[u'resource_alias'] m = regdropbrackets.match(entity) if m: merge_step5_wiki_simplified[m.group( 1)] = value[u'resource_alias'] return merge_step5_wiki_simplified
def load_wikidata(): """ this function cost too much memory """ data = {} for jsn in read_file_iter('wikidata_zh_simplified.json', jsn=True): m = regdisambiguation.match(jsn[u'chinese_label']) item = m.group(1) if m else jsn[u'chinese_label'] entity = item.strip().encode('utf-8') data[entity] = {} if u'chinese_aliases' in jsn: data[entity]['aliases'] = jsn[u'chinese_aliases'] jsn.pop(u'chinese_aliases') data[entity]['attributes'] = jsn
def wiki_extract_entity(fname, persistent=False): entities = set() for jsn in read_file_iter(fname, jsn=True): m = regdisambiguation.match(jsn[u'chinese_label']) item = m.group(1) if m else jsn[u'chinese_label'] entities.add(item.encode('utf-8').strip()) if u'chinese_aliases' in jsn: entities.update( map(string.strip, map(lambda x: x.encode('utf-8'), jsn[u'chinese_aliases']))) print('wiki entities length: ', len(entities)) if persistent: write_file('entities/wiki_entities.txt', entities) return entities
def zgdbk_extract_entity(infilename, persistent=False): entities = set() re_entity = re.compile('<span id="span2" class="STYLE2">(.+)</span') for line in read_file_iter(infilename): m = re_entity.match(line) if m: entity = regrmlabel(m.group(1)) entity = zgdbk_parse_entity(entity) if entity: entities.add(entity.strip()) print('zgdbk entities length: ', len(entities)) if persistent: write_file('entities/zgdbk_entities.txt', entities) return entities
def load_dbpedia(): data = {} for line in read_file_iter('merge_step_5_simplified.json', jsn=True): for key, value in line.items(): entity = value[u'resource_label'].encode('utf-8') data[entity] = {} if u'short_abstract' in value: data[entity]['definition'] = value[u'short_abstract'] value.pop(u'short_abstract') if u'resource_alias' in value: data[entity]['aliases'] = value[u'resource_alias'] value.pop(u'resource_alias') data[entity]['attributes'] = value
def begin_filter_with_lower(fname): entities = set() for line in read_file_iter(fname): entities.add(line.lower()) return entities
def read_jsn(self, data_dir): for fname in os.listdir(data_dir): for js in libfile.read_file_iter(os.path.join(data_dir, fname), jsn=True): self.parse(js)
def load_zgdbk_info(dirname='.'): fname = os.path.join(dirname, 'zgdbk_entity_info.txt') send_definition_to_es(read_file_iter(fname, jsn=True), field=None)