コード例 #1
0
def begin_filter_with_chinese(fname):
    entities = set()
    for line in read_file_iter(fname):
        if regchinese.match(line.decode('utf-8')):
            entities.add(line)

    write_file('entities_chinese_2_10.txt', entities)
コード例 #2
0
def get_fudanperiod_entity(
        entity_dict=False):  # get fudaninc entities stored in periodCache
    from hzlib.libfile import write_file
    saved_dir = '/data/hproject/2016/fudaninc-20160825'
    entities = set()
    i = 0
    for f in os.listdir(saved_dir):
        fname = os.path.join(saved_dir, f)
        with open(fname) as fd:
            try:
                data = json.load(fd)
            except:
                i += 1
                with open('failed.txt', 'a') as out:
                    out.write(fd.read() + '\n')
                print(i)
            for entity, dic in data.iteritems():
                if entity_dict:
                    m = regdropbrackets.match(entity)
                    if m:
                        entities.add(m.group(1))
                    else:
                        entities.add(entity)
                else:
                    entities.add(entity)
    if entity_dict:
        write_file('fudankg_entities_dict.txt', list(entities))
    else:
        write_file('fudankg_entities.txt', list(entities))
コード例 #3
0
def comic_song_extract_entity(fname, persistent=False):
    entities = set()

    for line in read_file_iter(fname):
        m = regdropbrackets.match(line.decode('utf-8'))
        entity = m.group(1).encode('utf-8') if m else line
        entities.add(entity)

    print('comic song entities length: ', len(entities))
    if persistent:
        write_file('entities/comic_song_entities.txt', entities)
    return entities
コード例 #4
0
def wiki_title_entity(fname, persistent=False):
    entities = set()

    for line in read_file_iter(fname):
        m = regdisambiguation.match(line.strip().decode('utf-8'))
        item = m.group(1).encode('utf-8') if m else line.strip()
        if not item.startswith('\xee'):  # human unreadable string
            entities.add(item.strip())

    write_file('entities/{}_title'.format(fname), entities)
    if persistent:
        print('wiki title entities length: ', len(entities))
    return entities
コード例 #5
0
def dbpedia_extract_entity(fname, persistent=False):
    entities = set()

    for jsn in read_file_iter(fname, jsn=True):
        key, value = jsn.items()[0]
        key = value[u'resource_label'].strip()

        m = regdisambiguation.match(key)
        entity = m.group(1) if m else key
        entities.add(entity.encode('utf-8'))

    print('dbpedia entities length: ', len(entities))
    if persistent:
        write_file('entities/dbpedia_entities.txt', entities)
    return entities
コード例 #6
0
def bdbk_extract_entity(ifilename, persistent=False):
    entities = set()
    last_line = '</>'

    for line in read_file_iter(ifilename):
        if last_line == '</>':
            entities.add(line)
        elif line.startswith('@@@LINK='):
            entities.add(line[8:])
        last_line = line

    print('bdbk entities length: ', len(entities))
    if persistent:
        write_file('entities/{}_entities.txt'.format(ifilename), entities)
    return entities
コード例 #7
0
def wiki_extract_entity(fname, persistent=False):
    entities = set()

    for jsn in read_file_iter(fname, jsn=True):
        m = regdisambiguation.match(jsn[u'chinese_label'])
        item = m.group(1) if m else jsn[u'chinese_label']
        entities.add(item.encode('utf-8').strip())
        if u'chinese_aliases' in jsn:
            entities.update(
                map(string.strip,
                    map(lambda x: x.encode('utf-8'), jsn[u'chinese_aliases'])))

    print('wiki entities length: ', len(entities))
    if persistent:
        write_file('entities/wiki_entities.txt', entities)
    return entities
コード例 #8
0
def zgdbk_extract_entity(infilename, persistent=False):
    entities = set()
    re_entity = re.compile('<span id="span2" class="STYLE2">(.+)</span')

    for line in read_file_iter(infilename):
        m = re_entity.match(line)
        if m:
            entity = regrmlabel(m.group(1))
            entity = zgdbk_parse_entity(entity)
            if entity:
                entities.add(entity.strip())

    print('zgdbk entities length: ', len(entities))
    if persistent:
        write_file('entities/zgdbk_entities.txt', entities)
    return entities
コード例 #9
0
def zgdbk_extract_info():
    pairs = []

    content = read_file('zgdbk.txt')
    infos = re.findall(
        '<span id="span2" class="STYLE2">(.+?)</span.+?<span id="span14".+?</SRIPT>(.+?)</table>',
        content, re.S)
    for i in infos:
        entity = zgdbk_parse_entity(i[0])
        if entity is None:
            continue

        info = i[1][:i[1].rfind('</span>')]
        info = lxml.html.fromstring(info.decode('utf-8')).text_content()
        pairs.append(json.dumps({entity: info}))

    write_file('zgdbk_result.txt', pairs)
コード例 #10
0
def information_exist_proportion():
    from hzlib.libfile import write_file
    saved_dir = '/data/crawler_file_cache/fudankg_saved'
    gcounter = Counter()
    entities_info_not_exist = set()

    for f in os.listdir(saved_dir):
        fname = os.path.join(saved_dir, f)
        with open(fname) as fd:
            for entity, dic in json.load(fd).iteritems():
                if u'Information' in dic:
                    gcounter['exist'] += 1
                else:
                    entities_info_not_exist.add(entity)
                gcounter['total'] += 1

    print(gcounter['exist'] / gcounter['total'])
    write_file('fudankg_entities_info_not_exist.txt',
               list(entities_info_not_exist))
コード例 #11
0
def get_fudankg_entity(entity_dict=False):
    from hzlib.libfile import write_file
    saved_dir = '/data/crawler_file_cache/fudankg_saved'
    entities = set()

    for f in os.listdir(saved_dir):
        fname = os.path.join(saved_dir, f)
        with open(fname) as fd:
            for entity, dic in json.load(fd).iteritems():
                if entity_dict:
                    m = regdropbrackets.match(entity)
                    if m:
                        entities.add(m.group(1))
                    else:
                        entities.add(entity)
                else:
                    entities.add(entity)
    if entity_dict:
        write_file('fudankg_entities_dict.txt', list(entities))
    else:
        write_file('fudankg_entities.txt', list(entities))
コード例 #12
0
def extract_bdbk_with_alias(ifilename):
    with open('{}_entities.txt'.format(ifilename),
              'w') as wfd, codecs.open(ifilename) as rfd:
        last_line = '</>'

        for i, line in enumerate(rfd):
            line = line.strip()
            if last_line == '</>':
                if i == 0:
                    wfd.write(line)
                else:
                    wfd.write('\n' + line)
            elif line.startswith('@@@LINK='):
                wfd.write('\t' + line[8:])
            last_line = line

    entity_alias = {}
    with open('{}_entities.txt'.format(ifilename)) as fd:
        for line in fd:
            if '\t' in line:
                entity_alias.update(dict([line.strip().split('\t')]))

    write_file('bdbk_entity_alias.json', entity_alias, jsn=True)
コード例 #13
0

def comic_song_extract_entity(fname, persistent=False):
    entities = set()

    for line in read_file_iter(fname):
        m = regdropbrackets.match(line.decode('utf-8'))
        entity = m.group(1).encode('utf-8') if m else line
        entities.add(entity)

    print('comic song entities length: ', len(entities))
    if persistent:
        write_file('entities/comic_song_entities.txt', entities)
    return entities


if __name__ == '__main__':
    entities = set()

    entities.update(zgdbk_extract_entity('zgdbk.txt', True))
    entities.update(bdbk_extract_entity('vbk2012.txt', True))
    entities.update(bdbk_extract_entity('vbk2012_ext.txt', True))
    entities.update(wiki_extract_entity('wikidata_zh_simplified.json', True))
    entities.update(
        dbpedia_extract_entity('merge_step_5_simplified.json', True))
    entities.update(
        wiki_title_entity('zhwiki-20160601-all-titles-in-ns2.txt', True))
    entities.update(comic_song_extract_entity('ertong.txt', True))

    write_file('entities/entities_0630_raw.txt', entities)
コード例 #14
0
def begin_filter_with_lower(fname):
    entities = set()
    for line in read_file_iter(fname):
        entities.add(line.lower())
    return entities


def begin_filter_with_search(fname):
    entities = set()

    for line in read_file_iter(fname):
        m = regentityfilt.match(line.decode('utf-8'))
        if m:
            entities.add(m.group(1))

    return entities


if __name__ == '__main__':
    entities = set()
    #    entities.update( begin_filter_with_search('entities/first_order_entities.txt') )

    entities.update(
        begin_filter_with_search('entities/second_order_entities.txt'))
    entities.update(
        begin_filter_with_search('entities/third_order_entities.txt'))

    print('length of first order filtered entities', len(entities))
    write_file('entities_for_fudankg_search.txt', entities)
コード例 #15
0
        setattr(load_db_data_bd, '_cache', (dbpedia, wikidata, bdbk))
    return load_db_data_bd._cache


def intersection_of3(dbpedia, wikidata, bdbk):
    if not hasattr(intersection_of3, '_cache'):
        inter_temp = dbpedia.intersection(wikidata)
        inter3 = inter_temp.intersection(bdbk)
        setattr(intersection_of3, '_cache',
                (inter3, inter_temp - inter3, inter_temp))
    return intersection_of3._cache


def get_dbpedia_wikidata():
    dbpedia, wikidata, bdbk = load_db_data_bd()
    _, _, db_wiki_each = intersection_of3(dbpedia, wikidata, bdbk)
    return list(dbpedia - db_wiki_each) + list(wikidata - db_wiki_each)


def get_bdbk():
    dbpedia, wikidata, bdbk = load_db_data_bd()
    return list(bdbk.difference(dbpedia).difference(wikidata))


if __name__ == '__main__':
    first = run()
    write_file('entities/first_order_entities.txt', first)

    write_file('entities/second_order_entities.txt', get_dbpedia_wikidata())
    write_file('entities/third_order_entities.txt', get_bdbk())