Esempio n. 1
0
def __getOtherNames():
    en_entitys = set()
    for h, r, t in tqdm(osUtils.readTriple(FB_movies_name)):
        if '@en' in t:
            en_entitys.add(h)

    other_entitys=[]
    for h,r,t in tqdm(osUtils.readTriple(FB_movies_name)):
        if '@en' in t:continue
        if h in en_entitys:continue
        other_entitys.append((h,t))
    return other_entitys
Esempio n. 2
0
def getPairsCount(kg_file=e_movies_tsv,count_file=paris_count_tsv):
    print('get pairs count')
    relations0=dict()
    for h, r, t in tqdm(osUtils.readTriple(kg_file)):
        if r not in relations0:
            relations0[r] = {'count': 0, 'h': set(), 't': set()}
        relations0[r]['count'] += 1
        relations0[r]['h'].add(h)
        relations0[r]['t'].add(t)

    relations = {}
    for r in relations0:
        relations[r] = {'count': relations0[r]['count'], 'h': len(relations0[r]['h']), 't': len(relations0[r]['t'])}

    writed_relations=set()
    with open(count_file,'w+',encoding='utf-8') as f:
        for r1 in relations:
            if r1 in writed_relations:continue
            for r2 in relations:
                if r1 in writed_relations:continue
                if r2 in writed_relations:continue
                if r1==r2:continue
                if relations[r1]['count']==relations[r2]['count']:
                    f.write(r1 + '\tcount\t' + str(relations[r1]['count']) + '\th\t' + str(relations[r1]['h']) + '\tt\t' + str(relations[r1]['t']) + '\n')
                    f.write(r2 + '\tcount\t' + str(relations[r2]['count']) + '\th\t' + str(relations[r2]['h']) + '\tt\t' + str(relations[r2]['t']) + '\n')
                    writed_relations.add(r1)
                    writed_relations.add(r2)
        for r in relations:
            if r in writed_relations:continue
            f.write(r+'\tcount\t'+str(relations[r]['count'])+'\th\t'+str(relations[r]['h'])+'\tt\t'+str(relations[r]['t'])+'\n')
Esempio n. 3
0
def __getDeletionMovies():
    deletes=__getDeleteRelations()
    movies_without_delete_relation=[]
    for h,r,t in tqdm(osUtils.readTriple(FB_movies_tsv)):
        if r in deletes:continue
        if r == 'film_regional_release_date.film_regional_debut_venue':continue
        movies_without_delete_relation.append((h,r,t))
    return movies_without_delete_relation
Esempio n. 4
0
def readGraphData( path = fp.Ml_100K.KG ):
    print('读取图数据...')
    entity_set = set( )
    pairs = [ ]
    for h, _, t in tqdm( osUtils.readTriple( path ) ):
        entity_set.add( int( h ) )
        entity_set.add( int( t ) )
        pairs.append( ( int( h ),int( t ) ) )
    return list( entity_set ), list( set( pairs ) )
Esempio n. 5
0
def getKgfile():
    print('scan_kg')
    relations_chose = {r[0] for r in osUtils.readFile(ALL_relations_chose)}

    freebase_movies = set(osUtils.getJson(Link_json).values())
    with open(Kg_tsv, 'w+') as f:
        for h, r, t in tqdm(osUtils.readTriple(FB_movies)):
            if r not in relations_chose: continue
            if (h in freebase_movies) or (t in freebase_movies):
                osUtils.writeTripleLine(h, r, t, f)
Esempio n. 6
0
def readData(path):
    entity_list,relation_list=set(),set()
    pairs=[]
    for h, r, t in tqdm(osUtils.readTriple(path)):
        entity_list.add(h)
        entity_list.add(t)
        relation_list.add(r)
        pairs.append([h,r,t])
    #返回实体集合列表,关系集合列表,与三元组列表
    return list(entity_list),list(relation_list),pairs
Esempio n. 7
0
def __getOnlyEnNames():
    entitys=set()
    en_names=[]
    for h,r,t in osUtils.readTriple(FB_movies_name):
        if '@en' in t:
            if h in entitys:continue
            name = re.findall('"(.*)"@en', t)
            name = name[0]
            en_names.append((h,name))
            entitys.add(h)
    return en_names
Esempio n. 8
0
def readKgData(path):
    print('读取知识图谱三元组...')
    entity_set, relation_set = set(), set()
    pairs = []
    for h, r, t in tqdm(osUtils.readTriple(path)):
        entity_set.add(int(h))
        entity_set.add(int(t))
        relation_set.add(int(r))
        pairs.append([int(h), int(r), int(t)])
    #返回实体集合列表,关系集合列表,与三元组列表
    return list(entity_set), list(relation_set), pairs
Esempio n. 9
0
def readKGData( path = fp.Ml_100K.KG ):
    print('读取知识图谱数据...')
    entity_set = set( )
    relation_set = set( )
    triples = [ ]
    for h, r, t in ou.readTriple( path ):
        entity_set.add( int( h ) )
        entity_set.add( int( t ) )
        relation_set.add( int ( r ) )
        triples.append( [ int( h ), int(r), int( t ) ] )
    return list( entity_set ), list( relation_set ), triples
Esempio n. 10
0
def getTypes():
    print('getTypes')
    entitys = dict()
    for h,r,t in tqdm(osUtils.readTriple(e_movies_tsv)):
        types = re.findall('(.*)\.', r)
        tail_type = re.findall('\.(.*)',r)
        entitys[h] = types[0]
        entitys[t] = tail_type[0]

    with open(e_types_tsv, 'w+', encoding='utf-8') as f:
        for e in entitys:
            f.write(e + '\t' + entitys[e] + '\n')
Esempio n. 11
0
def readRecData( path = fp.Ml_100K.RATING, test_ratio = 0.1 ):
    print( '读取用户评分三元组...' )
    user_set, item_set = set( ), set( )
    triples = []
    for u, i, r in tqdm( osUtils.readTriple( path ) ):
        user_set.add( int( u ) )
        item_set.add( int( i ) )
        triples.append( ( int( u ), int( i ), int( r ) ) )

    test_set = random.sample(triples,int(len(triples)*test_ratio))
    train_set = list(set(triples)-set(test_set))
    #返回用户集合列表,物品集合列表,与用户,物品,评分三元组列表
    return list(user_set),list(item_set),train_set,test_set
Esempio n. 12
0
def dealKg():
    mid2EidDict, eid2midDict = getMid2EidDict()
    eSet, rSet = set(), set()
    triples = []
    eid2eindex, rid2rindex = dict(), dict()
    for h, r, t in tqdm(osUtils.readTriple(Kg_tsv)):
        h = eid2midDict.get(h, h)
        t = eid2midDict.get(t, t)
        triple = (h, r, t)
        triples.append(triple)
        eSet.add(h)
        eSet.add(t)
        rSet.add(r)
    n_movie = 0
    all_items = set()

    for e in eSet:
        if e.isnumeric():
            eid2eindex[e] = n_movie
            all_items.add(n_movie)
            n_movie += 1

    for e in eSet:
        if not e.isnumeric():
            eid2eindex[e] = n_movie
            n_movie += 1

    r_index = 0
    for r in rSet:
        rid2rindex[r] = r_index
        r_index += 1

    osUtils.dumpJson(eid2eindex, Eid2index_json)
    osUtils.dumpJson(rid2rindex, Rid2index_json)

    with open(Kg_index, 'w+') as f:
        for triple in triples:
            f.write('\t'.join([
                str(eid2eindex[triple[0]]),
                str(rid2rindex[triple[1]]),
                str(eid2eindex[triple[2]])
            ]))
            f.write('\n')
Esempio n. 13
0
def scan_entity_propertits():
    allEntitys = set()
    for h, r, t in tqdm(osUtils.readTriple(FBMoives)):
        allEntitys.add(h)
        allEntitys.add(t)

    fb = gzip.GzipFile(FBorginal, "r")
    with open(Names, 'w+', encoding='utf-8') as f:
        for line in tqdm(fb):
            lines = line.decode('utf-8').strip().split("\t")
            relation = re.findall(
                '<http://rdf\.freebase\.com/ns/type\.object\.(.*)>', lines[1])
            if len(relation) == 0: continue
            relation = relation[0]
            if relation != 'name': continue
            head = re.findall('/m\.(.*)>', lines[0])
            if len(head) == 0: continue
            head = 'm.' + head[0]
            if head not in allEntitys: continue
            f.write(head + '\t' + relation + '\t' + lines[2] + '\n')
    fb.close()
Esempio n. 14
0
def scanEntitys():
    print('write_entitys')
    a_names, a_types, e_names, e_types = {}, {}, {}, {}
    entitys = set()
    for h, n in osUtils.readFile(ALL_names):
        a_names[h] = n
    for h, t in osUtils.readFile(ALL_types):
        a_types[h] = t
    for h, r, t in tqdm(osUtils.readTriple(Kg_tsv)):
        entitys.add(h)
        entitys.add(t)
    name_file = open(E_names, 'w+', encoding='utf-8')
    type_file = open(E_types, 'w+', encoding='utf-8')

    for e in tqdm(entitys):
        name = a_names.get(e, None)
        if name:
            name_file.write(e + '\t' + name + '\n')
        type = a_types.get(e, None)
        if type:
            type_file.write(e + '\t' + type + '\n')
Esempio n. 15
0
def __processEntitysAndRelationSets(entitys, relations, path):
    for h, r, t in osUtils.readTriple(path):
        entitys.add(h)
        entitys.add(t)
        relations.add(r)
Esempio n. 16
0
def __generateTrainningDate(eid2index, rid2index, opath, path):
    with open(path, 'w+') as f:
        for h, r, t in osUtils.readTriple(opath):
            f.write(eid2index[h] + '\t' + rid2index[r] + '\t' + eid2index[t] +
                    '\n')