def __getOtherNames(): en_entitys = set() for h, r, t in tqdm(osUtils.readTriple(FB_movies_name)): if '@en' in t: en_entitys.add(h) other_entitys=[] for h,r,t in tqdm(osUtils.readTriple(FB_movies_name)): if '@en' in t:continue if h in en_entitys:continue other_entitys.append((h,t)) return other_entitys
def getPairsCount(kg_file=e_movies_tsv,count_file=paris_count_tsv): print('get pairs count') relations0=dict() for h, r, t in tqdm(osUtils.readTriple(kg_file)): if r not in relations0: relations0[r] = {'count': 0, 'h': set(), 't': set()} relations0[r]['count'] += 1 relations0[r]['h'].add(h) relations0[r]['t'].add(t) relations = {} for r in relations0: relations[r] = {'count': relations0[r]['count'], 'h': len(relations0[r]['h']), 't': len(relations0[r]['t'])} writed_relations=set() with open(count_file,'w+',encoding='utf-8') as f: for r1 in relations: if r1 in writed_relations:continue for r2 in relations: if r1 in writed_relations:continue if r2 in writed_relations:continue if r1==r2:continue if relations[r1]['count']==relations[r2]['count']: f.write(r1 + '\tcount\t' + str(relations[r1]['count']) + '\th\t' + str(relations[r1]['h']) + '\tt\t' + str(relations[r1]['t']) + '\n') f.write(r2 + '\tcount\t' + str(relations[r2]['count']) + '\th\t' + str(relations[r2]['h']) + '\tt\t' + str(relations[r2]['t']) + '\n') writed_relations.add(r1) writed_relations.add(r2) for r in relations: if r in writed_relations:continue f.write(r+'\tcount\t'+str(relations[r]['count'])+'\th\t'+str(relations[r]['h'])+'\tt\t'+str(relations[r]['t'])+'\n')
def __getDeletionMovies(): deletes=__getDeleteRelations() movies_without_delete_relation=[] for h,r,t in tqdm(osUtils.readTriple(FB_movies_tsv)): if r in deletes:continue if r == 'film_regional_release_date.film_regional_debut_venue':continue movies_without_delete_relation.append((h,r,t)) return movies_without_delete_relation
def readGraphData( path = fp.Ml_100K.KG ): print('读取图数据...') entity_set = set( ) pairs = [ ] for h, _, t in tqdm( osUtils.readTriple( path ) ): entity_set.add( int( h ) ) entity_set.add( int( t ) ) pairs.append( ( int( h ),int( t ) ) ) return list( entity_set ), list( set( pairs ) )
def getKgfile(): print('scan_kg') relations_chose = {r[0] for r in osUtils.readFile(ALL_relations_chose)} freebase_movies = set(osUtils.getJson(Link_json).values()) with open(Kg_tsv, 'w+') as f: for h, r, t in tqdm(osUtils.readTriple(FB_movies)): if r not in relations_chose: continue if (h in freebase_movies) or (t in freebase_movies): osUtils.writeTripleLine(h, r, t, f)
def readData(path): entity_list,relation_list=set(),set() pairs=[] for h, r, t in tqdm(osUtils.readTriple(path)): entity_list.add(h) entity_list.add(t) relation_list.add(r) pairs.append([h,r,t]) #返回实体集合列表,关系集合列表,与三元组列表 return list(entity_list),list(relation_list),pairs
def __getOnlyEnNames(): entitys=set() en_names=[] for h,r,t in osUtils.readTriple(FB_movies_name): if '@en' in t: if h in entitys:continue name = re.findall('"(.*)"@en', t) name = name[0] en_names.append((h,name)) entitys.add(h) return en_names
def readKgData(path): print('读取知识图谱三元组...') entity_set, relation_set = set(), set() pairs = [] for h, r, t in tqdm(osUtils.readTriple(path)): entity_set.add(int(h)) entity_set.add(int(t)) relation_set.add(int(r)) pairs.append([int(h), int(r), int(t)]) #返回实体集合列表,关系集合列表,与三元组列表 return list(entity_set), list(relation_set), pairs
def readKGData( path = fp.Ml_100K.KG ): print('读取知识图谱数据...') entity_set = set( ) relation_set = set( ) triples = [ ] for h, r, t in ou.readTriple( path ): entity_set.add( int( h ) ) entity_set.add( int( t ) ) relation_set.add( int ( r ) ) triples.append( [ int( h ), int(r), int( t ) ] ) return list( entity_set ), list( relation_set ), triples
def getTypes(): print('getTypes') entitys = dict() for h,r,t in tqdm(osUtils.readTriple(e_movies_tsv)): types = re.findall('(.*)\.', r) tail_type = re.findall('\.(.*)',r) entitys[h] = types[0] entitys[t] = tail_type[0] with open(e_types_tsv, 'w+', encoding='utf-8') as f: for e in entitys: f.write(e + '\t' + entitys[e] + '\n')
def readRecData( path = fp.Ml_100K.RATING, test_ratio = 0.1 ): print( '读取用户评分三元组...' ) user_set, item_set = set( ), set( ) triples = [] for u, i, r in tqdm( osUtils.readTriple( path ) ): user_set.add( int( u ) ) item_set.add( int( i ) ) triples.append( ( int( u ), int( i ), int( r ) ) ) test_set = random.sample(triples,int(len(triples)*test_ratio)) train_set = list(set(triples)-set(test_set)) #返回用户集合列表,物品集合列表,与用户,物品,评分三元组列表 return list(user_set),list(item_set),train_set,test_set
def dealKg(): mid2EidDict, eid2midDict = getMid2EidDict() eSet, rSet = set(), set() triples = [] eid2eindex, rid2rindex = dict(), dict() for h, r, t in tqdm(osUtils.readTriple(Kg_tsv)): h = eid2midDict.get(h, h) t = eid2midDict.get(t, t) triple = (h, r, t) triples.append(triple) eSet.add(h) eSet.add(t) rSet.add(r) n_movie = 0 all_items = set() for e in eSet: if e.isnumeric(): eid2eindex[e] = n_movie all_items.add(n_movie) n_movie += 1 for e in eSet: if not e.isnumeric(): eid2eindex[e] = n_movie n_movie += 1 r_index = 0 for r in rSet: rid2rindex[r] = r_index r_index += 1 osUtils.dumpJson(eid2eindex, Eid2index_json) osUtils.dumpJson(rid2rindex, Rid2index_json) with open(Kg_index, 'w+') as f: for triple in triples: f.write('\t'.join([ str(eid2eindex[triple[0]]), str(rid2rindex[triple[1]]), str(eid2eindex[triple[2]]) ])) f.write('\n')
def scan_entity_propertits(): allEntitys = set() for h, r, t in tqdm(osUtils.readTriple(FBMoives)): allEntitys.add(h) allEntitys.add(t) fb = gzip.GzipFile(FBorginal, "r") with open(Names, 'w+', encoding='utf-8') as f: for line in tqdm(fb): lines = line.decode('utf-8').strip().split("\t") relation = re.findall( '<http://rdf\.freebase\.com/ns/type\.object\.(.*)>', lines[1]) if len(relation) == 0: continue relation = relation[0] if relation != 'name': continue head = re.findall('/m\.(.*)>', lines[0]) if len(head) == 0: continue head = 'm.' + head[0] if head not in allEntitys: continue f.write(head + '\t' + relation + '\t' + lines[2] + '\n') fb.close()
def scanEntitys(): print('write_entitys') a_names, a_types, e_names, e_types = {}, {}, {}, {} entitys = set() for h, n in osUtils.readFile(ALL_names): a_names[h] = n for h, t in osUtils.readFile(ALL_types): a_types[h] = t for h, r, t in tqdm(osUtils.readTriple(Kg_tsv)): entitys.add(h) entitys.add(t) name_file = open(E_names, 'w+', encoding='utf-8') type_file = open(E_types, 'w+', encoding='utf-8') for e in tqdm(entitys): name = a_names.get(e, None) if name: name_file.write(e + '\t' + name + '\n') type = a_types.get(e, None) if type: type_file.write(e + '\t' + type + '\n')
def __processEntitysAndRelationSets(entitys, relations, path): for h, r, t in osUtils.readTriple(path): entitys.add(h) entitys.add(t) relations.add(r)
def __generateTrainningDate(eid2index, rid2index, opath, path): with open(path, 'w+') as f: for h, r, t in osUtils.readTriple(opath): f.write(eid2index[h] + '\t' + rid2index[r] + '\t' + eid2index[t] + '\n')