def get_lda_topics(args): set_level = args[0] train_prob = args[1] topic_num = int(args[2]) file_template = './song_dataset/user_dataset_%s_%s_%s' #set_level, type, train_prob train_file = file_template%(set_level,'train',train_prob) test_file = file_template%(set_level,'test',train_prob) dataset = BaseDataSet() dataset.build_data(train_file,test_file) recommender = UserLDA() recommender.build_model(dataset.train_data,topic_num) for idx,distrib in enumerate(recommender.model.print_topics(1000)): dist0 = distrib.split()[0].split('*')[0] if float(dist0) > 0: print "Topic#%s\t%s"%(idx,distrib)
class HybirdModel_UB(BaseModel): def __init__(self): BaseModel.__init__(self) self.user_similarity = defaultdict(dict) self.userCF = UserCF() self.userTag = UserTagCF() self.userLda = UserLDA() def hybird_user_sim(self,user_songs, user_sim_file, hybird_sim_file, hybird_type='tag',theta=0.5,mix_type=0): time_st = time.time() self.userCF.load_user_similarity(user_sim_file,norm=1) if hybird_type == 'tag': self.userTag.load_user_similarity(hybird_sim_file,norm=1) elif hybird_type == 'lda': self.userLda.load_user_similarity(hybird_sim_file,norm=1) #Rebuild user_similarity matrix for uid in user_songs.keys(): candidate_user = defaultdict(float) ''' user_sim = user_tag_sim*theta*(1+user_lda_sim*(1-theta)) greater than user_sim= user_tag_sim * theta + user_lda_sim*(1-theta) ''' for (vid,sim) in self.userCF.user_similarity[uid]: if mix_type: candidate_user[vid] += sim * theta + 1 else: candidate_user[vid] += sim * theta if hybird_type == 'tag': for (vid,sim) in self.userTag.user_similarity[uid]: if mix_type: candidate_user[vid] *= (1+sim*(1-theta)) else: candidate_user[vid] += sim*(1-theta) elif hybird_type == 'lda': for (vid,sim) in self.userLda.user_similarity[uid]: if mix_type: candidate_user[vid] *= (1+sim * (1-theta)) else: candidate_user[vid] += sim * (1-theta) #Sort sim user: sorted_sim_user = sorted(candidate_user.items(),key=lambda x:x[1],reverse=True) self.user_similarity[uid] = sorted_sim_user[:400] time_ed = time.time() logging.info('Rebuild user-similarity matrix cost:%s'%(time_ed-time_st)) def recommend(self,user_songs,user_tags,item_tags,user_k,top_n,reorder=0): time_st = time.time() for uid in user_songs.keys(): candidate_songs = defaultdict(float) top_k_users = self.user_similarity[uid][:user_k] for (vid,sim) in top_k_users: for song in set(user_songs[vid])-set(user_songs[uid]): candidate_songs[song] += sim if reorder: top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:500] #Switch top_n*4 to 500/2015.3.8 top_n_songs = self.reorder_withItemTag(user_tags[uid],item_tags,top_n_songs)[:top_n] else: top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:top_n] top_n_songs = [song[0] for song in top_n_songs] print "%s\t%s"%(uid,json.dumps(top_n_songs)) #输出top_n推荐结果到文件 time_ed = time.time() self.cost_time = time_ed - time_st def hybird_recommend_result(self,user_songs,user_k,top_n): time_st = time.time() for uid in user_songs.keys(): candidate_songs = defaultdict(float) for (vid,sim) in self.userLda.user_similarity[uid][:user_k]: for song in set(user_songs[vid])-set(user_songs[uid]): candidate_songs[song]+= sim for (vid,sim) in self.userTag.user_similarity[uid][:user_k]: for song in set(user_songs[vid])-set(user_songs[uid]): candidate_songs[song] += sim top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:top_n] self.result[uid] = [song[0] for song in top_n_songs] time_ed = time.time() self.cost_time = time_ed - time_st def hybird_result_withReorder(self,user_songs,user_tags,item_tags,user_k,top_n): time_st = time.time() for uid in user_songs.keys(): candidate_songs = defaultdict(float) for (vid,sim) in self.userLda.user_similarity[uid][:user_k]: for song in set(user_songs[vid])-set(user_songs[uid]): candidate_songs[song]+= sim for (vid,sim) in self.userTag.user_similarity[uid][:user_k]: for song in set(user_songs[vid])-set(user_songs[uid]): candidate_songs[song]+= sim top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:500] top_n_songs = self.reorder_withItemTag(user_tags[uid],item_tags,top_n_songs)[:top_n] self.result[uid] = [song[0] for song in top_n_songs] time_ed = time.time() self.cost_time = time_ed - time_st def reorder_withItemTag(self,user_tag_distrib,items_tag_distrib,top_n_songs): ''' @Desc: @params[in] user_tag_distrib: dict, {tag:freq} @params[in] items_tag_distrib: dict, {sid:{tag:freq}} @params[in] top_n_songs: [(sid,score),] ''' songs = set([song[0] for song in top_n_songs]) user_norm = sum([freq**2 for freq in user_tag_distrib.values()]) user_tags = set([tag for tag in user_tag_distrib.keys()]) user_song_match = defaultdict(float) for sid in songs: inter_tag = user_tags & set(items_tag_distrib[sid].keys()) song_norm = sum([freq**2 for freq in items_tag_distrib[sid].values()]) if len(inter_tag) == 0: continue for tag in inter_tag: user_song_match[sid] += items_tag_distrib[sid][tag] * user_tag_distrib[tag] user_song_match[sid] /= (user_norm*song_norm)**0.5 n_top_n_songs = sorted([(song[0],song[1]*(1+user_song_match[song[0]])) for song in top_n_songs],key=lambda x:x[1],reverse=True) return n_top_n_songs
def __init__(self): BaseModel.__init__(self) self.user_similarity = defaultdict(dict) self.userCF = UserCF() self.userTag = UserTagCF() self.userLda = UserLDA()
class HybirdModel_UB(BaseModel): def __init__(self): BaseModel.__init__(self) self.user_similarity = defaultdict(dict) self.userCF = UserCF() self.userTag = UserTagCF() self.userLda = UserLDA() def hybird_user_sim(self, user_songs, user_sim_file, hybird_sim_file, hybird_type='tag', theta=0.5, mix_type=0): time_st = time.time() self.userCF.load_user_similarity(user_sim_file, norm=1) if hybird_type == 'tag': self.userTag.load_user_similarity(hybird_sim_file, norm=1) elif hybird_type == 'lda': self.userLda.load_user_similarity(hybird_sim_file, norm=1) #Rebuild user_similarity matrix for uid in user_songs.keys(): candidate_user = defaultdict(float) ''' user_sim = user_tag_sim*theta*(1+user_lda_sim*(1-theta)) greater than user_sim= user_tag_sim * theta + user_lda_sim*(1-theta) ''' for (vid, sim) in self.userCF.user_similarity[uid]: if mix_type: candidate_user[vid] += sim * theta + 1 else: candidate_user[vid] += sim * theta if hybird_type == 'tag': for (vid, sim) in self.userTag.user_similarity[uid]: if mix_type: candidate_user[vid] *= (1 + sim * (1 - theta)) else: candidate_user[vid] += sim * (1 - theta) elif hybird_type == 'lda': for (vid, sim) in self.userLda.user_similarity[uid]: if mix_type: candidate_user[vid] *= (1 + sim * (1 - theta)) else: candidate_user[vid] += sim * (1 - theta) #Sort sim user: sorted_sim_user = sorted(candidate_user.items(), key=lambda x: x[1], reverse=True) self.user_similarity[uid] = sorted_sim_user[:400] time_ed = time.time() logging.info('Rebuild user-similarity matrix cost:%s' % (time_ed - time_st)) def recommend(self, user_songs, user_tags, item_tags, user_k, top_n, reorder=0): time_st = time.time() for uid in user_songs.keys(): candidate_songs = defaultdict(float) top_k_users = self.user_similarity[uid][:user_k] for (vid, sim) in top_k_users: for song in set(user_songs[vid]) - set(user_songs[uid]): candidate_songs[song] += sim if reorder: top_n_songs = sorted( candidate_songs.items(), key=lambda x: x[1], reverse=True)[:500] #Switch top_n*4 to 500/2015.3.8 top_n_songs = self.reorder_withItemTag(user_tags[uid], item_tags, top_n_songs)[:top_n] else: top_n_songs = sorted(candidate_songs.items(), key=lambda x: x[1], reverse=True)[:top_n] top_n_songs = [song[0] for song in top_n_songs] print "%s\t%s" % (uid, json.dumps(top_n_songs)) #输出top_n推荐结果到文件 time_ed = time.time() self.cost_time = time_ed - time_st def hybird_recommend_result(self, user_songs, user_k, top_n): time_st = time.time() for uid in user_songs.keys(): candidate_songs = defaultdict(float) for (vid, sim) in self.userLda.user_similarity[uid][:user_k]: for song in set(user_songs[vid]) - set(user_songs[uid]): candidate_songs[song] += sim for (vid, sim) in self.userTag.user_similarity[uid][:user_k]: for song in set(user_songs[vid]) - set(user_songs[uid]): candidate_songs[song] += sim top_n_songs = sorted(candidate_songs.items(), key=lambda x: x[1], reverse=True)[:top_n] self.result[uid] = [song[0] for song in top_n_songs] time_ed = time.time() self.cost_time = time_ed - time_st def hybird_result_withReorder(self, user_songs, user_tags, item_tags, user_k, top_n): time_st = time.time() for uid in user_songs.keys(): candidate_songs = defaultdict(float) for (vid, sim) in self.userLda.user_similarity[uid][:user_k]: for song in set(user_songs[vid]) - set(user_songs[uid]): candidate_songs[song] += sim for (vid, sim) in self.userTag.user_similarity[uid][:user_k]: for song in set(user_songs[vid]) - set(user_songs[uid]): candidate_songs[song] += sim top_n_songs = sorted(candidate_songs.items(), key=lambda x: x[1], reverse=True)[:500] top_n_songs = self.reorder_withItemTag(user_tags[uid], item_tags, top_n_songs)[:top_n] self.result[uid] = [song[0] for song in top_n_songs] time_ed = time.time() self.cost_time = time_ed - time_st def reorder_withItemTag(self, user_tag_distrib, items_tag_distrib, top_n_songs): ''' @Desc: @params[in] user_tag_distrib: dict, {tag:freq} @params[in] items_tag_distrib: dict, {sid:{tag:freq}} @params[in] top_n_songs: [(sid,score),] ''' songs = set([song[0] for song in top_n_songs]) user_norm = sum([freq**2 for freq in user_tag_distrib.values()]) user_tags = set([tag for tag in user_tag_distrib.keys()]) user_song_match = defaultdict(float) for sid in songs: inter_tag = user_tags & set(items_tag_distrib[sid].keys()) song_norm = sum( [freq**2 for freq in items_tag_distrib[sid].values()]) if len(inter_tag) == 0: continue for tag in inter_tag: user_song_match[ sid] += items_tag_distrib[sid][tag] * user_tag_distrib[tag] user_song_match[sid] /= (user_norm * song_norm)**0.5 n_top_n_songs = sorted([(song[0], song[1] * (1 + user_song_match[song[0]])) for song in top_n_songs], key=lambda x: x[1], reverse=True) return n_top_n_songs