Exemple #1
0
def get_lda_topics(args):
	set_level = args[0]
	train_prob = args[1]
	topic_num = int(args[2])

	file_template = './song_dataset/user_dataset_%s_%s_%s' #set_level, type, train_prob
	train_file = file_template%(set_level,'train',train_prob)
	test_file = file_template%(set_level,'test',train_prob)
	
	dataset = BaseDataSet()
	dataset.build_data(train_file,test_file)

	recommender = UserLDA()
	recommender.build_model(dataset.train_data,topic_num)
	for idx,distrib in enumerate(recommender.model.print_topics(1000)):
		dist0 = distrib.split()[0].split('*')[0]
		if float(dist0) > 0:
			print "Topic#%s\t%s"%(idx,distrib)
Exemple #2
0
class HybirdModel_UB(BaseModel):
	def __init__(self):
		BaseModel.__init__(self)
		self.user_similarity = defaultdict(dict)
		self.userCF = UserCF()
		self.userTag = UserTagCF()
		self.userLda = UserLDA()

	def hybird_user_sim(self,user_songs, user_sim_file, hybird_sim_file, hybird_type='tag',theta=0.5,mix_type=0):
		time_st = time.time()
		self.userCF.load_user_similarity(user_sim_file,norm=1)
		if hybird_type == 'tag':
			self.userTag.load_user_similarity(hybird_sim_file,norm=1)
		elif hybird_type == 'lda':
			self.userLda.load_user_similarity(hybird_sim_file,norm=1)
		
		#Rebuild user_similarity matrix
		for uid in user_songs.keys():
			candidate_user = defaultdict(float)
			'''
			user_sim = user_tag_sim*theta*(1+user_lda_sim*(1-theta)) 
				greater than 
			user_sim= user_tag_sim * theta + user_lda_sim*(1-theta)
			'''
			
			for (vid,sim) in self.userCF.user_similarity[uid]:
				if mix_type:
					candidate_user[vid] += sim * theta + 1
				else:
					candidate_user[vid] += sim * theta
			if hybird_type == 'tag':
				for (vid,sim) in self.userTag.user_similarity[uid]:
					if mix_type:
						candidate_user[vid] *= (1+sim*(1-theta))
					else:
						candidate_user[vid] += sim*(1-theta)
			elif hybird_type == 'lda':
				for (vid,sim) in self.userLda.user_similarity[uid]:
					if mix_type:
						candidate_user[vid] *= (1+sim * (1-theta))
					else:
						candidate_user[vid] += sim * (1-theta)

			#Sort sim user:
			sorted_sim_user = sorted(candidate_user.items(),key=lambda x:x[1],reverse=True)
			self.user_similarity[uid] = sorted_sim_user[:400]
		time_ed = time.time()
		logging.info('Rebuild user-similarity matrix cost:%s'%(time_ed-time_st))

	def recommend(self,user_songs,user_tags,item_tags,user_k,top_n,reorder=0):
		time_st = time.time()
		for uid in user_songs.keys():
			candidate_songs = defaultdict(float)
			top_k_users = self.user_similarity[uid][:user_k]
			for (vid,sim) in top_k_users:
				for song in set(user_songs[vid])-set(user_songs[uid]):
					candidate_songs[song] += sim
			if reorder:
				top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:500]		#Switch top_n*4 to 500/2015.3.8
				top_n_songs = self.reorder_withItemTag(user_tags[uid],item_tags,top_n_songs)[:top_n]

			else:
				top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:top_n]

			top_n_songs = [song[0] for song in top_n_songs]
			print "%s\t%s"%(uid,json.dumps(top_n_songs))	#输出top_n推荐结果到文件
		time_ed = time.time()
		self.cost_time = time_ed - time_st

	def hybird_recommend_result(self,user_songs,user_k,top_n):
		time_st = time.time()
		for uid in user_songs.keys():
			candidate_songs = defaultdict(float)
			for (vid,sim) in self.userLda.user_similarity[uid][:user_k]:
				for song in set(user_songs[vid])-set(user_songs[uid]):
					candidate_songs[song]+= sim

			for (vid,sim) in self.userTag.user_similarity[uid][:user_k]:
				for song in set(user_songs[vid])-set(user_songs[uid]):
					candidate_songs[song] += sim
		
			top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:top_n]
			self.result[uid] = [song[0] for song in top_n_songs]
		time_ed = time.time()
		self.cost_time = time_ed - time_st

	def hybird_result_withReorder(self,user_songs,user_tags,item_tags,user_k,top_n):
		time_st = time.time()
		for uid in user_songs.keys():
			candidate_songs = defaultdict(float)
			for (vid,sim) in self.userLda.user_similarity[uid][:user_k]:
				for song in set(user_songs[vid])-set(user_songs[uid]):
					candidate_songs[song]+= sim
			for (vid,sim) in self.userTag.user_similarity[uid][:user_k]:
				for song in set(user_songs[vid])-set(user_songs[uid]):
					candidate_songs[song]+= sim
		
			top_n_songs = sorted(candidate_songs.items(),key=lambda x:x[1], reverse=True)[:500]
			top_n_songs = self.reorder_withItemTag(user_tags[uid],item_tags,top_n_songs)[:top_n]
			self.result[uid] = [song[0] for song in top_n_songs]
		time_ed = time.time()
		self.cost_time = time_ed - time_st

	def reorder_withItemTag(self,user_tag_distrib,items_tag_distrib,top_n_songs):
		'''
		@Desc:
		@params[in] user_tag_distrib: dict, {tag:freq}
		@params[in] items_tag_distrib: dict, {sid:{tag:freq}}
		@params[in] top_n_songs: [(sid,score),]
		'''
		songs = set([song[0] for song in top_n_songs])
		user_norm = sum([freq**2 for freq in user_tag_distrib.values()])
		user_tags = set([tag for tag in user_tag_distrib.keys()])
		user_song_match = defaultdict(float)
		for sid in songs:
			inter_tag = user_tags & set(items_tag_distrib[sid].keys())
			song_norm = sum([freq**2 for freq in items_tag_distrib[sid].values()])
			if len(inter_tag) == 0:
				continue
			for tag in inter_tag:
				user_song_match[sid] += items_tag_distrib[sid][tag] * user_tag_distrib[tag]
			user_song_match[sid] /= (user_norm*song_norm)**0.5

		n_top_n_songs = sorted([(song[0],song[1]*(1+user_song_match[song[0]])) for song in top_n_songs],key=lambda x:x[1],reverse=True)
		return n_top_n_songs
Exemple #3
0
	def __init__(self):
		BaseModel.__init__(self)
		self.user_similarity = defaultdict(dict)
		self.userCF = UserCF()
		self.userTag = UserTagCF()
		self.userLda = UserLDA()
Exemple #4
0
 def __init__(self):
     BaseModel.__init__(self)
     self.user_similarity = defaultdict(dict)
     self.userCF = UserCF()
     self.userTag = UserTagCF()
     self.userLda = UserLDA()
Exemple #5
0
class HybirdModel_UB(BaseModel):
    def __init__(self):
        BaseModel.__init__(self)
        self.user_similarity = defaultdict(dict)
        self.userCF = UserCF()
        self.userTag = UserTagCF()
        self.userLda = UserLDA()

    def hybird_user_sim(self,
                        user_songs,
                        user_sim_file,
                        hybird_sim_file,
                        hybird_type='tag',
                        theta=0.5,
                        mix_type=0):
        time_st = time.time()
        self.userCF.load_user_similarity(user_sim_file, norm=1)
        if hybird_type == 'tag':
            self.userTag.load_user_similarity(hybird_sim_file, norm=1)
        elif hybird_type == 'lda':
            self.userLda.load_user_similarity(hybird_sim_file, norm=1)

        #Rebuild user_similarity matrix
        for uid in user_songs.keys():
            candidate_user = defaultdict(float)
            '''
			user_sim = user_tag_sim*theta*(1+user_lda_sim*(1-theta)) 
				greater than 
			user_sim= user_tag_sim * theta + user_lda_sim*(1-theta)
			'''

            for (vid, sim) in self.userCF.user_similarity[uid]:
                if mix_type:
                    candidate_user[vid] += sim * theta + 1
                else:
                    candidate_user[vid] += sim * theta
            if hybird_type == 'tag':
                for (vid, sim) in self.userTag.user_similarity[uid]:
                    if mix_type:
                        candidate_user[vid] *= (1 + sim * (1 - theta))
                    else:
                        candidate_user[vid] += sim * (1 - theta)
            elif hybird_type == 'lda':
                for (vid, sim) in self.userLda.user_similarity[uid]:
                    if mix_type:
                        candidate_user[vid] *= (1 + sim * (1 - theta))
                    else:
                        candidate_user[vid] += sim * (1 - theta)

            #Sort sim user:
            sorted_sim_user = sorted(candidate_user.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
            self.user_similarity[uid] = sorted_sim_user[:400]
        time_ed = time.time()
        logging.info('Rebuild user-similarity matrix cost:%s' %
                     (time_ed - time_st))

    def recommend(self,
                  user_songs,
                  user_tags,
                  item_tags,
                  user_k,
                  top_n,
                  reorder=0):
        time_st = time.time()
        for uid in user_songs.keys():
            candidate_songs = defaultdict(float)
            top_k_users = self.user_similarity[uid][:user_k]
            for (vid, sim) in top_k_users:
                for song in set(user_songs[vid]) - set(user_songs[uid]):
                    candidate_songs[song] += sim
            if reorder:
                top_n_songs = sorted(
                    candidate_songs.items(), key=lambda x: x[1],
                    reverse=True)[:500]  #Switch top_n*4 to 500/2015.3.8
                top_n_songs = self.reorder_withItemTag(user_tags[uid],
                                                       item_tags,
                                                       top_n_songs)[:top_n]

            else:
                top_n_songs = sorted(candidate_songs.items(),
                                     key=lambda x: x[1],
                                     reverse=True)[:top_n]

            top_n_songs = [song[0] for song in top_n_songs]
            print "%s\t%s" % (uid, json.dumps(top_n_songs))  #输出top_n推荐结果到文件
        time_ed = time.time()
        self.cost_time = time_ed - time_st

    def hybird_recommend_result(self, user_songs, user_k, top_n):
        time_st = time.time()
        for uid in user_songs.keys():
            candidate_songs = defaultdict(float)
            for (vid, sim) in self.userLda.user_similarity[uid][:user_k]:
                for song in set(user_songs[vid]) - set(user_songs[uid]):
                    candidate_songs[song] += sim

            for (vid, sim) in self.userTag.user_similarity[uid][:user_k]:
                for song in set(user_songs[vid]) - set(user_songs[uid]):
                    candidate_songs[song] += sim

            top_n_songs = sorted(candidate_songs.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:top_n]
            self.result[uid] = [song[0] for song in top_n_songs]
        time_ed = time.time()
        self.cost_time = time_ed - time_st

    def hybird_result_withReorder(self, user_songs, user_tags, item_tags,
                                  user_k, top_n):
        time_st = time.time()
        for uid in user_songs.keys():
            candidate_songs = defaultdict(float)
            for (vid, sim) in self.userLda.user_similarity[uid][:user_k]:
                for song in set(user_songs[vid]) - set(user_songs[uid]):
                    candidate_songs[song] += sim
            for (vid, sim) in self.userTag.user_similarity[uid][:user_k]:
                for song in set(user_songs[vid]) - set(user_songs[uid]):
                    candidate_songs[song] += sim

            top_n_songs = sorted(candidate_songs.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:500]
            top_n_songs = self.reorder_withItemTag(user_tags[uid], item_tags,
                                                   top_n_songs)[:top_n]
            self.result[uid] = [song[0] for song in top_n_songs]
        time_ed = time.time()
        self.cost_time = time_ed - time_st

    def reorder_withItemTag(self, user_tag_distrib, items_tag_distrib,
                            top_n_songs):
        '''
		@Desc:
		@params[in] user_tag_distrib: dict, {tag:freq}
		@params[in] items_tag_distrib: dict, {sid:{tag:freq}}
		@params[in] top_n_songs: [(sid,score),]
		'''
        songs = set([song[0] for song in top_n_songs])
        user_norm = sum([freq**2 for freq in user_tag_distrib.values()])
        user_tags = set([tag for tag in user_tag_distrib.keys()])
        user_song_match = defaultdict(float)
        for sid in songs:
            inter_tag = user_tags & set(items_tag_distrib[sid].keys())
            song_norm = sum(
                [freq**2 for freq in items_tag_distrib[sid].values()])
            if len(inter_tag) == 0:
                continue
            for tag in inter_tag:
                user_song_match[
                    sid] += items_tag_distrib[sid][tag] * user_tag_distrib[tag]
            user_song_match[sid] /= (user_norm * song_norm)**0.5

        n_top_n_songs = sorted([(song[0], song[1] *
                                 (1 + user_song_match[song[0]]))
                                for song in top_n_songs],
                               key=lambda x: x[1],
                               reverse=True)
        return n_top_n_songs