def build_ITDistribution_with_singer(item_tag_file): ''' @Desc: build item-tag distribution from database @params[in]: item_tag_file: path of output ''' song_tag_distrib = defaultdict(dict) #Build item-tag distribution time_st = time.time() db = db_connection() collection = db['playlist_info'] records = collection.find() for record in records: songs = record['songs'].split(',') tags = record['tags'].encode('utf8') if tags == "none": continue for song in songs: for tag in tags.split(','): try: song_tag_distrib[song][tag] += 1 except: song_tag_distrib[song][tag] = 1 #Add atrist info into item-tag file db = db_connection() collection = db['song_info'] count = 0 fin = open(item_tag_file, 'wb') for songid in song_tag_distrib.keys(): record = collection.find_one({"_id": songid}) count += 1 logging.info("Processing cnt:%s" % (count)) try: song_tag_distrib[songid][record['singer_name'].encode('utf8')] = 1 except: logging.info('RecordMiss, songid:%s. Get from page.' % (songid)) try: singer = get_singer_from_page(songid) song_tag_distrib[songid][singer] = 1 except: logging.info("Get singer failed.Songid:%s" % (songid)) tag_dist = song_tag_distrib[songid] data_in_json = json.dumps(tag_dist) fin.write("%s\t%s\n" % (songid, data_in_json)) ''' ''' time_ed = time.time() logging.info('Build item-tag distribution cost:%s' % (time_ed - time_st))
def build_ITDistribution_with_singer(item_tag_file): ''' @Desc: build item-tag distribution from database @params[in]: item_tag_file: path of output ''' song_tag_distrib=defaultdict(dict) #Build item-tag distribution time_st = time.time() db = db_connection() collection = db['playlist_info'] records = collection.find() for record in records: songs = record['songs'].split(',') tags = record['tags'].encode('utf8') if tags == "none": continue for song in songs: for tag in tags.split(','): try: song_tag_distrib[song][tag] += 1 except: song_tag_distrib[song][tag] = 1 #Add atrist info into item-tag file db = db_connection() collection = db['song_info'] count = 0 fin = open(item_tag_file,'wb') for songid in song_tag_distrib.keys(): record = collection.find_one({"_id":songid}) count+=1 logging.info("Processing cnt:%s"%(count)) try: song_tag_distrib[songid][record['singer_name'].encode('utf8')]=1 except: logging.info('RecordMiss, songid:%s. Get from page.'%(songid)) try: singer = get_singer_from_page(songid) song_tag_distrib[songid][singer] = 1 except: logging.info("Get singer failed.Songid:%s"%(songid)) tag_dist = song_tag_distrib[songid] data_in_json = json.dumps(tag_dist) fin.write("%s\t%s\n"%(songid,data_in_json)) ''' ''' time_ed = time.time() logging.info('Build item-tag distribution cost:%s'%(time_ed - time_st))
def build_dataset(filepath,max_num,datatype): ''' @Desc: randomly select max_num of user for training and testing @params[in] filepath: path of file include all user_id @params[in] max_num: int, amount of selected users ''' logging.info("Dataset building process >> Begin") db = db_connection() get_method = None if datatype == 'song': get_method = get_favorSong_with_id elif datatype == 'playlist': get_method = get_favor_playlist_with_id uid_list = select_uid_reservoir(filepath,max_num,db,get_method=get_method) for uid,favor_songs in uid_list: print "%s\t%s"%(uid,','.join(favor_songs)) logging.info("Dataset building process >> Complete")
def get_favor_playlist_with_id(userid,database=None,table_name='user_info'): ''' @desc: get user's favor playlists from database @params[in] userid @params[in] database: pymongo.db @params[in] table_name: name of collection ''' #Connect to db db = database if db ==None: db = db_connection() #select collection collection = db[table_name] #Get record from collection record = collection.find_one({'_id':userid}) favor_playlist = record['favor_playlist'] if not favor_playlist == None: favor_playlist = favor_playlist.split(',') return favor_playlist
def get_favorSong_with_id(userid,database=None,table_name='user_favor_20141215'): ''' @desc: get user's favor songs from database @params[in] userid @params[in] database: pymongo.db @params[in] table_name: name of collection ''' #Connect to db db = database if db==None: db = db_connection() #Select collection collection = db[table_name] #Get record from collection record = collection.find_one({'_id':userid}) favorSongs = record['favor_songs'] if not favorSongs == None: favorSongs = favorSongs.split(',') return favorSongs
def get_favor_playlist_with_id(userid, database=None, table_name='user_info'): ''' @desc: get user's favor playlists from database @params[in] userid @params[in] database: pymongo.db @params[in] table_name: name of collection ''' #Connect to db db = database if db == None: db = db_connection() #select collection collection = db[table_name] #Get record from collection record = collection.find_one({'_id': userid}) favor_playlist = record['favor_playlist'] if not favor_playlist == None: favor_playlist = favor_playlist.split(',') return favor_playlist
def build_ITDistribution_withWeight(item_tag_file): ''' @Desc: build item-tag distribution from database @params[in]: item_tag_file: path of output ''' song_tag_distrib = defaultdict(dict) #Build item-tag distribution time_st = time.time() db = db_connection() collection = db['playlist_info'] records = collection.find() count = 0 all_times = 0 for record in records: play_times = int(record['play_times']) all_times += play_times count += 1 average_playtimes = all_times / count records = collection.find() for record in records: songs = record['songs'].split(',') tags = record['tags'].encode('utf8') play_times = int(record['play_times']) weight = float(play_times) / average_playtimes if tags == "none" or weight <= 0: continue for song in songs: for tag in tags.split(','): try: song_tag_distrib[song][tag] += weight except: song_tag_distrib[song][tag] = weight #Dump item-tag distribution to file logging.info("Dumping item-tag distribution to file:%s" % (item_tag_file)) with open(item_tag_file, 'wb') as fin: for sid, tag_dist in song_tag_distrib.items(): data_in_json = json.dumps(tag_dist) fin.write("%s\t%s\n" % (sid, data_in_json)) logging.info("Dumping process done..")
def build_ITDistribution_withWeight(item_tag_file): ''' @Desc: build item-tag distribution from database @params[in]: item_tag_file: path of output ''' song_tag_distrib=defaultdict(dict) #Build item-tag distribution time_st = time.time() db = db_connection() collection = db['playlist_info'] records = collection.find() count = 0 all_times = 0 for record in records: play_times = int(record['play_times']) all_times += play_times count += 1 average_playtimes = all_times/count records = collection.find() for record in records: songs = record['songs'].split(',') tags = record['tags'].encode('utf8') play_times = int(record['play_times']) weight = float(play_times)/average_playtimes if tags == "none" or weight <=0: continue for song in songs: for tag in tags.split(','): try: song_tag_distrib[song][tag] += weight except: song_tag_distrib[song][tag] = weight #Dump item-tag distribution to file logging.info("Dumping item-tag distribution to file:%s"%(item_tag_file)) with open(item_tag_file,'wb') as fin: for sid,tag_dist in song_tag_distrib.items(): data_in_json = json.dumps(tag_dist) fin.write("%s\t%s\n"%(sid,data_in_json)) logging.info("Dumping process done..")
def build_dataset(filepath, max_num, datatype): ''' @Desc: randomly select max_num of user for training and testing @params[in] filepath: path of file include all user_id @params[in] max_num: int, amount of selected users ''' logging.info("Dataset building process >> Begin") db = db_connection() get_method = None if datatype == 'song': get_method = get_favorSong_with_id elif datatype == 'playlist': get_method = get_favor_playlist_with_id uid_list = select_uid_reservoir(filepath, max_num, db, get_method=get_method) for uid, favor_songs in uid_list: print "%s\t%s" % (uid, ','.join(favor_songs)) logging.info("Dataset building process >> Complete")
def get_favorSong_with_id(userid, database=None, table_name='user_favor_20141215'): ''' @desc: get user's favor songs from database @params[in] userid @params[in] database: pymongo.db @params[in] table_name: name of collection ''' #Connect to db db = database if db == None: db = db_connection() #Select collection collection = db[table_name] #Get record from collection record = collection.find_one({'_id': userid}) favorSongs = record['favor_songs'] if not favorSongs == None: favorSongs = favorSongs.split(',') return favorSongs
def test(): db = db_connection() collection = db['song_info'] song_id = '684231' record = collection.find_one({"_id": song_id}) print record['singer_name'].encode('utf8')
def test(): db = db_connection() collection = db['song_info'] song_id = '684231' record = collection.find_one({"_id":song_id}) print record['singer_name'].encode('utf8')