Esempio n. 1
0
def build_ITDistribution_with_singer(item_tag_file):
    '''
	@Desc: build item-tag distribution from database
	@params[in]: item_tag_file: path of output
	'''
    song_tag_distrib = defaultdict(dict)
    #Build item-tag distribution
    time_st = time.time()
    db = db_connection()
    collection = db['playlist_info']
    records = collection.find()

    for record in records:
        songs = record['songs'].split(',')
        tags = record['tags'].encode('utf8')
        if tags == "none":
            continue
        for song in songs:
            for tag in tags.split(','):
                try:
                    song_tag_distrib[song][tag] += 1
                except:
                    song_tag_distrib[song][tag] = 1

    #Add atrist info into item-tag file
    db = db_connection()
    collection = db['song_info']
    count = 0
    fin = open(item_tag_file, 'wb')
    for songid in song_tag_distrib.keys():
        record = collection.find_one({"_id": songid})
        count += 1
        logging.info("Processing cnt:%s" % (count))
        try:
            song_tag_distrib[songid][record['singer_name'].encode('utf8')] = 1
        except:
            logging.info('RecordMiss, songid:%s. Get from page.' % (songid))
            try:
                singer = get_singer_from_page(songid)
                song_tag_distrib[songid][singer] = 1
            except:
                logging.info("Get singer failed.Songid:%s" % (songid))
        tag_dist = song_tag_distrib[songid]
        data_in_json = json.dumps(tag_dist)
        fin.write("%s\t%s\n" % (songid, data_in_json))
    '''
	'''
    time_ed = time.time()
    logging.info('Build item-tag distribution cost:%s' % (time_ed - time_st))
Esempio n. 2
0
def build_ITDistribution_with_singer(item_tag_file):
	'''
	@Desc: build item-tag distribution from database
	@params[in]: item_tag_file: path of output
	'''
	song_tag_distrib=defaultdict(dict)
	#Build item-tag distribution
	time_st = time.time()
	db = db_connection()
	collection = db['playlist_info']
	records = collection.find()

	for record in records:
		songs = record['songs'].split(',')
		tags = record['tags'].encode('utf8')
		if tags == "none":
			continue
		for song in songs:
			for tag in tags.split(','):
				try:
					song_tag_distrib[song][tag] += 1
				except:
					song_tag_distrib[song][tag] = 1

	#Add atrist info into item-tag file
	db = db_connection()
	collection = db['song_info']
	count = 0
	fin = open(item_tag_file,'wb')
	for songid in song_tag_distrib.keys():
		record = collection.find_one({"_id":songid})
		count+=1
		logging.info("Processing cnt:%s"%(count))
		try:
			song_tag_distrib[songid][record['singer_name'].encode('utf8')]=1
		except:
			logging.info('RecordMiss, songid:%s. Get from page.'%(songid))
			try:
				singer = get_singer_from_page(songid)
				song_tag_distrib[songid][singer] = 1
			except:
				logging.info("Get singer failed.Songid:%s"%(songid))
		tag_dist = song_tag_distrib[songid]
		data_in_json = json.dumps(tag_dist)
		fin.write("%s\t%s\n"%(songid,data_in_json))
	'''
	'''
	time_ed = time.time()
	logging.info('Build item-tag distribution cost:%s'%(time_ed - time_st))
Esempio n. 3
0
def build_dataset(filepath,max_num,datatype):
	'''
	@Desc: randomly select max_num of user for training and testing
	@params[in] filepath: path of file include all user_id
	@params[in] max_num: int, amount of selected users
	'''
	logging.info("Dataset building process >> Begin")
	db = db_connection()
	get_method = None
	if datatype == 'song':
		get_method = get_favorSong_with_id
	elif datatype == 'playlist':
		get_method = get_favor_playlist_with_id
	uid_list = select_uid_reservoir(filepath,max_num,db,get_method=get_method)
	for uid,favor_songs in uid_list:
		print "%s\t%s"%(uid,','.join(favor_songs))
	logging.info("Dataset building process >> Complete")
Esempio n. 4
0
def get_favor_playlist_with_id(userid,database=None,table_name='user_info'):
	'''
	@desc: get user's favor playlists from database
	@params[in] userid
	@params[in] database: pymongo.db
	@params[in] table_name: name of collection
	'''
	#Connect to db
	db = database
	if db ==None:
		db = db_connection()
	#select collection
	collection = db[table_name]
	#Get record from collection
	record = collection.find_one({'_id':userid})
	favor_playlist = record['favor_playlist']
	if not favor_playlist == None:
		favor_playlist = favor_playlist.split(',')
	return favor_playlist
Esempio n. 5
0
def get_favorSong_with_id(userid,database=None,table_name='user_favor_20141215'):
	'''
	@desc: get user's favor songs from database
	@params[in] userid
	@params[in] database: pymongo.db
	@params[in] table_name: name of collection
	'''
	#Connect to db
	db = database
	if db==None:
		db = db_connection()
	#Select collection
	collection = db[table_name]
	#Get record from collection
	record = collection.find_one({'_id':userid})
	favorSongs = record['favor_songs']
	if not favorSongs == None:
		favorSongs = favorSongs.split(',')
	return favorSongs
Esempio n. 6
0
def get_favor_playlist_with_id(userid, database=None, table_name='user_info'):
    '''
	@desc: get user's favor playlists from database
	@params[in] userid
	@params[in] database: pymongo.db
	@params[in] table_name: name of collection
	'''
    #Connect to db
    db = database
    if db == None:
        db = db_connection()
    #select collection
    collection = db[table_name]
    #Get record from collection
    record = collection.find_one({'_id': userid})
    favor_playlist = record['favor_playlist']
    if not favor_playlist == None:
        favor_playlist = favor_playlist.split(',')
    return favor_playlist
Esempio n. 7
0
def build_ITDistribution_withWeight(item_tag_file):
    '''
	@Desc: build item-tag distribution from database
	@params[in]: item_tag_file: path of output
	'''
    song_tag_distrib = defaultdict(dict)
    #Build item-tag distribution
    time_st = time.time()
    db = db_connection()
    collection = db['playlist_info']
    records = collection.find()

    count = 0
    all_times = 0
    for record in records:
        play_times = int(record['play_times'])
        all_times += play_times
        count += 1

    average_playtimes = all_times / count
    records = collection.find()
    for record in records:
        songs = record['songs'].split(',')
        tags = record['tags'].encode('utf8')
        play_times = int(record['play_times'])
        weight = float(play_times) / average_playtimes
        if tags == "none" or weight <= 0:
            continue
        for song in songs:
            for tag in tags.split(','):
                try:
                    song_tag_distrib[song][tag] += weight
                except:
                    song_tag_distrib[song][tag] = weight

    #Dump item-tag distribution to file
    logging.info("Dumping item-tag distribution to file:%s" % (item_tag_file))
    with open(item_tag_file, 'wb') as fin:
        for sid, tag_dist in song_tag_distrib.items():
            data_in_json = json.dumps(tag_dist)
            fin.write("%s\t%s\n" % (sid, data_in_json))
    logging.info("Dumping process done..")
Esempio n. 8
0
def build_ITDistribution_withWeight(item_tag_file):
	'''
	@Desc: build item-tag distribution from database
	@params[in]: item_tag_file: path of output
	'''
	song_tag_distrib=defaultdict(dict)
	#Build item-tag distribution
	time_st = time.time()
	db = db_connection()
	collection = db['playlist_info']
	records = collection.find()
	
	count = 0
	all_times = 0
	for record in records:
		play_times = int(record['play_times'])
		all_times += play_times
		count += 1

	average_playtimes = all_times/count
	records = collection.find()
	for record in records:
		songs = record['songs'].split(',')
		tags = record['tags'].encode('utf8')
		play_times = int(record['play_times'])
		weight = float(play_times)/average_playtimes
		if tags == "none" or weight <=0:
			continue
		for song in songs:
			for tag in tags.split(','):
				try:
					song_tag_distrib[song][tag] += weight
				except:
					song_tag_distrib[song][tag] = weight

	#Dump item-tag distribution to file
	logging.info("Dumping item-tag distribution to file:%s"%(item_tag_file))
	with open(item_tag_file,'wb') as fin:
		for sid,tag_dist in song_tag_distrib.items():
			data_in_json = json.dumps(tag_dist)
			fin.write("%s\t%s\n"%(sid,data_in_json))
	logging.info("Dumping process done..")
Esempio n. 9
0
def build_dataset(filepath, max_num, datatype):
    '''
	@Desc: randomly select max_num of user for training and testing
	@params[in] filepath: path of file include all user_id
	@params[in] max_num: int, amount of selected users
	'''
    logging.info("Dataset building process >> Begin")
    db = db_connection()
    get_method = None
    if datatype == 'song':
        get_method = get_favorSong_with_id
    elif datatype == 'playlist':
        get_method = get_favor_playlist_with_id
    uid_list = select_uid_reservoir(filepath,
                                    max_num,
                                    db,
                                    get_method=get_method)
    for uid, favor_songs in uid_list:
        print "%s\t%s" % (uid, ','.join(favor_songs))
    logging.info("Dataset building process >> Complete")
Esempio n. 10
0
def get_favorSong_with_id(userid,
                          database=None,
                          table_name='user_favor_20141215'):
    '''
	@desc: get user's favor songs from database
	@params[in] userid
	@params[in] database: pymongo.db
	@params[in] table_name: name of collection
	'''
    #Connect to db
    db = database
    if db == None:
        db = db_connection()
    #Select collection
    collection = db[table_name]
    #Get record from collection
    record = collection.find_one({'_id': userid})
    favorSongs = record['favor_songs']
    if not favorSongs == None:
        favorSongs = favorSongs.split(',')
    return favorSongs
Esempio n. 11
0
def test():
    db = db_connection()
    collection = db['song_info']
    song_id = '684231'
    record = collection.find_one({"_id": song_id})
    print record['singer_name'].encode('utf8')
Esempio n. 12
0
def test():
	db = db_connection()
	collection = db['song_info']
	song_id = '684231'
	record = collection.find_one({"_id":song_id})
	print record['singer_name'].encode('utf8')