def update_userinfo(): """ 临时更新数据库的脚本 Returns: """ DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos') uids = DAO_inst.db_inst.distinct('userId') count = 0 for uid in uids: userinfo = DAO_inst.db_inst.find_one({'userId': uid}) userinfo['follow_count'] = len(userinfo['follow_ids']) userinfo['fan_count'] = len(userinfo['fan_ids']) DAO_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True) data_process_logger.info('No.%s %s-%s' % (count, userinfo['userId'], userinfo['nickname'])) count += 1 print 'done'
def prepare_song_dict(tag=''): """ 从数据库中遍历歌单,准备song2vec的训练数据 Args: tag: 备注tag信息 Returns: """ playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') print playlist_dao_inst.db_inst.find( { 'trackCount': { '$gte': 3, '$lte': 1000 }, 'playCount': { '$gte': 1 } }, { 'tracks': 1, 'name': 1 }).limit(100000).count() find_result = playlist_dao_inst.db_inst.find( { 'trackCount': { '$gte': 3, '$lte': 1000 }, 'playCount': { '$gte': 1 } }, { 'tracks': 1, 'name': 1 }).limit(100000) # 将歌单中的歌曲名组合成歌曲名序列 total_song_set = [] count = 0 for item in find_result: data_process_logger.info('No.%s %s' % (count, item['name'])) # 保存歌单中的歌曲序列 song_seq = [] for song in item['tracks']: sname = song['name'] song_seq.append(sname.lower()) total_song_set.append(song_seq) count += 1 data_process_logger.info('start building dictionary') song_dictionary = corpora.Dictionary(total_song_set) print u'歌单数', song_dictionary.num_docs print u'歌曲数', song_dictionary.num_pos data_process_logger.info('start saving datas') song_dictionary.save('../datas/song_dictionary_%s.dict' % tag) pickle.dump(total_song_set, open('../datas/songs_seq_%s.dat' % tag, 'wb')) return song_dictionary
def get_user_playlist(uid): user_dao_inst = CloudMusicDAO('MusicTaster', 'UserInfos') playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') song_dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos') # count = 0 userinfo = user_dao_inst.db_inst.find_one({"userId": uid}) # fetch playlist ids user_playlists = user_playlist(uid, limit=2000) data_process_logger.info( 'processing the playlist of %s\nTotal playlist = %s' % (userinfo['nickname'], len(user_playlists))) if len(user_playlists): for i in range(len(user_playlists)): pl_info = user_playlists[i] data_process_logger.info( 'processing %s No.%s playlist: %s, total song: %s' % ( userinfo['nickname'], i, pl_info['name'], pl_info['trackCount'])) # fetch playlist details # 首先查看是否在数据库中有 pl_obj = playlist_dao_inst.db_inst.find_one({'id': pl_info['id']}) if not pl_obj: try: pl_obj = playlist_detail(pl_info['id']) pl_song_ids = [] if pl_obj != -1: for song in pl_obj['tracks']: song_dao_inst.save_unique_item(song, primary_key='id') pl_song_ids.append(song['id']) # 在playlist中保存track信息,只保存编号 user_playlists[i]['tracks_ids'] = pl_song_ids pl_obj['tracks_ids'] = pl_song_ids playlist_dao_inst.save_unique_item(pl_obj, primary_key='id', is_inform=True) slp = random.random() * 2 + 1 # data_process_logger.info('sleep %s sec' % slp) time.sleep(slp) else: data_process_logger.error('cannot fetch %s %s' % (pl_info['id'], pl_info['name'])) except Exception, e: print e else: user_playlists[i]['tracks_ids'] = pl_obj['tracks_ids']
def fill_song_comments(): """ 填充歌曲的评论详情 Returns: """ dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos') find_result = dao_inst.db_inst.find({'commentInfo': {'$exists': False}}) count = 0 for song_item in find_result: comm_data = song_comments(song_item['commentThreadId'], limit=10) if comm_data: # 确保评论详情读取正确 del comm_data['code'] # del comm_data['userId'] song_item['commentInfo'] = comm_data song_item['commentCount'] = comm_data['total'] dao_inst.db_inst.save(song_item) data_process_logger.info( 'No.%s %s, comments: %s done' % (count, song_item['name'], song_item['commentCount'])) count += 1 slp = random.random() * 2 + 1 data_process_logger.info('sleep %s sec' % slp) time.sleep(slp)
def prepare_artist_dict(tag=''): playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') # print playlist_dao_inst.db_inst.find( # {'trackCount': {'$gte': 10, '$lte': 600}, 'playCount': {'$gte': 10}}, # {'name': 1}).limit(100000).count() find_result = playlist_dao_inst.db_inst.find( { 'trackCount': { '$gte': 10, '$lte': 600 }, 'playCount': { '$gte': 5 } }, { 'tracks': 1, 'name': 1 }).limit(100000) # 将歌单中的歌曲名组合成歌曲名序列 total_artists_set = [] count = 0 for item in find_result: data_process_logger.info('No.%s %s' % (count, item['name'])) # 保存歌单中的歌曲序列 artists_seq = [] for song in item['tracks']: sname = song['artists'][0]['name'] artists_seq.append(sname.lower()) total_artists_set.append(artists_seq) count += 1 data_process_logger.info('start building dictionary') artist_dictionary = corpora.Dictionary(total_artists_set) print u'歌单数', artist_dictionary.num_docs try: print u'歌手数', len(artist_dictionary.token2id) except Exception, e: print 'error = %s' % e
def fetch_user_networks(start_id=None, max_user_count=5000): """ 启动用户信息爬取的函数 Args: max_user_count: 本次最大爬取的用户数 start_id: 入口id,如果没有则在数据库中任取一个 Returns: """ db_userinfo = get_db_inst('MusicTaster', 'UserInfos') DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos') # start_info = user_login('13717951224', 'hejiawei') # u_profile = user_profile(start_id) if not start_id: start_id = db_userinfo.find_one()['userId'] idlist = set() idlist.add(start_id) # save start user info cur_id = start_id followlist = user_follows(cur_id) for i in followlist: idlist.add(i['userId']) # result_count = find_result.count() user_count = 0 while len(idlist) > 0 and user_count < max_user_count and cur_id: if db_userinfo.find({'userId': cur_id}).count() != 0: # slp = random.random() * 1 + 0.5 data_process_logger.info('[SKIP] No.%s User %s skip!' % (user_count, cur_id)) # data_process_logger.info('sleep %s sec' % slp) user_count += 1 cur_id = idlist.pop() continue u_profile = user_profile(cur_id) # db_userinfo.insert(u_profile) followlist = user_follows(cur_id) fanlist = user_fans(cur_id) u_profile['follows'] = followlist u_profile['fans'] = fanlist followids = [] fanids = [] for userinfo in followlist: int_id = userinfo['userId'] followids.append(int_id) idlist.add(int_id) for userinfo in fanlist: int_id = userinfo['userId'] fanids.append(int_id) idlist.add(int_id) u_profile['follow_ids'] = followids u_profile['follow_count'] = len(followids) u_profile['fan_ids'] = fanids u_profile['fan_count'] = len(fanids) DAO_inst.save_unique_item(u_profile) data_process_logger.info('[OK] No.%s User %s, nickname = %s ok! %s users left' % ( user_count, cur_id, u_profile['nickname'], len(idlist))) slp = random.random() * 2 + 1 data_process_logger.info('sleep %s sec' % slp) time.sleep(slp) cur_id = idlist.pop() user_count += 1 # result_count = db_userinfo.find({'userId': cur_id}).count() print 'done'
def fetch_playlist(max_user_count=100): """ 进行用户歌单的抓取,同时更新UserInfos、SongInfos和Plyalists三个数据库的信息 Args: max_user_count: 最大抓取的用户数 Returns: 无 """ user_dao_inst = CloudMusicDAO('MusicTaster', 'UserInfos') playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') song_dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos') userid_list = user_dao_inst.db_inst.find({"playlists": {'$exists': False}}).distinct('userId') # random.shuffle(userid_list) count = 0 for uid in userid_list[:max_user_count]: # count = 0 userinfo = user_dao_inst.db_inst.find_one({"userId": uid}) # fetch playlist ids user_playlists = user_playlist(uid, limit=2000) data_process_logger.info( 'processing the playlist of %s\nTotal playlist = %s' % (userinfo['nickname'], len(user_playlists))) if len(user_playlists): for i in range(len(user_playlists)): pl_info = user_playlists[i] data_process_logger.info( 'processing %s No.%s playlist: %s, total song: %s' % ( userinfo['nickname'], i, pl_info['name'], pl_info['trackCount'])) # fetch playlist details # 首先查看是否在数据库中有 pl_obj = playlist_dao_inst.db_inst.find_one({'id': pl_info['id']}) if not pl_obj: try: pl_obj = playlist_detail(pl_info['id']) pl_song_ids = [] if pl_obj != -1: for song in pl_obj['tracks']: song_dao_inst.save_unique_item(song, primary_key='id') pl_song_ids.append(song['id']) # 在playlist中保存track信息,只保存编号 user_playlists[i]['tracks_ids'] = pl_song_ids pl_obj['tracks_ids'] = pl_song_ids playlist_dao_inst.save_unique_item(pl_obj, primary_key='id', is_inform=True) slp = random.random() * 2 + 1 # data_process_logger.info('sleep %s sec' % slp) time.sleep(slp) else: data_process_logger.error('cannot fetch %s %s' % (pl_info['id'], pl_info['name'])) except Exception, e: print e else: user_playlists[i]['tracks_ids'] = pl_obj['tracks_ids'] # 在userinfo中保存playlist信息 userinfo['playlists'] = user_playlists user_dao_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True, is_inform=True) data_process_logger.info('No.%s %s playlist handled!' % (count, userinfo['nickname'])) slp = random.random() * 2 + 1 data_process_logger.info('sleep %s sec' % slp) time.sleep(slp) count += 1