Esempio n. 1
0
def update_userinfo():
    """
    临时更新数据库的脚本
    Returns:

    """
    DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos')
    uids = DAO_inst.db_inst.distinct('userId')
    count = 0
    for uid in uids:
        userinfo = DAO_inst.db_inst.find_one({'userId': uid})
        userinfo['follow_count'] = len(userinfo['follow_ids'])
        userinfo['fan_count'] = len(userinfo['fan_ids'])
        DAO_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True)
        data_process_logger.info('No.%s %s-%s' % (count, userinfo['userId'], userinfo['nickname']))
        count += 1
    print 'done'
Esempio n. 2
0
def prepare_song_dict(tag=''):
    """
    从数据库中遍历歌单,准备song2vec的训练数据
    Args:
        tag: 备注tag信息

    Returns:

    """
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    print playlist_dao_inst.db_inst.find(
        {
            'trackCount': {
                '$gte': 3,
                '$lte': 1000
            },
            'playCount': {
                '$gte': 1
            }
        }, {
            'tracks': 1,
            'name': 1
        }).limit(100000).count()
    find_result = playlist_dao_inst.db_inst.find(
        {
            'trackCount': {
                '$gte': 3,
                '$lte': 1000
            },
            'playCount': {
                '$gte': 1
            }
        }, {
            'tracks': 1,
            'name': 1
        }).limit(100000)
    # 将歌单中的歌曲名组合成歌曲名序列
    total_song_set = []
    count = 0
    for item in find_result:
        data_process_logger.info('No.%s %s' % (count, item['name']))
        # 保存歌单中的歌曲序列
        song_seq = []
        for song in item['tracks']:
            sname = song['name']
            song_seq.append(sname.lower())
        total_song_set.append(song_seq)
        count += 1
    data_process_logger.info('start building dictionary')
    song_dictionary = corpora.Dictionary(total_song_set)
    print u'歌单数', song_dictionary.num_docs
    print u'歌曲数', song_dictionary.num_pos
    data_process_logger.info('start saving datas')
    song_dictionary.save('../datas/song_dictionary_%s.dict' % tag)
    pickle.dump(total_song_set, open('../datas/songs_seq_%s.dat' % tag, 'wb'))
    return song_dictionary
Esempio n. 3
0
def get_user_playlist(uid):
    user_dao_inst = CloudMusicDAO('MusicTaster', 'UserInfos')
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    song_dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos')
    # count = 0
    userinfo = user_dao_inst.db_inst.find_one({"userId": uid})
    # fetch playlist ids
    user_playlists = user_playlist(uid, limit=2000)
    data_process_logger.info(
        'processing the playlist of %s\nTotal playlist = %s' % (userinfo['nickname'], len(user_playlists)))
    if len(user_playlists):
        for i in range(len(user_playlists)):
            pl_info = user_playlists[i]
            data_process_logger.info(
                'processing %s No.%s playlist: %s, total song: %s' % (
                    userinfo['nickname'], i, pl_info['name'], pl_info['trackCount']))
            # fetch playlist details
            # 首先查看是否在数据库中有
            pl_obj = playlist_dao_inst.db_inst.find_one({'id': pl_info['id']})
            if not pl_obj:
                try:
                    pl_obj = playlist_detail(pl_info['id'])
                    pl_song_ids = []
                    if pl_obj != -1:
                        for song in pl_obj['tracks']:
                            song_dao_inst.save_unique_item(song, primary_key='id')
                            pl_song_ids.append(song['id'])
                        # 在playlist中保存track信息,只保存编号
                        user_playlists[i]['tracks_ids'] = pl_song_ids
                        pl_obj['tracks_ids'] = pl_song_ids
                        playlist_dao_inst.save_unique_item(pl_obj, primary_key='id', is_inform=True)
                        slp = random.random() * 2 + 1
                        # data_process_logger.info('sleep %s sec' % slp)
                        time.sleep(slp)
                    else:
                        data_process_logger.error('cannot fetch %s %s' % (pl_info['id'], pl_info['name']))
                except Exception, e:
                    print e
            else:
                user_playlists[i]['tracks_ids'] = pl_obj['tracks_ids']
Esempio n. 4
0
def fill_song_comments():
    """
    填充歌曲的评论详情
    Returns:

    """
    dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos')
    find_result = dao_inst.db_inst.find({'commentInfo': {'$exists': False}})
    count = 0
    for song_item in find_result:
        comm_data = song_comments(song_item['commentThreadId'], limit=10)
        if comm_data:  # 确保评论详情读取正确
            del comm_data['code']
            # del comm_data['userId']
            song_item['commentInfo'] = comm_data
            song_item['commentCount'] = comm_data['total']
        dao_inst.db_inst.save(song_item)
        data_process_logger.info(
            'No.%s %s, comments: %s done' % (count, song_item['name'], song_item['commentCount']))
        count += 1
        slp = random.random() * 2 + 1
        data_process_logger.info('sleep %s sec' % slp)
        time.sleep(slp)
Esempio n. 5
0
def prepare_artist_dict(tag=''):
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    # print playlist_dao_inst.db_inst.find(
    #     {'trackCount': {'$gte': 10, '$lte': 600}, 'playCount': {'$gte': 10}},
    #     {'name': 1}).limit(100000).count()
    find_result = playlist_dao_inst.db_inst.find(
        {
            'trackCount': {
                '$gte': 10,
                '$lte': 600
            },
            'playCount': {
                '$gte': 5
            }
        }, {
            'tracks': 1,
            'name': 1
        }).limit(100000)
    # 将歌单中的歌曲名组合成歌曲名序列
    total_artists_set = []
    count = 0
    for item in find_result:
        data_process_logger.info('No.%s %s' % (count, item['name']))
        # 保存歌单中的歌曲序列
        artists_seq = []
        for song in item['tracks']:
            sname = song['artists'][0]['name']
            artists_seq.append(sname.lower())
        total_artists_set.append(artists_seq)
        count += 1
    data_process_logger.info('start building dictionary')
    artist_dictionary = corpora.Dictionary(total_artists_set)
    print u'歌单数', artist_dictionary.num_docs
    try:
        print u'歌手数', len(artist_dictionary.token2id)
    except Exception, e:
        print 'error = %s' % e
Esempio n. 6
0
def fetch_user_networks(start_id=None, max_user_count=5000):
    """
    启动用户信息爬取的函数
    Args:
        max_user_count: 本次最大爬取的用户数
        start_id: 入口id,如果没有则在数据库中任取一个

    Returns:

    """
    db_userinfo = get_db_inst('MusicTaster', 'UserInfos')
    DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos')
    # start_info = user_login('13717951224', 'hejiawei')
    # u_profile = user_profile(start_id)
    if not start_id:
        start_id = db_userinfo.find_one()['userId']
    idlist = set()
    idlist.add(start_id)
    # save start user info
    cur_id = start_id
    followlist = user_follows(cur_id)
    for i in followlist:
        idlist.add(i['userId'])
    # result_count = find_result.count()
    user_count = 0
    while len(idlist) > 0 and user_count < max_user_count and cur_id:
        if db_userinfo.find({'userId': cur_id}).count() != 0:
            # slp = random.random() * 1 + 0.5
            data_process_logger.info('[SKIP] No.%s User %s skip!' % (user_count, cur_id))
            # data_process_logger.info('sleep %s sec' % slp)
            user_count += 1
            cur_id = idlist.pop()
            continue
        u_profile = user_profile(cur_id)
        # db_userinfo.insert(u_profile)
        followlist = user_follows(cur_id)
        fanlist = user_fans(cur_id)
        u_profile['follows'] = followlist
        u_profile['fans'] = fanlist
        followids = []
        fanids = []
        for userinfo in followlist:
            int_id = userinfo['userId']
            followids.append(int_id)
            idlist.add(int_id)
        for userinfo in fanlist:
            int_id = userinfo['userId']
            fanids.append(int_id)
            idlist.add(int_id)
        u_profile['follow_ids'] = followids
        u_profile['follow_count'] = len(followids)
        u_profile['fan_ids'] = fanids
        u_profile['fan_count'] = len(fanids)
        DAO_inst.save_unique_item(u_profile)
        data_process_logger.info('[OK] No.%s User %s, nickname = %s ok! %s users left' % (
            user_count, cur_id, u_profile['nickname'], len(idlist)))
        slp = random.random() * 2 + 1
        data_process_logger.info('sleep %s sec' % slp)
        time.sleep(slp)
        cur_id = idlist.pop()
        user_count += 1
        # result_count = db_userinfo.find({'userId': cur_id}).count()
    print 'done'
Esempio n. 7
0
def fetch_playlist(max_user_count=100):
    """
    进行用户歌单的抓取,同时更新UserInfos、SongInfos和Plyalists三个数据库的信息
    Args:
        max_user_count: 最大抓取的用户数

    Returns:
        无
    """
    user_dao_inst = CloudMusicDAO('MusicTaster', 'UserInfos')
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    song_dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos')
    userid_list = user_dao_inst.db_inst.find({"playlists": {'$exists': False}}).distinct('userId')
    # random.shuffle(userid_list)
    count = 0
    for uid in userid_list[:max_user_count]:
        # count = 0
        userinfo = user_dao_inst.db_inst.find_one({"userId": uid})
        # fetch playlist ids
        user_playlists = user_playlist(uid, limit=2000)
        data_process_logger.info(
            'processing the playlist of %s\nTotal playlist = %s' % (userinfo['nickname'], len(user_playlists)))
        if len(user_playlists):
            for i in range(len(user_playlists)):
                pl_info = user_playlists[i]
                data_process_logger.info(
                    'processing %s No.%s playlist: %s, total song: %s' % (
                        userinfo['nickname'], i, pl_info['name'], pl_info['trackCount']))
                # fetch playlist details
                # 首先查看是否在数据库中有
                pl_obj = playlist_dao_inst.db_inst.find_one({'id': pl_info['id']})
                if not pl_obj:
                    try:
                        pl_obj = playlist_detail(pl_info['id'])
                        pl_song_ids = []
                        if pl_obj != -1:
                            for song in pl_obj['tracks']:
                                song_dao_inst.save_unique_item(song, primary_key='id')
                                pl_song_ids.append(song['id'])
                            # 在playlist中保存track信息,只保存编号
                            user_playlists[i]['tracks_ids'] = pl_song_ids
                            pl_obj['tracks_ids'] = pl_song_ids
                            playlist_dao_inst.save_unique_item(pl_obj, primary_key='id', is_inform=True)
                            slp = random.random() * 2 + 1
                            # data_process_logger.info('sleep %s sec' % slp)
                            time.sleep(slp)
                        else:
                            data_process_logger.error('cannot fetch %s %s' % (pl_info['id'], pl_info['name']))
                    except Exception, e:
                        print e
                else:
                    user_playlists[i]['tracks_ids'] = pl_obj['tracks_ids']

        # 在userinfo中保存playlist信息
        userinfo['playlists'] = user_playlists
        user_dao_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True, is_inform=True)
        data_process_logger.info('No.%s %s playlist handled!' % (count, userinfo['nickname']))
        slp = random.random() * 2 + 1
        data_process_logger.info('sleep %s sec' % slp)
        time.sleep(slp)
        count += 1