Ejemplo n.º 1
0
 def test_get_url_from_web(self):
     from page_get import user as user_get
     normal_user = user_get.get_profile('1195908387')
     self.assertEqual(normal_user.name, '日_推')
     writer = user_get.get_profile('1191258123')
     self.assertEqual(writer.description, '韩寒')
     enterprise_user = user_get.get_profile('1839256234')
     self.assertEqual(enterprise_user.level, 36)
Ejemplo n.º 2
0
    def test_get_url_from_web(self):
        """
        test crawling different kind of users
        """
        from page_get import user as user_get

        normal_user, _ = user_get.get_profile('1195908387')
        self.assertEqual(normal_user.name, '日_推')
        writer, _ = user_get.get_profile('1191258123')
        self.assertEqual(writer.description, '韩寒')
        enterprise_user, _ = user_get.get_profile('1839256234')
        self.assertEqual(enterprise_user.level, 36)
Ejemplo n.º 3
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user, _ = user_get.get_profile(uid)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # when it comes to errors, set the args to default(root)
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    weibo_repost.save_reposts(repost_datas)
Ejemplo n.º 4
0
def _get_current_source(url, wb_mid):
    """
    :param url: 当前微博url
    :param wb_mid: 当前微博mid
    :return: 转发数,微博用户id,用户名
    """
    html = get_page(url)
    if not html or basic.is_404(html):
        return None

    reposts = parse_status.get_repostcounts(html)
    comments = parse_status.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=wb_mid,
                                          reposts=reposts,
                                          comments=comments)

    root_url = url
    user_id = parse_status.get_userid(html)
    user_name = parse_status.get_username(html)
    post_time = parse_status.get_statustime(html)
    device = parse_status.get_statussource(html)
    comments_count = parse_status.get_commentcounts(html)
    reposts_count = parse_status.get_repostcounts(html)
    root_user = user.get_profile(user_id)
    # 源微博的相关信息存储
    spread_original_dao.save(root_user, wb_mid, post_time, device,
                             reposts_count, comments_count, root_url)

    crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))
    return reposts_count, user_id, user_name
Ejemplo n.º 5
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    wb_data.set_weibo_repost_crawled(mid)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user = user_get.get_profile(uid)

    if total_page < limit:
        limit = total_page + 1
    # todo 这里需要衡量是否有用网络调用的必要性
    for page_num in range(2, limit):
        # app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
        #               routing_key='comment_page_info')
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    # 补上user_id,方便可视化
    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # 设置成根用户的uid和用户名
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    weibo_repost.save_reposts(repost_datas)
Ejemplo n.º 6
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user, _ = user_get.get_profile(uid)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # when it comes to errors, set the args to default(root)
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    weibo_repost.save_reposts(repost_datas)
Ejemplo n.º 7
0
def crawl_person_infos(uid):
    """
    根据用户id来爬取用户相关资料和用户的关注数和粉丝数(由于微博服务端限制,默认爬取前五页,企业号的关注和粉丝也不能查看)
    :param uid: 用户id
    :return: 
    """
    if not uid:
        return

    # 由于与别的任务共享数据表,所以需要先判断数据库是否有该用户信息,再进行抓取
    user = user_get.get_profile(uid)
    # 不抓取企业号
    if user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    seed = get_seed_by_id(uid)
    if seed.other_crawled == 0:
        rs = user_get.get_fans_or_followers_ids(uid, 1)
        rs.extend(user_get.get_fans_or_followers_ids(uid, 2))
        datas = set(rs)
        # 重复数据跳过插入
        if datas:
            insert_seeds(datas)
        set_seed_other_crawled(uid)
Ejemplo n.º 8
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    url = home_url.format(uid)
    html = get_page(url)
    if is_404(html):
        return None

    domain = public.get_userdomain(html)

    user, is_crawled = user_get.get_profile(uid, domain)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans',
                      args=(uid, domain),
                      queue='fans_followers',
                      routing_key='for_fans_followers')
Ejemplo n.º 9
0
def get_status_info(url, user_id, name, mid=''):
    soc = SpreadOtherCache()
    print('当前转发微博url为:' + url)
    repost_cont = get_page(url)

    if not is_404(repost_cont):
        repost_user_id = parse_status.get_userid(repost_cont)
        if repost_user_id == '':
            return None

        repost_user_name = parse_status.get_username(repost_cont)
        soc.set_id(repost_user_id)
        soc.set_name(repost_user_name)

        so = SpreadOther()
        so.id = repost_user_id
        so.screen_name = repost_user_name
        so.upper_user_name = parse_status.get_upperusername(repost_cont, name)
        cur_user = user.get_profile(repost_user_id)
        try:
            so.province = cur_user.province
            so.city = cur_user.city
            so.location = cur_user.location
            so.description = cur_user.description
            so.domain_name = cur_user.domain_name
            so.blog_url = cur_user.blog_url
            so.gender = cur_user.gender
            so.headimg_url = cur_user.headimg_url
            so.followers_count = cur_user.followers_count
            so.friends_count = cur_user.friends_count
            so.status_count = cur_user.status_count
            so.verify_type = cur_user.verify_type
            so.verify_info = cur_user.verify_info
            so.register_time = cur_user.register_time

            if so.screen_name == name:
                so.id = user_id

            so.mid = parse_status.get_mid(repost_cont)
            so.status_post_time = parse_status.get_statustime(repost_cont)
            so.device = parse_status.get_statussource(repost_cont)
            if mid:
                so.original_status_id = mid
            else:
                so.original_status_id = parse_status.get_orignalmid(repost_cont)
            so.comments_count = parse_status.get_commentcounts(repost_cont)
            so.reposts_count = parse_status.get_repostcounts(repost_cont)
            so.like_count = parse_status.get_likecounts(repost_cont)
            so.status_url = url
        except AttributeError as e:
            # todo:找出这里的问题
            logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e))
            logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont))
            return None
        else:
            return SpreadOtherAndCache(so, soc)
    else:
        return None
Ejemplo n.º 10
0
    def test_get_user_from_web(self):
        from wblogin.login import get_session
        from page_get.user import get_profile
        from headers import headers

        user_id = '2674334272'
        sc = get_session()
        if sc:
            session = sc.get('session', '')

            if session:
                # 数据库已有的数据
                user = get_profile(user_id, session, headers)
                self.assertNotEqual(user.description, '')
                # 数据库没有的数据
                user2 = get_profile('3614046244', session, headers)
                self.assertEqual(user2.status_count, 35)
        else:
            raise Exception('模拟登录失败')
Ejemplo n.º 11
0
def crawl_person_infos(uid):
    """
    根据用户id来爬取用户相关资料和用户的关注数和粉丝数(由于微博服务端限制,默认爬取前五页,企业号的关注和粉丝也不能查看)
    :param uid: 用户id
    :return: 
    """
    if not uid:
        return

    # 由于与别的任务共享数据表,所以需要先判断数据库是否有该用户信息,再进行抓取
    user = user_get.get_profile(uid)
    # 不抓取企业号
    if user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                  routing_key='for_fans_followers')
Ejemplo n.º 12
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    user, is_crawled = user_get.get_profile(uid)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                      routing_key='for_fans_followers')
Ejemplo n.º 13
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    user = user_get.get_profile(uid)
    # If it's enterprise user, just skip it
    if user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    celery.send_task('celery_tasks.weibo.user.crawl_follower_fans',
                     args=(uid, ),
                     queue='fans_followers',
                     routing_key='for_fans_followers')
Ejemplo n.º 14
0
def _get_current_reposts(url, session, weibo_mid):
    """
    修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博
    """
    spread_other_caches = list()
    spread_others = list()
    spread_other_and_caches = list()

    html = get_page(url, session, headers)
    reposts = status_parse.get_repostcounts(html)
    comments = status_parse.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=weibo_mid,
                                          reposts=reposts,
                                          comments=comments)

    if not basic.is_404(html):
        root_url = url
        mid = status_parse.get_mid(html)
        user_id = status_parse.get_userid(html)
        user_name = status_parse.get_username(html)
        post_time = status_parse.get_statustime(html)
        device = status_parse.get_statussource(html)
        comments_count = status_parse.get_commentcounts(html)
        reposts_count = status_parse.get_repostcounts(html)
        root_user = user.get_profile(user_id, session, headers)

        spread_original_dao.save(root_user, mid, post_time, device,
                                 reposts_count, comments_count, root_url)

        crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))

        if reposts_count > 0:
            base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}'
            soc = SpreadOtherCache()
            soc.set_id(user_id)
            soc.set_name(user_name)
            spread_other_caches.append(soc)
            page = 1
            ajax_url = base_url.format(mid=mid, currpage=page)
            source = get_page(ajax_url, session, headers, False)

            crawler.info('本次转发信息url为:' + ajax_url)

            try:
                repost_json = json.loads(source)
                total_page = int(repost_json['data']['page']['totalpage'])
            except Exception as why:
                parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                    url=ajax_url, why=why))
            else:
                page = total_page
                page_counter = 0

                while page > 0 and page_counter < page_max:
                    ajax_url = base_url.format(mid=mid, currpage=page)
                    repost_info = get_page(ajax_url, session, headers, False)
                    try:
                        repost_json = json.loads(repost_info)
                        repost_html = repost_json['data']['html']
                    except Exception as why:
                        parser.error(
                            '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                                url=ajax_url, why=why))
                    else:
                        repost_urls = status_parse.get_reposturls(repost_html)

                        # 转发节点排序逻辑
                        for repost_url in repost_urls:
                            repost_cont = status.get_status_info(
                                repost_url, session, user_id, user_name,
                                headers, mid)

                            if repost_cont is not None:
                                spread_other_and_caches.append(repost_cont)

                        for soac in spread_other_and_caches:
                            if soac.get_so().id != '':
                                spread_others.append(soac.get_so())
                                spread_other_caches.append(soac.get_soc())
                    finally:
                        print('当前位于第{currpage}页'.format(currpage=page))
                        page -= 1
                        page_counter += 1

                for so in spread_others:
                    if so.verify_type == '':
                        so.verify_type = 0

                    for i in spread_other_caches:
                        if so.upper_user_name == i.get_name():
                            so.upper_user_id = i.get_id()
                            break
                        else:
                            so.upper_user_id = user_id

                spread_others = list(set(spread_others))

                spread_other_dao.save(spread_others)
                crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format(
                    num=len(spread_others)))
    else:
        crawler.info('{url}为404页面'.format(url=url))