Esempio n. 1
0
    def test_get_home_page_right(self):
        """
        测试主页右边部分(即微博数据部分)是否可以正常解析
        :return: 
        """
        from page_parse import home
        with open('tests/enterprisehome.html', encoding='utf-8') as f:
            html = f.read()
        wbcounts = home.get_wbdata_fromweb(html)
        self.assertEqual(len(wbcounts), 15)

        with open('tests/personhome.html', encoding='utf-8') as f:
            html = f.read()
        wbcounts = home.get_wbdata_fromweb(html)
        self.assertEqual(len(wbcounts), 15)
Esempio n. 2
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
Esempio n. 3
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Esempio n. 4
0
    def test_get_home_page_right(self):
        """
        测试主页右边部分(即微博数据部分)是否可以正常解析
        :return: 
        """
        from page_parse import home
        url = TEST_SERVER + 'enterprisehome.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        html = resp.text
        wbcounts = home.get_wbdata_fromweb(html)
        self.assertEqual(len(wbcounts), 15)

        url = TEST_SERVER + 'personhome.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        html = resp.text
        wbcounts = home.get_wbdata_fromweb(html)
        self.assertEqual(len(wbcounts), 15)
Esempio n. 5
0
    def test_get_home_page_right(self):
        """
        测试主页右边部分(即微博数据部分)是否可以正常解析
        :return: 
        """
        from page_parse import home
        url = TEST_SERVER + 'enterprisehome.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        html = resp.text
        wbcounts = home.get_wbdata_fromweb(html)
        self.assertEqual(len(wbcounts), 15)

        url = TEST_SERVER + 'personhome.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        html = resp.text
        wbcounts = home.get_wbdata_fromweb(html)
        self.assertEqual(len(wbcounts), 15)
Esempio n. 6
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        page_id = public.get_pageid(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Esempio n. 7
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 10
    # end

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是
        # 450条微博。
        if cur_page > max_page:
            break
        # end

        url = home_url.format(uid, cur_page)
        html = get_page(url)

        domain = public.get_userdomain(html)
        # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。
        if domain not in ['103505', '100306', '100505', '']:
            set_seed_home_crawled(uid, 2)
            return
        # end

        weibo_datas, weibo_pics = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        # 如果非空,则将weibo_pics插入数据库中
        if weibo_pics:
            insert_weibo_pics(weibo_pics)
        # end

        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

    # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在
    # 某个回调函数中使用它。
    set_seed_home_crawled(uid)