コード例 #1
0
ファイル: home.py プロジェクト: dittoyi/weibospider
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
コード例 #2
0
ファイル: home.py プロジェクト: sizeof1234/WeiboSpider
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
コード例 #3
0
 def test_get_total_home_page(self):
     """
     测试获取主页页数
     :return: 
     """
     from page_parse import home
     with open('tests/asyncpersonhome.html', encoding='utf-8') as f:
         html = f.read()
     num = home.get_total_page(html)
     self.assertEqual(num, 18)
コード例 #4
0
ファイル: home.py プロジェクト: Doraying1230/Python-Study
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        length_weibo_datas = len(weibo_datas)
        for i in range(0, len(weibo_datas)):
            weibo_time = time.mktime(
                time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
            if weibo_time < timeafter:
                weibo_datas = weibo_datas[0:i]
                break

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if i != length_weibo_datas - 1:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #5
0
ファイル: test_wbspider.py プロジェクト: zqy1/weibospider
    def test_get_total_home_page(self):
        """
        测试获取主页页数
        :return: 
        """
        from page_parse import home

        url = TEST_SERVER + 'asyncpersonhome.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        html = resp.text
        num = home.get_total_page(html)
        self.assertEqual(num, 18)
コード例 #6
0
ファイル: test_wbspider.py プロジェクト: dittoyi/weibospider
    def test_get_total_home_page(self):
        """
        测试获取主页页数
        :return: 
        """
        from page_parse import home

        url = TEST_SERVER + 'asyncpersonhome.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        html = resp.text
        num = home.get_total_page(html)
        self.assertEqual(num, 18)
コード例 #7
0
ファイル: home.py プロジェクト: dougaoyang/weibospider
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        page_id = public.get_pageid(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
コード例 #8
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        WbDataOper.add_all(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #9
0
ファイル: home.py プロジェクト: 402730243/spider-weibo
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #10
0
ファイル: home.py プロジェクト: ResolveWang/WeiboSpider
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #11
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 10
    # end

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是
        # 450条微博。
        if cur_page > max_page:
            break
        # end

        url = home_url.format(uid, cur_page)
        html = get_page(url)

        domain = public.get_userdomain(html)
        # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。
        if domain not in ['103505', '100306', '100505', '']:
            set_seed_home_crawled(uid, 2)
            return
        # end

        weibo_datas, weibo_pics = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        # 如果非空,则将weibo_pics插入数据库中
        if weibo_pics:
            insert_weibo_pics(weibo_pics)
        # end

        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

    # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在
    # 某个回调函数中使用它。
    set_seed_home_crawled(uid)