コード例 #1
0
ファイル: repost.py プロジェクト: cptBTptpwbct/weibo
def crawl_repost_by_page(mid, page_num):
    cur_url = BASE_URL.format(mid, page_num)
    html = get_page(cur_url, auth_level=1, is_ajax=True)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        WbDataOper.set_weibo_repost_crawled(mid)
    return html, repost_datas
コード例 #2
0
ファイル: praise.py プロジェクト: 402730243/spider-weibo
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)

    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error(
            'fail to get praise page 2 ext_param, mid is {mid}'.format(
                mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2, 5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return
コード例 #3
0
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)

            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return
コード例 #4
0
ファイル: repost.py プロジェクト: ResolveWang/WeiboSpider
def crawl_repost_by_page(mid, page_num):
    cur_url = BASE_URL.format(mid, page_num)
    html = get_page(cur_url, auth_level=1, is_ajax=True)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        WbDataOper.set_weibo_repost_crawled(mid)
    return html, repost_datas
コード例 #5
0
def crawl_comment_by_page(mid, page_num):
    cur_url = BASE_URL.format(mid, page_num)
    html = get_page(cur_url, auth_level=1, is_ajax=True)
    comment_datas = comment.get_comment_list(html, mid)
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
コード例 #6
0
ファイル: dialogue.py プロジェクト: ResolveWang/WeiboSpider
def crawl_dialogue_by_comment_page(mid, page_num):
    comment_url = COMMENT_URL.format(mid, page_num)
    html = get_page(comment_url, auth_level=1, is_ajax=True)
    comment_ids = dialogue.get_comment_id(html, mid)
    for cid in comment_ids:
        crawl_dialogue_by_comment_id(cid, mid)

    if page_num == 1:
        WbDataOper.set_weibo_dialogue_crawled(mid)
    return html
コード例 #7
0
ファイル: dialogue.py プロジェクト: lan1tian/weibospider
def crawl_dialogue_by_comment_page(mid, page_num):
    comment_url = COMMENT_URL.format(mid, page_num)
    html = get_page(comment_url, auth_level=1, is_ajax=True)
    comment_ids = dialogue.get_comment_id(html, mid)
    for cid in comment_ids:
        crawl_dialogue_by_comment_id(cid, mid)

    if page_num == 1:
        WbDataOper.set_weibo_dialogue_crawled(mid)
    return html
コード例 #8
0
def search_items_v2(keyword, keyword_id, date_item):
    search_time_list = [
        "{}-{}:{}-{}".format(d, t, d, t + 2)
        for d, t in itertools.product([date_item], TIME_LIIT)
    ]

    for s_time in search_time_list:
        crawler.info('We are searching keyword "{}", {}'.format(
            keyword, s_time))
        cur_page = 1
        encode_keyword = url_parse.quote(keyword)
        while cur_page < LIMIT:
            cur_url = MAX_URL.format(encode_keyword, cur_page, s_time)
            # current only for login, maybe later crawling page one without login
            search_page = get_page(cur_url, auth_level=1, need_proxy=True)
            if "您可以尝试更换关键词,再次搜索" in search_page:
                break
            if not search_page:
                crawler.warning(
                    'No search result for keyword {}, the source page is {}'.
                    format(keyword, search_page))
                cur_page += 1
                continue
                # return

            search_list = parse_search.get_search_info(search_page)

            if cur_page == 1:
                cur_page += 1
            elif 'noresult_tit' not in search_page:
                cur_page += 1
            else:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return

            # Because the search results are sorted by time, if any result has been stored in mysql,
            # We don't need to crawl the same keyword in this turn
            for wb_data in search_list:
                # print(wb_data)
                rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # todo incremental crawling using time
                if rs:
                    crawler.info('Weibo {} has been crawled, skip it.'.format(
                        wb_data.weibo_id))
                    continue
                else:
                    WbDataOper.add_one(wb_data)
                    # todo: only add seed ids and remove this task
                    app.send_task('tasks.user.crawl_person_infos',
                                  args=(wb_data.uid, ),
                                  queue='user_crawler',
                                  routing_key='for_user_info')
コード例 #9
0
ファイル: home.py プロジェクト: Doraying1230/Python-Study
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        length_weibo_datas = len(weibo_datas)
        for i in range(0, len(weibo_datas)):
            weibo_time = time.mktime(
                time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
            if weibo_time < timeafter:
                weibo_datas = weibo_datas[0:i]
                break

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if i != length_weibo_datas - 1:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #10
0
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdatas = get_ajax_data(ajax_html)
    if not ajax_wbdatas:
        return ''

    WbDataOper.add_all(ajax_wbdatas)
    return ajax_html
コード例 #11
0
ファイル: comment.py プロジェクト: cptBTptpwbct/weibo
def crawl_comment_by_page(mid, page_num, seeion):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas, seed_ids = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_comment_by_page(mid, page_num)
    CommentOper.add_all(comment_datas, seeion)
    SeedidsOper.insert_seeds(seed_ids, seeion)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid, seeion)
    return html, comment_datas
コード例 #12
0
def crawl_praise_by_page(mid, page_num):
    try:
        cur_time = int(time.time() * 1000)
        cur_url = BASE_URL.format(mid, page_num, cur_time)
        html = get_page(cur_url, auth_level=2, is_ajax=True)
        praise_datas = praise.get_praise_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "praise SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_praise_by_page(mid, page_num)
    PraiseOper.add_all(praise_datas)
    if page_num == 1:
        WbDataOper.set_weibo_praise_crawled(mid)
    return html, praise_datas
コード例 #13
0
    def test_weibodata_oper(self):
        db_session.execute("insert into {} ({}.weibo_id) values ('".format(
            weibo_data.name, weibo_data.name) + FAKE_ID + "')")
        assert WbDataOper.get_wb_by_mid(FAKE_ID) is not None
        assert len(WbDataOper.get_weibo_comment_not_crawled()) == 1
        assert len(WbDataOper.get_weibo_repost_not_crawled()) == 1

        WbDataOper.set_weibo_comment_crawled(FAKE_ID)
        WbDataOper.set_weibo_repost_crawled(FAKE_ID)

        assert len(WbDataOper.get_weibo_comment_not_crawled()) == 0
        assert len(WbDataOper.get_weibo_repost_not_crawled()) == 0
コード例 #14
0
    def test_weibodata_oper(self):
        db_session.execute("insert into {} ({}.weibo_id) values ('".format(weibo_data.name, weibo_data.name)
                           + FAKE_ID + "')")
        assert WbDataOper.get_wb_by_mid(FAKE_ID) is not None
        assert len(WbDataOper.get_weibo_comment_not_crawled()) == 1
        assert len(WbDataOper.get_weibo_repost_not_crawled()) == 1

        WbDataOper.set_weibo_comment_crawled(FAKE_ID)
        WbDataOper.set_weibo_repost_crawled(FAKE_ID)

        assert len(WbDataOper.get_weibo_comment_not_crawled()) == 0
        assert len(WbDataOper.get_weibo_repost_not_crawled()) == 0
コード例 #15
0
ファイル: praise.py プロジェクト: 402730243/spider-weibo
def execute_praise_task():
    weibo_datas = WbDataOper.get_weibo_praise_not_crawled()
    for weibo_data in weibo_datas:
        app.send_task('tasks.praise.crawl_praise_page',
                      args=(weibo_data.weibo_id, ),
                      queue='praise_crawler',
                      routing_key='praise_info')
コード例 #16
0
ファイル: home.py プロジェクト: 402730243/spider-weibo
def determine(weibo_datum, timeafter):
    weibo_time = time.mktime(
        time.strptime(weibo_datum.create_time, '%Y-%m-%d %H:%M'))
    if weibo_time < timeafter:
        return False
    if WbDataOper.get_wb_by_mid(weibo_datum.weibo_id):
        return False
    return True
コード例 #17
0
ファイル: dialogue.py プロジェクト: ResolveWang/WeiboSpider
def execute_dialogue_task():
    weibo_datas = WbDataOper.get_weibo_dialogue_not_crawled()
    for weibo_data in weibo_datas:
        # crawl_dialogue(weibo_data.weibo_id)
        app.send_task('tasks.dialogue.crawl_dialogue',
                      args=(weibo_data.weibo_id,),
                      queue='dialogue_crawler',
                      routing_key='dialogue_info')
コード例 #18
0
ファイル: comment.py プロジェクト: 402730243/spider-weibo
def crawl_comment_by_page(mid, page_num):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        app.send_task('tasks.comment.crawl_comment_by_page',
                      args=(mid, page_num),
                      queue='comment_page_crawler',
                      routing_key='comment_page_info')
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
コード例 #19
0
ファイル: home.py プロジェクト: ResolveWang/WeiboSpider
def determine(weibo_datum, timeafter):
    weibo_time = time.mktime(
        time.strptime(weibo_datum.create_time, '%Y-%m-%d %H:%M'))
    if weibo_time < timeafter:
        return False
    if WbDataOper.get_wb_by_mid(weibo_datum.weibo_id):
        return False
    return True
コード例 #20
0
ファイル: repost.py プロジェクト: Doraying1230/Python-Study
def execute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = WbDataOper.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler', routing_key='repost_info')
コード例 #21
0
ファイル: repost.py プロジェクト: cptBTptpwbct/weibo
def execute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = WbDataOper.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(
        len(weibo_datas)))

    for weibo_data in weibo_datas:
        crawl_repost_page(weibo_data.weibo_id, weibo_data.uid)
コード例 #22
0
ファイル: dialogue.py プロジェクト: lan1tian/weibospider
def execute_dialogue_task():
    weibo_datas = WbDataOper.get_weibo_dialogue_not_crawled()
    for weibo_data in weibo_datas:
        # crawl_dialogue(weibo_data.weibo_id)
        app.send_task('tasks.dialogue.crawl_dialogue',
                      args=(weibo_data.weibo_id, ),
                      queue='dialogue_crawler',
                      routing_key='dialogue_info')
コード例 #23
0
def execute_comment_task():
    # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改
    weibo_datas = WbDataOper.get_weibo_comment_not_crawled()
    for weibo_data in weibo_datas:
        app.send_task('tasks.comment.crawl_comment_page',
                      args=(weibo_data.weibo_id, ),
                      queue='comment_crawler',
                      routing_key='comment_info')
コード例 #24
0
ファイル: comment.py プロジェクト: ResolveWang/WeiboSpider
def crawl_comment_by_page(mid, page_num):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        app.send_task(
            'tasks.comment.crawl_comment_by_page',
            args=(mid, page_num),
            queue='comment_page_crawler',
            routing_key='comment_page_info')
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
コード例 #25
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        if cur_page == 1:
            search_page = get_page(cur_url, auth_level=1)
        else:
            search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning(
                'No result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                WbDataOper.add_one(wb_data)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info(
                'Keyword {} has been crawled in this turn'.format(keyword))
            return
コード例 #26
0
ファイル: home.py プロジェクト: 402730243/spider-weibo
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdata = get_ajax_data(ajax_html)
    if not ajax_wbdata:
        return ''

    timeafter = time.mktime(
        time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
    ajax_wbdata = [
        ajax_wbdatum for ajax_wbdatum in ajax_wbdata
        if determine(ajax_wbdatum, timeafter)
    ]

    WbDataOper.add_all(ajax_wbdata)
    return ajax_html
コード例 #27
0
ファイル: home.py プロジェクト: ResolveWang/WeiboSpider
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdata = get_ajax_data(ajax_html)
    if not ajax_wbdata:
        return ''

    timeafter = time.mktime(
        time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
    ajax_wbdata = [
        ajax_wbdatum for ajax_wbdatum in ajax_wbdata
        if determine(ajax_wbdatum, timeafter)
    ]

    WbDataOper.add_all(ajax_wbdata)
    return ajax_html
コード例 #28
0
ファイル: home.py プロジェクト: Doraying1230/Python-Study
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdatas = get_ajax_data(ajax_html)
    if not ajax_wbdatas:
        return ''

    timeafter = time.mktime(time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
    for i in range(0,len(ajax_wbdatas)):
        weibo_time = time.mktime(time.strptime(ajax_wbdatas[i].create_time, '%Y-%m-%d %H:%M'))
        if weibo_time < timeafter:
            ajax_wbdatas = ajax_wbdatas[0:i]
            break

    WbDataOper.add_all(ajax_wbdatas)
    return ajax_html
コード例 #29
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        WbDataOper.add_all(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #30
0
ファイル: topic.py プロジェクト: OneCodeMonkey/WeiboCrawler
def search_keyword_topic(keyword, keyword_id, start_time='', end_time=''):
    crawler.info(
        'We are crawling weibo topic content with keyword "{}"'.format(
            keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, start_time, end_time, cur_page)
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.info(
                'No such result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_topic.get_search_info(search_page)
        if cur_page == 1:
            cur_page += 1
        elif '您可以尝试更换关键词' not in search_page:
            cur_page += 1
        else:
            crawler.info(
                'Keyword {} has been crawled in this turn'.format(keyword))
            return

        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)
            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(
                    wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')
コード例 #31
0
ファイル: search.py プロジェクト: ResolveWang/WeiboSpider
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We don't need to crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)
            # todo incremental crawling using time
            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                # todo: only add seed ids and remove this task
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')
コード例 #32
0
ファイル: praise.py プロジェクト: ResolveWang/WeiboSpider
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)
    
    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2,5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return
コード例 #33
0
ファイル: home.py プロジェクト: ResolveWang/WeiboSpider
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #34
0
ファイル: praise.py プロジェクト: ResolveWang/WeiboSpider
def execute_praise_task():
    weibo_datas = WbDataOper.get_weibo_praise_not_crawled()
    for weibo_data in weibo_datas:
        app.send_task('tasks.praise.crawl_praise_page', args=(weibo_data.weibo_id,), queue='praise_crawler',
                      routing_key='praise_info')
コード例 #35
0
def execute_praise_task():
    weibo_datas = WbDataOper.get_weibo_praise_not_crawled()
    for weibo_data in weibo_datas:
        crawl_praise_page(weibo_data.weibo_id)
コード例 #36
0
ファイル: home.py プロジェクト: 402730243/spider-weibo
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
コード例 #37
0
ファイル: comment.py プロジェクト: ResolveWang/WeiboSpider
def execute_comment_task():
    # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改
    weibo_datas = WbDataOper.get_weibo_comment_not_crawled()
    for weibo_data in weibo_datas:
        app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id,), queue='comment_crawler',
                      routing_key='comment_info')
コード例 #38
0
ファイル: comment.py プロジェクト: cptBTptpwbct/weibo
def execute_comment_task():
    weibo_datas = WbDataOper.get_weibo_comment_not_crawled(db_session)
    crawl_comment_page(4253637545362266)
コード例 #39
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 36
    retry_count = 1
    while cur_page <= 36:
        crawler.warning("current page {}".format(cur_page))

        url = HOME_URL.format(uid, cur_page)
        #if cur_page == 1:
        #    html = get_page(url, auth_level=1)
        #else:
        html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            if retry_count < 10:
                crawler.warning("user {} has no weibo, retry".format(uid))
                retry_count = retry_count + 1
                #time.sleep(240)
                continue;
            else:
                crawler.warning("user {} has no weibo, return".format(uid))
                return


         # Check whether weibo created after time in spider.yaml
        # timeafter = time.mktime(
        #     time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        # length_weibo_datas = len(weibo_datas)
        # for i in range(0, len(weibo_datas)):
        #     weibo_time = time.mktime(
        #         time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
        #     if weibo_time < timeafter:
        #         weibo_datas = weibo_datas[0:i]
        #         break

        WbDataOper.add_all(weibo_datas)

        # # If the weibo isn't created after the given time, jump out the loop
        # if i != length_weibo_datas - 1:
        #     break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        # if cur_page == 1:
        #     # here we use local call to get total page number
        #     total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
        #     auth_level = 1
        # else:
        auth_level = 2

        #if total_page < limit:
        #    limit = total_page

        crawler.warning("append tasks.home.crawl_ajax_page{}".format(uid));

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)