Exemple #1
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        length_weibo_datas = len(weibo_datas)
        for i in range(0, len(weibo_datas)):
            weibo_time = time.mktime(
                time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
            if weibo_time < timeafter:
                weibo_datas = weibo_datas[0:i]
                break

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if i != length_weibo_datas - 1:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Exemple #2
0
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)

    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error(
            'fail to get praise page 2 ext_param, mid is {mid}'.format(
                mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2, 5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return
Exemple #3
0
def crawl_repost_by_page(mid, page_num):
    cur_url = BASE_URL.format(mid, page_num)
    html = get_page(cur_url, auth_level=1, is_ajax=True)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        WbDataOper.set_weibo_repost_crawled(mid)
    return html, repost_datas
Exemple #4
0
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)

            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return
Exemple #5
0
def crawl_repost_by_page(mid, page_num):
    cur_url = BASE_URL.format(mid, page_num)
    html = get_page(cur_url, auth_level=1, is_ajax=True)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        WbDataOper.set_weibo_repost_crawled(mid)
    return html, repost_datas
Exemple #6
0
def crawl_praise_by_page(mid, ext_param):
    cur_time = int(time.time() * 1000)
    cur_url = PAGE_URL.format(ext_param, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)
    return html, praise_data, ext_param
Exemple #7
0
def crawl_praise_by_page(mid, ext_param):
    cur_time = int(time.time() * 1000)
    cur_url = PAGE_URL.format(ext_param, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)
    return html, praise_data, ext_param
def test_crawl_first_search_page():
    url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page=1'
    encode_keyword = url_parse.quote('火影')
    cur_url = url.format(encode_keyword, 1)
    search_page = get_page(cur_url, auth_level=1)
    assert "['islogin']" in search_page
    time.sleep(REQUEST_INTERNAL)
Exemple #9
0
def crawl_comment_by_page(mid, page_num):
    cur_url = BASE_URL.format(mid, page_num)
    html = get_page(cur_url, auth_level=1, is_ajax=True)
    comment_datas = comment.get_comment_list(html, mid)
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
Exemple #10
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data = dialogue.get_dialogue(html, mid, cid)

    CommonOper.add_one(dialogue_data)
Exemple #11
0
def crawl_dialogue_by_comment_page(mid, page_num):
    comment_url = COMMENT_URL.format(mid, page_num)
    html = get_page(comment_url, auth_level=1, is_ajax=True)
    comment_ids = dialogue.get_comment_id(html, mid)
    for cid in comment_ids:
        crawl_dialogue_by_comment_id(cid, mid)

    if page_num == 1:
        WbDataOper.set_weibo_dialogue_crawled(mid)
    return html
Exemple #12
0
def crawl_dialogue_by_comment_page(mid, page_num):
    comment_url = COMMENT_URL.format(mid, page_num)
    html = get_page(comment_url, auth_level=1, is_ajax=True)
    comment_ids = dialogue.get_comment_id(html, mid)
    for cid in comment_ids:
        crawl_dialogue_by_comment_id(cid, mid)

    if page_num == 1:
        WbDataOper.set_weibo_dialogue_crawled(mid)
    return html
def test_crawl_first_home_page():
    from page_parse.home import get_ajax_data
    url = 'http://weibo.com/u/1800822823?is_ori=1&is_tag=0&profile_ftype=1&page=1'
    content = get_page(url, auth_level=1)
    assert "['islogin']" in content
    time.sleep(REQUEST_INTERNAL)
    cur_time = int(time.time() * 1000)
    ajax_url_0 = HOME_AJAX_URL.format('100505', 0, '100505', '1800822823', 1, 1, cur_time)
    ajax_url_1 = HOME_AJAX_URL.format('100505', 0, '100505', '1800822823', 1, 1, cur_time + 100)

    content = get_page(ajax_url_0, auth_level=1, is_ajax=True)
    assert 'Sina Visitor System' not in content
    assert len(get_ajax_data(content)) > 0
    time.sleep(REQUEST_INTERNAL)

    content = get_page(ajax_url_1, auth_level=1, is_ajax=True)
    assert 'Sina Visitor System' not in content
    assert len(get_ajax_data(content)) > 0
    time.sleep(REQUEST_INTERNAL)
Exemple #14
0
def search_items_v2(keyword, keyword_id, date_item):
    search_time_list = [
        "{}-{}:{}-{}".format(d, t, d, t + 2)
        for d, t in itertools.product([date_item], TIME_LIIT)
    ]

    for s_time in search_time_list:
        crawler.info('We are searching keyword "{}", {}'.format(
            keyword, s_time))
        cur_page = 1
        encode_keyword = url_parse.quote(keyword)
        while cur_page < LIMIT:
            cur_url = MAX_URL.format(encode_keyword, cur_page, s_time)
            # current only for login, maybe later crawling page one without login
            search_page = get_page(cur_url, auth_level=1, need_proxy=True)
            if "您可以尝试更换关键词,再次搜索" in search_page:
                break
            if not search_page:
                crawler.warning(
                    'No search result for keyword {}, the source page is {}'.
                    format(keyword, search_page))
                cur_page += 1
                continue
                # return

            search_list = parse_search.get_search_info(search_page)

            if cur_page == 1:
                cur_page += 1
            elif 'noresult_tit' not in search_page:
                cur_page += 1
            else:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return

            # Because the search results are sorted by time, if any result has been stored in mysql,
            # We don't need to crawl the same keyword in this turn
            for wb_data in search_list:
                # print(wb_data)
                rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # todo incremental crawling using time
                if rs:
                    crawler.info('Weibo {} has been crawled, skip it.'.format(
                        wb_data.weibo_id))
                    continue
                else:
                    WbDataOper.add_one(wb_data)
                    # todo: only add seed ids and remove this task
                    app.send_task('tasks.user.crawl_person_infos',
                                  args=(wb_data.uid, ),
                                  queue='user_crawler',
                                  routing_key='for_user_info')
Exemple #15
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        if cur_page == 1:
            search_page = get_page(cur_url, auth_level=1)
        else:
            search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning(
                'No result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                WbDataOper.add_one(wb_data)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info(
                'Keyword {} has been crawled in this turn'.format(keyword))
            return
Exemple #16
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        WbDataOper.add_all(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Exemple #17
0
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdatas = get_ajax_data(ajax_html)
    if not ajax_wbdatas:
        return ''

    WbDataOper.add_all(ajax_wbdatas)
    return ajax_html
Exemple #18
0
def crawl_comment_by_page(mid, page_num, seeion):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas, seed_ids = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_comment_by_page(mid, page_num)
    CommentOper.add_all(comment_datas, seeion)
    SeedidsOper.insert_seeds(seed_ids, seeion)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid, seeion)
    return html, comment_datas
Exemple #19
0
def crawl_praise_by_page(mid, page_num):
    try:
        cur_time = int(time.time() * 1000)
        cur_url = BASE_URL.format(mid, page_num, cur_time)
        html = get_page(cur_url, auth_level=2, is_ajax=True)
        praise_datas = praise.get_praise_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "praise SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_praise_by_page(mid, page_num)
    PraiseOper.add_all(praise_datas)
    if page_num == 1:
        WbDataOper.set_weibo_praise_crawled(mid)
    return html, praise_datas
Exemple #20
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data, uids = dialogue.get_dialogue(html, mid, cid)
    if dialogue_data:
        CommonOper.add_one(dialogue_data)

    if uids:
        for uid in uids:
            # crawl_person_infos_not_in_seed_ids(uid)
            app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids',
                          args=(uid, ),
                          queue='user_crawler',
                          routing_key='for_user_info')
Exemple #21
0
def crawl_comment_by_page(mid, page_num):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        app.send_task('tasks.comment.crawl_comment_by_page',
                      args=(mid, page_num),
                      queue='comment_page_crawler',
                      routing_key='comment_page_info')
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
Exemple #22
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data, uids = dialogue.get_dialogue(html, mid, cid)
    if dialogue_data:
        CommonOper.add_one(dialogue_data)

    if uids:
        for uid in uids:
            # crawl_person_infos_not_in_seed_ids(uid)
            app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids',
                          args=(uid,),
                          queue='user_crawler',
                          routing_key='for_user_info')
Exemple #23
0
def crawl_comment_by_page(mid, page_num):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        app.send_task(
            'tasks.comment.crawl_comment_by_page',
            args=(mid, page_num),
            queue='comment_page_crawler',
            routing_key='comment_page_info')
    CommentOper.add_all(comment_datas)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid)
    return html, comment_datas
Exemple #24
0
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdatas = get_ajax_data(ajax_html)
    if not ajax_wbdatas:
        return ''

    timeafter = time.mktime(time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
    for i in range(0,len(ajax_wbdatas)):
        weibo_time = time.mktime(time.strptime(ajax_wbdatas[i].create_time, '%Y-%m-%d %H:%M'))
        if weibo_time < timeafter:
            ajax_wbdatas = ajax_wbdatas[0:i]
            break

    WbDataOper.add_all(ajax_wbdatas)
    return ajax_html
Exemple #25
0
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdata = get_ajax_data(ajax_html)
    if not ajax_wbdata:
        return ''

    timeafter = time.mktime(
        time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
    ajax_wbdata = [
        ajax_wbdatum for ajax_wbdatum in ajax_wbdata
        if determine(ajax_wbdatum, timeafter)
    ]

    WbDataOper.add_all(ajax_wbdata)
    return ajax_html
Exemple #26
0
def crawl_ajax_page(url, auth_level):
    """
    :param url: user home ajax url
    :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login
    :return: resp.text
    """
    ajax_html = get_page(url, auth_level, is_ajax=True)
    ajax_wbdata = get_ajax_data(ajax_html)
    if not ajax_wbdata:
        return ''

    timeafter = time.mktime(
        time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
    ajax_wbdata = [
        ajax_wbdatum for ajax_wbdatum in ajax_wbdata
        if determine(ajax_wbdatum, timeafter)
    ]

    WbDataOper.add_all(ajax_wbdata)
    return ajax_html
Exemple #27
0
def search_keyword_topic(keyword, keyword_id, start_time='', end_time=''):
    crawler.info(
        'We are crawling weibo topic content with keyword "{}"'.format(
            keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, start_time, end_time, cur_page)
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.info(
                'No such result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_topic.get_search_info(search_page)
        if cur_page == 1:
            cur_page += 1
        elif '您可以尝试更换关键词' not in search_page:
            cur_page += 1
        else:
            crawler.info(
                'Keyword {} has been crawled in this turn'.format(keyword))
            return

        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)
            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(
                    wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')
Exemple #28
0
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)
    
    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2,5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return
Exemple #29
0
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We don't need to crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)
            # todo incremental crawling using time
            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                # todo: only add seed ids and remove this task
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')
Exemple #30
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Exemple #31
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 36
    retry_count = 1
    while cur_page <= 36:
        crawler.warning("current page {}".format(cur_page))

        url = HOME_URL.format(uid, cur_page)
        #if cur_page == 1:
        #    html = get_page(url, auth_level=1)
        #else:
        html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            if retry_count < 10:
                crawler.warning("user {} has no weibo, retry".format(uid))
                retry_count = retry_count + 1
                #time.sleep(240)
                continue;
            else:
                crawler.warning("user {} has no weibo, return".format(uid))
                return


         # Check whether weibo created after time in spider.yaml
        # timeafter = time.mktime(
        #     time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        # length_weibo_datas = len(weibo_datas)
        # for i in range(0, len(weibo_datas)):
        #     weibo_time = time.mktime(
        #         time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
        #     if weibo_time < timeafter:
        #         weibo_datas = weibo_datas[0:i]
        #         break

        WbDataOper.add_all(weibo_datas)

        # # If the weibo isn't created after the given time, jump out the loop
        # if i != length_weibo_datas - 1:
        #     break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        # if cur_page == 1:
        #     # here we use local call to get total page number
        #     total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
        #     auth_level = 1
        # else:
        auth_level = 2

        #if total_page < limit:
        #    limit = total_page

        crawler.warning("append tasks.home.crawl_ajax_page{}".format(uid));

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Exemple #32
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)