Example #1
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            save_user(user)
            storage.info('has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Example #2
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
Example #3
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Example #4
0
def crawl_comment_page(mid):
    limit = get_max_comment_page()
    cur_page = 1
    next_url = ''
    while cur_page <= limit:
        cur_time = int(time.time()*1000)
        if cur_page == 1:
            url = start_url.format(mid, cur_time)
        else:
            url = base_url.format(next_url, cur_time)
        html = get_page(url, user_verify=False)
        comment_datas = comment.get_comment_list(html, mid)

        if not comment_datas and cur_page == 1:
            crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid))
            return

        save_comments(comment_datas)
        # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦)
        next_url = comment.get_next_url(html)

        if not next_url:
            crawler.info('微博{}的评论采集已经完成'.format(mid))
            return
        cur_page += 1
Example #5
0
def crawl_ajax_page(url):
    ajax_html_0 = get_page(url)
    ajax_wbdatas_0 = get_home_wbdata_byajax(ajax_html_0)
    if not ajax_wbdatas_0:
        return

    insert_weibo_datas(ajax_wbdatas_0)
Example #6
0
def get_fans_or_followers_ids(user_id, crawl_type):
    """
    Get followers or fans
    :param user_id: user id
    :param crawl_type: 1 stands for fans,2 stands for follows
    :return: lists of fans or followers
    """

    # todo check fans and followers the special users,such as writers
    # todo deal with conditions that fans and followers more than 5 pages
    if crawl_type == 1:
        fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?relate=fans&page={}#Pl_Official_HisRelation__60'
    else:
        fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?page={}#Pl_Official_HisRelation__60'

    cur_page = 1
    max_page = 6
    user_ids = list()
    while cur_page < max_page:
        url = fans_or_follows_url.format(user_id, cur_page)
        page = get_page(url)
        if cur_page == 1:
            urls_length = public.get_max_crawl_pages(page)
            if max_page > urls_length:
                max_page = urls_length + 1
        # get ids and store relations
        user_ids.extend(public.get_fans_or_follows(page, user_id, crawl_type))

        cur_page += 1

    return user_ids
Example #7
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    url = home_url.format(uid)
    html = get_page(url)
    if is_404(html):
        return None

    domain = public.get_userdomain(html)

    user, is_crawled = user_get.get_profile(uid, domain)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans',
                      args=(uid, domain),
                      queue='fans_followers',
                      routing_key='for_fans_followers')
Example #8
0
def crawl_repost_by_page(mid, page_num):
    cur_url = base_url.format(mid, page_num)
    html = get_page(cur_url, user_verify=False)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        wb_data.set_weibo_repost_crawled(mid)
    return html, repost_datas
Example #9
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Example #10
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    #crawler.info(limit)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        #crawler.info(search_page)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in last turn'.format(keyword))
                #continue
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Example #11
0
def get_fans_or_followers_ids(user_id, crawl_type):
    """
    获取用户的粉丝和关注用户
    :param user_id: 用户id
    :param crawl_type: 1表示获取粉丝,2表示获取关注
    :return: 获取的关注或者粉丝列表
    """

    # todo 验证作家等用户的粉丝和关注是否满足;处理粉丝或者关注5页的情况
    if crawl_type == 1:
        ff_url = 'http://weibo.com/p/100505{}/follow?relate=fans&page={}#Pl_Official_HisRelation__60'
    else:
        ff_url = 'http://weibo.com/p/100505{}/follow?page={}#Pl_Official_HisRelation__60'

    cur_page = 1
    max_page = 6
    user_ids = list()
    while cur_page < max_page:
        url = ff_url.format(user_id, cur_page)
        page = get_page(url)
        if cur_page == 1:
            user_ids.extend(public.get_fans_or_follows(page))
            urls_length = public.get_max_crawl_pages(page)
            if max_page > urls_length:
                max_page = urls_length + 1

        cur_page += 1

    return user_ids
Example #12
0
def get_fans_or_followers_ids(user_id, crawl_type):
    """
    Get followers or fans
    :param user_id: user id
    :param crawl_type: 1 stands for fans,2 stands for followers
    :return: lists of fans or followers
    """

    # todo check fans and followers the special users,such as writers
    # todo process the conditions that fans and followers more than 5 pages
    if crawl_type == 1:
        fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?relate=fans&page={}#Pl_Official_HisRelation__60'
    else:
        fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?page={}#Pl_Official_HisRelation__60'

    cur_page = 1
    max_page = 6
    user_ids = list()
    while cur_page < max_page:
        url = fans_or_follows_url.format(user_id, cur_page)
        page = get_page(url)
        if cur_page == 1:
            urls_length = public.get_max_crawl_pages(page)
            if max_page > urls_length:
                max_page = urls_length + 1
        # get ids and store relations
        user_ids.extend(public.get_fans_or_follows(page, user_id, crawl_type))

        cur_page += 1

    return user_ids
Example #13
0
def _crawl_loop(page, page_counter, mid, uid, user_name,
                spread_other_and_caches, spread_others, spread_other_caches):
    while page > 0 and page_counter < page_max:
        ajax_url = base_url.format(mid=mid, currpage=page)
        repost_info = get_page(ajax_url, False)
        try:
            repost_json = json.loads(repost_info)
            repost_html = repost_json['data']['html']
        except Exception as why:
            # 如果出现异常,默认不抓该ajax_url对应的微博信息
            parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                url=ajax_url, why=why))
        else:
            repost_urls = parse_status.get_reposturls(repost_html)

            # 转发节点排序逻辑
            # todo 不通过repost_urls去获取转发微博的相关信息,验证扩散效果是否相同
            for repost_url in repost_urls:
                repost_cont = status.get_status_info(repost_url, uid,
                                                     user_name, mid)
                if repost_cont is not None:
                    spread_other_and_caches.append(repost_cont)

            for soac in spread_other_and_caches:
                if soac.get_so().id != '':
                    spread_others.append(soac.get_so())
                    spread_other_caches.append(soac.get_soc())
        finally:
            print('当前位于第{}页'.format(page))
            page -= 1
            page_counter += 1
Example #14
0
def _get_current_source(url, wb_mid):
    """
    :param url: 当前微博url
    :param wb_mid: 当前微博mid
    :return: 转发数,微博用户id,用户名
    """
    html = get_page(url)
    if not html or basic.is_404(html):
        return None

    reposts = parse_status.get_repostcounts(html)
    comments = parse_status.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=wb_mid,
                                          reposts=reposts,
                                          comments=comments)

    root_url = url
    user_id = parse_status.get_userid(html)
    user_name = parse_status.get_username(html)
    post_time = parse_status.get_statustime(html)
    device = parse_status.get_statussource(html)
    comments_count = parse_status.get_commentcounts(html)
    reposts_count = parse_status.get_repostcounts(html)
    root_user = user.get_profile(user_id)
    # 源微博的相关信息存储
    spread_original_dao.save(root_user, wb_mid, post_time, device,
                             reposts_count, comments_count, root_url)

    crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))
    return reposts_count, user_id, user_name
Example #15
0
def get_fans_or_followers_names(name, crawl_type):
    """
    抓取用户和粉丝
    :param name: 用户名
    :param crawl_type: 抓取类型。 followees: 关注, followers: 粉丝
    :return:
    """
    LIMIT = 20
    page = 1
    is_end = False
    max_follow_page = get_max_follow_page()

    while (not is_end) and (page < max_follow_page):
        url = FOLLOW_URL.format(name, crawl_type, (page - 1) * LIMIT, LIMIT)
        html = get_page(url)
        user_names, is_end = get_fans_or_follows(html, name)
        storage.info(
            f"get {name} {crawl_type}: user_names: {user_names}, is_end:{is_end}"
        )
        SeedUser.insert_many(user_names)

        page += 1

        storage.info(
            f"get {name} page={page}, max_follow_page={max_follow_page}, is_end={is_end}"
        )
Example #16
0
def crawl_repost_by_page(mid, page_num):
    cur_url = base_url.format(mid, page_num)
    html = get_page(cur_url, user_verify=False)
    repost_datas = repost.get_repost_list(html, mid)
    if page_num == 1:
        wb_data.set_weibo_repost_crawled(mid)
    return html, repost_datas
Example #17
0
def get_mpproxy_to_db():
	url = 'http://proxy.mimvp.com/api/fetch.php?orderid=860170808163932696&num=100&country_group=1&http_type=2&anonymous=3,5&result_fields=1,2&result_format=json&ping_time=1'
	html = get_page(url, user_verify=False, need_login=False)
	proxy_dict = parse_json_to_dict(html)
	proxies = proxy_dict.get('result')
	proxy_list = []
	if proxies:
		for proxy in proxies:
			data = proxy.get('ip:port')
			data = data.split(':')
			if data:
				ip = data[0]
				port = data[1]
			else:
				return False
			new_proxy = Proxys()
			new_proxy.ip = ip
			new_proxy.port = port
			new_proxy.types = 2
			new_proxy.protocol = 2
			new_proxy.country = '国内'
			new_proxy.area = '米扑代理'
			new_proxy.speed = 0.00
			new_proxy.score = 5
			proxy_list.append(new_proxy)
	if proxy_list:
		insert_proxy(proxy_list)
		return True
	else:
		return False
Example #18
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(
                keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Example #19
0
 def test_page_get(self):
     """
     test crawling pages
     """
     from page_get import basic
     test_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more'
     text = basic.get_page(test_url)
     self.assertIn('深扒娱乐热点', text)
Example #20
0
def crawl_comment_by_page(mid, page_num):
    cur_time = int(time.time() * 1000)
    cur_url = base_url.format(mid, page_num, cur_time)
    html = get_page(cur_url, user_verify=False)
    comment_datas = comment.get_comment_list(html, mid)
    save_comments(comment_datas)
    wb_data.set_weibo_comment_crawled(mid)
    return html
Example #21
0
 def test_page_get(self):
     """
     测试页面抓取功能
     """
     from page_get import basic
     test_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more'
     text = basic.get_page(test_url)
     self.assertIn('深扒娱乐热点', text)
Example #22
0
def crawl_ajax_page(url):
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html
Example #23
0
def get_status_info(url, user_id, name, mid=''):
    soc = SpreadOtherCache()
    print('当前转发微博url为:' + url)
    repost_cont = get_page(url)

    if not is_404(repost_cont):
        repost_user_id = parse_status.get_userid(repost_cont)
        if repost_user_id == '':
            return None

        repost_user_name = parse_status.get_username(repost_cont)
        soc.set_id(repost_user_id)
        soc.set_name(repost_user_name)

        so = SpreadOther()
        so.id = repost_user_id
        so.screen_name = repost_user_name
        so.upper_user_name = parse_status.get_upperusername(repost_cont, name)
        cur_user = user.get_profile(repost_user_id)
        try:
            so.province = cur_user.province
            so.city = cur_user.city
            so.location = cur_user.location
            so.description = cur_user.description
            so.domain_name = cur_user.domain_name
            so.blog_url = cur_user.blog_url
            so.gender = cur_user.gender
            so.headimg_url = cur_user.headimg_url
            so.followers_count = cur_user.followers_count
            so.friends_count = cur_user.friends_count
            so.status_count = cur_user.status_count
            so.verify_type = cur_user.verify_type
            so.verify_info = cur_user.verify_info
            so.register_time = cur_user.register_time

            if so.screen_name == name:
                so.id = user_id

            so.mid = parse_status.get_mid(repost_cont)
            so.status_post_time = parse_status.get_statustime(repost_cont)
            so.device = parse_status.get_statussource(repost_cont)
            if mid:
                so.original_status_id = mid
            else:
                so.original_status_id = parse_status.get_orignalmid(repost_cont)
            so.comments_count = parse_status.get_commentcounts(repost_cont)
            so.reposts_count = parse_status.get_repostcounts(repost_cont)
            so.like_count = parse_status.get_likecounts(repost_cont)
            so.status_url = url
        except AttributeError as e:
            # todo:找出这里的问题
            logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e))
            logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont))
            return None
        else:
            return SpreadOtherAndCache(so, soc)
    else:
        return None
Example #24
0
def get_url_from_web(user_id):
    """
    根据用户id获取用户资料:如果用户的domain为100505,那么会直接返回用户详细资料;如果是103505或者100306,那么需要再进行
    一次请求,因为用base_url的方式它只会定位到用户主页而不是详细资料页;如果是企业和服务号等,通过base_url访问也会跳转到该
    用户的主页,由于该类用户的详细页价值不大,所以不再进行请求它们的详细页
    :param user_id: 用户id
    :return: 用户类实体
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # 作家
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # 普通用户
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # 默认是企业
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        # 保存用户信息到数据库
        save_user(user)
        storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

        return user
    else:
        return None
Example #25
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = base_url.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = base_url.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        save_user(user)
        storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

        return user
    else:
        return None
Example #26
0
def crawl_comment_by_page(mid, page_num):
    cur_time = int(time.time() * 1000)
    cur_url = base_url.format(mid, page_num, cur_time)
    html = get_page(cur_url, user_verify=False)
    comment_datas = comment.get_comment_list(html, mid)
    save_comments(comment_datas)
    if page_num == 1:
        wb_data.set_weibo_comment_crawled(mid)
    return html
Example #27
0
def search_one(keyword, session):
    url = 'http://s.weibo.com/weibo/' + keyword + '&Refer=STopic_box'
    search_page = get_page(url, session, headers)
    if search_page:
        search_list = search_parse.get_search_info(search_page)
        for s in search_list:
            s.keyword = keyword
            s.mk_primary = '_'.join([str(s.mid), keyword])
        add_search_cont(search_list)
    else:
        print('并未解析到搜索结果:{page}'.format(page=search_page))
Example #28
0
def crawl_ajax_page(url):
    """
    :param url: user home ajax url
    :return: resp.text
    """
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html
Example #29
0
def get_fans_list_return(uid, page):
	fans_wb_temp_url = 'https://m.weibo.cn/api/container/getIndex?containerid={}_-_followers_-_{}&luicode={}&lfid={}&featurecode={}&type=uid&value={}&page={}'
	
	containerid = '231051'
	luicode = '10000011'
	lfid = '100505' + str(uid)
	featurecode = '20000320'
	value = str(uid)

	url = fans_wb_temp_url.format(containerid, uid, luicode, lfid, featurecode, value, page)
	html = get_page(url, user_verify=False, need_login=False)
	return url, html
Example #30
0
def crawl_ajax_page(url):
    """
    :param url: user home ajax url
    :return: resp.text
    """
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html
Example #31
0
def crawl_ajax_page(url):
    """
    返回值主要供第一次本地调用使用(获取总页数),网络调用忽略返回值
    :param url: 
    :return: 
    """
    ajax_html = get_page(url, user_verify=False)
    ajax_wbdatas = get_home_wbdata_byajax(ajax_html)
    if not ajax_wbdatas:
        return ''

    insert_weibo_datas(ajax_wbdatas)
    return ajax_html
Example #32
0
def get_hot_list_from_web(title):
    if not title:
        return None

    url = HOT_LIST_URL.format(title)
    html = get_page(url)

    all_lists = parse_hot_list(title, html)
    if all_lists:
        CommonOperate.add_all(all_lists)
        storage.info(f"Has stored hot_list {title} info successfully")

    return all_lists
Example #33
0
def get_cont_of_weibo(mid):
    """
    :param mid: weibo's mid
    :return: all cont of the weibo
    """
    url = base_url.format(mid)
    html = get_page(url, user_verify=False)

    if html:
        try:
            html = json.loads(html, encoding='utf-8').get('data').get('html')
            cont = filters.text_filter(html)
        except AttributeError:
            cont = ''
        return cont
def get_cont_of_weibo(mid):
    """
    :param mid: weibo's mid
    :return: all cont of the weibo
    """
    url = base_url.format(mid)
    html = get_page(url, user_verify=False)

    if html:
        try:
            html = json.loads(html, encoding='utf-8').get('data').get('html')
            cont = filters.text_filter(html)
        except AttributeError:
            cont = ''
        return cont
Example #35
0
def get_user_info_from_web(user_name):
    """从网络抓取用户信息
    :param: user_name 用户名
    :return: user entiry
    """
    if not user_name:
        return None

    url = USER_HOME_URL.format(user_name)
    html = get_page(url)

    user = get_user_detail(user_name, html)
    if user:
        CommonOperate.add_one(user)
        storage.info(f"Has stored user {user_name} info successfully")

    return user
Example #36
0
def _get_total_page(wb_mid):
    page = 1
    ajax_url = base_url.format(mid=wb_mid, currpage=page)
    source = get_page(ajax_url, False)

    if source == '':
        crawler.error('本次转发url{}抓取出错'.format(ajax_url))
        return 0

    crawler.info('本次转发信息url为{}'.format(ajax_url))

    try:
        repost_json = json.loads(source)
        total_page = int(repost_json['data']['page']['totalpage'])
    except Exception as why:
        parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why))
        return 0
    else:
        return total_page
Example #37
0
def parse_xdaili_return(url):
	html = get_page(url, user_verify=False, need_login=False)
	proxy_dict = parse_json_to_dict(html)
	proxies = proxy_dict.get('RESULT')
	err_code = int(proxy_dict.get('ERRORCODE'))
	proxy_list = []
	if proxies and err_code == 0:
		for proxy in proxies:
			port = proxy.get('port')
			ip = proxy.get('ip')
			new_proxy = Proxys()
			new_proxy.ip = ip
			new_proxy.port = port
			new_proxy.types = 2
			new_proxy.protocol = 2
			new_proxy.country = '国内'
			new_proxy.area = '讯代理'
			new_proxy.speed = 0.00
			new_proxy.score = 5
			proxy_list.append(new_proxy)
	return proxy_list
Example #38
0
def search_keyword(row):
    cur_page = 1
    keyword = row.keyword
    if row.startTime:
        startTime = row.startTime.strftime('%Y-%m-%d')
        url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}&timescope=custom:{}'
    if row.endTime:
        endTime = row.endTime.strftime('%Y-%m-%d')
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        if row.startTime and row.endTime:
            finalTime = startTime + ':' + endTime
            cur_url = url.format(encode_keyword, cur_page, finalTime)
        else:
            cur_url = url.format(encode_keyword, cur_page)
        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning(
                '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Example #39
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        page_id = public.get_pageid(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)