Ejemplo n.º 1
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    url = home_url.format(uid)
    html = get_page(url)
    if is_404(html):
        return None

    domain = public.get_userdomain(html)

    user, is_crawled = user_get.get_profile(uid, domain)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans',
                      args=(uid, domain),
                      queue='fans_followers',
                      routing_key='for_fans_followers')
Ejemplo n.º 2
0
def excute_login_task():
    infos = login_info.get_login_info()
    log.crawler.info('本轮模拟登陆开始')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password))
        time.sleep(10)
Ejemplo n.º 3
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
Ejemplo n.º 4
0
def excute_home_task():
    # you can have many strategies to crawl user's home page, here we choose table seed_ids's uid
    # whose home_crawl is 0
    id_objs = get_home_ids()
    for id_obj in id_objs:
        app.send_task('tasks.home.crawl_weibo_datas', args=(id_obj.uid,), queue='home_crawler',
                      routing_key='home_info')
Ejemplo n.º 5
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Ejemplo n.º 6
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(
                keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Ejemplo n.º 7
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Ejemplo n.º 8
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    #crawler.info(limit)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        #crawler.info(search_page)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in last turn'.format(keyword))
                #continue
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Ejemplo n.º 9
0
def get_answer_comments():
    comment_ids = get_zhihu_answer_comment_not_crawled()
    for comment_id in comment_ids:
        app.send_task('tasks.answer_comment.get_one_answer_comments',
                      args=(comment_id[0]),
                      queue='answer_comment_crawler',
                      routing_key='for_search_info')
Ejemplo n.º 10
0
def excute_search_task():
    keywords = get_search_keywords()
    for each in keywords:
        app.send_task('tasks.search.search_keyword',
                      args=(each[0], each[1]),
                      queue='search_crawler',
                      routing_key='for_search_info')
Ejemplo n.º 11
0
def excute_pic_task():
    # 这里的策略由自己指定,可以基于已有用户做主页抓取,也可以指定一些用户,我这里直接选的种子数据库中的uid
    id_objs = get_home_ids(0)
    for id_obj in id_objs:
        app.send_task('tasks.pic.crawl_weibo_pics', args=(id_obj.uid,), queue='pic_crawler',
                      routing_key='pic_info')
        # crawl_weibo_pics(id_obj.uid)
Ejemplo n.º 12
0
def execute_start_request():
    meta = {}
    start_url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python"
    meta["start_url"] = start_url
    app.send_task("tasks.downloader.downloader",
                  args=(start_url, meta),
                  queue="downloader_queue",
                  routing_key="for_download")
Ejemplo n.º 13
0
def excute_comment_task():
    #weibo_datas = wb_data.get_weibo_comment_not_crawled()
    weibo_data = '4079144788308403'
    # for weibo_data in weibo_datas:
    #     app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id,), queue='comment_crawler',
    #                   routing_key='comment_info')
    app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data,), queue='comment_crawler',
                  routing_key='comment_info')
Ejemplo n.º 14
0
def execute_user_task():
    seeds, is_exists = SeedUser.get_seed_names()
    if is_exists:
        for seed in seeds:
            crawler.info(f"send task crawl_user_info {seed.name}")
            app.send_task("tasks.user.crawl_user_info", args=(seed.name, ))
    else:
        crawler.info("find no user, abort")
Ejemplo n.º 15
0
def excute_repost_task():
    # 以当前微博为源微博进行分析,不向上溯源,如果有同学需要向上溯源,需要自己判断一下该微博是否是根微博
    weibo_datas = wb_data.get_weibo_repost_not_crawled()
    crawler.info('本次一共有{}条微博需要抓取转发信息'.format(len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler', routing_key='repost_info')
Ejemplo n.º 16
0
def excute_repost_task():
    datas = weibosearch_dao.get_crawl_urls()
    crawler.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas)))
    # 把抓取任务分发到各个机器上执行
    for data in datas:
        app.send_task('tasks.repost.get_current_reposts', args=(data['url'], data['mid']))

    crawler.info('本次任务分发完成')
def excute_user_task():
    seeds = get_seed_ids()
    if seeds:
        for seed in seeds:
            app.send_task('tasks.user.crawl_person_infos',
                          args=(seed.uid, ),
                          queue='user_crawler',
                          routing_key='for_user_info')
Ejemplo n.º 18
0
def excute_comment_task():
    # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改
    weibo_datas = wb_data.get_weibo_comment_not_crawled()
    for weibo_data in weibo_datas:
        app.send_task('tasks.comment.crawl_comment_page',
                      args=(weibo_data.weibo_id, ),
                      queue='comment_crawler',
                      routing_key='comment_info')
Ejemplo n.º 19
0
def excute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = wb_data.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler', routing_key='repost_info')
Ejemplo n.º 20
0
def excute_login_task():
    infos = login_info.get_login_info()
    log.crawler.info('本轮模拟登陆开始')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password, info.need_verify),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
Ejemplo n.º 21
0
def excute_home_task():
    # 这里的策略由自己指定,可以基于已有用户做主页抓取,也可以指定一些用户,我这里直接选的种子数据库中的uid
    uids = get_home_ids()

    for uid in uids:
        app.send_task('tasks.home.crawl_weibo_datas',
                      args=(uid[0], ),
                      queue='home_crawler',
                      routing_key='home_info')
Ejemplo n.º 22
0
def excute_home_task():
    # you can have many strategies to crawl user's home page, here we choose table seed_ids's uid
    # whose home_crawl is 0
    id_objs = get_home_ids()
    for id_obj in id_objs:
        app.send_task('tasks.home.crawl_weibo_datas',
                      args=(id_obj.uid, ),
                      queue='home_crawler',
                      routing_key='home_info')
Ejemplo n.º 23
0
def excute_login_task():
    infos = login_info.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    log.crawler.info('The login task is starting...')
    for info in infos:
        app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
Ejemplo n.º 24
0
def excute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = wb_data.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(
        len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page',
                      args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler',
                      routing_key='repost_info')
Ejemplo n.º 25
0
def excute_login_task():
    infos = login_info.get_login_info()
    # 每次登陆前清楚所有堆积的登录任务
    Cookies.check_login_task()
    log.crawler.info('本轮模拟登陆开始')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
Ejemplo n.º 26
0
def excute_login_task():
    infos = login_info.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    log.crawler.info('The login task is starting...')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
Ejemplo n.º 27
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    # 这里为了马上拿到返回结果,采用本地调用的方式
    first_page = crawl_comment_by_page(mid, 1)
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
                      routing_key='comment_page_info')
Ejemplo n.º 28
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    # 这里为了马上拿到返回结果,采用本地调用的方式
    first_page = crawl_comment_by_page(mid, 1)
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
                      routing_key='comment_page_info')
Ejemplo n.º 29
0
def get_people_and_follows(people_id, selector):
    try:
        people = People()
        people.people_id = people_id
        people.name = selector.xpath(
            '//div[@class="aw-user-center"]/div[1]/div/h1/text()')[0].strip()
        people.desc = "".join(
            selector.xpath(
                '//div[@class="aw-user-center"]/div[1]/div/span/text()'))
        if selector.xpath('//i[contains(@class,"i-user-locate")]'):
            user_locate = selector.xpath(
                '//i[contains(@class,"i-user-locate")]')[0].getparent()
            people.province = "".join(user_locate.xpath('a[1]/text()'))
            people.city = "".join(user_locate.xpath('a[2]/text()'))
        if selector.xpath('//i[contains(@class,"i-user-post")]'):
            user_post = selector.xpath(
                '//i[contains(@class,"i-user-post")]')[0].getparent()
            people.post = "".join(user_post.xpath('text()')).strip()
        if selector.xpath('//i[contains(@class,"i-user-visits")]'):
            user_visits = selector.xpath(
                '//i[contains(@class,"i-user-visits")]')[0].getparent()
            user_visits_str = "".join(user_visits.xpath('text()'))
            people.home_visit_num = re.findall('(\d+)', user_visits_str)[0]
        people_type_spans = selector.xpath(
            '//div[@class="aw-user-center"]/div[1]/div/p[3]/span')
        people.user_type = people_type_spans[0].xpath(
            'a/em/text()')[0].replace("»", "").strip()
        people.weiwang_num = people_type_spans[1].xpath('em/text()')[0]
        people.agree_num = people_type_spans[2].xpath('em/text()')[0]
        people.thanks_num = people_type_spans[3].xpath('em/text()')[0]
        people.gold_num = people_type_spans[4].xpath('em/text()')[0]
        if '+' in people.gold_num:
            people.gold_num = 100
        if selector.xpath('//span[contains(text(),"最后活跃")]'):
            last_active_time_str = selector.xpath(
                '//span[contains(text(),"最后活跃")]')[0].getparent().getnext(
                ).xpath('text()')[0]
            people.last_active_time = str2datetime(last_active_time_str)
        CommonOper.add_one(people)
        CommonOper.add_filter_key("people_id", people_id)
    except Exception as e:
        jsl_log.warning(
            "get people info error,people_id:{},here are details {}".format(
                people_id,
                format_tb(e.__traceback__)[0]))
    app.send_task("tasks.people.do_follow",
                  args=(
                      people_id,
                      0,
                  ),
                  queue="people_queue",
                  routing_key="people")
Ejemplo n.º 30
0
def crawl_user_info(name):
    """抓取用户首页的信息
    :param name: 用户名
    :return: None
    """
    if not name:
        return None

    crawler.info(f"received task crawl_user_info {name}")
    user, other_crawled = get_profile(name)
    if not other_crawled:
        crawler.info(f"send task crawl_follower_fans {user.name}")
        app.send_task("tasks.user.crawl_follower_fans", args=(user.name, ))
Ejemplo n.º 31
0
def excute_login_task():
    infos = login_info.get_login_info()
    log.crawler.info('本轮模拟登陆开始')
    for info in infos:
        try:
            rs = Cookies.check_login_task(info.name)
        except KeyError:
            log.crawler.warning('请检查是否已经启动worker及指定了login_queue')
        else:
            if not rs:
                app.send_task('tasks.login.login_task',
                              args=(info.name, info.password,
                                    info.need_verify),
                              queue='login_queue',
                              routing_key='for_login')
Ejemplo n.º 32
0
def get_douban_subject_id_list(tag, sort, page_start, page_limit):
    """
    按subject获取豆瓣电影id列表
    :param tag:
    :param sort:
    :param page_start:
    :param page_limit:
    :return:
    """
    app.send_task('tasks.movie.douban_get_subject_id_list',
                  args=(
                      tag,
                      sort,
                      page_start,
                      page_limit,
                  ))
Ejemplo n.º 33
0
def task_filter(task_type, param):
    if task_type == 'question':
        if not CommonOper.is_exist("question_id", param):
            app.send_task('tasks.question.do_question',
                          args=(param, ),
                          queue='question_queue',
                          routing_key='question')
        else:
            jsl_log.info("相关question已存在,question_id:{}".format(param))
    elif task_type == 'people':
        if not CommonOper.is_exist("people_id", param):
            app.send_task('tasks.people.do_people',
                          args=(param, ),
                          queue='people_queue',
                          routing_key='people')
        else:
            jsl_log.info("相关people已存在,people_id:{}".format(param))
Ejemplo n.º 34
0
def excute_start_crawler(parameter):
    crawler_info.info("Task started!")
    result = app.send_task('tasks.start_task.start_crawler',
                           args=(parameter["id"], ),
                           queue='crawler_queue',
                           routing_key='for_crawler')
    crawler_info.info(result.task_id)
    return result.task_id
Ejemplo n.º 35
0
def crawl_person_infos(uid):
    """
    根据用户id来爬取用户相关资料和用户的关注数和粉丝数(由于微博服务端限制,默认爬取前五页,企业号的关注和粉丝也不能查看)
    :param uid: 用户id
    :return: 
    """
    if not uid:
        return

    # 由于与别的任务共享数据表,所以需要先判断数据库是否有该用户信息,再进行抓取
    user = user_get.get_profile(uid)
    # 不抓取企业号
    if user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                  routing_key='for_fans_followers')
Ejemplo n.º 36
0
def start_crawler(parameter):
    parameters = {}
    parameters["id"] = parameter
    params = spider_task.start_task(parameters)
    for item in params:
        crawler = Crawleruning()
        crawler.set_parameter(item)
        crawler.start()
        target_url = crawler.process()

        for sub_url in target_url:
            item["url"] = sub_url
            app.send_task('tasks.start_task.parse_url',
                          args=(item, ),
                          queue='crawler_queue',
                          routing_key='for_crawler')
    parameters["status"] = 0
    spider_task.update_status(parameters)
Ejemplo n.º 37
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    user, is_crawled = user_get.get_profile(uid)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                      routing_key='for_fans_followers')
Ejemplo n.º 38
0
def excute_comment_task():
    # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改
    weibo_datas = wb_data.get_weibo_comment_not_crawled()
    for weibo_data in weibo_datas:
        app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id,), queue='comment_crawler',
                      routing_key='comment_info')
Ejemplo n.º 39
0
def excute_search_task():
    keywords = get_search_keywords()
    for each in keywords:
        app.send_task('tasks.search.search_keyword', args=(each[0], each[1]), queue='search_crawler',
                      routing_key='for_search_info')
Ejemplo n.º 40
0
def get_weibo_info(each, html):
    wb_data = WeiboData()

    try:
        wb_data.weibo_id = each['mid']
    except (AttributeError, IndexError, TypeError):
        parser.error('Failed to get weibo id, the page source is {}'.format(html))
        return None

    imgs = list()
    imgs_url = list()
    try:
        imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li'))
        imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)))
        wb_data.weibo_img = ';'.join(imgs_url)
    except Exception:
        wb_data.weibo_img = ''

    if IMG_ALLOW and imgs and imgs_url:
        app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url),
                      queue='download_queue', routing_key='for_download')
        wb_data.weibo_img_path = IMG_PATH
    else:
        wb_data.weibo_img_path = ''

    # todo 没找到vedio的测试数据
    try:
        a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a'))
        extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&amp;", a_tag)[0])
        wb_data.weibo_video = url_filter(extracted_url)
    except Exception:
        wb_data.weibo_video = ''

    try:
        wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text
    except AttributeError:
        wb_data.device = ''

    try:
        # todo 日期格式化,会有今日XXX,X分钟前等噪音
        wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip()
        wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href']
        wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3]
    except (AttributeError, KeyError):
        wb_data.create_time = ''
        wb_data.weibo_url = ''
        wb_data.weibo_uid = ''

    try:
        wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.repost_num = 0
    try:
        wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1])
    except (AttributeError, ValueError):
        wb_data.comment_num = 0
    try:
        wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text)
    except (AttributeError, ValueError):
        wb_data.praise_num = 0

    if '展开全文' in str(each):
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    else:
        is_all_cont = 1
        try:
            wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip()
        except Exception as why:
            parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html))
            return None
    return wb_data, is_all_cont
Ejemplo n.º 41
0
def excute_user_task():
    seeds = get_seed_ids()
    if seeds:
        for seed in seeds:
            app.send_task('tasks.user.crawl_person_infos', args=(seed.uid,), queue='user_crawler',
                          routing_key='for_user_info')