Esempio n. 1
0
def excute_search_task():
    keywords = get_search_keywords()
    for each in keywords:
        celery.send_task('celery_tasks.weibo.search.search_keyword',
                         args=(each[0], each[1]),
                         queue='search_crawler',
                         routing_key='for_search_info')
Esempio n. 2
0
def excute_user_task():
    seeds = get_seed_ids()
    if seeds:
        for seed in seeds:
            celery.send_task('celery_tasks.weibo.user.crawl_person_infos',
                             args=(seed.uid, ),
                             queue='user_crawler',
                             routing_key='for_user_info')
Esempio n. 3
0
def excute_comment_task():
    # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改
    weibo_datas = wb_data.get_weibo_comment_not_crawled()
    for weibo_data in weibo_datas:
        celery.send_task('celery_tasks.weibo.comment.crawl_comment_page',
                         args=(weibo_data.weibo_id, ),
                         queue='comment_crawler',
                         routing_key='comment_info')
Esempio n. 4
0
def excute_home_task():
    # 这里的策略由自己指定,可以基于已有用户做主页抓取,也可以指定一些用户,我这里直接选的种子数据库中的uid
    id_objs = get_home_ids()
    for id_obj in id_objs:
        celery.send_task('celery_tasks.weibo.home.crawl_weibo_datas',
                         args=(id_obj.uid, ),
                         queue='home_crawler',
                         routing_key='home_info')
Esempio n. 5
0
def excute_user_personal_adver_task(adv_message):
    seeds = get_seed()
    if seeds:
        for seed in seeds:
            sleep(random.randint(1, 6))
            print(seed.uid)
            celery.send_task('celery_tasks.weibo.user.excute_personal_adver',
                             args=(seed.uid, adv_message),
                             queue='personal_adver',
                             routing_key='for_adver')
Esempio n. 6
0
def excute_login_task():
    infos = login_info.get_login_info()
    # Clear all stacked login celery_tasks before each time for login
    Cookies.check_login_task()
    log.crawler.info('The excute_login_task is starting...')
    for info in infos:
        celery.send_task('celery_tasks.weibo.login.login_task',
                         args=(info.name, info.password, info.source),
                         queue='login_task',
                         routing_key='login_task')
        time.sleep(10)
Esempio n. 7
0
def excute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = wb_data.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(
        len(weibo_datas)))

    for weibo_data in weibo_datas:
        celery.send_task('celery_tasks.weibo.repost.crawl_repost_page',
                         args=(weibo_data.weibo_id, weibo_data.uid),
                         queue='repost_crawler',
                         routing_key='repost_info')
Esempio n. 8
0
def crawl_comment_page(mid):
    limit = conf.get_max_comment_page() + 1
    # 这里为了马上拿到返回结果,采用本地调用的方式
    first_page = crawl_comment_by_page(mid, 1)
    total_page = comment.get_total_page(first_page)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        celery.send_task('celery_tasks.weibo.comment.crawl_comment_by_page',
                         args=(mid, page_num),
                         queue='comment_page_crawler',
                         routing_key='comment_page_info')
Esempio n. 9
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    user = user_get.get_profile(uid)
    # If it's enterprise user, just skip it
    if user.verify_type == 2:
        set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    celery.send_task('celery_tasks.weibo.user.crawl_follower_fans',
                     args=(uid, ),
                     queue='fans_followers',
                     routing_key='for_fans_followers')
Esempio n. 10
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning(
                'No result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info(
                    'keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                celery.send_task('celery_tasks.weibo.user.crawl_person_infos',
                                 args=(wb_data.uid, ),
                                 queue='user_crawler',
                                 routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info(
                'keyword {} has been crawled in this turn'.format(keyword))
            return
Esempio n. 11
0
def jd_seckill_timer_relogin(self):
    task_id = self.request.id

    logger = cl.getLogger('jd_seckill_timer_relogin')
    logger.info('重新登录任务开始,task_id:' + task_id)

    ppool = ProxyStore.get_proxyPoolstores()
    mongdb_conn = DBStore.get_datastores()
    mydb = mongdb_conn['JD']
    jd_users = mydb.Users.find({"status": 1})
    for jd_user in jd_users:
        fetch_result = celery.send_task("celery_tasks.jd_seckill.jd_seckill.jd_seckill_relogin_task",
                                              queue='jd_seckill_presell',
                                              args=(jd_user["username"], jd_user["password"], 4099139, ppool.getProxy()))
Esempio n. 12
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        celery.send_task('celery_tasks.weibo.home.crawl_ajax_page',
                         args=(ajax_url_0, ),
                         queue='ajax_home_crawler',
                         routing_key='ajax_home_info')

        celery.send_task('celery_tasks.weibo.home.crawl_ajax_page',
                         args=(ajax_url_1, ),
                         queue='ajax_home_crawler',
                         routing_key='ajax_home_info')
Esempio n. 13
0
# coding:utf-8
from apps.celery_init import celery
from celery_tasks.weibo import login

if __name__ == '__main__':
    # you should execute this file, because celery timer will execute login delayed
    # login.excute_login_task()
    celery.send_task('apps.celery_init.start_add_task',
                     args=(1, 2),
                     queue='start_add_task',
                     routing_key='start_add_task')