Exemple #1
0
def execute_info_7x24():
    crawler.info('The info_7x24 task is starting...')
    for au_message724_field in parse_info_7x24_list():
        app.send_task('tasks.info_7x24.save_info_7x24_list',
                      args=(au_message724_field, ),
                      queue='info_7x24',
                      routing_key='for_info_7x24')
Exemple #2
0
def save_info_7x24_list(au_message724_field):
    crawler.info('The info_7x24 list task is starting...')
    AuMessage724.add(au_message724_field)

    app.send_task('tasks.info_7x24.save_info_7x24_content',
                  args=(au_message724_field['ArticleID'], ),
                  queue='info_7x24',
                  routing_key='for_info_7x24')
Exemple #3
0
def execute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = WbDataOper.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler', routing_key='repost_info')
Exemple #4
0
def execute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = WbDataOper.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(
        len(weibo_datas)))

    for weibo_data in weibo_datas:
        crawl_repost_page(weibo_data.weibo_id, weibo_data.uid)
Exemple #5
0
def execute_market():
    crawler.info('The market task is starting...')
    for b_market_field in parse_market_list():

        app.send_task('tasks.market.save_market',
                      args=(b_market_field,),
                      queue='market',
                      routing_key='for_market')
Exemple #6
0
def save_trade_info_list(a_information_field):
    crawler.info('The trade info list task is starting...')
    BDxtInformation.add(a_information_field)

    app.send_task('tasks.trade_info.save_trade_info_content',
                  args=(a_information_field['information_id'],),
                  queue='trade_info',
                  routing_key='for_trade_info')
Exemple #7
0
def execute_user_task():
    seeds, is_exists = SeedUser.get_seed_names()
    if is_exists:
        for seed in seeds:
            crawler.info(f"send task crawl_user_info {seed.name}")
            app.send_task("tasks.user.crawl_user_info", args=(seed.name, ))
    else:
        crawler.info("find no user, abort")
Exemple #8
0
def execute_info_trade():
    crawler.info('The trade info task is starting...')
    for a_information_field in parse_info_trade_list():

        app.send_task('tasks.trade_info.save_trade_info_list',
                      args=(a_information_field,),
                      queue='trade_info',
                      routing_key='for_trade_info')
Exemple #9
0
def execute_login_task():
    infos = LoginInfoOper.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    crawler.info('The login task is starting...')
    for info in infos:
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
Exemple #10
0
def get_page(url):
    crawler.info("the crawling url is {url}".format(url=url))
    # proxies = get_proxy()
    time.sleep(REQUEST_INTERVAL)
    crawler.info(f"sleep {REQUEST_INTERVAL}")

    resp = requests_retry_session().get(url,
                                        headers=headers,
                                        timeout=REQUEST_TIMEOUT)

    return resp.text
Exemple #11
0
def crawl_user_info(name):
    """抓取用户首页的信息
    :param name: 用户名
    :return: None
    """
    if not name:
        return None

    crawler.info(f"received task crawl_user_info {name}")
    user, other_crawled = get_profile(name)
    if not other_crawled:
        crawler.info(f"send task crawl_follower_fans {user.name}")
        app.send_task("tasks.user.crawl_follower_fans", args=(user.name, ))
Exemple #12
0
def crawl(mid):
    # gu = BiliUser(mid)
    crawl_code = -1
    try:
        res = BiliUser.store(mid)
        if res:
            crawler.info('%s_%d' % (mid, crawl_code))
            crawl_code = 0
    except:
        crawl.info('%s_%d' % (mid, crawl_code))
        crawl_code = -1

    # time.sleep(1)
    return crawl_code
Exemple #13
0
def execute_login_task():
    # 获取所有的需要登录的weibo账号信息
    infos = LoginInfoOper.get_login_info()
    # Clear all stacked login tasks before each time for login
    Cookies.check_login_task()
    crawler.info('The login task is starting...')
    for info in infos:
        # 对xx任务 发送参数args
        # 让这个任务启动
        # queue参数:表示通过这个队列来路由通知任务
        # 路由的key由参数routing_key 指定
        app.send_task('tasks.login.login_task',
                      args=(info.name, info.password),
                      queue='login_queue',
                      routing_key='for_login')
        time.sleep(10)
Exemple #14
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        if cur_page == 1:
            search_page = get_page(cur_url, auth_level=1)
        else:
            search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning(
                'No result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                WbDataOper.add_one(wb_data)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info(
                'Keyword {} has been crawled in this turn'.format(keyword))
            return
Exemple #15
0
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)

            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return
Exemple #16
0
def save_info_7x24_content(article_id):
    crawler.info('The info_7x24 content task is starting...')
    crawler.info('The info_7x24 content article id is :{}'.format(article_id))
    res_div = parse_content(article_id)
    crawler.info('The info_7x24 content is :{}...'.format(res_div[:20]))

    AuMessage724.objects(ArticleID=article_id).update(ArticleContent=res_div)
Exemple #17
0
def save_trade_info_content(information_id):
    crawler.info('The trade info content list task is starting...')
    crawler.info('The trade info content info id is :{}'.format(information_id))
    res_div = parse_content(information_id)

    crawler.info('The trade info content is :{}...'.format(res_div[:20]))
    BDxtInformation.objects(information_id=information_id).update(
        content=res_div)
Exemple #18
0
def search_items_v2(keyword, keyword_id, date_item):
    search_time_list = [
        "{}-{}:{}-{}".format(d, t, d, t + 2)
        for d, t in itertools.product([date_item], TIME_LIIT)
    ]

    for s_time in search_time_list:
        crawler.info('We are searching keyword "{}", {}'.format(
            keyword, s_time))
        cur_page = 1
        encode_keyword = url_parse.quote(keyword)
        while cur_page < LIMIT:
            cur_url = MAX_URL.format(encode_keyword, cur_page, s_time)
            # current only for login, maybe later crawling page one without login
            search_page = get_page(cur_url, auth_level=1, need_proxy=True)
            if "您可以尝试更换关键词,再次搜索" in search_page:
                break
            if not search_page:
                crawler.warning(
                    'No search result for keyword {}, the source page is {}'.
                    format(keyword, search_page))
                cur_page += 1
                continue
                # return

            search_list = parse_search.get_search_info(search_page)

            if cur_page == 1:
                cur_page += 1
            elif 'noresult_tit' not in search_page:
                cur_page += 1
            else:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return

            # Because the search results are sorted by time, if any result has been stored in mysql,
            # We don't need to crawl the same keyword in this turn
            for wb_data in search_list:
                # print(wb_data)
                rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # todo incremental crawling using time
                if rs:
                    crawler.info('Weibo {} has been crawled, skip it.'.format(
                        wb_data.weibo_id))
                    continue
                else:
                    WbDataOper.add_one(wb_data)
                    # todo: only add seed ids and remove this task
                    app.send_task('tasks.user.crawl_person_infos',
                                  args=(wb_data.uid, ),
                                  queue='user_crawler',
                                  routing_key='for_user_info')
Exemple #19
0
def search_keyword_topic(keyword, keyword_id, start_time='', end_time=''):
    crawler.info(
        'We are crawling weibo topic content with keyword "{}"'.format(
            keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, start_time, end_time, cur_page)
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.info(
                'No such result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_topic.get_search_info(search_page)
        if cur_page == 1:
            cur_page += 1
        elif '您可以尝试更换关键词' not in search_page:
            cur_page += 1
        else:
            crawler.info(
                'Keyword {} has been crawled in this turn'.format(keyword))
            return

        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)
            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(
                    wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')
Exemple #20
0
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We don't need to crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)
            # todo incremental crawling using time
            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                # todo: only add seed ids and remove this task
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')
Exemple #21
0
start_time = datetime.datetime.strptime(STARTTIME, "%Y-%m-%d %H:%M:%S")
end_time = datetime.datetime.strptime(ENDTIME, "%Y-%m-%d %H:%M:%S")

OneHour = datetime.timedelta(hours=1)

time1 = start_time
time2 = start_time + OneHour


def get_topic_data(keyword, start_time='', end_time=''):
    from tasks.topic import search_keyword_topic
    search_keyword_topic(keyword, 33, start_time, end_time)


while 1:
    while time1 < end_time:
        start_time_str = time1.strftime("%Y-%m-%d-%H")
        end_time_str = time2.strftime("%Y-%m-%d-%H")
        print(start_time_str, '--', end_time_str)

        crawler.info(
            "we are crawling keyword:{}, timerange {}:{} content".format(
                KEYWORD, start_time_str, end_time_str))
        get_topic_data(KEYWORD, start_time_str, end_time_str)
        time1 = time1 + OneHour
        time2 = time2 + OneHour

    time1 = start_time
    time2 = start_time + OneHour
Exemple #22
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired'
                )
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {
                'http': name_cookies[2],
                'https': name_cookies[2],
            }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')

        try:
            if auth_level == 2:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=COOKIES,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'Excepitons are raised when crawling {}.Here are details:{}'.
                format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(
                    name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''
Exemple #23
0
# -*- coding: utf8 -*-
"""
test logger
"""

import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from logger import crawler, storage

crawler.info('crawler')
storage.info('database connect error')
Exemple #24
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {'http': name_cookies[2], 'https': name_cookies[2], }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')
        
        try:
            if auth_level == 2:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy)
            else:
                resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy)
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('Excepitons are raised when crawling {}.Here are details:{}'.format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''
Exemple #25
0
def execute_hot_list_task():
    crawler.info(f"send task hot_list")
    hot_list_title = ["total", "science", "digital", "sport", "fashion", "film"]
    for title in hot_list_title:
        app.send_task("tasks.hot_list.crawl_hot_list", args=(title,))
Exemple #26
0
def crawl_hot_list(title):
    if not title:
        return None

    crawler.info(f"received task crawl_hot_list {title}")
    get_hot_list(title)
Exemple #27
0
def save_market(b_market_field):
    crawler.info('The save market task is starting...')
    BDxtMarketHouse.add(b_market_field)