Beispiel #1
0
from urllib import parse as url_parse

from logger import crawler
from .workers import app
from page_get import get_page
from config import get_max_search_page
from page_parse import search as parse_search
from db.dao import (KeywordsOper, KeywordsDataOper, WbDataOper)

# This url is just for original weibos.
# If you want other kind of search, you can change the url below
URL = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}'
LIMIT = get_max_search_page() + 1


@app.task(ignore_result=True)
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        if cur_page == 1:
            search_page = get_page(cur_url, auth_level=1)
        else:
            search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning(
                'No result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return
Beispiel #2
0
from logger import crawler
from .workers import app
from page_get import get_page
from config import get_max_search_page
from page_parse import search as parse_search
from db.dao import (
    KeywordsOper, KeywordsDataOper, WbDataOper)


# This url is just for original weibos.
# If you want other kind of search, you can change the url below
# But if you change this url, maybe you have to rewrite some part of the parse code
URL = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}'
# Use this if results are too little
# URL = 'http://s.weibo.com/weibo/{}&nodup=1&page={}'
LIMIT = get_max_search_page() + 1


@app.task(ignore_result=True)
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return