Beispiel #1
0
class GrabEngine(SearchEngine):
    def __init__(self, log_file, address, log_dir='', debug=False, language='ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3',
                 charset='utf-8', sleep_max=4):
        log_path = os.path.join(log_dir, log_file) if log_dir else log_file
        self.crawler = Grab(log_file=log_path)
        self.crawler.setup(debug=debug)  # Option `debug` enables saving of outgoing requests headers
        self.crawler.setup(headers={'Accept-Language': language, 'Accept-Charset': charset})
        #self.crawler.setup(proxy='113.122.35.15:8909', proxy_type='http', connect_timeout=25, timeout=25)
        self.sleep_max = sleep_max if sleep_max > 1 else 1
        self.address = address

    def sleep(self):
        self.crawler.sleep(limit2=self.sleep_max)

    def check_captcha(self):
        if self.crawler.xpath_list(self.captcha_selector):
            raise CaptchaException()
Beispiel #2
0
def search(query, grab=None, limit=None, per_page=None):

    if not grab:
        grab = Grab()
    stop = False
    count = 0

    grab.clear_cookies()
    if grab.proxylist:
        grab.change_proxy()

    for page in xrange(1, 9999):
        if stop:
            break
        url = build_search_url(query, page, per_page=per_page)
        index_size = None
        grab = grab.go(url)
        #grab = google_request(url, grab=grab)

        count = 0
        for item in parse_search_results(grab):
            yield item # {url, title, index_size}
            count += 1

        if not count:
            stop = True

        if is_last_page(grab):
            logging.debug('Last page found')
            stop = True

        if limit is not None and count >= limit:
            logging.debug('Limit %d reached' % limit)
            stop = True

        grab.sleep(3, 5)