class GrabEngine(SearchEngine): def __init__(self, log_file, address, log_dir='', debug=False, language='ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', charset='utf-8', sleep_max=4): log_path = os.path.join(log_dir, log_file) if log_dir else log_file self.crawler = Grab(log_file=log_path) self.crawler.setup(debug=debug) # Option `debug` enables saving of outgoing requests headers self.crawler.setup(headers={'Accept-Language': language, 'Accept-Charset': charset}) #self.crawler.setup(proxy='113.122.35.15:8909', proxy_type='http', connect_timeout=25, timeout=25) self.sleep_max = sleep_max if sleep_max > 1 else 1 self.address = address def sleep(self): self.crawler.sleep(limit2=self.sleep_max) def check_captcha(self): if self.crawler.xpath_list(self.captcha_selector): raise CaptchaException()
def search(query, grab=None, limit=None, per_page=None): if not grab: grab = Grab() stop = False count = 0 grab.clear_cookies() if grab.proxylist: grab.change_proxy() for page in xrange(1, 9999): if stop: break url = build_search_url(query, page, per_page=per_page) index_size = None grab = grab.go(url) #grab = google_request(url, grab=grab) count = 0 for item in parse_search_results(grab): yield item # {url, title, index_size} count += 1 if not count: stop = True if is_last_page(grab): logging.debug('Last page found') stop = True if limit is not None and count >= limit: logging.debug('Limit %d reached' % limit) stop = True grab.sleep(3, 5)