def solve_captcha(solver, grab, url=None, recognition_delay=5, recognition_time=120, **kwargs): """ :param solver: CaptchaService object :param grab: grab object with captcha image in body :return: grab object with captcha solution The function is subroutine that must be used in the inline task: class Bot(Spider): def task_generator(self): grab = self.create_grab_instance() url = 'http://captcha.ru/' grab.setup(url=url) yield Task('foo', grab=grab) @inline_task def task_foo(self, grab, task): solver = CaptchaSolver('antigate', 'grab', api_key='some api key') captcha_grab = grab.clone() url = 'http://captcha.ru/captcha2/' captcha_grab.setup(url=url) captcha_grab = yield Task(grab=captcha_grab) solution_grab = yield solve_captcha(solver, captcha_grab) request_data = {'code': solution_grab.response.code, 'body': solution_grab.response.body} solution = solver.captcha_backend\ .parse_check_solution_response(request_data) b = Bot() b.run() """ if url: grab = grab.clone() grab.setup(url=url) grab = yield Task(grab=grab) logger.debug('Got captcha image') data = solver.captcha_backend\ .get_submit_captcha_request_data(grab.response.body, **kwargs) antigate_grab = solver.network_backend.make_grab_instance(**data) antigate_grab = yield Task(grab=antigate_grab) captcha_id = solver.captcha_backend\ .parse_submit_captcha_response(response_to_dict(antigate_grab)) data = solver.captcha_backend.get_check_solution_request_data(captcha_id) antigate_grab = solver.network_backend.make_grab_instance(**data) delay = recognition_delay or 1 for _ in range(0, recognition_time, delay): antigate_grab = yield Task(grab=antigate_grab, delay=recognition_delay) try: solver.captcha_backend\ .parse_check_solution_response(response_to_dict(antigate_grab)) except SolutionNotReady: logger.debug('Solution is not ready') else: return
def process_links(self, grab, task_name, xpath, resolve_base=False, limit=None, **kwargs): """ :param grab: Grab instance :param xpath: xpath expression which calculates list of URLS :param task_name: name of task to generate Example:: self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href') """ urls = set() count = 0 for url in grab.xpath_list(xpath): url = grab.make_url_absolute(url, resolve_base=resolve_base) if not url in urls: urls.add(url) g2 = grab.clone(url=url) self.add_task(Task(task_name, grab=g2, **kwargs)) count += 1 if limit is not None and count >= limit: break
def process_object_image(self, task_name, collection, obj, image_field, image_url, base_dir, ext='jpg', skip_existing=True): path = os.path.join(base_dir, hashed_path(image_url, ext=ext)) if os.path.exists(path) and skip_existing: collection.update({'_id': obj['_id']}, { '$set': { '%s_path' % image_field: path, '%s_url' % image_field: image_url } }) else: self.add_task( Task(task_name, url=image_url, obj=obj, disable_cache=True, image_field=image_field, collection=collection, base_dir=base_dir, ext=ext))
def follow_links(self, grab, xpath, task_name, task=None): """ DEPRECATED, WILL BE REMOVED Args: :xpath: xpath expression which calculates list of URLS Example:: self.follow_links(grab, '//div[@class="topic"]/a/@href', 'topic') """ logger.error( 'Method follow_links is deprecated. Use process_links method instead.' ) urls = [] for url in grab.xpath_list(xpath): #if not url.startswith('http') and self.base_url is None: # raise SpiderError('You should define `base_url` attribute to resolve relative urls') url = urljoin(grab.config['url'], url) if not url in urls: urls.append(url) g2 = grab.clone() g2.setup(url=url) self.add_task(Task(task_name, grab=g2))
def task_submit_captcha(self, grab, task): captcha_id = self.solver.captcha_backend\ .parse_submit_captcha_response(response_to_dict(grab)) data = self.solver.captcha_backend\ .get_check_solution_request_data(captcha_id) g_new = self.solver.network_backend.make_grab_instance(**data) delay = task.meta.get('delay', 5) yield Task('check_solution', grab=g_new, delay=delay, meta=task.meta)
def load_initial_urls(self): """ Create initial tasks from `self.initial_urls`. Tasks are created with name "initial". """ if self.initial_urls: for url in self.initial_urls: self.add_task(Task('initial', url=url))
def start_task_generator(self): """ Process `self.initial_urls` list and `self.task_generator` method. Generate first portion of tasks. """ logger_verbose.debug('Processing initial urls') if self.initial_urls: for url in self.initial_urls: self.add_task(Task('initial', url=url)) self.task_generator_object = self.task_generator() self.task_generator_enabled = True # Initial call to task generator before spider has started working self.process_task_generator()
def start_task_generators(self): """ Process `self.initial_urls` list and `self.task_generator` method. """ logger_verbose.debug('Processing initial urls') if self.initial_urls: for url in self.initial_urls: # pylint: disable=not-an-iterable self.add_task(Task('initial', url=url)) self._task_generator_list = [] thread = TaskGeneratorWrapperThread(self.task_generator(), self) thread.daemon = True thread.start() self._task_generator_list.append(thread)
def start_task_generators(self): """ Process `self.initial_urls` list and `self.task_generator` method. """ logger_verbose.debug('Processing initial urls') if self.initial_urls: for url in self.initial_urls: self.add_task(Task('initial', url=url)) self._task_generator_list = [] th = Thread(target=self.task_generator_thread_wrapper, args=[self.task_generator()]) th.daemon = True th.start() self._task_generator_list.append(th)
def handler(self, collection, obj, set_field, base_dir, task_args=None, grab_args=None, callback=None): from database import db for image in obj.get(set_field, []): path = hashed_path(image['url'], base_dir=base_dir) if os.path.exists(path): if path != image['path']: db[collection].update( { '_id': obj['_id'], ('%s.url' % set_field): image['url'] }, {'$set': { ('%s.$.path' % set_field): path }}) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=image['url']) g.setup(referer=build_image_hosting_referer(image['url'])) if grab_args: g.setup(**grab_args) yield Task(callback=callback or image_set_handler, grab=g, collection=collection, path=path, obj=obj, image=image, set_field=set_field, disable_cache=True, backup=g.dump_config(), **kwargs)
def handler(self, url, collection, obj, path_field, base_dir, task_args=None, grab_args=None, callback=None): from database import db path = hashed_path(url, base_dir=base_dir) if os.path.exists(path): if path != obj.get(path_field, None): db[collection].update({'_id': obj['_id']}, {'$set': { path_field: path }}) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=url) g.setup(referer=build_image_hosting_referer(url)) if grab_args: g.setup(**grab_args) yield Task(callback=callback or image_handler, grab=g, collection=collection, path=path, obj=obj, path_field=path_field, disable_cache=True, backup=g.dump_config(), **kwargs)
def process_initial_urls(self): if self.initial_urls: for url in self.initial_urls: self.add_task(Task('initial', url=url))
def task_download_captcha(self, grab, task): logger.debug('Got captcha image') g_new = self.solver.backend.get_submit_captcha_request( grab.response.body) yield Task('submit_captcha', grab=g_new, meta=task.meta)
def task_submit_captcha(self, grab, task): captcha_id = self.solver.backend.parse_submit_captcha_response( grab.response) g_new = self.solver.backend.get_check_solution_request(captcha_id) yield Task('check_solution', grab=g_new, delay=5, meta=task.meta)
def task_download_captcha(self, grab, task): logger.debug('Got captcha image') data = self.solver.captcha_backend.\ get_submit_captcha_request_data(grab.response.body) g_new = self.solver.network_backend.make_grab_instance(**data) yield Task('submit_captcha', grab=g_new, meta=task.meta)