def solve_captcha(solver,
                  grab,
                  url=None,
                  recognition_delay=5,
                  recognition_time=120,
                  **kwargs):
    """
    :param solver: CaptchaService object
    :param grab: grab object with captcha image in body
    :return: grab object with captcha solution

    The function is subroutine that must be used in the inline task:

    class Bot(Spider):
        def task_generator(self):
            grab = self.create_grab_instance()
            url = 'http://captcha.ru/'
            grab.setup(url=url)
            yield Task('foo', grab=grab)

        @inline_task
        def task_foo(self, grab, task):
            solver = CaptchaSolver('antigate', 'grab', api_key='some api key')
            captcha_grab = grab.clone()
            url = 'http://captcha.ru/captcha2/'
            captcha_grab.setup(url=url)
            captcha_grab = yield Task(grab=captcha_grab)
            solution_grab = yield solve_captcha(solver, captcha_grab)
            request_data = {'code': solution_grab.response.code,
                            'body': solution_grab.response.body}
            solution = solver.captcha_backend\
                .parse_check_solution_response(request_data)
    b = Bot()
    b.run()
    """
    if url:
        grab = grab.clone()
        grab.setup(url=url)
        grab = yield Task(grab=grab)
    logger.debug('Got captcha image')
    data = solver.captcha_backend\
        .get_submit_captcha_request_data(grab.response.body, **kwargs)
    antigate_grab = solver.network_backend.make_grab_instance(**data)
    antigate_grab = yield Task(grab=antigate_grab)

    captcha_id = solver.captcha_backend\
        .parse_submit_captcha_response(response_to_dict(antigate_grab))
    data = solver.captcha_backend.get_check_solution_request_data(captcha_id)
    antigate_grab = solver.network_backend.make_grab_instance(**data)

    delay = recognition_delay or 1
    for _ in range(0, recognition_time, delay):
        antigate_grab = yield Task(grab=antigate_grab, delay=recognition_delay)
        try:
            solver.captcha_backend\
                .parse_check_solution_response(response_to_dict(antigate_grab))
        except SolutionNotReady:
            logger.debug('Solution is not ready')
        else:
            return
Exemple #2
0
    def process_links(self,
                      grab,
                      task_name,
                      xpath,
                      resolve_base=False,
                      limit=None,
                      **kwargs):
        """
        :param grab: Grab instance
        :param xpath: xpath expression which calculates list of URLS
        :param task_name: name of task to generate

        Example::

            self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
        """
        urls = set()
        count = 0
        for url in grab.xpath_list(xpath):
            url = grab.make_url_absolute(url, resolve_base=resolve_base)
            if not url in urls:
                urls.add(url)
                g2 = grab.clone(url=url)
                self.add_task(Task(task_name, grab=g2, **kwargs))
                count += 1
                if limit is not None and count >= limit:
                    break
Exemple #3
0
 def process_object_image(self,
                          task_name,
                          collection,
                          obj,
                          image_field,
                          image_url,
                          base_dir,
                          ext='jpg',
                          skip_existing=True):
     path = os.path.join(base_dir, hashed_path(image_url, ext=ext))
     if os.path.exists(path) and skip_existing:
         collection.update({'_id': obj['_id']}, {
             '$set': {
                 '%s_path' % image_field: path,
                 '%s_url' % image_field: image_url
             }
         })
     else:
         self.add_task(
             Task(task_name,
                  url=image_url,
                  obj=obj,
                  disable_cache=True,
                  image_field=image_field,
                  collection=collection,
                  base_dir=base_dir,
                  ext=ext))
Exemple #4
0
    def follow_links(self, grab, xpath, task_name, task=None):
        """
        DEPRECATED, WILL BE REMOVED

        Args:
            :xpath: xpath expression which calculates list of URLS

        Example::

            self.follow_links(grab, '//div[@class="topic"]/a/@href', 'topic')
        """
        logger.error(
            'Method follow_links is deprecated. Use process_links method instead.'
        )

        urls = []
        for url in grab.xpath_list(xpath):
            #if not url.startswith('http') and self.base_url is None:
            #    raise SpiderError('You should define `base_url` attribute to resolve relative urls')
            url = urljoin(grab.config['url'], url)
            if not url in urls:
                urls.append(url)
                g2 = grab.clone()
                g2.setup(url=url)
                self.add_task(Task(task_name, grab=g2))
 def task_submit_captcha(self, grab, task):
     captcha_id = self.solver.captcha_backend\
         .parse_submit_captcha_response(response_to_dict(grab))
     data = self.solver.captcha_backend\
         .get_check_solution_request_data(captcha_id)
     g_new = self.solver.network_backend.make_grab_instance(**data)
     delay = task.meta.get('delay', 5)
     yield Task('check_solution', grab=g_new, delay=delay, meta=task.meta)
Exemple #6
0
    def load_initial_urls(self):
        """
        Create initial tasks from `self.initial_urls`.

        Tasks are created with name "initial".
        """

        if self.initial_urls:
            for url in self.initial_urls:
                self.add_task(Task('initial', url=url))
Exemple #7
0
    def start_task_generator(self):
        """
        Process `self.initial_urls` list and `self.task_generator`
        method.  Generate first portion of tasks.
        """

        logger_verbose.debug('Processing initial urls')
        if self.initial_urls:
            for url in self.initial_urls:
                self.add_task(Task('initial', url=url))

        self.task_generator_object = self.task_generator()
        self.task_generator_enabled = True
        # Initial call to task generator before spider has started working
        self.process_task_generator()
Exemple #8
0
    def start_task_generators(self):
        """
        Process `self.initial_urls` list and `self.task_generator`
        method.
        """

        logger_verbose.debug('Processing initial urls')
        if self.initial_urls:
            for url in self.initial_urls: # pylint: disable=not-an-iterable
                self.add_task(Task('initial', url=url))

        self._task_generator_list = []
        thread = TaskGeneratorWrapperThread(self.task_generator(), self)
        thread.daemon = True
        thread.start()
        self._task_generator_list.append(thread)
Exemple #9
0
    def start_task_generators(self):
        """
        Process `self.initial_urls` list and `self.task_generator`
        method.
        """

        logger_verbose.debug('Processing initial urls')
        if self.initial_urls:
            for url in self.initial_urls:
                self.add_task(Task('initial', url=url))

        self._task_generator_list = []
        th = Thread(target=self.task_generator_thread_wrapper,
                    args=[self.task_generator()])
        th.daemon = True
        th.start()
        self._task_generator_list.append(th)
Exemple #10
0
    def handler(self,
                collection,
                obj,
                set_field,
                base_dir,
                task_args=None,
                grab_args=None,
                callback=None):
        from database import db

        for image in obj.get(set_field, []):
            path = hashed_path(image['url'], base_dir=base_dir)
            if os.path.exists(path):
                if path != image['path']:
                    db[collection].update(
                        {
                            '_id': obj['_id'],
                            ('%s.url' % set_field): image['url']
                        }, {'$set': {
                            ('%s.$.path' % set_field): path
                        }})
            else:
                kwargs = {}
                if task_args:
                    kwargs = deepcopy(task_args)

                g = Grab()
                g.setup(url=image['url'])
                g.setup(referer=build_image_hosting_referer(image['url']))
                if grab_args:
                    g.setup(**grab_args)

                yield Task(callback=callback or image_set_handler,
                           grab=g,
                           collection=collection,
                           path=path,
                           obj=obj,
                           image=image,
                           set_field=set_field,
                           disable_cache=True,
                           backup=g.dump_config(),
                           **kwargs)
Exemple #11
0
    def handler(self,
                url,
                collection,
                obj,
                path_field,
                base_dir,
                task_args=None,
                grab_args=None,
                callback=None):
        from database import db
        path = hashed_path(url, base_dir=base_dir)
        if os.path.exists(path):
            if path != obj.get(path_field, None):
                db[collection].update({'_id': obj['_id']},
                                      {'$set': {
                                          path_field: path
                                      }})
        else:
            kwargs = {}
            if task_args:
                kwargs = deepcopy(task_args)

            g = Grab()
            g.setup(url=url)
            g.setup(referer=build_image_hosting_referer(url))
            if grab_args:
                g.setup(**grab_args)

            yield Task(callback=callback or image_handler,
                       grab=g,
                       collection=collection,
                       path=path,
                       obj=obj,
                       path_field=path_field,
                       disable_cache=True,
                       backup=g.dump_config(),
                       **kwargs)
Exemple #12
0
 def process_initial_urls(self):
     if self.initial_urls:
         for url in self.initial_urls:
             self.add_task(Task('initial', url=url))
Exemple #13
0
 def task_download_captcha(self, grab, task):
     logger.debug('Got captcha image')
     g_new = self.solver.backend.get_submit_captcha_request(
         grab.response.body)
     yield Task('submit_captcha', grab=g_new, meta=task.meta)
Exemple #14
0
 def task_submit_captcha(self, grab, task):
     captcha_id = self.solver.backend.parse_submit_captcha_response(
         grab.response)
     g_new = self.solver.backend.get_check_solution_request(captcha_id)
     yield Task('check_solution', grab=g_new, delay=5, meta=task.meta)
 def task_download_captcha(self, grab, task):
     logger.debug('Got captcha image')
     data = self.solver.captcha_backend.\
         get_submit_captcha_request_data(grab.response.body)
     g_new = self.solver.network_backend.make_grab_instance(**data)
     yield Task('submit_captcha', grab=g_new, meta=task.meta)