Esempio n. 1
0
    def get_html(self, url: str, render_js: bool = True) -> Union[HTML, None]:
        """Get html from a specific URL

        :param url: the URL
        :param render_js: [whether to render js], defaults to True
        :param render_js: bool, optional
        :return: [the HTML string]
        :rtype: str
        """

        try:
            # TODO: load config for timeout
            response: HTMLResponse = self.session.get(url, timeout=30)
        except requests.RequestException:
            logger.warning('[Worker] Cannot get this url: ' + url)
            return None
        except (KeyboardInterrupt, SystemExit, InterruptedError):
            self.stop()
            return None

        if response.ok:
            if render_js:
                logger.debug('starting render js...')
                response.html.render(wait=1.5, timeout=10.0)
                logger.debug('end render js...')
            return response.html
        else:
            return None
Esempio n. 2
0
    def _get_html_no_js(self, url: str) -> Union[PyQuery, None]:
        try:
            # TODO: load config for timeout
            response: Response = self.requests_session.get(url, timeout=DEFAULT_TIMEOUT_SECONDS)
        except requests.RequestException:
            logger.warning('[Worker] Cannot get this url: ' + url)
            return None
        except (KeyboardInterrupt, SystemExit, InterruptedError):
            self.stop()
            return None

        if response.ok:
            doc = PyQuery(response.text)
            return doc
        else:
            logger.debug(f'Request for {url} failed, status code: {response.status_code}')
            return None
Esempio n. 3
0
def fetch_ips(q: Queue, validator_queue: Queue):
    logger.debug('fetch_ips...')
    worker = Worker()

    while True:
        try:
            provider: BaseProvider = q.get()

            provider_name = provider.__class__.__name__

            logger.debug('Get a provider from the provider queue: ' +
                         provider_name)

            for url in provider.urls():

                html = worker.get_html(url,
                                       render_js=provider.should_render_js())

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        logger.debug('Put new proxy ip into queue: {}'.format(
                            p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            logger.info('worker_process exited.')
            break
        except pyppeteer.errors.PyppeteerError as e:
            logger.debug(
                'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) +
                'Please make sure you have installed all the dependencies for chromium correctly'
            )
        except Exception as e:
            worker = Worker()  # reset worker
            logger.warning('Unhandled exception is detected: {}'.format(e))