def get_html(self, url: str, render_js: bool = True) -> Union[HTML, None]: """Get html from a specific URL :param url: the URL :param render_js: [whether to render js], defaults to True :param render_js: bool, optional :return: [the HTML string] :rtype: str """ try: # TODO: load config for timeout response: HTMLResponse = self.session.get(url, timeout=30) except requests.RequestException: logger.warning('[Worker] Cannot get this url: ' + url) return None except (KeyboardInterrupt, SystemExit, InterruptedError): self.stop() return None if response.ok: if render_js: logger.debug('starting render js...') response.html.render(wait=1.5, timeout=10.0) logger.debug('end render js...') return response.html else: return None
def _get_html_no_js(self, url: str) -> Union[PyQuery, None]: try: # TODO: load config for timeout response: Response = self.requests_session.get(url, timeout=DEFAULT_TIMEOUT_SECONDS) except requests.RequestException: logger.warning('[Worker] Cannot get this url: ' + url) return None except (KeyboardInterrupt, SystemExit, InterruptedError): self.stop() return None if response.ok: doc = PyQuery(response.text) return doc else: logger.debug(f'Request for {url} failed, status code: {response.status_code}') return None
def fetch_ips(q: Queue, validator_queue: Queue): logger.debug('fetch_ips...') worker = Worker() while True: try: provider: BaseProvider = q.get() provider_name = provider.__class__.__name__ logger.debug('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): html = worker.get_html(url, render_js=provider.should_render_js()) if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) logger.debug('Put new proxy ip into queue: {}'.format( p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): logger.info('worker_process exited.') break except pyppeteer.errors.PyppeteerError as e: logger.debug( 'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) + 'Please make sure you have installed all the dependencies for chromium correctly' ) except Exception as e: worker = Worker() # reset worker logger.warning('Unhandled exception is detected: {}'.format(e))