def fetch_ips(q: Queue, validator_queue: Queue): logger.debug('fetch_ips...') worker = Worker() while True: try: provider: BaseProvider = q.get()() provider_name = provider.__class__.__name__ logger.debug('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): try: html = worker.get_html( url, render_js=provider.should_render_js()) except Exception as e: logger.error("worker.get_html failed: %s", e) continue if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): worker.stop() logger.info('worker_process exited.') break
def forward(self, host=None, port=None): try: url = self.request.uri body = self.request.body if not body: body = None httpclient.AsyncHTTPClient().fetch( httpclient.HTTPRequest(url=url, method=self.request.method, body=body, headers=self.request.headers, follow_redirects=False, validate_cert=False, proxy_host=host, proxy_port=port), self.handle_response) except httpclient.HTTPError as e: logger.debug("tornado signalled HTTPError {}".format(e)) self.set_status(500) self.finish() except: self.set_status(500) self.write("Internal server error:\n" + ''.join(traceback.format_exception(*sys.exc_info()))) self.finish()
def get_html(self, url: str, render_js: bool = True) -> Union[HTML, None]: """Get html from a specific URL :param url: the URL :param render_js: [whether to render js], defaults to True :param render_js: bool, optional :return: [the HTML string] :rtype: str """ try: # TODO: load config for timeout response: HTMLResponse = self.session.get(url, timeout=30) except requests.RequestException: logger.warning('[Worker] Cannot get this url: ' + url) return None except (KeyboardInterrupt, SystemExit, InterruptedError): self.stop() return None if response.ok: if render_js: logger.debug('starting render js...') response.html.render(wait=1.5, timeout=10.0) logger.debug('end render js...') return response.html else: return None
def feed_from_db(): # TODO: better query (order by attempts) proxies = ProxyIP.select().where(ProxyIP.updated_at > datetime.now() - timedelta(days=14)) for p in proxies: scheduler.validator_queue.put(p) logger.debug('Feed {} proxies from the database for a second time validation'.format(len(proxies)))
async def api_v1_proxies(request: Request): args = request.raw_args limit = 20 page = 1 is_anonymous = 2 # 0: no, 1: yes, 2: any if 'limit' in args: int_limit = _parse_str_to_int(args['limit']) limit = int_limit if int_limit else 20 if 'page' in args: int_page = _parse_str_to_int(args['page']) page = int_page if int_page > 0 else 1 if 'anonymous' in args: str_anonymous = args['anonymous'] if str_anonymous == 'true': is_anonymous = 1 elif str_anonymous == 'false': is_anonymous = 0 else: is_anonymous = 2 proxy_initial_query = _get_valid_proxies_query() proxy_query = proxy_initial_query if is_anonymous != 2: if is_anonymous == 1: proxy_query = proxy_initial_query.where( ProxyIP.is_anonymous == True) elif is_anonymous == 0: proxy_query = proxy_initial_query.where( ProxyIP.is_anonymous == False) proxies = proxy_query.order_by(ProxyIP.updated_at.desc(), ProxyIP.latency).offset( (page - 1) * limit).limit(limit) count = proxy_initial_query.count() logger.debug('Perform SQL query: {}'.format(proxy_query.sql())) proxy_list = [] for p in proxies: proxy_list.append(model_to_dict(p)) return json({ 'proxies': proxy_list, 'count': count, 'per_page': limit, 'page': page, 'total_page': math.ceil(count / limit), })
def handle_response(self, response: HTTPResponse): if response.body: self.write(response.body) self.finish() elif response.error: logger.debug('The forward proxy has an error: {}'.format(response.error)) self.finish() else: self.finish()
def create_connection() -> SqliteDatabase: """ create a database connection :rtype: SqliteDatabase """ global _db if _db: return _db else: logger.debug('create new db connection') _db = SqliteDatabase(get_config('db_path', './scylla.db')) return _db
def _get_html_js(self, url: str) -> Union[PyQuery, None]: page = self.browser.new_page() response = page.goto(url=url, timeout=DEFAULT_TIMEOUT_SECONDS, wait_until='domcontentloaded') if not response: logger.debug(f'Request for {url} failed because response is None') return None if response.ok: doc = PyQuery(page.content()) return doc else: logger.debug(f'Request for {url} failed, status code: {response.status}') return None
def _get_html_no_js(self, url: str) -> Union[PyQuery, None]: try: # TODO: load config for timeout response: Response = self.requests_session.get(url, timeout=DEFAULT_TIMEOUT_SECONDS) except requests.RequestException: logger.warning('[Worker] Cannot get this url: ' + url) return None except (KeyboardInterrupt, SystemExit, InterruptedError): self.stop() return None if response.ok: doc = PyQuery(response.text) return doc else: logger.debug(f'Request for {url} failed, status code: {response.status_code}') return None
def cron_schedule(scheduler, run_once=False): """ :param scheduler: the Scheduler instance :param run_once: flag for testing """ def feed(): scheduler.feed_providers() def feed_from_db(): # TODO: better query (order by attempts) proxies = ProxyIP.select().where( ProxyIP.updated_at > datetime.now() - timedelta(days=14)) for p in proxies: scheduler.validator_queue.put(p) logger.info( 'Feed {} proxies from the database for a second time validation'. format(len(proxies))) # feed providers at the very beginning scheduler.feed_providers() schedule.every(10).minutes.do(feed) schedule.every(FEED_FROM_DB_INTERVAL_MINUTES).minutes.do(feed_from_db) logger.debug('cron_thread started.') # After 1 minute, try feed_from_db() for the first time wait_time_for_feed_from_db = 1 if run_once else 60 time.sleep(wait_time_for_feed_from_db) feed_from_db() while True: try: schedule.run_pending() if run_once: raise SystemExit else: time.sleep(60) except (KeyboardInterrupt, InterruptedError, SystemExit): break logger.debug('cron_thread exited.')
def get_proxy_and_forward(self): https = False if self.request.uri.startswith('https'): https = True disable_forward_proxy = get_config('disable_forward_proxy', default=False) if disable_forward_proxy: self.forward() logger.debug('proxy get_proxy_and_forward option %s', disable_forward_proxy) else: proxy = get_proxy(https=https) self.forward(host=proxy.ip, port=proxy.port) logger.debug('proxy get_proxy_and_forward option %s %s %s', disable_forward_proxy, proxy.ip, proxy.port)
def fetch_ips(q: Queue, validator_queue: Queue, run_once=False): logger.debug('worker_process started.') logger.info('fetching ips...') worker = Worker() while True: try: if run_once and q.empty(): raise SystemExit break provider: BaseProvider = q.get() provider_name = provider.__class__.__name__ logger.info('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): html = worker.get_html(url, render_js=provider.should_render_js()) if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): worker.stop() break except pyppeteer.errors.PyppeteerError as e: logger.error( """pyppeteer.errors.PyppeteerError detected: %s\n 'Please make sure you have installed all the dependencies for chromium correctly""", e) break logger.debug('worker_process exited.')
def fetch_ips(q: Queue, validator_queue: Queue): logger.debug('fetch_ips...') worker = Worker() while True: try: provider: BaseProvider = q.get() provider_name = provider.__class__.__name__ logger.debug('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): html = worker.get_html(url, render_js=provider.should_render_js()) if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) logger.debug('Put new proxy ip into queue: {}'.format( p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): logger.info('worker_process exited.') break except pyppeteer.errors.PyppeteerError as e: logger.debug( 'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) + 'Please make sure you have installed all the dependencies for chromium correctly' ) except Exception as e: worker = Worker() # reset worker logger.warning('Unhandled exception is detected: {}'.format(e))
def validate_ips(validator_queue: Queue, validator_pool: ThreadPoolExecutor, run_once=False): logger.debug('validator_thread started.') while True: try: ## wait 5 mins for next proxy ip in run once mode proxy: ProxyIP = validator_queue.get( timeout=300 if run_once else None) validator_pool.submit(validate_proxy_ip, p=proxy) except (KeyboardInterrupt, SystemExit): break except queue.Empty: logger.debug('validator_thread has timed out.') break logger.debug('validator_thread exited.') validator_pool.shutdown(wait=True) logger.debug('validator_pool exited.')
def handle_response(self, response: HTTPResponse): if response.body: logger.debug('The forward proxy has body') self.write(response.body) self.set_status(200) self.finish() elif response.error: logger.debug('The forward proxy has an error: {}'.format( response.error)) self.write('The forward proxy has an error: {}'.format( response.error)) self.set_status(500) self.finish() else: logger.debug('The forward proxy empty body finish') self.finish()
async def api_v1_proxies(request: Request): args = request.raw_args limit = 20 page = 1 is_anonymous = 2 # 0: no, 1: yes, 2: any if 'limit' in args: int_limit = _parse_str_to_int(args['limit']) limit = int_limit if int_limit else 20 if 'page' in args: int_page = _parse_str_to_int(args['page']) page = int_page if int_page > 0 else 1 if 'anonymous' in args: str_anonymous = args['anonymous'] if str_anonymous == 'true': is_anonymous = 1 elif str_anonymous == 'false': is_anonymous = 0 else: is_anonymous = 2 str_https = None if 'https' in args: str_https = args['https'] country_list = [] if 'countries' in args: countries = args['countries'] country_list = countries.split(',') proxy_initial_query = _get_valid_proxies_query() proxy_query = proxy_initial_query if is_anonymous != 2: if is_anonymous == 1: proxy_query = proxy_query.where(ProxyIP.is_anonymous == True) elif is_anonymous == 0: proxy_query = proxy_query.where(ProxyIP.is_anonymous == False) if str_https: if str_https == 'true': proxy_query = proxy_query.where(ProxyIP.is_https == True) elif str_https == 'false': proxy_query = proxy_query.where(ProxyIP.is_https == False) if country_list and len(country_list) > 0: proxy_query = proxy_query.where(ProxyIP.country << country_list) count = proxy_query.count() # count before sorting proxies = proxy_query.order_by(ProxyIP.updated_at.desc(), ProxyIP.latency).offset((page - 1) * limit).limit(limit) logger.debug('Perform SQL query: {}'.format(proxy_query.sql())) proxy_list = [] for p in proxies: d = model_to_dict(p) if d['created_at']: d['created_at'] = d['created_at'].timestamp() if d['updated_at']: d['updated_at'] = d['updated_at'].timestamp() proxy_list.append(d) return json({ 'proxies': proxy_list, 'count': count, 'per_page': limit, 'page': page, 'total_page': math.ceil(count / limit), })
def feed_providers(self): logger.debug('feed {} providers...'.format(len(all_providers))) for provider in all_providers: self.worker_queue.put(provider)