def fetch_ips(q: Queue, validator_queue: Queue): logger.debug('fetch_ips...') worker = Worker() while True: try: provider: BaseProvider = q.get()() provider_name = provider.__class__.__name__ logger.debug('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): try: html = worker.get_html( url, render_js=provider.should_render_js()) except Exception as e: logger.error("worker.get_html failed: %s", e) continue if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): worker.stop() logger.info('worker_process exited.') break
def cron_schedule(scheduler, only_once=False): """ :param scheduler: the Scheduler instance :param only_once: flag for testing """ def feed(): scheduler.feed_providers() # feed providers at the very beginning scheduler.feed_providers() schedule.every(10).minutes.do(feed) logger.info('Start python scheduler') flag = True while flag: try: schedule.run_pending() if only_once: flag = False else: time.sleep(60) except (KeyboardInterrupt, InterruptedError): logger.info('Stopping python scheduler') break
def send(sender, recver, msg): while 1: data = sender.recv(2048) if not data: break recver.sendall(data) logger.info('close conn {}'.format(msg)) sender.close() recver.close()
def feed_from_db(): # TODO: better query (order by attempts) proxies = ProxyIP.select().where( ProxyIP.updated_at > datetime.now() - timedelta(days=14)) for p in proxies: scheduler.validator_queue.put(p) logger.info( 'Feed {} proxies from the database for a second time validation'. format(len(proxies)))
def start_forward_proxy_server(): server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) if not server.bind(("0.0.0.0", 8081)): server.listen(50) logger.info('start listener at {}:{}'.format('0.0.0.0', 8081)) while True: conn, addr = server.accept() logger.info('get connect from {}'.format(addr)) threading.Thread(target=proxy, args=(conn, )).start() else: logger.error("can listener {}:{}".format('0.0.0.0', 8081))
def cron_schedule(scheduler, only_once=False): """ :param scheduler: the Scheduler instance :param only_once: flag for testing """ def feed(): scheduler.feed_providers() def feed_from_db(): # TODO: better query (order by attempts) proxies = ProxyIP.select().where( ProxyIP.updated_at > datetime.now() - timedelta(days=14)) for p in proxies: scheduler.validator_queue.put(p) logger.debug( 'Feed {} proxies from the database for a second time validation'. format(len(proxies))) # feed providers at the very beginning scheduler.feed_providers() schedule.every(10).minutes.do(feed) schedule.every(FEED_FROM_DB_INTERVAL_MINUTES).minutes.do(feed_from_db) logger.info('Start python scheduler') flag = True # After 1 minute, try feed_from_db() for the first time wait_time_for_feed_from_db = 1 if only_once else 60 time.sleep(wait_time_for_feed_from_db) feed_from_db() while flag: try: schedule.run_pending() if only_once: flag = False else: time.sleep(60) except (KeyboardInterrupt, InterruptedError): logger.info('Stopping python scheduler') break
def start(self): """ Start the scheduler with processes for worker (fetching candidate proxies from different providers), and validator threads for checking whether the fetched proxies are able to use. """ logger.info('Scheduler starts...') self.cron_thread = Thread(target=cron_schedule, args=(self, ), daemon=True) self.worker_process = Process(target=fetch_ips, args=(self.worker_queue, self.validator_queue)) self.validator_thread = Thread(target=validate_ips, args=(self.validator_queue, self.validator_pool)) self.cron_thread.daemon = True self.worker_process.daemon = True self.validator_thread.daemon = True self.cron_thread.start() self.worker_process.start( ) # Python will wait for all process finished logger.info('worker_process started') self.validator_thread.start() logger.info('validator_thread started')
def fetch_ips(q: Queue, validator_queue: Queue): logger.debug('fetch_ips...') worker = Worker() while True: try: provider: BaseProvider = q.get() provider_name = provider.__class__.__name__ logger.debug('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): html = worker.get_html(url, render_js=provider.should_render_js()) if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) logger.debug('Put new proxy ip into queue: {}'.format( p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): logger.info('worker_process exited.') break except pyppeteer.errors.PyppeteerError as e: logger.debug( 'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) + 'Please make sure you have installed all the dependencies for chromium correctly' ) except Exception as e: worker = Worker() # reset worker logger.warning('Unhandled exception is detected: {}'.format(e))
def proxy(client): retry_time = 1 while retry_time < 10: logger.info('start request time: {}'.format(retry_time)) server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) remote_proxy = get_proxy() logger.info('use proxy {}:{}'.format(remote_proxy.ip, remote_proxy.port)) if not server.connect_ex((remote_proxy.ip, remote_proxy.port)): logger.info('connect success') threading.Thread(target=send, args=(client, server, "send to remote")).start() threading.Thread(target=send, args=(server, client, "recv from remote")).start() break else: retry_time += 1
def fetch_ips(q: Queue, validator_queue: Queue, run_once=False): logger.debug('worker_process started.') logger.info('fetching ips...') worker = Worker() while True: try: if run_once and q.empty(): raise SystemExit break provider: BaseProvider = q.get() provider_name = provider.__class__.__name__ logger.info('Get a provider from the provider queue: ' + provider_name) for url in provider.urls(): html = worker.get_html(url, render_js=provider.should_render_js()) if html: proxies = provider.parse(html) for p in proxies: validator_queue.put(p) # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__())) logger.info( ' {}: feed {} potential proxies into the validator queue' .format(provider_name, len(proxies))) except (KeyboardInterrupt, InterruptedError, SystemExit): worker.stop() break except pyppeteer.errors.PyppeteerError as e: logger.error( """pyppeteer.errors.PyppeteerError detected: %s\n 'Please make sure you have installed all the dependencies for chromium correctly""", e) break logger.debug('worker_process exited.')
def main(args) -> int: parser = argparse.ArgumentParser( description=CMD_DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--no-webserver', '-no-ws', action='store_true', help='Prevent starting a web server for JSON API') parser.add_argument('--web-port', '-wp', type=int, default=8899, help='The port number for the web server') parser.add_argument('--web-host', '-wh', type=str, default='0.0.0.0', help='The hostname for the web server') parser.add_argument('--skip-scheduler', action='store_true', help='Prevent the scheduler from crawling') parser.add_argument('--version', '-v', action='store_true', help='Print the version of Scylla') parser.add_argument('--db-path', type=str, default='./scylla.db', help='The sqlite database file location') parsed_args = parser.parse_args(args) parsed_args_dict = vars(parsed_args) batch_set_config(**vars(parsed_args)) handle_special_flags(parsed_args_dict) from scylla.database import create_db_tables from scylla.loggings import logger from scylla.scheduler import Scheduler from scylla.web import start_web_server create_db_tables() s = Scheduler() try: if not get_config('skip_scheduler'): s.start() # web server if not get_config('no_webserver'): logger.info('Start the web server') start_web_server(host=parsed_args_dict['web_host'], port=parsed_args_dict['web_port']) s.join() except (KeyboardInterrupt, SystemExit): logger.info('catch KeyboardInterrupt, exiting...') s.stop() return 0 return 0
def start_forward_proxy_server(): app = make_app() port = int(get_config('proxy_port', default='8081')) app.listen(port) logger.info('Start forward proxy server on port {}'.format(port)) ioloop.IOLoop.current().start()
def main(args) -> int: parser = argparse.ArgumentParser( description=CMD_DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--no-webserver', '-no-ws', action='store_true', help='Prevent starting a web server for JSON API') parser.add_argument('--web-port', '-wp', type=int, default=8899, help='The port number for the web server') parser.add_argument('--web-host', '-wh', type=str, default='0.0.0.0', help='The hostname for the web server') parser.add_argument('--skip-scheduler', action='store_true', help='Prevent the scheduler from crawling') parser.add_argument('--scheduler-run-once', action='store_true', help='Run all tasks in scheduler only once') parser.add_argument('--version', '-v', action='store_true', help='Print the version of Scylla') parser.add_argument('--db-path', type=str, default='./scylla.db', help='The sqlite database file location') parser.add_argument( '--validation-pool', type=int, default=31, help= 'The validation pool size (i.e. the limit of concurrent validation tasks for proxies)' ) parser.add_argument('--no-forward-proxy-server', action='store_true', help='Disable the forward proxy server') parser.add_argument('--proxy-port', '-pp', type=int, default=8081, help='The port number for the forward proxy') parser.add_argument('--chrome-path', '-cp', type=str, default=None, help='path of chrome/chromium to be used to render js') parsed_args = parser.parse_args(args) parsed_args_dict = vars(parsed_args) batch_set_config(**vars(parsed_args)) handle_special_flags(parsed_args_dict) from scylla.database import create_db_tables from scylla.loggings import logger from scylla.scheduler import Scheduler from scylla.web import start_web_server_non_blocking from scylla.proxy import start_forward_proxy_server_non_blocking create_db_tables() s = Scheduler() p_web, p_proxy = None, None try: # scheduler if not get_config('skip_scheduler'): run_once = bool(get_config('scheduler_run_once')) logger.info('Start scheduler, run_once=%s' % run_once) s.start(run_once) # forward proxy serveer if not get_config('no_forward_proxy_server'): logger.info('Start forward proxy server') p_web = start_forward_proxy_server_non_blocking() # web server if not get_config('no_webserver'): logger.info('Start web server') p_proxy = start_web_server_non_blocking(workers=1) # exit if s.is_alive(): s.join() logger.info('scheduler done.') if p_web or p_proxy: p_web.join() p_proxy.join() except (KeyboardInterrupt, SystemExit): logger.info('catch KeyboardInterrupt, exiting...') s.stop() sys.exit(0) logger.info('scylla exiting...') return 0