def __init__(self): self.worker_queue = Queue() self.validator_queue = Queue() self.worker_process = None self.validator_thread = None self.cron_thread = None self.validator_pool = ThreadPoolExecutor(max_workers=int(get_config('validation_pool', default='31')))
def create_connection() -> SqliteDatabase: """ create a database connection :rtype: SqliteDatabase """ global _db if _db: return _db else: logger.debug('create new db connection') _db = SqliteDatabase(get_config('db_path', './scylla.db')) return _db
def get_proxy_and_forward(self): https = False if self.request.uri.startswith('https'): https = True disable_forward_proxy = get_config('disable_forward_proxy', default=False) if disable_forward_proxy: self.forward() else: proxy = get_proxy(https=https) self.forward(host=proxy.ip, port=proxy.port)
def main(args) -> int: parser = argparse.ArgumentParser( description=CMD_DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--no-webserver', '-no-ws', action='store_true', help='Prevent starting a web server for JSON API') parser.add_argument('--web-port', '-wp', type=int, default=8899, help='The port number for the web server') parser.add_argument('--web-host', '-wh', type=str, default='0.0.0.0', help='The hostname for the web server') parser.add_argument('--skip-scheduler', action='store_true', help='Prevent the scheduler from crawling') parser.add_argument('--version', '-v', action='store_true', help='Print the version of Scylla') parser.add_argument('--db-path', type=str, default='./scylla.db', help='The sqlite database file location') parsed_args = parser.parse_args(args) parsed_args_dict = vars(parsed_args) batch_set_config(**vars(parsed_args)) handle_special_flags(parsed_args_dict) from scylla.database import create_db_tables from scylla.loggings import logger from scylla.scheduler import Scheduler from scylla.web import start_web_server create_db_tables() s = Scheduler() try: if not get_config('skip_scheduler'): s.start() # web server if not get_config('no_webserver'): logger.info('Start the web server') start_web_server(host=parsed_args_dict['web_host'], port=parsed_args_dict['web_port']) s.join() except (KeyboardInterrupt, SystemExit): logger.info('catch KeyboardInterrupt, exiting...') s.stop() return 0 return 0
def test_config_default(): config_str = get_config('empty', default='baz') assert 'baz' == config_str
def test_config(): set_config('foo', 'bar') config_str = get_config('foo') assert 'bar' == config_str
def __init__(self): """Initialize the worker object """ self.session = HTMLSession( chrome_path=get_config('chrome_path', default=None))
def start_forward_proxy_server(): app = make_app() port = int(get_config('proxy_port', default='8081')) app.listen(port) logger.info('Start forward proxy server on port {}'.format(port)) ioloop.IOLoop.current().start()
def main(args) -> int: parser = argparse.ArgumentParser( description=CMD_DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--no-webserver', '-no-ws', action='store_true', help='Prevent starting a web server for JSON API') parser.add_argument('--web-port', '-wp', type=int, default=8899, help='The port number for the web server') parser.add_argument('--web-host', '-wh', type=str, default='0.0.0.0', help='The hostname for the web server') parser.add_argument('--skip-scheduler', action='store_true', help='Prevent the scheduler from crawling') parser.add_argument('--scheduler-run-once', action='store_true', help='Run all tasks in scheduler only once') parser.add_argument('--version', '-v', action='store_true', help='Print the version of Scylla') parser.add_argument('--db-path', type=str, default='./scylla.db', help='The sqlite database file location') parser.add_argument( '--validation-pool', type=int, default=31, help= 'The validation pool size (i.e. the limit of concurrent validation tasks for proxies)' ) parser.add_argument('--no-forward-proxy-server', action='store_true', help='Disable the forward proxy server') parser.add_argument('--proxy-port', '-pp', type=int, default=8081, help='The port number for the forward proxy') parser.add_argument('--chrome-path', '-cp', type=str, default=None, help='path of chrome/chromium to be used to render js') parsed_args = parser.parse_args(args) parsed_args_dict = vars(parsed_args) batch_set_config(**vars(parsed_args)) handle_special_flags(parsed_args_dict) from scylla.database import create_db_tables from scylla.loggings import logger from scylla.scheduler import Scheduler from scylla.web import start_web_server_non_blocking from scylla.proxy import start_forward_proxy_server_non_blocking create_db_tables() s = Scheduler() p_web, p_proxy = None, None try: # scheduler if not get_config('skip_scheduler'): run_once = bool(get_config('scheduler_run_once')) logger.info('Start scheduler, run_once=%s' % run_once) s.start(run_once) # forward proxy serveer if not get_config('no_forward_proxy_server'): logger.info('Start forward proxy server') p_web = start_forward_proxy_server_non_blocking() # web server if not get_config('no_webserver'): logger.info('Start web server') p_proxy = start_web_server_non_blocking(workers=1) # exit if s.is_alive(): s.join() logger.info('scheduler done.') if p_web or p_proxy: p_web.join() p_proxy.join() except (KeyboardInterrupt, SystemExit): logger.info('catch KeyboardInterrupt, exiting...') s.stop() sys.exit(0) logger.info('scylla exiting...') return 0
def start_web_server(workers=1): host = str(get_config('web_host', default='0.0.0.0')) port = int(get_config('web_port', default='8899')) app.run(host=host, port=port, workers=workers)