Beispiel #1
0
 def __init__(self):
     self.worker_queue = Queue()
     self.validator_queue = Queue()
     self.worker_process = None
     self.validator_thread = None
     self.cron_thread = None
     self.validator_pool = ThreadPoolExecutor(max_workers=int(get_config('validation_pool', default='31')))
Beispiel #2
0
def create_connection() -> SqliteDatabase:
    """
    create a database connection
    :rtype: SqliteDatabase
    """
    global _db
    if _db:
        return _db
    else:
        logger.debug('create new db connection')
        _db = SqliteDatabase(get_config('db_path', './scylla.db'))
        return _db
Beispiel #3
0
    def get_proxy_and_forward(self):
        https = False

        if self.request.uri.startswith('https'):
            https = True

        disable_forward_proxy = get_config('disable_forward_proxy', default=False)

        if disable_forward_proxy:
            self.forward()
        else:
            proxy = get_proxy(https=https)
            self.forward(host=proxy.ip, port=proxy.port)
Beispiel #4
0
def main(args) -> int:
    parser = argparse.ArgumentParser(
        description=CMD_DESCRIPTION,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--no-webserver',
                        '-no-ws',
                        action='store_true',
                        help='Prevent starting a web server for JSON API')
    parser.add_argument('--web-port',
                        '-wp',
                        type=int,
                        default=8899,
                        help='The port number for the web server')
    parser.add_argument('--web-host',
                        '-wh',
                        type=str,
                        default='0.0.0.0',
                        help='The hostname for the web server')
    parser.add_argument('--skip-scheduler',
                        action='store_true',
                        help='Prevent the scheduler from crawling')
    parser.add_argument('--version',
                        '-v',
                        action='store_true',
                        help='Print the version of Scylla')
    parser.add_argument('--db-path',
                        type=str,
                        default='./scylla.db',
                        help='The sqlite database file location')

    parsed_args = parser.parse_args(args)

    parsed_args_dict = vars(parsed_args)

    batch_set_config(**vars(parsed_args))

    handle_special_flags(parsed_args_dict)

    from scylla.database import create_db_tables
    from scylla.loggings import logger
    from scylla.scheduler import Scheduler
    from scylla.web import start_web_server

    create_db_tables()

    s = Scheduler()

    try:
        if not get_config('skip_scheduler'):
            s.start()

        # web server
        if not get_config('no_webserver'):
            logger.info('Start the web server')
            start_web_server(host=parsed_args_dict['web_host'],
                             port=parsed_args_dict['web_port'])

        s.join()
    except (KeyboardInterrupt, SystemExit):
        logger.info('catch KeyboardInterrupt, exiting...')
        s.stop()
        return 0

    return 0
Beispiel #5
0
def test_config_default():
    config_str = get_config('empty', default='baz')
    assert 'baz' == config_str
Beispiel #6
0
def test_config():
    set_config('foo', 'bar')
    config_str = get_config('foo')
    assert 'bar' == config_str
Beispiel #7
0
    def __init__(self):
        """Initialize the worker object

        """
        self.session = HTMLSession(
            chrome_path=get_config('chrome_path', default=None))
Beispiel #8
0
def start_forward_proxy_server():
    app = make_app()
    port = int(get_config('proxy_port', default='8081'))
    app.listen(port)
    logger.info('Start forward proxy server on port {}'.format(port))
    ioloop.IOLoop.current().start()
Beispiel #9
0
def main(args) -> int:
    parser = argparse.ArgumentParser(
        description=CMD_DESCRIPTION,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--no-webserver',
                        '-no-ws',
                        action='store_true',
                        help='Prevent starting a web server for JSON API')
    parser.add_argument('--web-port',
                        '-wp',
                        type=int,
                        default=8899,
                        help='The port number for the web server')
    parser.add_argument('--web-host',
                        '-wh',
                        type=str,
                        default='0.0.0.0',
                        help='The hostname for the web server')
    parser.add_argument('--skip-scheduler',
                        action='store_true',
                        help='Prevent the scheduler from crawling')
    parser.add_argument('--scheduler-run-once',
                        action='store_true',
                        help='Run all tasks in scheduler only once')
    parser.add_argument('--version',
                        '-v',
                        action='store_true',
                        help='Print the version of Scylla')
    parser.add_argument('--db-path',
                        type=str,
                        default='./scylla.db',
                        help='The sqlite database file location')
    parser.add_argument(
        '--validation-pool',
        type=int,
        default=31,
        help=
        'The validation pool size (i.e. the limit of concurrent validation tasks for proxies)'
    )
    parser.add_argument('--no-forward-proxy-server',
                        action='store_true',
                        help='Disable the forward proxy server')
    parser.add_argument('--proxy-port',
                        '-pp',
                        type=int,
                        default=8081,
                        help='The port number for the forward proxy')
    parser.add_argument('--chrome-path',
                        '-cp',
                        type=str,
                        default=None,
                        help='path of chrome/chromium to be used to render js')

    parsed_args = parser.parse_args(args)

    parsed_args_dict = vars(parsed_args)

    batch_set_config(**vars(parsed_args))

    handle_special_flags(parsed_args_dict)

    from scylla.database import create_db_tables
    from scylla.loggings import logger
    from scylla.scheduler import Scheduler
    from scylla.web import start_web_server_non_blocking
    from scylla.proxy import start_forward_proxy_server_non_blocking

    create_db_tables()

    s = Scheduler()
    p_web, p_proxy = None, None

    try:
        # scheduler
        if not get_config('skip_scheduler'):
            run_once = bool(get_config('scheduler_run_once'))
            logger.info('Start scheduler, run_once=%s' % run_once)
            s.start(run_once)

        # forward proxy serveer
        if not get_config('no_forward_proxy_server'):
            logger.info('Start forward proxy server')
            p_web = start_forward_proxy_server_non_blocking()

        # web server
        if not get_config('no_webserver'):
            logger.info('Start web server')
            p_proxy = start_web_server_non_blocking(workers=1)

        # exit
        if s.is_alive():
            s.join()
            logger.info('scheduler done.')
        if p_web or p_proxy:
            p_web.join()
            p_proxy.join()

    except (KeyboardInterrupt, SystemExit):
        logger.info('catch KeyboardInterrupt, exiting...')
        s.stop()
        sys.exit(0)

    logger.info('scylla exiting...')
    return 0
Beispiel #10
0
def start_web_server(workers=1):
    host = str(get_config('web_host', default='0.0.0.0'))
    port = int(get_config('web_port', default='8899'))
    app.run(host=host, port=port, workers=workers)