Ejemplo n.º 1
0
def fetch_ips(q: Queue, validator_queue: Queue):
    logger.debug('fetch_ips...')
    worker = Worker()

    while True:
        try:
            provider: BaseProvider = q.get()()

            provider_name = provider.__class__.__name__

            logger.debug('Get a provider from the provider queue: ' +
                         provider_name)

            for url in provider.urls():
                try:
                    html = worker.get_html(
                        url, render_js=provider.should_render_js())
                except Exception as e:
                    logger.error("worker.get_html failed: %s", e)
                    continue

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            worker.stop()
            logger.info('worker_process exited.')
            break
Ejemplo n.º 2
0
def cron_schedule(scheduler, only_once=False):
    """

    :param scheduler: the Scheduler instance
    :param only_once: flag for testing
    """
    def feed():
        scheduler.feed_providers()

    # feed providers at the very beginning
    scheduler.feed_providers()

    schedule.every(10).minutes.do(feed)

    logger.info('Start python scheduler')

    flag = True

    while flag:
        try:
            schedule.run_pending()

            if only_once:
                flag = False
            else:
                time.sleep(60)
        except (KeyboardInterrupt, InterruptedError):
            logger.info('Stopping python scheduler')
            break
Ejemplo n.º 3
0
def send(sender, recver, msg):
    while 1:
        data = sender.recv(2048)
        if not data: break
        recver.sendall(data)
    logger.info('close conn {}'.format(msg))
    sender.close()
    recver.close()
Ejemplo n.º 4
0
    def feed_from_db():

        # TODO: better query (order by attempts)
        proxies = ProxyIP.select().where(
            ProxyIP.updated_at > datetime.now() - timedelta(days=14))
        for p in proxies:
            scheduler.validator_queue.put(p)

        logger.info(
            'Feed {} proxies from the database for a second time validation'.
            format(len(proxies)))
Ejemplo n.º 5
0
def start_forward_proxy_server():
    server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    if not server.bind(("0.0.0.0", 8081)):
        server.listen(50)
        logger.info('start listener at {}:{}'.format('0.0.0.0', 8081))
        while True:
            conn, addr = server.accept()
            logger.info('get connect from {}'.format(addr))
            threading.Thread(target=proxy, args=(conn, )).start()
    else:
        logger.error("can listener {}:{}".format('0.0.0.0', 8081))
Ejemplo n.º 6
0
def cron_schedule(scheduler, only_once=False):
    """

    :param scheduler: the Scheduler instance
    :param only_once: flag for testing
    """
    def feed():
        scheduler.feed_providers()

    def feed_from_db():

        # TODO: better query (order by attempts)
        proxies = ProxyIP.select().where(
            ProxyIP.updated_at > datetime.now() - timedelta(days=14))
        for p in proxies:
            scheduler.validator_queue.put(p)

        logger.debug(
            'Feed {} proxies from the database for a second time validation'.
            format(len(proxies)))

    # feed providers at the very beginning
    scheduler.feed_providers()

    schedule.every(10).minutes.do(feed)
    schedule.every(FEED_FROM_DB_INTERVAL_MINUTES).minutes.do(feed_from_db)

    logger.info('Start python scheduler')

    flag = True

    # After 1 minute, try feed_from_db() for the first time
    wait_time_for_feed_from_db = 1 if only_once else 60
    time.sleep(wait_time_for_feed_from_db)
    feed_from_db()

    while flag:
        try:
            schedule.run_pending()

            if only_once:
                flag = False
            else:
                time.sleep(60)
        except (KeyboardInterrupt, InterruptedError):
            logger.info('Stopping python scheduler')
            break
Ejemplo n.º 7
0
    def start(self):
        """
        Start the scheduler with processes for worker (fetching candidate proxies from different providers),
        and validator threads for checking whether the fetched proxies are able to use.

        """
        logger.info('Scheduler starts...')

        self.cron_thread = Thread(target=cron_schedule,
                                  args=(self, ),
                                  daemon=True)
        self.worker_process = Process(target=fetch_ips,
                                      args=(self.worker_queue,
                                            self.validator_queue))
        self.validator_thread = Thread(target=validate_ips,
                                       args=(self.validator_queue,
                                             self.validator_pool))

        self.cron_thread.daemon = True
        self.worker_process.daemon = True
        self.validator_thread.daemon = True

        self.cron_thread.start()
        self.worker_process.start(
        )  # Python will wait for all process finished
        logger.info('worker_process started')
        self.validator_thread.start()
        logger.info('validator_thread started')
Ejemplo n.º 8
0
def fetch_ips(q: Queue, validator_queue: Queue):
    logger.debug('fetch_ips...')
    worker = Worker()

    while True:
        try:
            provider: BaseProvider = q.get()

            provider_name = provider.__class__.__name__

            logger.debug('Get a provider from the provider queue: ' +
                         provider_name)

            for url in provider.urls():

                html = worker.get_html(url,
                                       render_js=provider.should_render_js())

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        logger.debug('Put new proxy ip into queue: {}'.format(
                            p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            logger.info('worker_process exited.')
            break
        except pyppeteer.errors.PyppeteerError as e:
            logger.debug(
                'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) +
                'Please make sure you have installed all the dependencies for chromium correctly'
            )
        except Exception as e:
            worker = Worker()  # reset worker
            logger.warning('Unhandled exception is detected: {}'.format(e))
Ejemplo n.º 9
0
def proxy(client):
    retry_time = 1
    while retry_time < 10:
        logger.info('start request time: {}'.format(retry_time))
        server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        remote_proxy = get_proxy()
        logger.info('use proxy {}:{}'.format(remote_proxy.ip,
                                             remote_proxy.port))
        if not server.connect_ex((remote_proxy.ip, remote_proxy.port)):
            logger.info('connect success')
            threading.Thread(target=send,
                             args=(client, server, "send to remote")).start()
            threading.Thread(target=send,
                             args=(server, client,
                                   "recv from remote")).start()
            break
        else:
            retry_time += 1
Ejemplo n.º 10
0
def fetch_ips(q: Queue, validator_queue: Queue, run_once=False):
    logger.debug('worker_process started.')
    logger.info('fetching ips...')
    worker = Worker()

    while True:
        try:
            if run_once and q.empty():
                raise SystemExit
                break

            provider: BaseProvider = q.get()

            provider_name = provider.__class__.__name__

            logger.info('Get a provider from the provider queue: ' +
                        provider_name)

            for url in provider.urls():

                html = worker.get_html(url,
                                       render_js=provider.should_render_js())

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            worker.stop()
            break
        except pyppeteer.errors.PyppeteerError as e:
            logger.error(
                """pyppeteer.errors.PyppeteerError detected: %s\n
                         'Please make sure you have installed all the dependencies for chromium correctly""",
                e)
            break

    logger.debug('worker_process exited.')
Ejemplo n.º 11
0
def main(args) -> int:
    parser = argparse.ArgumentParser(
        description=CMD_DESCRIPTION,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--no-webserver',
                        '-no-ws',
                        action='store_true',
                        help='Prevent starting a web server for JSON API')
    parser.add_argument('--web-port',
                        '-wp',
                        type=int,
                        default=8899,
                        help='The port number for the web server')
    parser.add_argument('--web-host',
                        '-wh',
                        type=str,
                        default='0.0.0.0',
                        help='The hostname for the web server')
    parser.add_argument('--skip-scheduler',
                        action='store_true',
                        help='Prevent the scheduler from crawling')
    parser.add_argument('--version',
                        '-v',
                        action='store_true',
                        help='Print the version of Scylla')
    parser.add_argument('--db-path',
                        type=str,
                        default='./scylla.db',
                        help='The sqlite database file location')

    parsed_args = parser.parse_args(args)

    parsed_args_dict = vars(parsed_args)

    batch_set_config(**vars(parsed_args))

    handle_special_flags(parsed_args_dict)

    from scylla.database import create_db_tables
    from scylla.loggings import logger
    from scylla.scheduler import Scheduler
    from scylla.web import start_web_server

    create_db_tables()

    s = Scheduler()

    try:
        if not get_config('skip_scheduler'):
            s.start()

        # web server
        if not get_config('no_webserver'):
            logger.info('Start the web server')
            start_web_server(host=parsed_args_dict['web_host'],
                             port=parsed_args_dict['web_port'])

        s.join()
    except (KeyboardInterrupt, SystemExit):
        logger.info('catch KeyboardInterrupt, exiting...')
        s.stop()
        return 0

    return 0
Ejemplo n.º 12
0
def start_forward_proxy_server():
    app = make_app()
    port = int(get_config('proxy_port', default='8081'))
    app.listen(port)
    logger.info('Start forward proxy server on port {}'.format(port))
    ioloop.IOLoop.current().start()
Ejemplo n.º 13
0
def main(args) -> int:
    parser = argparse.ArgumentParser(
        description=CMD_DESCRIPTION,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--no-webserver',
                        '-no-ws',
                        action='store_true',
                        help='Prevent starting a web server for JSON API')
    parser.add_argument('--web-port',
                        '-wp',
                        type=int,
                        default=8899,
                        help='The port number for the web server')
    parser.add_argument('--web-host',
                        '-wh',
                        type=str,
                        default='0.0.0.0',
                        help='The hostname for the web server')
    parser.add_argument('--skip-scheduler',
                        action='store_true',
                        help='Prevent the scheduler from crawling')
    parser.add_argument('--scheduler-run-once',
                        action='store_true',
                        help='Run all tasks in scheduler only once')
    parser.add_argument('--version',
                        '-v',
                        action='store_true',
                        help='Print the version of Scylla')
    parser.add_argument('--db-path',
                        type=str,
                        default='./scylla.db',
                        help='The sqlite database file location')
    parser.add_argument(
        '--validation-pool',
        type=int,
        default=31,
        help=
        'The validation pool size (i.e. the limit of concurrent validation tasks for proxies)'
    )
    parser.add_argument('--no-forward-proxy-server',
                        action='store_true',
                        help='Disable the forward proxy server')
    parser.add_argument('--proxy-port',
                        '-pp',
                        type=int,
                        default=8081,
                        help='The port number for the forward proxy')
    parser.add_argument('--chrome-path',
                        '-cp',
                        type=str,
                        default=None,
                        help='path of chrome/chromium to be used to render js')

    parsed_args = parser.parse_args(args)

    parsed_args_dict = vars(parsed_args)

    batch_set_config(**vars(parsed_args))

    handle_special_flags(parsed_args_dict)

    from scylla.database import create_db_tables
    from scylla.loggings import logger
    from scylla.scheduler import Scheduler
    from scylla.web import start_web_server_non_blocking
    from scylla.proxy import start_forward_proxy_server_non_blocking

    create_db_tables()

    s = Scheduler()
    p_web, p_proxy = None, None

    try:
        # scheduler
        if not get_config('skip_scheduler'):
            run_once = bool(get_config('scheduler_run_once'))
            logger.info('Start scheduler, run_once=%s' % run_once)
            s.start(run_once)

        # forward proxy serveer
        if not get_config('no_forward_proxy_server'):
            logger.info('Start forward proxy server')
            p_web = start_forward_proxy_server_non_blocking()

        # web server
        if not get_config('no_webserver'):
            logger.info('Start web server')
            p_proxy = start_web_server_non_blocking(workers=1)

        # exit
        if s.is_alive():
            s.join()
            logger.info('scheduler done.')
        if p_web or p_proxy:
            p_web.join()
            p_proxy.join()

    except (KeyboardInterrupt, SystemExit):
        logger.info('catch KeyboardInterrupt, exiting...')
        s.stop()
        sys.exit(0)

    logger.info('scylla exiting...')
    return 0