Esempio n. 1
0
def fetch_ips(q: Queue, validator_queue: Queue):
    logger.debug('fetch_ips...')
    worker = Worker()

    while True:
        try:
            provider: BaseProvider = q.get()()

            provider_name = provider.__class__.__name__

            logger.debug('Get a provider from the provider queue: ' +
                         provider_name)

            for url in provider.urls():
                try:
                    html = worker.get_html(
                        url, render_js=provider.should_render_js())
                except Exception as e:
                    logger.error("worker.get_html failed: %s", e)
                    continue

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            worker.stop()
            logger.info('worker_process exited.')
            break
Esempio n. 2
0
    def forward(self, host=None, port=None):
        try:
            url = self.request.uri

            body = self.request.body

            if not body:
                body = None

            httpclient.AsyncHTTPClient().fetch(
                httpclient.HTTPRequest(url=url,
                                       method=self.request.method,
                                       body=body,
                                       headers=self.request.headers,
                                       follow_redirects=False,
                                       validate_cert=False,
                                       proxy_host=host,
                                       proxy_port=port), self.handle_response)

        except httpclient.HTTPError as e:
            logger.debug("tornado signalled HTTPError {}".format(e))
            self.set_status(500)
            self.finish()
        except:
            self.set_status(500)
            self.write("Internal server error:\n" +
                       ''.join(traceback.format_exception(*sys.exc_info())))
            self.finish()
Esempio n. 3
0
    def get_html(self, url: str, render_js: bool = True) -> Union[HTML, None]:
        """Get html from a specific URL

        :param url: the URL
        :param render_js: [whether to render js], defaults to True
        :param render_js: bool, optional
        :return: [the HTML string]
        :rtype: str
        """

        try:
            # TODO: load config for timeout
            response: HTMLResponse = self.session.get(url, timeout=30)
        except requests.RequestException:
            logger.warning('[Worker] Cannot get this url: ' + url)
            return None
        except (KeyboardInterrupt, SystemExit, InterruptedError):
            self.stop()
            return None

        if response.ok:
            if render_js:
                logger.debug('starting render js...')
                response.html.render(wait=1.5, timeout=10.0)
                logger.debug('end render js...')
            return response.html
        else:
            return None
Esempio n. 4
0
    def feed_from_db():

        # TODO: better query (order by attempts)
        proxies = ProxyIP.select().where(ProxyIP.updated_at > datetime.now() - timedelta(days=14))
        for p in proxies:
            scheduler.validator_queue.put(p)

        logger.debug('Feed {} proxies from the database for a second time validation'.format(len(proxies)))
Esempio n. 5
0
async def api_v1_proxies(request: Request):
    args = request.raw_args

    limit = 20

    page = 1

    is_anonymous = 2  # 0: no, 1: yes, 2: any

    if 'limit' in args:
        int_limit = _parse_str_to_int(args['limit'])
        limit = int_limit if int_limit else 20

    if 'page' in args:
        int_page = _parse_str_to_int(args['page'])
        page = int_page if int_page > 0 else 1

    if 'anonymous' in args:
        str_anonymous = args['anonymous']
        if str_anonymous == 'true':
            is_anonymous = 1
        elif str_anonymous == 'false':
            is_anonymous = 0
        else:
            is_anonymous = 2

    proxy_initial_query = _get_valid_proxies_query()

    proxy_query = proxy_initial_query

    if is_anonymous != 2:
        if is_anonymous == 1:
            proxy_query = proxy_initial_query.where(
                ProxyIP.is_anonymous == True)
        elif is_anonymous == 0:
            proxy_query = proxy_initial_query.where(
                ProxyIP.is_anonymous == False)

    proxies = proxy_query.order_by(ProxyIP.updated_at.desc(),
                                   ProxyIP.latency).offset(
                                       (page - 1) * limit).limit(limit)

    count = proxy_initial_query.count()

    logger.debug('Perform SQL query: {}'.format(proxy_query.sql()))

    proxy_list = []

    for p in proxies:
        proxy_list.append(model_to_dict(p))

    return json({
        'proxies': proxy_list,
        'count': count,
        'per_page': limit,
        'page': page,
        'total_page': math.ceil(count / limit),
    })
Esempio n. 6
0
    def handle_response(self, response: HTTPResponse):

        if response.body:
            self.write(response.body)
            self.finish()
        elif response.error:
            logger.debug('The forward proxy has an error: {}'.format(response.error))
            self.finish()
        else:
            self.finish()
Esempio n. 7
0
def create_connection() -> SqliteDatabase:
    """
    create a database connection
    :rtype: SqliteDatabase
    """
    global _db
    if _db:
        return _db
    else:
        logger.debug('create new db connection')
        _db = SqliteDatabase(get_config('db_path', './scylla.db'))
        return _db
Esempio n. 8
0
    def _get_html_js(self, url: str) -> Union[PyQuery, None]:
        page = self.browser.new_page()
        response = page.goto(url=url, timeout=DEFAULT_TIMEOUT_SECONDS, wait_until='domcontentloaded')

        if not response:
            logger.debug(f'Request for {url} failed because response is None')
            return None

        if response.ok:
            doc = PyQuery(page.content())
            return doc
        else:
            logger.debug(f'Request for {url} failed, status code: {response.status}')
            return None
Esempio n. 9
0
    def _get_html_no_js(self, url: str) -> Union[PyQuery, None]:
        try:
            # TODO: load config for timeout
            response: Response = self.requests_session.get(url, timeout=DEFAULT_TIMEOUT_SECONDS)
        except requests.RequestException:
            logger.warning('[Worker] Cannot get this url: ' + url)
            return None
        except (KeyboardInterrupt, SystemExit, InterruptedError):
            self.stop()
            return None

        if response.ok:
            doc = PyQuery(response.text)
            return doc
        else:
            logger.debug(f'Request for {url} failed, status code: {response.status_code}')
            return None
Esempio n. 10
0
def cron_schedule(scheduler, run_once=False):
    """
    :param scheduler: the Scheduler instance
    :param run_once: flag for testing
    """
    def feed():
        scheduler.feed_providers()

    def feed_from_db():

        # TODO: better query (order by attempts)
        proxies = ProxyIP.select().where(
            ProxyIP.updated_at > datetime.now() - timedelta(days=14))
        for p in proxies:
            scheduler.validator_queue.put(p)

        logger.info(
            'Feed {} proxies from the database for a second time validation'.
            format(len(proxies)))

    # feed providers at the very beginning
    scheduler.feed_providers()

    schedule.every(10).minutes.do(feed)
    schedule.every(FEED_FROM_DB_INTERVAL_MINUTES).minutes.do(feed_from_db)

    logger.debug('cron_thread started.')

    # After 1 minute, try feed_from_db() for the first time
    wait_time_for_feed_from_db = 1 if run_once else 60
    time.sleep(wait_time_for_feed_from_db)
    feed_from_db()

    while True:
        try:
            schedule.run_pending()

            if run_once:
                raise SystemExit
            else:
                time.sleep(60)
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            break

    logger.debug('cron_thread exited.')
Esempio n. 11
0
    def get_proxy_and_forward(self):
        https = False

        if self.request.uri.startswith('https'):
            https = True

        disable_forward_proxy = get_config('disable_forward_proxy',
                                           default=False)

        if disable_forward_proxy:
            self.forward()
            logger.debug('proxy get_proxy_and_forward option %s',
                         disable_forward_proxy)
        else:
            proxy = get_proxy(https=https)
            self.forward(host=proxy.ip, port=proxy.port)
            logger.debug('proxy get_proxy_and_forward option %s %s %s',
                         disable_forward_proxy, proxy.ip, proxy.port)
Esempio n. 12
0
def fetch_ips(q: Queue, validator_queue: Queue, run_once=False):
    logger.debug('worker_process started.')
    logger.info('fetching ips...')
    worker = Worker()

    while True:
        try:
            if run_once and q.empty():
                raise SystemExit
                break

            provider: BaseProvider = q.get()

            provider_name = provider.__class__.__name__

            logger.info('Get a provider from the provider queue: ' +
                        provider_name)

            for url in provider.urls():

                html = worker.get_html(url,
                                       render_js=provider.should_render_js())

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            worker.stop()
            break
        except pyppeteer.errors.PyppeteerError as e:
            logger.error(
                """pyppeteer.errors.PyppeteerError detected: %s\n
                         'Please make sure you have installed all the dependencies for chromium correctly""",
                e)
            break

    logger.debug('worker_process exited.')
Esempio n. 13
0
def fetch_ips(q: Queue, validator_queue: Queue):
    logger.debug('fetch_ips...')
    worker = Worker()

    while True:
        try:
            provider: BaseProvider = q.get()

            provider_name = provider.__class__.__name__

            logger.debug('Get a provider from the provider queue: ' +
                         provider_name)

            for url in provider.urls():

                html = worker.get_html(url,
                                       render_js=provider.should_render_js())

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        logger.debug('Put new proxy ip into queue: {}'.format(
                            p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            logger.info('worker_process exited.')
            break
        except pyppeteer.errors.PyppeteerError as e:
            logger.debug(
                'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) +
                'Please make sure you have installed all the dependencies for chromium correctly'
            )
        except Exception as e:
            worker = Worker()  # reset worker
            logger.warning('Unhandled exception is detected: {}'.format(e))
Esempio n. 14
0
def validate_ips(validator_queue: Queue,
                 validator_pool: ThreadPoolExecutor,
                 run_once=False):
    logger.debug('validator_thread started.')

    while True:
        try:
            ## wait 5 mins for next proxy ip in run once mode
            proxy: ProxyIP = validator_queue.get(
                timeout=300 if run_once else None)

            validator_pool.submit(validate_proxy_ip, p=proxy)
        except (KeyboardInterrupt, SystemExit):
            break
        except queue.Empty:
            logger.debug('validator_thread has timed out.')
            break

    logger.debug('validator_thread exited.')

    validator_pool.shutdown(wait=True)
    logger.debug('validator_pool exited.')
Esempio n. 15
0
    def handle_response(self, response: HTTPResponse):
        if response.body:
            logger.debug('The forward proxy has body')
            self.write(response.body)
            self.set_status(200)
            self.finish()
        elif response.error:
            logger.debug('The forward proxy has an error: {}'.format(
                response.error))

            self.write('The forward proxy has an error: {}'.format(
                response.error))
            self.set_status(500)
            self.finish()
        else:
            logger.debug('The forward proxy empty body finish')
            self.finish()
Esempio n. 16
0
async def api_v1_proxies(request: Request):
    args = request.raw_args

    limit = 20

    page = 1

    is_anonymous = 2  # 0: no, 1: yes, 2: any

    if 'limit' in args:
        int_limit = _parse_str_to_int(args['limit'])
        limit = int_limit if int_limit else 20

    if 'page' in args:
        int_page = _parse_str_to_int(args['page'])
        page = int_page if int_page > 0 else 1

    if 'anonymous' in args:
        str_anonymous = args['anonymous']
        if str_anonymous == 'true':
            is_anonymous = 1
        elif str_anonymous == 'false':
            is_anonymous = 0
        else:
            is_anonymous = 2

    str_https = None
    if 'https' in args:
        str_https = args['https']

    country_list = []
    if 'countries' in args:
        countries = args['countries']
        country_list = countries.split(',')

    proxy_initial_query = _get_valid_proxies_query()

    proxy_query = proxy_initial_query

    if is_anonymous != 2:
        if is_anonymous == 1:
            proxy_query = proxy_query.where(ProxyIP.is_anonymous == True)
        elif is_anonymous == 0:
            proxy_query = proxy_query.where(ProxyIP.is_anonymous == False)

    if str_https:
        if str_https == 'true':
            proxy_query = proxy_query.where(ProxyIP.is_https == True)
        elif str_https == 'false':
            proxy_query = proxy_query.where(ProxyIP.is_https == False)

    if country_list and len(country_list) > 0:
        proxy_query = proxy_query.where(ProxyIP.country << country_list)

    count = proxy_query.count()  # count before sorting

    proxies = proxy_query.order_by(ProxyIP.updated_at.desc(), ProxyIP.latency).offset((page - 1) * limit).limit(limit)

    logger.debug('Perform SQL query: {}'.format(proxy_query.sql()))

    proxy_list = []

    for p in proxies:
        d = model_to_dict(p)
        if d['created_at']:
            d['created_at'] = d['created_at'].timestamp()
        if d['updated_at']:
            d['updated_at'] = d['updated_at'].timestamp()
        proxy_list.append(d)

    return json({
        'proxies': proxy_list,
        'count': count,
        'per_page': limit,
        'page': page,
        'total_page': math.ceil(count / limit),
    })
Esempio n. 17
0
    def feed_providers(self):
        logger.debug('feed {} providers...'.format(len(all_providers)))

        for provider in all_providers:
            self.worker_queue.put(provider)