async def wrap(self, *args, **kwargs): good_proxies_count = await db.count( Proxy.select().where(Proxy.number_of_bad_checks == 0)) bad_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, )) dead_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, )) not_checked_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks >= settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, )) response = { "bad_proxies_count": bad_proxies_count, "good_proxies_count": good_proxies_count, "dead_proxies_count": dead_proxies_count, "not_checked_proxies_count": not_checked_proxies_count, } response.update(await func(self, *args, **kwargs)) return response
async def number_of_proxies_to_process(timestamp): good_proxies_count = await db.count( Proxy.select().where( Proxy.number_of_bad_checks == 0, Proxy.next_check_time < timestamp, ) ) bad_proxies_count = await db.count( Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, Proxy.next_check_time < timestamp, ) ) dead_proxies_count = await db.count( Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, Proxy.next_check_time < timestamp, ) ) await db.create( NumberOfProxiesToProcess, timestamp=timestamp, good_proxies=good_proxies_count, bad_proxies=bad_proxies_count, dead_proxies=dead_proxies_count, )
async def process_proxies(self): while True: await asyncio.sleep(0.01) try: # check good proxies proxies = await db.execute( Proxy.select().where( Proxy.number_of_bad_checks == 0, Proxy.next_check_time < time.time(), ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS) ) if proxies: self.good_proxies_are_processed = False await self.add_proxies_to_queue(proxies) if proxies: continue self.good_proxies_are_processed = True # check bad proxies proxies = await db.execute( Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, Proxy.next_check_time < time.time(), ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS) ) await self.add_proxies_to_queue(proxies) if proxies: continue # check dead proxies proxies = await db.execute( Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, Proxy.next_check_time < time.time(), ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS) ) await self.add_proxies_to_queue(proxies) except KeyboardInterrupt as ex: raise ex except BaseException as ex: self.logger.exception(ex) if settings.DEBUG: raise ex await asyncio.sleep(settings.SLEEP_AFTER_ERROR_PERIOD)
async def get_best_http_proxy(self, request): proxy_address = (await db.get(Proxy.select().where( Proxy.number_of_bad_checks == 0, Proxy.raw_protocol == Proxy.PROTOCOLS.index("http"), ).order_by(Proxy.response_time))).address return web.Response(text=proxy_address)
async def process_proxy(self, raw_protocol: int, auth_data: str, domain: str, port: int, collector_id): async with self.proxies_semaphore: self.logger.debug( "start processing proxy {}://{}@{}:{} with collector id {}". format(raw_protocol, auth_data, domain, port, collector_id)) if auth_data is None: auth_data = "" proxy_url = "{}://".format(Proxy.PROTOCOLS[raw_protocol]) if auth_data: proxy_url += auth_data + "@" proxy_url += domain + ":" + str(port) start_checking_time = time.time() check_result, checker_additional_info = await proxy_utils.check_proxy( proxy_url) end_checking_time = time.time() if check_result: self.logger.debug("proxy {0} works".format(proxy_url)) await self.create_or_update_proxy( raw_protocol, auth_data, domain, port, start_checking_time, end_checking_time, checker_additional_info, ) else: self.logger.debug("proxy {0} doesn't work".format(proxy_url)) try: proxy = await db.get(Proxy.select().where( Proxy.raw_protocol == raw_protocol, Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port, )) proxy.last_check_time = int(time.time()) proxy.next_check_time = (proxy.last_check_time + proxy.checking_period) proxy.number_of_bad_checks += 1 proxy.uptime = int(time.time()) if proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD: proxy.bad_uptime = int(time.time()) if (proxy.number_of_bad_checks == settings.DO_NOT_CHECK_ON_N_BAD_CHECKS): self.logger.debug( "proxy {} isn't checked anymore".format( proxy.to_url())) await db.update(proxy) except Proxy.DoesNotExist: pass
async def get_proxies_html(self, request): proxies = await db.execute(Proxy.select().where( Proxy.number_of_bad_checks == 0).order_by(Proxy.response_time)) proxies = list(proxies) current_timestamp = time.time() return { "proxies": [{ "address": proxy.address, "response_time": proxy.response_time / 1000 if proxy.response_time is not None else None, "uptime": datetime.timedelta(seconds=int(current_timestamp - proxy.uptime)) if proxy.uptime is not None else None, "bad_uptime": datetime.timedelta(seconds=int(current_timestamp - proxy.bad_uptime)) if proxy.bad_uptime is not None else None, "last_check_time": proxy.last_check_time, "checking_period": proxy.checking_period, "number_of_bad_checks": proxy.number_of_bad_checks, "bad_proxy": proxy.bad_proxy, "white_ipv4": proxy.white_ipv4, "location": proxy.location, } for proxy in proxies] }
async def create_proxy_count_item(timestamp): good_proxies_count = await db.count( Proxy.select().where(Proxy.number_of_bad_checks == 0)) bad_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks > 0, Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, )) dead_proxies_count = await db.count(Proxy.select().where( Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD)) await db.create( ProxyCountItem, timestamp=timestamp, good_proxies_count=good_proxies_count, bad_proxies_count=bad_proxies_count, dead_proxies_count=dead_proxies_count, )
async def process_raw_proxy(self, proxy, collector_id): self.logger.debug("processing raw proxy \"{}\"".format(proxy)) try: _, auth_data, domain, port = proxy_validator.retrieve(proxy) except proxy_validator.ValidationError as ex: self.collectors_logger.error( "Collector with id \"{}\" returned bad raw proxy \"{}\". " "Message: {}".format(collector_id, proxy, ex) ) return # don't care about protocol try: proxy = await db.get( Proxy.select().where( Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port, ) ) if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time(): proxy_short_address = "" if auth_data: proxy_short_address += auth_data + "@" proxy_short_address += "{}:{}".format(domain, port) self.logger.debug( "skipping proxy \"{}\" from collector \"{}\"".format( proxy_short_address, collector_id) ) return except Proxy.DoesNotExist: pass for raw_protocol in range(len(Proxy.PROTOCOLS)): while not self.good_proxies_are_processed: # TODO: find a better way await asyncio.sleep(0.1) new_proxy = Proxy() new_proxy.raw_protocol = raw_protocol new_proxy.auth_data = auth_data new_proxy.domain = domain new_proxy.port = port await self.add_proxy_to_queue(new_proxy, collector_id)
def proxy_generator(): for proxy_acc in Proxy.select().where(Proxy.used == False): proxy_acc.used = True proxy_acc.save() yield proxy_acc.host_port