def spider_opened(self, spider): s = spider.settings or {} ban_checks = s.get("IP_BAN_CHECKS" , get_or_call_attr(spider,"ip_ban_checks",{} ) ) or {} ban_checks = import_objects( ban_checks ) doms = set(get_or_call_attr(spider,"allowed_domains",[])) if s.get("PROXY_REQUIRED") else s.get("PROXY_REQUIRED_DOMAINS", get_or_call_attr(spider,"proxy_required_domains",[]) ) if ban_checks: doms = doms | set(ban_checks.keys()) if not doms: return if is_iterable( doms ): for dom in doms: if not dom in self.domains: self.domains.add(dom) if self.domain_parser is None: self.domain_parser = domainparser( self.domains ) else: for dom in self.domains: self.domain_parser.add_domain( dom ) for dom in self.domains: if not dom in self.good_ips: self.good_ips[dom] = {} self.bad_ips[dom] = {} if ban_checks: self.ban_checks.update(**ban_checks) self.spiders.add( spider )
def __init__(self, settings , stats , crawler ): self.domains = set( settings.get("PROXY_REQUIRED_DOMAINS",[]) ) self.prox = {} self.ban_checks = settings.get("IP_BAN_CHECKS",{}) mports = [] self.ban_checks = import_objects(self.ban_checks) self.crawler = crawler self.good_ips = {} self.bad_ips = {} self.spiders = set() self.stats = stats self.splash_url = settings.get("SPLASH_URL","") self.domain_parser = None self.stats.set_value('%s/success' % self.proxy_name ,0) self.stats.set_value('%s/fail' % self.proxy_name ,0) self.stats.set_value('%s/retry_proxy_domain' % self.proxy_name ,0) self.stats.set_value('%s/retry_proxy_domain_success' % self.proxy_name ,0) self.stats.set_value('%s/retry_proxy_domain_fail' % self.proxy_name ,0) self.stats.set_value('%s/proxy_domain_banned' % self.proxy_name ,0) self.splash_url = settings.get("SPLASH_URL",None) super(ProxyPoolMiddlewareBase,self).__init__(settings,crawler)