def run(self, proxyips): result = {} proxy_set = self.classify(proxyips) for proxy_type in self.proxy_type: proxy_list = list(proxy_set.get(proxy_type, set())) logger.info('sniffer start, proxy_type: %s, proxy_ip: %s', proxy_type, len(proxy_list)) result[proxy_type] = self.validator.run_in_multiprocess(proxy_list) logger.info('sniffer finish, proxy_type: %s, avail_ip: %s', proxy_type, len(result[proxy_type])) if SNIFFER['OUTPUT']: try: self.save2file(result) except Exception as e: logger.error("Write file fail, error: %s", e) if SNIFFER['BACKEND'] != '': try: self.redis = redis.StrictRedis(*SNIFFER['BACKEND'].split(':')) self.redis.ping() except Exception as e: logger.error("Backend redis error: %s", e) return self.reflesh_redis() self.save2redis(result)
def validate_job(self, proxy_list): result = {} while len(proxy_list) > 0: ip_port = proxy_list.pop() is_valid, speed = self.validate(ip_port) if is_valid: result[ip_port] = speed logger.info("got an valid ip: %s, time:%s", ip_port, speed) return result
def run(cls): proxyip = [] for source in [CNProxy, CNProxyForeign, IP66, IP66API, IP002, \ XiCiDaiLi, CZ88, KuaiDaiLi, IP002, KuaiDaiLi2]: instance = source() proxyips = instance.crawl() proxyip.extend(proxyips) logger.info('%s crawl ip: %s', source, len(proxyips)) return proxyip
def validate(self, ip_port): proxies = { "http": "http://%s" % ip_port, } try: start = time.time() r = requests.get(self.target, proxies=proxies, timeout=self.timeout) if r.status_code == requests.codes.ok: speed = time.time() - start logger.info('validating %s, success, time:%ss', ip_port, speed) return True, speed except Exception as e: logger.warn("validating %s, fail: %s", ip_port, e) return False, 0
def get(self, url, encoding=None, headers=None): logger.info('crawl: %s', url) try: r = requests.get(url, headers=headers) if headers else requests.get(url) if encoding: r.encoding = encoding if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.text, "html5lib") return self.parse(soup) else: raise Exception("HTTP Response Code: %s" % r.status_code) except Exception as e: logger.error('Crawl error: %s', e) return []
def main(): proxyips = Crawler.run() logger.info('Crawler finish, total ip: %s', len(proxyips)) sniffer = Sniffer() sniffer.run(proxyips)