def __init__(self): self.log = logging.getLogger("proxy.spider") self.sched = BlockingScheduler() self.client = RedisClient(host=REDIS['host'], port=REDIS['port'], db=REDIS['db'], password=REDIS['password'], max_conns=REDIS['max_conns']) self._config_schedule()
def __init__(self, thread_id): super(Validator, self).__init__() # id从1开始 self.thread_id = thread_id self.log = logging.getLogger('proxy.validator_{}'.format(thread_id)) self.client = RedisClient(host=REDIS['host'], port=REDIS['port'], db=REDIS['db'], password=REDIS['password'], max_conns=REDIS['max_conns'])
class ProxyFactory: def __init__(self): self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD) def get_proxy(self): res = self.db.get() proxies = {"http": "http://{proxy}".format(proxy=res)} return proxies def del_proxy(self, proxies): key = proxies['http'].split("//")[1] print(key) return self.db.delete(key)
class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def limit(self, limit_num=500): """ 判断代理数量是否超过代理池设定值 :param limit_num: :return: """ if self.redis.count() >= limit_num: return True else: return False def run(self): print("Getter is running...") if not self.limit(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.put(proxy)
def __init__(self): self.proxy_queue = Queue() self.logger = logger self.html_request = HtmlRequest() self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD)
class ProxyValidator: def __init__(self): self.proxy_queue = Queue() self.logger = logger self.html_request = HtmlRequest() self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD) def start_valid(self, thread_num=10): thread_list = [] for i in range(thread_num): thread_list.append(Thread(target=self.vaild, name="check_proxy_thread-%d" % i)) for thread in thread_list: thread.daemon = True thread.start() for thread in thread_list: thread.join() def vaild(self): while not self.proxy_queue.empty(): proxy = self.proxy_queue.get() if not self.check(proxy): self.logger.info("invalid proxy %s", proxy) self.db.delete(proxy) self.proxy_queue.task_done() def run(self): self.init_queue() while True: if not self.proxy_queue.empty(): self.logger.info("start valid proxy...") self.start_valid() else: self.logger.info("valid complete! wait next valid") time.sleep(60 * 10) self.init_queue() def init_queue(self): for item in self.db.get_all(): self.proxy_queue.put(item) def check(self, proxy): proxies = {"http": "http://{proxy}".format(proxy=proxy)} try: # 超过20秒的代理就不要了 headers = { 'Host': 'kyfw.12306.cn', 'Referer': 'https://kyfw.12306.cn/otn/leftTicket/init', } r = self.html_request.get(config.CHECK_TARGET, header=headers, proxies=proxies) # r = requests.get(url=config.CHECK_TARGET, headers=headers, proxies=proxies, timeout=10, verify=False) if r.status_code == 200: logger.info('%s is ok' % proxy) return True except Exception as e: logger.error(str(e)) return False
def __init__(self): self.logger = logger self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD) self.html_request = HtmlRequest() self.html_parser = HtmlParser()
class ProxyPool: def __init__(self): self.logger = logger self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD) self.html_request = HtmlRequest() self.html_parser = HtmlParser() def update(self): """ 更新代理池 :return: """ while True: if self.db.nums() < config.PROXY_MINNUM: self.logger.info( "db exists ip:%d, less the minnum, start crawling proxy..." % self.db.nums()) spawns = [] gevent.spawn(self.crawl_gatherproxy) # for parser in config.parserList: # spawns.append(gevent.spawn(self.crawl, parser)) # if len(spawns) >= config.MAX_DOWNLOAD_CONCURRENT: # gevent.joinall(spawns) # spawns = [] gevent.joinall(spawns) else: self.logger.info( "db exists ip:%d, enough to use, wait next update..." % self.db.nums()) time.sleep(config.UPDATE_TIME) def crawl(self, parser): for url in parser['urls']: response = self.html_request.get(url) if response: proxy_list = self.html_parser.parse(response.text, parser) if proxy_list: self.logger.info("get %d proxy from %s", len(proxy_list), url) for proxy in proxy_list: if self.vaild(proxy): # save proxy self.logger.info("get a vaild proxy: %s", proxy) self.db.put(proxy) def crawl_gatherproxy(self): headers = { 'Host': 'www.gatherproxy.com', 'Proxy-Connection': 'keep-alive', 'Origin': 'http://www.gatherproxy.com', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.gatherproxy.com/proxylist/country/?c=China', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9' } url = 'http://www.gatherproxy.com/proxylist/country/?c=China' data = {"Country": "china", "PageIdx": 1, "Filter": '', "Uptime": 0} for page in range(1, 40): data['PageIdx'] = page response = self.html_request.post(url, data, headers) proxy_list = [] root = etree.HTML(response.text) proxys = root.xpath(".//table[@id='tblproxy']/tr[position()>2]") for proxy in proxys: try: ip_text = proxy.xpath(".//td[2]/script")[0].text ip = ip_text.split("'")[1] port_text = proxy.xpath(".//td[3]/script")[0].text port = str(int(port_text.split("'")[1], 16)) except Exception as e: self.logger.error("parse proxy error: ", e) continue proxy = ":".join([ip, port]) proxy_list.append(proxy) if proxy_list: self.logger.info("get %d proxy from %s", len(proxy_list), url) for proxy in proxy_list: if self.vaild(proxy): # save proxy self.logger.info("get a vaild proxy: %s", proxy) self.db.changeTable("gatherproxy") self.db.put(proxy) def vaild(self, proxy): proxies = {"http": "http://{proxy}".format(proxy=proxy)} try: # 超过20秒的代理就不要了 r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False) if r.status_code == 200: # logger.info('%s is ok' % proxy) return True except Exception as e: # logger.error(str(e)) return False
# -*- coding: utf-8 -*- __author__ = 'ada' # Created by ada on 13/10/2017 from flask import Flask from config import REDIS from db.RedisClient import RedisClient app = Flask(__name__) client = RedisClient(host=REDIS['host'], port=REDIS['port'], db=REDIS['db'], password=REDIS['password'], max_conns=REDIS['max_conns']) from .ProxyApi import *
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def get_conn(): if not hasattr(g, 'redis'): g.redis = RedisClient() return g.redis
class SpiderSchedule(object): """ 爬虫调度,不同的数据源,采取不同的爬取策略 """ def __init__(self): self.log = logging.getLogger("proxy.spider") self.sched = BlockingScheduler() self.client = RedisClient(host=REDIS['host'], port=REDIS['port'], db=REDIS['db'], password=REDIS['password'], max_conns=REDIS['max_conns']) self._config_schedule() def _config_schedule(self): """ 配置任务 :return: """ for crawl in crawl_list: # 是否可用 if not crawl["enable"]: continue self.log.info("添加job:{}".format(crawl["name"])) #执行方式,是间隔时间,还是定时任务 if "interval" in crawl: d = crawl["interval"] self.sched.add_job(self._spider, "interval", [crawl["name"]], **d) elif "cron" in crawl: d = crawl["cron"] self.sched.add_job(self._spider, "cron", [crawl["name"]], **d) def _spider(self, name): """ 爬虫实现 :param name: :return: """ self.log.info("爬取源:{}".format(name)) crawl_conf = get_crawl_by_name(name) for url in crawl_conf["urls"]: # 延时下载 time.sleep(crawl_conf.get("delay", None) or DOWNLOAD_DELAY) content = Downloader.download(url, timeout=config.DOWNLOAD_TIMEOUT, retries=config.DOWNLOAD_RETRIES) if content is None: self.log.error("download失败,url:" + url) continue #解析页面 proxy_list = HtmlParser().parse(url, content, crawl_conf) #保存proxy self._save(proxy_list, crawl_conf) def _save(self, proxy_list, crawl_conf): self.client.lpushlist(QUEUE_NAME, proxy_list) def run(self): try: # 判断是否有job jobs = self.sched.get_jobs() if len(jobs) == 0: self.log.error("当前jobs为0") return self.sched.start() except Exception: self.log.error("执行调度任务失败")
def __init__(self): self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD)
class Validator(Thread): """ 验证器 """ def __init__(self, thread_id): super(Validator, self).__init__() # id从1开始 self.thread_id = thread_id self.log = logging.getLogger('proxy.validator_{}'.format(thread_id)) self.client = RedisClient(host=REDIS['host'], port=REDIS['port'], db=REDIS['db'], password=REDIS['password'], max_conns=REDIS['max_conns']) def _check_exists(self, proxy): return self.client.exist(proxy2str(proxy)) def _save_to_pool(self, proxy): mylock.acquire() #保存到缓存池 if self.client.sadd(POOL_NAME, proxy2str(proxy)) > 0: #保存代理信息 self.client.hset(POOL_SCORE_NAME, proxy2str(proxy), PROXY_SCORE) mylock.release() def _save_to_bucket(self, proxy, ttl=BUCKET_TTL): return self.client.set(proxy2str(proxy), proxy2str(proxy, 2), ex=ttl) def _brpop_queue(self): datas = self.client.brpop(QUEUE_NAME) try: return json.loads(str(datas[1], encoding='utf-8')) except Exception: return None def _valid_proxy(self, proxy): ret = Downloader.valid_proxy(proxy) if ret == False and proxy.get("protocol", 0) == 1: # https的代理,再用http请求一遍 proxy["protocol"] = 0 ret = Downloader.valid_proxy(proxy) return ret def run(self): while True: proxy = self._brpop_queue() if proxy is None: # 队列中没有需要验证的代理,则睡眠10s time.sleep(10) continue #验证是否存在 if self._check_exists(proxy): continue #测试代理 if self._valid_proxy(proxy): self.log.info("[passed]proxy:{}".format(proxy2str(proxy))) #保存到代理池中 self._save_to_pool(proxy) self._save_to_bucket(proxy, ttl=None) else: #保存到已验证的桶中,通过验证则一直保留,否则添加ttl self._save_to_bucket(proxy)