def random_proxy(): """ Get a proxy :return: 随机代理 """ conn = LocalDict() ip = conn.random() Log.info(f"ip: {ip}") return ip
def get_proxy(): """ Get a proxy :return: 随机代理 """ conn = LocalDict() ip = conn.max() Log.info(f"ip: {ip}") return ip
def get_proxies(self, callback): proxies = [] try: for proxy in eval("self.{}()".format(callback)): Log.debug(f'getter:成功获取到代理: {proxy}') proxies.append(proxy) except: Log.error(f'getter:抓取代理异常,{traceback.format_exc()}') return self.save_proxies(proxies)
def decrease_proxy(): """ Get a proxy :return: 随机代理 """ proxy = request.args.get("proxy") conn = LocalDict() conn.decrease(proxy, MAX_SCORE) Log.info(f"删除的ip为{proxy}") return "ok"
def clear(self): with self.mutex: try: useless = [] for key, value in self.proxys.items(): if value <= 0: useless.append(key) for i in useless: Log.info(f"清理值为0的无效ip:{i}") self.proxys.pop(i) except Exception as e: print("清理 ip 异常", e)
def run(self): """ 测试主函数 :return: """ t = set() self._minus_count = 0 count = self.local.count() if count == 0: Log.info("Tester:无代理") return Log.info(f'Tester:开始运行, 当前容量:{count}') try: stop = max(0, count) test_proxies = self.local.batch(0, stop) for proxy in test_proxies: for url in TEST_URLS: t.add(self.factory.add(self.test_single_proxy, url, proxy)) self.local.clear() except Exception as e: Log.error(f'Tester:发生错误 {e.args}') self.factory.wait(t) Log.info(f'Tester:执行结束, 测试前容量:{count}, 剩余:{count-self._minus_count}')
def test_single_proxy(self, url, proxy): """ 测试单个代理 :param proxy: :return: """ proxies = { "http": "http://" + proxy, } try: response = requests.head(url, headers=base_headers, proxies=proxies, timeout=15, allow_redirects=False, verify=False) status_code = response.status_code if status_code in VALID_STATUS_CODES: Log.debug(f'Tester:代理可用 {proxy}') pass else: if status_code in FORBIDEN_STATUS_CODES: self._minus() self.local.decrease(proxy, -MAX_SCORE) else: self.local.decrease(proxy) Log.error( f'Tester:请求响应码不合法 {status_code} ,IP {proxy}, URL: {url}') except (ReadTimeout, HTTPError, ProxyError, ConnectionError): self._minus() self.local.decrease(proxy, -MAX_SCORE) Log.warning(f'Tester:无用ip,直接删掉, ip: {proxy}') except (TypeError, AttributeError) as e: self.local.decrease(proxy) Log.error(f'Tester:代理请求失败 {proxy} ERROR: {e}')
def schedule_getter(): """ 定时获取代理 """ getter = Getter() while True: try: getter.run() except: getter = Getter() Log.error(f'getter: 抓取代理异常, {traceback.format_exc()}') finally: gc.collect() time.sleep(GETTER_CYCLE)
def schedule_tester(): """ 定时测试代理 """ tester = Tester() while True: try: tester.run() except: tester = Tester() Log.error(f'Tester: 测试代理异常, {traceback.format_exc()}') finally: gc.collect() time.sleep(TESTER_CYCLE)
def shutdown(self): for w in self.workers: while w.wf != "fired": Log.debug(f"正在赶打工人{w.wid}号离开") time.sleep(CHECK_FACTORY_STATUS) Log.debug(f"打工人{w.wid}号离开") Log.info("正常关闭工厂") return True
def run(self): t = set() count = self.local.count() if self.is_over_threshold(): Log.info("Getter:此时容量已达上限,不获取ip") return Log.info(f'Getter:开始执行, 当前容量:{count}') for callback_label in range(self.crawler.__CrawlFuncCount__): try: callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 t.add(self.factory.add(self.crawler.get_proxies, callback)) sys.stdout.flush() except: traceback.print_exc() self.factory.wait(t) Log.info(f'Getter:执行结束, 获取前容量:{count}, 当前:{self.local.count()}')
def run(self): self.wf = "doing" while self.factory.factory_status and self.status: if not self.__running.is_set(): Log.info("咋瓦鲁多") # 获取任务 t = self.factory.get_task() if not t: self._sleep(self.wid) continue try: self.sleep_count = 0 t.run() # print(f"打工人{self.wid}号 ,剩余工作 {self.factory._active_working}") except Exception as e: Log.error(f"打工人工作异常: {e}") finally: self.factory.minus_active(t.name) self.__running.wait() # 为True时立即返回, 为False时阻塞直到内部的标识位为True后返回 Log.debug(f"打工人{self.wid}你被炒鱿鱼了") self.wf = "fired"
def get_page(url, options={}, proxy=None): """ 抓取代理 :param url: :param options: :return: """ headers = dict(base_headers, **options) Log.debug(f'getter:正在抓取 {url}') try: response = requests.get(url, headers=headers, verify=False, proxies=proxy, timeout=10) Log.debug(f'getter:抓取成功 {url} {response.status_code}') if response.status_code == 200: return response.text except (ConnectionError, requests.exceptions.ReadTimeout): Log.error(f'getter:抓取失败 {url}') return None
def _sleep(self, wid): self.sleep_count += 1 Log.debug(f"打工人{wid}号,沉睡第{self.sleep_count}次") time.sleep(WORKER_SLEEP)