def parse(self, response, **kwargs): res = response.body soup = BeautifulSoup(res, "lxml") tbody = soup.find("tbody") tr_list = tbody.find_all("tr") # if not tr_list: # self.crawler.engine.close_spider(self) for tr in tr_list: td_list = tr.find_all("td") ip_info = IPItem() ip_info["host"] = td_list[0].text.strip() ip_info["port"] = td_list[1].text.strip() if Cache.is_exist("{}:{}".format(ip_info["host"], ip_info["port"])): continue ip_info["proxy_type"] = 0 ip_info["anonymity_type"] = 1 ip_info["region"] = FieldParser.get_region(td_list[2].text.strip()) try: yield ip_info except Exception as exc: self.logger.error("【程序异常】{}".format(exc))
def parse(self, response, **kwargs): res = response.body soup = BeautifulSoup(res, "lxml") table = soup.find_all("table")[2] tr_list = table.find_all("tr")[1:] if not tr_list and self.page > 2400: self.crawler.engine.close_spider(self) for tr in tr_list: ip_info = IPItem() ip_info["host"] = tr.contents[0].text ip_info["port"] = tr.contents[1].text if Cache.is_exist("{}:{}".format(ip_info["host"], ip_info["port"])): continue ip_info["proxy_type"] = 0 ip_info["anonymity_type"] = FieldParser.get_anonymity_type( tr.contents[3].text) ip_info["region"] = FieldParser.get_region(tr.contents[2].text) try: yield ip_info except Exception as exc: self.logger.error("【程序异常】{}".format(exc))