Esempio n. 1
0
    def parse(self, response, **kwargs):
        res = response.body

        soup = BeautifulSoup(res, "lxml")
        tbody = soup.find("tbody")

        tr_list = tbody.find_all("tr")

        # if not tr_list:
        #     self.crawler.engine.close_spider(self)

        for tr in tr_list:
            td_list = tr.find_all("td")

            ip_info = IPItem()
            ip_info["host"] = td_list[0].text.strip()
            ip_info["port"] = td_list[1].text.strip()

            if Cache.is_exist("{}:{}".format(ip_info["host"],
                                             ip_info["port"])):
                continue

            ip_info["proxy_type"] = 0
            ip_info["anonymity_type"] = 1
            ip_info["region"] = FieldParser.get_region(td_list[2].text.strip())

            try:
                yield ip_info
            except Exception as exc:
                self.logger.error("【程序异常】{}".format(exc))
Esempio n. 2
0
    def parse(self, response, **kwargs):
        res = response.body

        soup = BeautifulSoup(res, "lxml")
        table = soup.find_all("table")[2]
        tr_list = table.find_all("tr")[1:]

        if not tr_list and self.page > 2400:
            self.crawler.engine.close_spider(self)

        for tr in tr_list:

            ip_info = IPItem()
            ip_info["host"] = tr.contents[0].text
            ip_info["port"] = tr.contents[1].text

            if Cache.is_exist("{}:{}".format(ip_info["host"],
                                             ip_info["port"])):
                continue

            ip_info["proxy_type"] = 0
            ip_info["anonymity_type"] = FieldParser.get_anonymity_type(
                tr.contents[3].text)
            ip_info["region"] = FieldParser.get_region(tr.contents[2].text)

            try:
                yield ip_info
            except Exception as exc:
                self.logger.error("【程序异常】{}".format(exc))