Esempio n. 1
0
    async def get_proxy(self):
        result = list()
        resp = await self._request(Request('http://www.goubanjia.com/', headers=self.header))
        if resp is None or resp.status != 200:
            log.info(f'{self.__class__.__name__} request failed {resp}')
            return

        proxy_list = resp.xpath_selector.xpath('//td[@class="ip"]')
        xpath_str = """.//*[not(contains(@style, 'display: none'))
                                                and not(contains(@style, 'display:none'))
                                                and not(contains(@class, 'port'))
                                                ]/text()
                                        """
        for each_proxy in proxy_list:
            try:
                # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
                ip_addr = ''.join(each_proxy.xpath(xpath_str))

                # HTML中的port是随机数,真正的端口编码在class后面的字母中。
                # 比如这个:
                # <span class="port CFACE">9054</span>
                # CFACE解码后对应的是3128。
                port = 0
                for _ in each_proxy.xpath(".//span[contains(@class, 'port')]"
                                          "/attribute::class")[0]. \
                        replace("port ", ""):
                    port *= 10
                    port += (ord(_) - ord('A'))
                port /= 8

                result.append('{}:{}'.format(ip_addr, int(port)))
            except Exception:
                pass
        return result
Esempio n. 2
0
    async def get_proxy(self) -> list:
        result = list()

        for i in range(1, 2):
            url = 'http://ip.jiangxianli.com/?country=中国&?page={}'.format(i)
            resp = await self._request(Request(url, headers=self.header))
            if resp is None or resp.status != 200:
                log.info(f'{self.__class__.__name__} request failed {resp}')
                continue

            for index, tr in enumerate(resp.xpath_selector.xpath("//table//tr")):
                if index == 0:
                    continue
                result.append(":".join(tr.xpath("./td/text()")[0:2]).strip())
        return result
Esempio n. 3
0
    async def get_proxy(self) -> list:
        result = list()

        base_url = 'http://www.89ip.cn/index_{}.html'
        for page in range(1, 2):
            url = base_url.format(page)
            resp = await self._request(Request(url, headers=self.header))
            if resp is None or resp.status != 200:
                log.info(f'{self.__class__.__name__} request failed {resp}')
                continue
            proxies = re.findall(
                r'<td.*?>[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?</td>[\s\S]*?<td.*?>[\s\S]*?(\d+)[\s\S]*?</td>',
                resp.text)
            for proxy in proxies:
                result.append(':'.join(proxy))
        return result
Esempio n. 4
0
    async def get_proxy(self) -> list:
        result = list()

        urls = ['http://www.ip3366.net/free/?stype=1',
                "http://www.ip3366.net/free/?stype=2"]
        for url in urls:
            resp = await self._request(Request(url, headers=self.header))
            if resp is None or resp.status != 200:
                log.info(f'{self.__class__.__name__} request failed {resp}')
                continue

            proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', resp.text)
            for proxy in proxies:
                result.append(":".join(proxy))

        return result
Esempio n. 5
0
    async def get_proxy(self) -> list:
        result = list()

        base_url = 'http://www.qydaili.com/free/?action=china&page='
        for page in range(1, 2):
            url = base_url + str(page)
            resp = await self._request(Request(url, headers=self.header))
            if resp is None or resp.status != 200:
                log.info(f'{self.__class__.__name__} request failed {resp}')
                continue

            proxies = re.findall(
                r'<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td.*?>(\d+)</td>',
                resp.text)
            for proxy in proxies:
                result.append(':'.join(proxy))
        return result
Esempio n. 6
0
    async def get_proxy(self) -> list:
        result = list()
        url_list = [
            'https://www.kuaidaili.com/free/inha/',
            'https://www.kuaidaili.com/free/intr/'
        ]
        for url in url_list:
            resp = await self._request(Request(url, headers=self.header))
            if resp is None or resp.status != 200:
                log.info(f'{self.__class__.__name__} request failed {resp}')
                continue

            proxy_list = resp.xpath_selector.xpath('.//table//tr')
            time.sleep(1)  # 必须sleep 不然第二条请求不到数据
            for tr in proxy_list[1:]:
                result.append(':'.join(tr.xpath('./td/text()')[0:2]))

        return result