async def get_proxy(self): result = list() resp = await self._request(Request('http://www.goubanjia.com/', headers=self.header)) if resp is None or resp.status != 200: log.info(f'{self.__class__.__name__} request failed {resp}') return proxy_list = resp.xpath_selector.xpath('//td[@class="ip"]') xpath_str = """.//*[not(contains(@style, 'display: none')) and not(contains(@style, 'display:none')) and not(contains(@class, 'port')) ]/text() """ for each_proxy in proxy_list: try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) # HTML中的port是随机数,真正的端口编码在class后面的字母中。 # 比如这个: # <span class="port CFACE">9054</span> # CFACE解码后对应的是3128。 port = 0 for _ in each_proxy.xpath(".//span[contains(@class, 'port')]" "/attribute::class")[0]. \ replace("port ", ""): port *= 10 port += (ord(_) - ord('A')) port /= 8 result.append('{}:{}'.format(ip_addr, int(port))) except Exception: pass return result
async def get_proxy(self) -> list: result = list() for i in range(1, 2): url = 'http://ip.jiangxianli.com/?country=中国&?page={}'.format(i) resp = await self._request(Request(url, headers=self.header)) if resp is None or resp.status != 200: log.info(f'{self.__class__.__name__} request failed {resp}') continue for index, tr in enumerate(resp.xpath_selector.xpath("//table//tr")): if index == 0: continue result.append(":".join(tr.xpath("./td/text()")[0:2]).strip()) return result
async def get_proxy(self) -> list: result = list() base_url = 'http://www.89ip.cn/index_{}.html' for page in range(1, 2): url = base_url.format(page) resp = await self._request(Request(url, headers=self.header)) if resp is None or resp.status != 200: log.info(f'{self.__class__.__name__} request failed {resp}') continue proxies = re.findall( r'<td.*?>[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?</td>[\s\S]*?<td.*?>[\s\S]*?(\d+)[\s\S]*?</td>', resp.text) for proxy in proxies: result.append(':'.join(proxy)) return result
async def get_proxy(self) -> list: result = list() urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"] for url in urls: resp = await self._request(Request(url, headers=self.header)) if resp is None or resp.status != 200: log.info(f'{self.__class__.__name__} request failed {resp}') continue proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', resp.text) for proxy in proxies: result.append(":".join(proxy)) return result
async def get_proxy(self) -> list: result = list() base_url = 'http://www.qydaili.com/free/?action=china&page=' for page in range(1, 2): url = base_url + str(page) resp = await self._request(Request(url, headers=self.header)) if resp is None or resp.status != 200: log.info(f'{self.__class__.__name__} request failed {resp}') continue proxies = re.findall( r'<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td.*?>(\d+)</td>', resp.text) for proxy in proxies: result.append(':'.join(proxy)) return result
async def get_proxy(self) -> list: result = list() url_list = [ 'https://www.kuaidaili.com/free/inha/', 'https://www.kuaidaili.com/free/intr/' ] for url in url_list: resp = await self._request(Request(url, headers=self.header)) if resp is None or resp.status != 200: log.info(f'{self.__class__.__name__} request failed {resp}') continue proxy_list = resp.xpath_selector.xpath('.//table//tr') time.sleep(1) # 必须sleep 不然第二条请求不到数据 for tr in proxy_list[1:]: result.append(':'.join(tr.xpath('./td/text()')[0:2])) return result