def parse(self, response): total = int( response.xpath( '//*[@id="content"]/table[2]/tr/td[1]/table/tr[2]/td/small/b/text()' ).extract()[0]) / 10 if response.url.find('pnum=') == -1: cur_page = 0 else: cur_page = int(search('pnum=(\d+)', response.url).group(1)) if total - cur_page > 1: yield Request(url="http://www.xroxy.com/proxylist.php?pnum=%d" % (cur_page + 1), headers={'Referer': response.url}) ip_list = response.xpath( '//*[@id="content"]/table[1]/tr[@class="row0"] | //*[@id="content"]/table[1]/tr[@class="row1"]' ) for ip in ip_list: item = ProxyIPItem() item['ip'] = ip.xpath('td[2]/a/text()').extract()[0].strip() item['port'] = ip.xpath('td[3]/a/text()').extract()[0].strip() type = ip.xpath('td[4]/a/text()').extract()[0].strip().lower() type = 'http' if type in [ 'anonymous', 'transparent', 'high anonymity', 'distorting' ] else type item['type'] = type yield item
def parse(self, response): ip_list = response.xpath('//*[@id="proxylisttable"]/tbody/tr') for ip in ip_list: item = ProxyIPItem() item['ip'] = ip.xpath('td[1]/text()').extract()[0] item['port'] = ip.xpath('td[2]/text()').extract()[0] item['type'] = ip.xpath('td[5]/text()').extract()[0].lower() yield item
def parse(self, response): ip_list = re.findall("\d+\.\d+\.\d+\.\d+:\d+", response.body) for ip in ip_list: item = ProxyIPItem() item['ip'] = ip[:ip.index(':')] item['port'] = ip[ip.index(":") + 1:] item['type'] = 'http' yield item
def parse(self, response): for tr in response.xpath('//tr'): item = ProxyIPItem() try: item['ip'] = tr.xpath('td/text()')[0].extract() item['port'] = tr.xpath('td/text()')[1].extract() except: continue item['type'] = 'http' yield item
def parse(self, response): ip_list = response.xpath('//table[@id="ip_list"]/tr') if len(ip_list) > 0: ip_list.pop(0) for ip in ip_list: item = ProxyIPItem() item['ip'] = ip.xpath('td[2]/text()').extract()[0] item['port'] = ip.xpath('td[3]/text()').extract()[0] item['type'] = 'http' yield item
def parse(self, response): try: proxies = json.loads(response.text) for proxy in proxies: item = ProxyIPItem() item['ip'] = proxy['IP'] item['port'] = proxy['PORT'] item['type'] = 'http' yield item pass except: pass
def parseDetail(self, response): for tr in response.xpath('//tr'): item = ProxyIPItem() try: server = tr.xpath('td/a/text()')[0].extract() strs = server.split(':') item['ip'] = strs[0] item['port'] = strs[1] except Exception as e: print(e) continue item['type'] = 'http' yield item
def parse(self, response): ip_list = response.xpath('//div[@id="boxright"]/div/ul/li') if len(ip_list) > 0: ip_list.pop(0) for ip in ip_list: item = ProxyIPItem() item['ip'] = ip.xpath('div[@class="ip"]/text()').extract()[0] item['port'] = ip.xpath('div[@class="port"]/text()').extract()[0] if response.url.find('socks4') != -1: item['type'] = 'socks4' elif response.url.find('socks5') != -1: item['type'] = 'socks5' else: item['type'] = 'http' yield item
def parse(self, response): ip_list = response.xpath('//div[@id="index_free_list"]/table/tbody/tr') for line in ip_list: item = ProxyIPItem(type="http") item["ip"] = line.xpath('td[1]/text()').extract()[0].strip() item["port"] = line.xpath('td[2]/text()').extract()[0].strip() yield item if response.request.url.find('proxylist') < 0: pages = response.xpath('//div[@id="listnav"]/ul/li/a') pages.pop(0) for page in pages: path = page.xpath('@href').extract()[0] yield Request(url=self.start_urls[0] + path, headers={ 'Referer': response.request.url, 'User-Agent': response.request.headers.get('User-Agent') })
def parse(self, response): # pages = response.xpath('//*[@id="flip"]/div/span | //*[@id="flip"]/div/a') # if len(pages) > 4: # next_page = pages[-2].xpath('@href').extract() # if len(next_page) == 1: # yield Request(url='%s/%s' % (self.referer, next_page[0]), headers={'Referer': response.url}) if response.request.url == 'http://ip.qiaodm.com/free/index.html': hot_urls = response.xpath( '//div[@class="freeb"]/a[contains(@href,"free")]/@href' ).extract() for url in hot_urls: yield Request(url=url, headers={'Referer': self.referer}) country_urls = response.xpath('//a[@class="item"]/@href').extract() for url in country_urls: yield Request(url=url, headers={'Referer': self.referer}) ip_list = response.xpath( '//*[@id="main_container"]/div[1]/table/tbody/tr') if len(ip_list) > 2: ip_list.pop(1) ip_list.pop(0) for line in ip_list: item = ProxyIPItem() columns = line.xpath('td') ip_spans = columns[0].xpath( 'node()/script/text() | node()[not(contains(@style, "none"))]/text()' ).extract() item['ip'] = ''.join([ a.replace('document.write(\'', '').replace('\');', '') for a in ip_spans ]) # port = columns[1].xpath('text()').extract()[0] port = columns[1].xpath('@class').extract()[0].split(' ')[1] port = int(''.join([str("ABCDEFGHIZ".index(c)) for c in port])) / 8 item['port'] = port # port = columns[1].xpath('script/text()').extract()[0] # port = port[port.index('=') + 1:port.index(';')] # item['port'] = ''.join([str(eval(a)) for a in port.split('+')]) item['type'] = 'http' yield item
def parse(self, response): ip_list = response.xpath( '/html/body/center/table[2]/tr/td[1]/table/tr') if len(ip_list) > 1: ip_list.pop(0) has_next = True for ip in ip_list: item = ProxyIPItem() columns = ip.xpath('td/text()').extract() item['ip'] = columns[0].strip() item['port'] = columns[1].strip() item['type'] = 'http' if columns[-1].strip() == u'超时': has_next = False yield item if has_next: url = "%s%s" % ( self.referer, response.xpath( '/html/body/center/table[2]/tr/td[1]/p/a[last()]/@href'). extract()[0]) yield Request(url=url, headers={'Referer': response.url})
def parse(self, response): ip_list = response.xpath('body/font/b/table/tr[1]/td[2]/table/tr') if len(ip_list) > 3: ip_list.pop(1) ip_list.pop(0) ip = ip_list.pop() cur_page = int(search('cn_(\d+)_ext', response.url).group(1)) total = len(ip.xpath('td/b/a')) if total - cur_page > 1: yield Request(url="http://www.proxylists.net/cn_%d_ext.html" % (cur_page + 1), headers={'Referer': response.url}) for ip in ip_list: item = ProxyIPItem() item['ip'] = unquote( search('%22(.*)%22', ip.xpath('td/script/text()').extract()[0]).group(1)) item['port'] = ip.xpath('td[2]/text()').extract()[0] type = ip.xpath('td[3]/text()').extract()[0].lower() type = 'http' if type in [ 'anonymous', 'transparent', 'high anonymity', 'distorting' ] else type item['type'] = type yield item