def parse_page(self, response): _port_expr_values = response.xpath( '//script[2]/text()').extract_first().encode()[1:] trs = response.xpath('//table[@id="proxylist"]/tr') if response.url == 'http://nntime.com/proxy-ip-01.htm': trs = trs[1:] if trs: for tr in trs: item = ProxyItem() _ip = tr.xpath('td[2]/text()').extract_first().encode() _port_expr = tr.xpath('td[2]//script/text()').extract_first( ).encode().split('":"')[1][1:-1].split('+') with PyV8.JSContext() as env: env.eval(_port_expr_values) _port = '' for expr in _port_expr: _port += str(env.eval(expr)) _type = tr.xpath('td[3]/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item
def parse_page(self, response): trs = response.xpath('/html/body/div[1]/table[2]/tr') trs = trs[2:] if trs: for tr in trs: item = ProxyItem() try: _ip = tr.xpath('td[2]/text()').extract_first().encode() _ip = re.search('(\d+\.\d+\.\d+\.\d+)', _ip).group(1) _port = tr.xpath('td[3]/text()').extract_first().encode() _type = tr.xpath('td[4]/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item except: pass pages = response.xpath( '/html/body/div[1]/table[3]/tr/td/a/@href').extract() if pages: this_page = response.url.split('//')[-1].split('/')[-1] if not this_page or this_page == 'list.proxylistplus.com' or this_page == u'' or this_page == '': this_page = 'Fresh-HTTP-Proxy-List-1' next_page = 'Fresh-HTTP-Proxy-List-%d' % (int(this_page[-1]) + 1) next_page = next_page.decode() if next_page in pages: next_page.encode() yield scrapy.http.Request('http://list.proxylistplus.com/' + next_page, callback=self.parse_page)
def parse_page(self, response): trs = response.xpath('//table')[4].xpath('tr') trs = trs[3:-4] if trs: for tr in trs: item = ProxyItem() _ip = tr.xpath('td[1]/text()').extract_first().encode() _port = tr.xpath('td[2]/text()').extract_first().encode() _type = tr.xpath('td[3]/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item urls = response.xpath('//table')[4].xpath( 'tr[last()]/td/a[@href]/@href').extract() this_page = response.url if this_page == 'http://www.mrhinkydink.com/proxies.htm': this_page = 'http://www.mrhinkydink.com/proxies1.htm' this_page = this_page[-5] next_page = int(this_page) + 1 next_page = u'proxies%d.htm' % next_page yield scrapy.http.Request('http://www.mrhinkydink.com/' + next_page.encode(), callback=self.parse_page)
def parse_page(self, response): #from scrapy.shell import inspect_response #inspect_response(response, self) trs = response.xpath('//table[2]/tr[4]/td[1]/table/tr') _port_expr_values = response.xpath( '/html/body/script[1]/text()').extract_first() trs = trs[3:] if trs: for tr in trs: item = ProxyItem() try: _ip = tr.xpath( 'td[1]/font[2]/text()').extract_first().encode() _port_expr = trs[3].xpath('td[1]/font/script/text()' ).extract_first().encode().split( '"+')[1][:-1] _port_expr = _port_expr.split('+') with PyV8.JSContext() as env: env.eval(_port_expr_values) _port = '' for expr in _port_expr: _port += str(env.eval(expr)) _type = tr.xpath('td[3]/a/font/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item except: pass
def parse_page(self, response): trs = response.xpath('//table[@id="sort"]/tr') trs = trs[1:] if trs: for tr in trs: item = ProxyItem() try: _ip = tr.xpath( 'td[last()]/text()').extract_first().encode() _port = tr.xpath( 'td[last()-1]/text()').extract_first().encode() _type = tr.xpath('td[last()-3]/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item except: pass try: next_page = response.xpath( '//input[@type="submit"][@name="page"][@class="this_page"]/following::input[@name="page"][@value][1]/@value' ).extract_first().encode() if next_page: yield scrapy.http.FormRequest( 'http://www.idcloak.com/proxylist/free-proxy-ip-list.html#sort', formdata={ 'port[]': 'all', 'protocol-http': 'true', 'protocol-https': 'true', 'protocol-sock4': 'true', 'protocol-sock5': 'true', 'anonymity-low': 'true', 'anonymity-medium': 'true', 'anonymity-high': 'true', 'connection-low': 'true', 'connection-medium': 'true', 'connection-high': 'true', 'speed-low': 'true', 'speed-medium': 'true', 'speed-high': 'true', 'order': 'desc', 'by': 'updated', 'page': next_page }, callback=self.parse_page) except: pass
def parse_page(self, response): trs = response.xpath( '/html/body/section/section[4]/section/div/table/tbody/tr') for tr in trs: item = ProxyItem() _ip = tr.xpath('td/text()')[1].extract_first().encode() _port = tr.xpath('td/text()')[2].extract_first().encode() _id = hash(_ip + _port + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item
def parse_page(self, response): trs = response.xpath('//table[@id="proxylisttable"]/tbody/tr') if trs: for tr in trs: item = ProxyItem() _ip = tr.xpath('td[1]/text()').extract_first().encode() _port = tr.xpath('td[2]/text()').extract_first().encode() _type = tr.xpath('td[5]/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item
def parse_page(self, response): trs = response.xpath( '//table[@class="table table-striped table-hover"]/tbody/tr') if trs: for tr in trs: item = ProxyItem() _ip, _port = tr.xpath( 'td[1]/a/text()').extract_first().encode().split(':') _type = tr.xpath('td[3]/span/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item
def parse_page(self, response): trs = response.xpath( '//table[@class="table table-hover table-condensed"]/tbody/tr') if trs: for tr in trs: item = ProxyItem() _ip = tr.xpath('td[1]/a/text()').extract_first().encode() _port = tr.xpath('td[2]/text()').extract_first().encode() _type = tr.xpath('td[8]/abbr/text()').extract_first().encode() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port.strip() item['p_type'] = _type yield item
def parse_page(self, response): ips = response.xpath('//pre[@style="float:left; "]')[0].xpath( 'text()').extract_first().split('\r\n') if ips: for ip in ips: item = ProxyItem() _ip = ip.encode().split(':')[0] _port = ip.encode().split(':')[1] _type = 'unknown' _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item
def parse_page(self, response): ips = json.loads( response.xpath('//script[@type="text/javascript"][3]/text()'). extract_first().encode().split(';')[0][13:]) if ips: for ip in ips: item = ProxyItem() _ip = ip[u'i'].encode() _port = ip[u'p'].encode() _type = ip[u'a'].encode() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item
def parse_page(self, response): trs = response.xpath( '//table[@class="table table-hover panel-default panel ctable"]/tbody/tr' ) trs = trs[1:] if trs: for tr in trs: item = ProxyItem() _ip = tr.xpath('td[1]/text()').extract_first().encode() _port = tr.xpath('td[2]/text()').extract_first().encode() _type = tr.xpath('td[3]/text()').extract_first() _id = hash(_ip + _port + _type + self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item
def parse(self, response): #scrapy selector trs = response.xpath('//table/tr') trs = trs[1:] if trs: for tr in trs: item = ProxyItem() try: _ip = tr.xpath('td[2]/text()').extract_first().encode() _port = tr.xpath('td[3]/text()').extract_first().encode() _type = tr.xpath('td[4]/text()').extract_first() _id = hash(_ip+_port+_type+self.name) item['_id'] = _id item['source'] = self.name item['ip'] = _ip item['port'] = _port item['p_type'] = _type yield item except: pass