Esempio n. 1
0
 def parse_page(self, response):
     _port_expr_values = response.xpath(
         '//script[2]/text()').extract_first().encode()[1:]
     trs = response.xpath('//table[@id="proxylist"]/tr')
     if response.url == 'http://nntime.com/proxy-ip-01.htm':
         trs = trs[1:]
     if trs:
         for tr in trs:
             item = ProxyItem()
             _ip = tr.xpath('td[2]/text()').extract_first().encode()
             _port_expr = tr.xpath('td[2]//script/text()').extract_first(
             ).encode().split('":"')[1][1:-1].split('+')
             with PyV8.JSContext() as env:
                 env.eval(_port_expr_values)
                 _port = ''
                 for expr in _port_expr:
                     _port += str(env.eval(expr))
             _type = tr.xpath('td[3]/text()').extract_first()
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port
             item['p_type'] = _type
             yield item
Esempio n. 2
0
 def parse_page(self, response):
     trs = response.xpath('/html/body/div[1]/table[2]/tr')
     trs = trs[2:]
     if trs:
         for tr in trs:
             item = ProxyItem()
             try:
                 _ip = tr.xpath('td[2]/text()').extract_first().encode()
                 _ip = re.search('(\d+\.\d+\.\d+\.\d+)', _ip).group(1)
                 _port = tr.xpath('td[3]/text()').extract_first().encode()
                 _type = tr.xpath('td[4]/text()').extract_first()
                 _id = hash(_ip + _port + _type + self.name)
                 item['_id'] = _id
                 item['source'] = self.name
                 item['ip'] = _ip
                 item['port'] = _port
                 item['p_type'] = _type
                 yield item
             except:
                 pass
     pages = response.xpath(
         '/html/body/div[1]/table[3]/tr/td/a/@href').extract()
     if pages:
         this_page = response.url.split('//')[-1].split('/')[-1]
         if not this_page or this_page == 'list.proxylistplus.com' or this_page == u'' or this_page == '':
             this_page = 'Fresh-HTTP-Proxy-List-1'
         next_page = 'Fresh-HTTP-Proxy-List-%d' % (int(this_page[-1]) + 1)
         next_page = next_page.decode()
         if next_page in pages:
             next_page.encode()
             yield scrapy.http.Request('http://list.proxylistplus.com/' +
                                       next_page,
                                       callback=self.parse_page)
Esempio n. 3
0
 def parse_page(self, response):
     trs = response.xpath('//table')[4].xpath('tr')
     trs = trs[3:-4]
     if trs:
         for tr in trs:
             item = ProxyItem()
             _ip = tr.xpath('td[1]/text()').extract_first().encode()
             _port = tr.xpath('td[2]/text()').extract_first().encode()
             _type = tr.xpath('td[3]/text()').extract_first()
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port
             item['p_type'] = _type
             yield item
     urls = response.xpath('//table')[4].xpath(
         'tr[last()]/td/a[@href]/@href').extract()
     this_page = response.url
     if this_page == 'http://www.mrhinkydink.com/proxies.htm':
         this_page = 'http://www.mrhinkydink.com/proxies1.htm'
     this_page = this_page[-5]
     next_page = int(this_page) + 1
     next_page = u'proxies%d.htm' % next_page
     yield scrapy.http.Request('http://www.mrhinkydink.com/' +
                               next_page.encode(),
                               callback=self.parse_page)
Esempio n. 4
0
    def parse_page(self, response):
        #from scrapy.shell import inspect_response
        #inspect_response(response, self)
        trs = response.xpath('//table[2]/tr[4]/td[1]/table/tr')
        _port_expr_values = response.xpath(
            '/html/body/script[1]/text()').extract_first()
        trs = trs[3:]
        if trs:
            for tr in trs:
                item = ProxyItem()
                try:
                    _ip = tr.xpath(
                        'td[1]/font[2]/text()').extract_first().encode()
                    _port_expr = trs[3].xpath('td[1]/font/script/text()'
                                              ).extract_first().encode().split(
                                                  '"+')[1][:-1]
                    _port_expr = _port_expr.split('+')
                    with PyV8.JSContext() as env:
                        env.eval(_port_expr_values)
                        _port = ''
                        for expr in _port_expr:
                            _port += str(env.eval(expr))

                    _type = tr.xpath('td[3]/a/font/text()').extract_first()
                    _id = hash(_ip + _port + _type + self.name)
                    item['_id'] = _id
                    item['source'] = self.name
                    item['ip'] = _ip
                    item['port'] = _port
                    item['p_type'] = _type
                    yield item
                except:
                    pass
Esempio n. 5
0
 def parse_page(self, response):
     trs = response.xpath('//table[@id="sort"]/tr')
     trs = trs[1:]
     if trs:
         for tr in trs:
             item = ProxyItem()
             try:
                 _ip = tr.xpath(
                     'td[last()]/text()').extract_first().encode()
                 _port = tr.xpath(
                     'td[last()-1]/text()').extract_first().encode()
                 _type = tr.xpath('td[last()-3]/text()').extract_first()
                 _id = hash(_ip + _port + _type + self.name)
                 item['_id'] = _id
                 item['source'] = self.name
                 item['ip'] = _ip
                 item['port'] = _port
                 item['p_type'] = _type
                 yield item
             except:
                 pass
     try:
         next_page = response.xpath(
             '//input[@type="submit"][@name="page"][@class="this_page"]/following::input[@name="page"][@value][1]/@value'
         ).extract_first().encode()
         if next_page:
             yield scrapy.http.FormRequest(
                 'http://www.idcloak.com/proxylist/free-proxy-ip-list.html#sort',
                 formdata={
                     'port[]': 'all',
                     'protocol-http': 'true',
                     'protocol-https': 'true',
                     'protocol-sock4': 'true',
                     'protocol-sock5': 'true',
                     'anonymity-low': 'true',
                     'anonymity-medium': 'true',
                     'anonymity-high': 'true',
                     'connection-low': 'true',
                     'connection-medium': 'true',
                     'connection-high': 'true',
                     'speed-low': 'true',
                     'speed-medium': 'true',
                     'speed-high': 'true',
                     'order': 'desc',
                     'by': 'updated',
                     'page': next_page
                 },
                 callback=self.parse_page)
     except:
         pass
Esempio n. 6
0
 def parse_page(self, response):
     trs = response.xpath(
         '/html/body/section/section[4]/section/div/table/tbody/tr')
     for tr in trs:
         item = ProxyItem()
         _ip = tr.xpath('td/text()')[1].extract_first().encode()
         _port = tr.xpath('td/text()')[2].extract_first().encode()
         _id = hash(_ip + _port + self.name)
         item['_id'] = _id
         item['source'] = self.name
         item['ip'] = _ip
         item['port'] = _port
         item['p_type'] = _type
         yield item
Esempio n. 7
0
 def parse_page(self, response):
     trs = response.xpath('//table[@id="proxylisttable"]/tbody/tr')
     if trs:
         for tr in trs:
             item = ProxyItem()
             _ip = tr.xpath('td[1]/text()').extract_first().encode()
             _port = tr.xpath('td[2]/text()').extract_first().encode()
             _type = tr.xpath('td[5]/text()').extract_first()
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port
             item['p_type'] = _type
             yield item
Esempio n. 8
0
 def parse_page(self, response):
     trs = response.xpath(
         '//table[@class="table table-striped table-hover"]/tbody/tr')
     if trs:
         for tr in trs:
             item = ProxyItem()
             _ip, _port = tr.xpath(
                 'td[1]/a/text()').extract_first().encode().split(':')
             _type = tr.xpath('td[3]/span/text()').extract_first()
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port
             item['p_type'] = _type
             yield item
Esempio n. 9
0
 def parse_page(self, response):
     trs = response.xpath(
         '//table[@class="table table-hover table-condensed"]/tbody/tr')
     if trs:
         for tr in trs:
             item = ProxyItem()
             _ip = tr.xpath('td[1]/a/text()').extract_first().encode()
             _port = tr.xpath('td[2]/text()').extract_first().encode()
             _type = tr.xpath('td[8]/abbr/text()').extract_first().encode()
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port.strip()
             item['p_type'] = _type
             yield item
Esempio n. 10
0
 def parse_page(self, response):
     ips = response.xpath('//pre[@style="float:left; "]')[0].xpath(
         'text()').extract_first().split('\r\n')
     if ips:
         for ip in ips:
             item = ProxyItem()
             _ip = ip.encode().split(':')[0]
             _port = ip.encode().split(':')[1]
             _type = 'unknown'
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port
             item['p_type'] = _type
             yield item
Esempio n. 11
0
 def parse_page(self, response):
     ips = json.loads(
         response.xpath('//script[@type="text/javascript"][3]/text()').
         extract_first().encode().split(';')[0][13:])
     if ips:
         for ip in ips:
             item = ProxyItem()
             _ip = ip[u'i'].encode()
             _port = ip[u'p'].encode()
             _type = ip[u'a'].encode()
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port
             item['p_type'] = _type
             yield item
Esempio n. 12
0
 def parse_page(self, response):
     trs = response.xpath(
         '//table[@class="table table-hover panel-default panel ctable"]/tbody/tr'
     )
     trs = trs[1:]
     if trs:
         for tr in trs:
             item = ProxyItem()
             _ip = tr.xpath('td[1]/text()').extract_first().encode()
             _port = tr.xpath('td[2]/text()').extract_first().encode()
             _type = tr.xpath('td[3]/text()').extract_first()
             _id = hash(_ip + _port + _type + self.name)
             item['_id'] = _id
             item['source'] = self.name
             item['ip'] = _ip
             item['port'] = _port
             item['p_type'] = _type
             yield item
Esempio n. 13
0
 def parse(self, response):                #scrapy selector
     trs = response.xpath('//table/tr')
     trs = trs[1:]
     if trs:
         for tr in trs:
             item = ProxyItem()
             try:
                 _ip = tr.xpath('td[2]/text()').extract_first().encode()
                 _port = tr.xpath('td[3]/text()').extract_first().encode()
                 _type = tr.xpath('td[4]/text()').extract_first()
                 _id = hash(_ip+_port+_type+self.name)
                 item['_id'] = _id
                 item['source'] = self.name
                 item['ip'] = _ip
                 item['port'] = _port
                 item['p_type'] = _type
                 yield item
             except:
                 pass