Esempio n. 1
0
    def parse(resp_body):
        selector = parsel.Selector(text=resp_body)
        all_tr_elems = selector.css('table.table-hover tbody tr')
        # 这个数字用来计算端口号
        data_num = selector.css('html').re_first(
            r'<div style="display:none" data-[a-zA-Z]*="(\d+)"'
        )
        result = []

        for tr in all_tr_elems:
            position = tr.css('td:nth-child(4) div::text').extract_first()
            # 解析第一个td下面的script标签
            script_elem = tr.css('td:nth-child(1) script::text')

            # ip first part, is reversed form
            ip_first_part = script_elem.re_first(r'\'([\d\.]*)\'\.split')
            ip_first_part = ''.join(reversed(ip_first_part))

            # ip second part, is base64 encoded
            hex_list = script_elem.re(r'\\x([A-Za-z0-9]{2})')
            b64_string = bytearray.fromhex(''.join(hex_list)).decode()
            ip_second_part = base64.b64decode(b64_string).decode()

            ip = ip_first_part + ip_second_part

            # 获取页面中的port,然后加上data_num,就是实际的port
            raw_port = script_elem.re_first(r'var pp = \((\d+) -')
            port = int(raw_port) + int(data_num)

            item = ProxyItem(
                ip=ip, port=port, position=position
            )
            result.append(item)
        return result
Esempio n. 2
0
 def parse(resp_body):
     selector = parsel.Selector(text=resp_body)
     all_tr_elems = selector.css('table#ip_list tr')[1:]
     result = []
     for tr in all_tr_elems:
         ip = tr.css('td:nth-child(2)::text').extract_first()
         port = tr.css('td:nth-child(3)::text').extract_first()
         position = tr.css('td:nth-child(4) a::text').extract_first()
         item = ProxyItem(
             ip=ip, port=port, position=position
         )
         result.append(item)
     return result
Esempio n. 3
0
 def parse(resp_body):
     selector = parsel.Selector(text=resp_body)
     all_tr_elems = selector.css("table.table-bordered tbody tr")
     result = []
     for tr in all_tr_elems:
         ip = tr.css('td:nth-child(1)::text').extract_first()
         port = tr.css('td:nth-child(2)::text').extract_first()
         position = tr.css('td:nth-child(5)::text').extract_first()
         item = ProxyItem(
             ip=ip, port=port, position=position
         )
         result.append(item)
     return result
Esempio n. 4
0
 def parse(resp_body):
     selector = parsel.Selector(text=resp_body)
     all_lines = selector.css('.l2')
     result = []
     for line in all_lines:
         ip = line.css('span:nth-child(1) li::text').extract_first()
         port = line.css('span:nth-child(2) li::text').extract_first()
         position = line.css('span:nth-child(5) li::text').extract_first()
         item = ProxyItem(
             ip=ip, port=port, position=position
         )
         result.append(item)
     return result
Esempio n. 5
0
    def parse(resp_body):
        selector = parsel.Selector(text=resp_body)
        all_tr_elems = selector.css('#datatable-row-highlight tbody tr')
        result = []

        for tr in all_tr_elems:
            ip = tr.css('td:nth-child(1) a::text').extract_first()
            port = tr.css('td:nth-child(2)::text').extract_first()
            if port is None:
                continue
            position = tr.css('td:nth-child(5)::text').extract_first()
            item = ProxyItem(
                ip=ip, port=port, position=position
            )
            result.append(item)
        return result
Esempio n. 6
0
    def parse(resp_body):
        selector = parsel.Selector(text=resp_body)
        result = []
        all_tr_elems = selector.css('table tr')[1:]
        for tr in all_tr_elems:
            secret_ip = tr.css('td:nth-child(1) script::text').re_first(
                r'rot13\(\"(.*?)\"'
            )
            if secret_ip is None:
                continue
            decoded_by_rot13 = codecs.decode(secret_ip, 'rot13')
            ip = base64.b64decode(decoded_by_rot13).decode()
            port = tr.css("td:nth-child(2)::text").extract_first()
            position = tr.css("td:nth-child(4)::text").extract_first()
            item = ProxyItem(
                ip=ip, port=port, position=position
            )
            result.append(item)

        return result
Esempio n. 7
0
	def parse(self):
		# get response and timestamp
		response = requests.get(self.url, headers=self.headers)
		# >>>>> DEBUG
		#with open('proxy-list.html', 'w') as f:
		#	f.write(response.text)
		# <<<<< DEBUG
		self.request_time = datetime.utcnow()

		# get proxy table
		soup = bs.BeautifulSoup(response.text,'lxml')
		proxy_table = soup.find('tbody')
		
		# extract data to db
		db = sessionmaker(bind=self.engine)()
		for row in proxy_table.find_all('tr'):
			# extract row data
			data = [td.text for td in row.find_all('td')]
			print(data)
			# check if the proxy exists
			item = db.query(ProxyItem).filter_by(ip=data[0]).filter_by(port=int(data[1])).first()
			if item:
				item.country = data[2]
				item.anonimity = data[4]
				item.https_support = (data[6]== 'yes')
				item.last_checked = self.last_checked(data[7])
			else:
				item = ProxyItem(
					ip=data[0],
					port=int(data[1]),
					country=data[2],
					anonimity=data[4],
					https_support= (data[6]== 'yes'),
					last_checked=self.last_checked(data[7]),
				)
				db.add(item)
		db.commit()