def parse(resp_body): selector = parsel.Selector(text=resp_body) all_tr_elems = selector.css('table.table-hover tbody tr') # 这个数字用来计算端口号 data_num = selector.css('html').re_first( r'<div style="display:none" data-[a-zA-Z]*="(\d+)"' ) result = [] for tr in all_tr_elems: position = tr.css('td:nth-child(4) div::text').extract_first() # 解析第一个td下面的script标签 script_elem = tr.css('td:nth-child(1) script::text') # ip first part, is reversed form ip_first_part = script_elem.re_first(r'\'([\d\.]*)\'\.split') ip_first_part = ''.join(reversed(ip_first_part)) # ip second part, is base64 encoded hex_list = script_elem.re(r'\\x([A-Za-z0-9]{2})') b64_string = bytearray.fromhex(''.join(hex_list)).decode() ip_second_part = base64.b64decode(b64_string).decode() ip = ip_first_part + ip_second_part # 获取页面中的port,然后加上data_num,就是实际的port raw_port = script_elem.re_first(r'var pp = \((\d+) -') port = int(raw_port) + int(data_num) item = ProxyItem( ip=ip, port=port, position=position ) result.append(item) return result
def parse(resp_body): selector = parsel.Selector(text=resp_body) all_tr_elems = selector.css('table#ip_list tr')[1:] result = [] for tr in all_tr_elems: ip = tr.css('td:nth-child(2)::text').extract_first() port = tr.css('td:nth-child(3)::text').extract_first() position = tr.css('td:nth-child(4) a::text').extract_first() item = ProxyItem( ip=ip, port=port, position=position ) result.append(item) return result
def parse(resp_body): selector = parsel.Selector(text=resp_body) all_tr_elems = selector.css("table.table-bordered tbody tr") result = [] for tr in all_tr_elems: ip = tr.css('td:nth-child(1)::text').extract_first() port = tr.css('td:nth-child(2)::text').extract_first() position = tr.css('td:nth-child(5)::text').extract_first() item = ProxyItem( ip=ip, port=port, position=position ) result.append(item) return result
def parse(resp_body): selector = parsel.Selector(text=resp_body) all_lines = selector.css('.l2') result = [] for line in all_lines: ip = line.css('span:nth-child(1) li::text').extract_first() port = line.css('span:nth-child(2) li::text').extract_first() position = line.css('span:nth-child(5) li::text').extract_first() item = ProxyItem( ip=ip, port=port, position=position ) result.append(item) return result
def parse(resp_body): selector = parsel.Selector(text=resp_body) all_tr_elems = selector.css('#datatable-row-highlight tbody tr') result = [] for tr in all_tr_elems: ip = tr.css('td:nth-child(1) a::text').extract_first() port = tr.css('td:nth-child(2)::text').extract_first() if port is None: continue position = tr.css('td:nth-child(5)::text').extract_first() item = ProxyItem( ip=ip, port=port, position=position ) result.append(item) return result
def parse(resp_body): selector = parsel.Selector(text=resp_body) result = [] all_tr_elems = selector.css('table tr')[1:] for tr in all_tr_elems: secret_ip = tr.css('td:nth-child(1) script::text').re_first( r'rot13\(\"(.*?)\"' ) if secret_ip is None: continue decoded_by_rot13 = codecs.decode(secret_ip, 'rot13') ip = base64.b64decode(decoded_by_rot13).decode() port = tr.css("td:nth-child(2)::text").extract_first() position = tr.css("td:nth-child(4)::text").extract_first() item = ProxyItem( ip=ip, port=port, position=position ) result.append(item) return result
def parse(self): # get response and timestamp response = requests.get(self.url, headers=self.headers) # >>>>> DEBUG #with open('proxy-list.html', 'w') as f: # f.write(response.text) # <<<<< DEBUG self.request_time = datetime.utcnow() # get proxy table soup = bs.BeautifulSoup(response.text,'lxml') proxy_table = soup.find('tbody') # extract data to db db = sessionmaker(bind=self.engine)() for row in proxy_table.find_all('tr'): # extract row data data = [td.text for td in row.find_all('td')] print(data) # check if the proxy exists item = db.query(ProxyItem).filter_by(ip=data[0]).filter_by(port=int(data[1])).first() if item: item.country = data[2] item.anonimity = data[4] item.https_support = (data[6]== 'yes') item.last_checked = self.last_checked(data[7]) else: item = ProxyItem( ip=data[0], port=int(data[1]), country=data[2], anonimity=data[4], https_support= (data[6]== 'yes'), last_checked=self.last_checked(data[7]), ) db.add(item) db.commit()