def get(url, headers): resp = opener.open(Request(url, headers=headers)) html_text = html.to_html(resp.read(), html.get_charset(resp.getheader('Content-Type'))) root = etree.HTML(html_text) account = root.xpath('//p[@class="pt5"]/span/text()')[0] print(account)
def request(url, data: dict, headers=None): if data: # 将post上传的数据转成字节流 form_params = urlencode(data) # key=value&key2=value2 req = Request(url, form_params.encode(), headers) else: if headers: req = Request(url, headers=headers) else: req = Request(url) resp = opener.open(req) html_txt = html.to_html(resp.read(), html.get_charset(resp.getheader('Content-Type'))) print(html_txt)
def get_kuai(self): if not self.q_kuai.empty(): url = self.q_kuai.get() if DEBUG: print('正在爬取: ', url) try: response = requests.get(url=url, headers=get_header()) time.sleep(self.delay) if response.ok: resp_bytes = response.content html = to_html(resp_bytes) self.parse_kuai(html) except: # 请求出错,将url重新放入队列 self.q_kuai.put(url) # 调用自身 self.get_kuai()
def request(url): # 1. 创建opener对象 - 类似于一个浏览器工具 opener = build_opener(HTTPHandler()) # 2. 构建请求对象 Request req = Request(url, headers=get_headers()) # 3. 发起请求 resp = opener.open(req) print(type(resp)) # http.client.HTTPResponse if resp.code == 200: # print(resp.getheader('Content-Type')) charset = html.get_charset(resp.getheader('Content-Type')) html_text = html.to_html(resp.read(), charset) print(html_text)