def get(url): resp = requests.get(url, headers={'User-Agent': header.get_ua()}) if resp.status_code == 200: parse(resp.text) else: raise Exception('请求失败!')
def get(url): resp = requests.get(url, verify=False, headers={'User-Agent': header.get_ua()}) if resp.status_code == 200: # print(resp.text) # 240099 parse(resp.text) else: raise Exception('请求失败')
import json import time import re from urllib.parse import quote import requests from selenium.webdriver import Chrome from selenium.webdriver.support import ui, expected_conditions from selenium.webdriver.common.by import By from utils.header import get_ua headers = {'User-Agent': get_ua()} def start(cityName): url = f'https://zhaopin.baidu.com/?city={quote(cityName)}' chrome.get(url) query = chrome.find_element_by_css_selector('input[name="query"]') query.send_keys('Python') chrome.execute_script('var q=document.documentElement.scrollLeft=1000') chrome.find_element_by_css_selector('.search-btn').click() time.sleep(2) # 需要验证登录 try: chrome.find_element_by_class_name('tang-pass-footerBarULogin').click() time.sleep(0.5) input_uesrname = chrome.find_element( By.XPATH, '//input[@id="TANGRAM__PSP_3__userName"]') input_uesrname.send_keys('18795681793')
""" 爬取美女网 -requests -bs4 -csv储存 - 扩展 携程 asyncio """ import json from bs4 import BeautifulSoup, Tag from utils.header import get_ua import requests import time headers = { 'User-Agent': get_ua() } def get(url): resp = requests.get(url, headers=headers) if resp.status_code == 200: resp.encoding = 'utf-8' parse(resp.text) def parse(html): soup = BeautifulSoup(html, 'lxml') content_boxs = soup.select('.content-box') item = {} for content_box in content_boxs:
def download_code(): resp = session.get('https://so.gushiwen.org/RandCode.ashx', headers={'User-Agent': get_ua()}) # 写入文件 with open('code.png', 'wb') as f: f.write(resp.content)
def get(url): resp: Response = requests.get(url, headers={'user-agent': header.get_ua()}) if resp.status_code == 200: parse(resp.text) else: raise Exception('失败')
import os import re import requests from utils.header import get_ua base_url = 'http://sc.chinaz.com/tupian/' url = 'http://sc.chinaz.com/tupian/' if os.path.exists('mn.html'): with open('mn.html', encoding='utf-8') as f: html = f.read() else: resp = requests.get(url, headers={'User_Agent': get_ua()}) resp.encoding = 'utf-8' assert resp.status_code == 200 html = resp.text with open('mn.html', 'w', encoding=resp.encoding) as f: f.write(html) compile = re.compile(r'<img src2="(.*?) alt="(.*?)">') images = compile.findall(html) # print(images) next_url = re.findall(r'<b>2221</b></a><a href="(.*?)" class="nextpage', html, re.S) print(base_url + next_url[0])
import re import os import requests from requests import Response from utils.header import get_ua base_url = 'http://sc.chinaz.com/tupian/' url = 'http://sc.chinaz.com/tupian/shuaigetupian.html' if os.path.exists('mn.html'): with open('mn.html','r',encoding='utf-8') as f: html = f.read() else: resp: Response = requests.get(url, headers = {'User-Agent': get_ua()}) print(resp.encoding) # ISO-8859-1 这是国际标准编码 resp.encoding = 'utf-8' # 可以修改响应的状态码 assert resp.status_code == 200 html = resp.text with open('mn.html', 'w') as f: f.write(html) # print(html) # [\u4e00-\u9fa5] compile = re.compile(r'<img src2="(.*?)" alt="(.*?)">') compile2 = re.compile(r'<img alt="(.*?)" src="(.*?)" >') imgs = compile.findall(html) # list if len(imgs) == 0: imgs = compile2.findall(html) print(len(imgs), imgs, sep="\n")
def process_request(self, request, spider): headers = {'User-Agent': get_ua()} # UA 伪装 request.headers['User-Agent'] = headers return None