def downloadHTML(url): headers = get_headers() try: r = requests.get(url, headers=headers) r.raise_for_status() r.encodeing = 'utf-8' return r.text except: print("爬取失败!")
def getHTMLText(url): headers = get_headers() try: r = requests.get(url, headers=headers) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print('爬取失败!')
def __init__(self, username, password): """ Returns an instance of My2Session after logging into the My2 website using username and password provided. The default number of retries is 3. Use set_max_retries(retries) to modify. :param username: :param password: """ # Logging self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) handler = logging.FileHandler('logs.log') handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) self.username = username self.password = password self.login_url = 'https://my2.ism.lt/Account/Login' self.max_tries = 3 self.retry_interval = 600 super().__init__() self.headers.update(random_headers.get_headers())
from bs4 import BeautifulSoup from common_requests import getHTML from random_headers import get_headers def parserHTML(html): soup = BeautifulSoup(html, 'html.parser') return soup if __name__ == '__main__': headers = get_headers() # 随机headers,反反爬虫措施 url = input('请输入需要爬取的HTML页面:') html = getHTML(url, headers).text[:1000] sp = parserHTML(html) print(sp.prettify())
import requests from random_headers import get_headers def saveFiles(url, headers=None): root = "D://work_study//code//Crawler_exercise//files//" path = root + url.split('/')[-1] # 取得文件名称 try: if not os.path.exists(root): # 判断文件保存路径是否存在 os.mkdir(root) # 不存在则新建 if not os.path.exists(path): # 文件不存在则保存 r = requests.get(url, headers=headers) with open(path, 'wb') as f: f.write(r.content) f.close() print("文件保存成功!") else: print("文件已存在!") except: print("出现异常,保存失败!") if __name__ == "__main__": headers = get_headers() url = input("请输入需要保存的文件URL:") saveFiles(url,headers) # http://books.linjianming.com/test/20191230042141203.jpg # http://books.linjianming.com/test/人生如戏 停更5个月后的再次更新.pdf # http://books.linjianming.com/test/demo.html