def downloadHTML(url):
    headers = get_headers()
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encodeing = 'utf-8'
        return r.text
    except:
        print("爬取失败!")
def getHTMLText(url):
    headers = get_headers()
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('爬取失败!')
    def __init__(self, username, password):
        """
        Returns an instance of My2Session after logging into the My2 website using username and password provided.
        The default number of retries is 3. Use set_max_retries(retries) to modify.
        :param username:
        :param password:
        """
        # Logging
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)
        handler = logging.FileHandler('logs.log')
        handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)

        self.username = username
        self.password = password
        self.login_url = 'https://my2.ism.lt/Account/Login'
        self.max_tries = 3
        self.retry_interval = 600

        super().__init__()
        self.headers.update(random_headers.get_headers())
from bs4 import BeautifulSoup
from common_requests import getHTML
from random_headers import get_headers


def parserHTML(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup


if __name__ == '__main__':
    headers = get_headers()  # 随机headers,反反爬虫措施
    url = input('请输入需要爬取的HTML页面:')
    html = getHTML(url, headers).text[:1000]
    sp = parserHTML(html)
    print(sp.prettify())
Exemple #5
0
import requests
from random_headers import get_headers


def saveFiles(url, headers=None):
    root = "D://work_study//code//Crawler_exercise//files//"
    path = root + url.split('/')[-1]  # 取得文件名称
    try:
        if not os.path.exists(root):  # 判断文件保存路径是否存在
            os.mkdir(root)  # 不存在则新建
        if not os.path.exists(path):  # 文件不存在则保存
            r = requests.get(url, headers=headers)
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print("文件保存成功!")
        else:
            print("文件已存在!")
    except:
        print("出现异常,保存失败!")

if __name__ == "__main__":
    headers = get_headers()
    url = input("请输入需要保存的文件URL:")
    saveFiles(url,headers)


# http://books.linjianming.com/test/20191230042141203.jpg
# http://books.linjianming.com/test/人生如戏  停更5个月后的再次更新.pdf
# http://books.linjianming.com/test/demo.html