def __init__(self, url): self.logger = LoggerUtil().get_log # 请求 url self.url = url # 构建代理 handler # self.proxy_list = { # "http" : "127.0.0.1:8080", # "https": "127.0.0.1:8080" # } self.proxy_list = None
# -*- coding: utf-8 -*- import os import configparser import random from utils.logger import LoggerUtil logger = LoggerUtil().get_log project_path = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir)).replace('\\', '/') config_dir = os.path.join(project_path, "config").replace('\\', '/') config_path = os.path.join(config_dir, "config.ini").replace('\\', '/') logger.info("project path : {project_path}".format(project_path=project_path)) logger.info("config dir : {config_dir}".format(config_dir=config_dir)) logger.info("config path : {config_path}".format(config_path=config_path)) def get_user_agent(): config = configparser.ConfigParser() config.read(config_path, encoding="utf-8") userAgents = config.items("User-Agent") return userAgents def random_user_agent(): vlues = get_user_agent() return random.choice(vlues)[1]
class CrawlXz(): def __init__(self, url): self.logger = LoggerUtil().get_log # 请求 url self.url = url # 构建代理 handler # self.proxy_list = { # "http" : "127.0.0.1:8080", # "https": "127.0.0.1:8080" # } self.proxy_list = None def do_request(self): try: # 创建代理处理器 httpproxy_handler = urllib.request.ProxyHandler(self.proxy_list) # 创建特定的opener对象 opener = urllib.request.build_opener(httpproxy_handler, urllib.request.HTTPSHandler) # 安装全局的opener 把urlopen也变成特定的opener urllib.request.install_opener(opener) # http头信息 headers = { 'Connection': 'close', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-User': '******', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Accept - Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } req = urllib.request.Request(self.url, headers=headers) return urllib.request.urlopen(req, timeout=60).read().decode( "utf8") # timeout_sec指定超时时间 except Exception as e: self.logger.error(sys._getframe().f_code.co_name + " error : " + str(e)) def parse(self): res = self.do_request() # 构建 html 树 html = etree.HTML(res) # xpath 解析 html items = html.xpath('//*[@class="topic-title"]') urls = {} # 获取 urls for item in items: urls['https://xz.aliyun.com' + item.attrib['href']] = item.text.replace('\u200b', '').strip() return urls
# -*- coding: UTF-8 -*- """ @Author :haby0 @Desc : """ import pymysql from utils.config import ConfigParser from utils.logger import LoggerUtil from utils.mail import MailUtil logger = LoggerUtil().get_log class MySqlHandle(object): def __init__(self): self.host = ConfigParser.get_config('MySQL', 'host') self.port = ConfigParser.get_config('MySQL', 'port') self.username = ConfigParser.get_config('MySQL', 'username') self.password = ConfigParser.get_config('MySQL', 'password') self.dbname = ConfigParser.get_config('MySQL', 'dbname') try: # 连接数据库 connect = pymysql.Connect(host=self.host, port=int(self.port), user=self.username, passwd=self.password, db=self.dbname, charset='utf8') except Exception as e: logger.error('database conn error : {e}'.format(e=e))