Beispiel #1
0
 def __init__(self, url):
     self.logger = LoggerUtil().get_log
     # 请求 url
     self.url = url
     # 构建代理 handler
     # self.proxy_list = {
     #     "http" : "127.0.0.1:8080",
     #     "https": "127.0.0.1:8080"
     # }
     self.proxy_list = None
Beispiel #2
0
# -*- coding: utf-8 -*-

import os
import configparser
import random
from utils.logger import LoggerUtil

logger = LoggerUtil().get_log

project_path = os.path.abspath(
    os.path.join(os.path.dirname(__file__), os.pardir)).replace('\\', '/')
config_dir = os.path.join(project_path, "config").replace('\\', '/')
config_path = os.path.join(config_dir, "config.ini").replace('\\', '/')

logger.info("project path : {project_path}".format(project_path=project_path))
logger.info("config dir : {config_dir}".format(config_dir=config_dir))
logger.info("config path : {config_path}".format(config_path=config_path))


def get_user_agent():
    config = configparser.ConfigParser()
    config.read(config_path, encoding="utf-8")
    userAgents = config.items("User-Agent")
    return userAgents


def random_user_agent():
    vlues = get_user_agent()
    return random.choice(vlues)[1]

Beispiel #3
0
class CrawlXz():
    def __init__(self, url):
        self.logger = LoggerUtil().get_log
        # 请求 url
        self.url = url
        # 构建代理 handler
        # self.proxy_list = {
        #     "http" : "127.0.0.1:8080",
        #     "https": "127.0.0.1:8080"
        # }
        self.proxy_list = None

    def do_request(self):
        try:
            # 创建代理处理器
            httpproxy_handler = urllib.request.ProxyHandler(self.proxy_list)
            # 创建特定的opener对象
            opener = urllib.request.build_opener(httpproxy_handler,
                                                 urllib.request.HTTPSHandler)
            # 安装全局的opener 把urlopen也变成特定的opener
            urllib.request.install_opener(opener)

            # http头信息
            headers = {
                'Connection':
                'close',
                'Upgrade-Insecure-Requests':
                '1',
                'Sec-Fetch-User':
                '******',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'Sec-Fetch-Site':
                'same-origin',
                'Sec-Fetch-Mode':
                'navigate',
                'Accept - Encoding':
                'gzip, deflate',
                'Accept-Language':
                'zh-CN,zh;q=0.9',
                'User-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
            }
            req = urllib.request.Request(self.url, headers=headers)

            return urllib.request.urlopen(req, timeout=60).read().decode(
                "utf8")  # timeout_sec指定超时时间

        except Exception as e:
            self.logger.error(sys._getframe().f_code.co_name + " error : " +
                              str(e))

    def parse(self):
        res = self.do_request()
        # 构建 html 树
        html = etree.HTML(res)
        # xpath 解析 html
        items = html.xpath('//*[@class="topic-title"]')

        urls = {}
        # 获取 urls
        for item in items:
            urls['https://xz.aliyun.com' +
                 item.attrib['href']] = item.text.replace('\u200b',
                                                          '').strip()
        return urls
Beispiel #4
0
# -*- coding: UTF-8 -*-
"""
@Author :haby0
@Desc   :
"""
import pymysql
from utils.config import ConfigParser
from utils.logger import LoggerUtil
from utils.mail import MailUtil

logger = LoggerUtil().get_log


class MySqlHandle(object):
    def __init__(self):
        self.host = ConfigParser.get_config('MySQL', 'host')
        self.port = ConfigParser.get_config('MySQL', 'port')
        self.username = ConfigParser.get_config('MySQL', 'username')
        self.password = ConfigParser.get_config('MySQL', 'password')
        self.dbname = ConfigParser.get_config('MySQL', 'dbname')
        try:
            # 连接数据库
            connect = pymysql.Connect(host=self.host,
                                      port=int(self.port),
                                      user=self.username,
                                      passwd=self.password,
                                      db=self.dbname,
                                      charset='utf8')
        except Exception as e:
            logger.error('database conn error : {e}'.format(e=e))