Example #1
0
class ProxyRefreshSchedule(ProxyManager):
    """
    代理定时刷新
    """
    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('refresh_schedule')

    def validProxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        raw_proxy = self.db.pop()
        self.log.info('%s start validProxy_a' % time.ctime())
        exist_proxy = self.db.getAll()
        while raw_proxy:
            if validUsefulProxy(raw_proxy) and (raw_proxy not in exist_proxy):
                self.db.changeTable(self.useful_proxy_queue)
                self.db.put(raw_proxy)
                self.log.info('validProxy_a: %s validation pass' % raw_proxy)
            else:
                self.log.debug('validProxy_a: %s validation fail' % raw_proxy)
            self.db.changeTable(self.raw_proxy_queue)
            raw_proxy = self.db.pop()
        self.log.info('%s validProxy_a complete' % time.ctime())
Example #2
0
    def __init__(self, dbtype='sqlit'):
        """

        :param dbtype: 选择数据库类型
        """
        self.log = LogHandler("db")
        DBCONFIG = DBConfig().get_db_config(dbtype)
        ROOT_PATH = os.path.join(os.path.dirname(os.path.abspath(CURRENT_PATH)), DBCONFIG.get('path'))
        DB_NAME = DBCONFIG.get("dbname")
        DB_PATH = os.path.join(ROOT_PATH,DB_NAME)
        print(DB_PATH)
        self.conn = sqlite3.connect(DB_PATH)

        self.c = self.conn.cursor()
Example #3
0
    def __init__(self, dbtype):
        """
        创建数据库
        :param dbtype: 数据库类型
        """
        self.log = LogHandler("db")

        dbconfig = DBConfig().get_db_config(dbtype)

        # self.connection = pymysql.connect(
        #     **dbconfig,
        # )
        if dbtype == "mysql":
            # print("ok")
            self.connection = pymysql.connect(**dbconfig, )
Example #4
0
    def __init__(self, urltype):
        """

        :param urltype: 0-国内高匿代理IP;1-国内透明代理IP;2-国内HTTPS代理IP;3-国外高匿代理IP
        """
        url_list = {
            0: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=1',
            1: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=2',
            2: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=3',
            3: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=4',
        }
        if urltype in [0, 1, 2, 3]:
            self.url = url_list.get(urltype)
        self.ua = UserAgent()
        self.sqlite = SqliteClient()
        self.sqlite.create_table_sqlite()
        self.log = LogHandler("db")
class ProxyManager(object):
    def __init__(self):
        self.db = DbClient()
        self.config = GetConfig()
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        for proxyGetter in self.config.proxy_getter_functions:
            proxy_set = set()
            for proxy in getattr(GeteFreeProxy, proxyGetter.strip())():
                if proxy.strip():
                    self.log.info('{func}: fetch proxy {proxy}'.format(
                        func=proxyGetter, proxy=proxy))
                    proxy_set.add(proxy.strip())

            self.db.changeTable(self.raw_proxy_queue)
            for proxy in proxy_set:
                self.db.put(proxy)

    def get(self):
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.get()

    def delete(self, proxy):
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.getAll()

    def get_status(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.get_status()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.get_status()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
class ProxyValidSchedule(ProxyManager):
    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('valid_schedule')

    def __validProxy(self):
        """
        验证代理
        :return:
        """
        while True:
            self.db.changeTable(self.useful_proxy_queue)
            for each_proxy in self.db.getAll():
                if isinstance(each_proxy, bytes):
                    each_proxy = each_proxy.decode('utf-8')

                if validUsefulProxy(each_proxy):
                    # 成功计数器加1
                    self.db.inckey(each_proxy, 1)
                    self.log.debug(
                        'validProxy_b: {} validation pass'.format(each_proxy))
                else:
                    # 失败计数器减一
                    self.db.inckey(each_proxy, -1)
                    # self.db.delete(each_proxy)
                    self.log.info(
                        'validProxy_b: {} validation fail'.format(each_proxy))
                value = self.db.getvalue(each_proxy)
                if value and int(value) < -5:
                    # 计数器小于-5删除该代理
                    self.db.delete(each_proxy)
        self.log.info('validProxy_a running normal')

    def main(self):
        self.__validProxy()
Example #7
0
 def __init__(self):
     """
     构造默认 header request session
     """
     self.header = {
         "Accept":
         "*/*",
         "Accept-Encoding":
         "gzip,deflate,sdch",
         "Accept-Language":
         "zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4",
         "Connection":
         "keep-alive",
         "Content-Type":
         "application/x-www-form-urlencoded",
         "Host":
         "music.163.com",
         "Referer":
         "http://music.163.com",
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
     }
     self.session = requests.session()
     self.log = LogHandler('NeteaseApi')
Example #8
0
   Date         :   2019/12/1
-------------------------------------------------
"""

import platform

from flask import Flask, request

from common.Response import Response
from config.Getter import config
from service.MusicooService import MusicooService
from util.LogHandler import LogHandler

app = Flask(__name__)

log = LogHandler('Musicoo')


@app.route('/', methods=['GET'])
def index():
    return 'index'


@app.route('/netease/song/<song_id>/url', methods=['GET'])
def song_url(song_id):
    """
    获取音乐链接
        /netease/song/1379444316/url
    :param song_id:
    :return:
    """
Example #9
0
--------------------------------------------
"""
__author__ = 'wanglin'

import datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

# 引入外部文件
from jinja2html import create_html
from monitorSpider import get_info
from util.LogHandler import LogHandler
from util.DBManager import get_table_count

log = LogHandler('mailsend')
_sender_address = '*****@*****.**'
_reciver_address = '*****@*****.**'
_subject = u'平台报告-【%s】' % datetime.datetime.now().strftime('%Y-%m-%d')
_passwd = 'passwd'
_smtpadd = 'smtp.ruifucredit.com'


def sendMail(sender, reciver, subject, content, passwd, smtpadd):
    log.info('Start to initialize the mail message.')
    username = sender
    password = passwd
    msg = MIMEMultipart('related')
    msg['Subject'] = subject
    # html格式
    html = content
Example #10
0
class NetEase(object):
    def __init__(self):
        """
        构造默认 header request session
        """
        self.header = {
            "Accept":
            "*/*",
            "Accept-Encoding":
            "gzip,deflate,sdch",
            "Accept-Language":
            "zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4",
            "Connection":
            "keep-alive",
            "Content-Type":
            "application/x-www-form-urlencoded",
            "Host":
            "music.163.com",
            "Referer":
            "http://music.163.com",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
        }
        self.session = requests.session()
        self.log = LogHandler('NeteaseApi')

    def _raw_request(self, method, url, data=None):
        """
        实际发起请求方法
        :param method: POST | GET
        :param url: url
        :param data: 请求携带的数据
        :return: response
        """
        if method == "GET":
            response = self.session.get(url,
                                        params=data,
                                        headers=self.header,
                                        timeout=DEFAULT_TIMEOUT)
        elif method == "POST":
            response = self.session.post(url,
                                         data=data,
                                         headers=self.header,
                                         timeout=DEFAULT_TIMEOUT)
        return response

    def _get_form_data(self, encrypt_data):
        """
        获取加密后的 form data 参数
        :param encrypt_data: 待加密的参数
        :return: 加密后的参数 {"params":"", "encSecKey":""}
        """
        key = netease.create_key(16)
        return {
            "params": netease.aes(netease.aes(encrypt_data, netease.NONCE),
                                  key),
            "encSecKey": netease.rsa(key, netease.PUBKEY, netease.MODULUS)
        }

    def request(self, method, path, data={}, default={"code": -1}):
        """
        统一请求方法
        :param method: POST | GET
        :param path: 路径
        :param data: 未加密的 data
        :param default: 默认的 response
        :return: response
        """
        url = "{}{}".format(BASE_URL, path)
        response = default
        csrf_token = ""

        data.update({"csrf_token": csrf_token})
        params = self._get_form_data(json.dumps(data).encode('utf-8'))
        try:
            self.log.debug(
                '[Netease api] url: {};\trequest  data: {};\tparams: {}'.
                format(url, data, params))
            response = self._raw_request(method, url, params)
            response = response.json()
            self.log.debug('[Netease api] url: {};\tresponse data: {}'.format(
                url, response))
        except requests.exceptions.RequestException as e:
            self.log.error('[Netease api] request error: {}'.format(e))
        except ValueError as e:
            self.log.error(
                "[Netease api] request error; Path: {}, response: {}".format(
                    path, response.text[:200]))
        finally:
            return response

    def songs_url(self, song_id):
        """
        获取音乐的实际 url,外链
            {ids: "[514235010]", level: "standard", encodeType: "aac", csrf_token: ""}
        :param song_id: 音乐 id
        :return: 带有外链的 json 串
        """
        path = "/weapi/song/enhance/player/url/v1?csrf_token="
        params = {
            'ids': '[' + str(song_id) + ']',
            'level': 'standard',
            'encodeType': 'aac',
            'csrf_token': ''
        }
        return self.request(POST, path, params)

    def songs_lyric(self, song_id):
        """
        获取音乐歌词
            {id: "186453", lv: -1, tv: -1, csrf_token: ""}
        :param song_id:
        :return:
        """
        path = "/weapi/song/lyric?csrf_token="
        params = {'id': str(song_id), 'lv': -1, 'tv': -1, 'csrf_token': ''}
        return self.request(POST, path, params)

    def songs_search(self, keyword, offset=0, limit=30):
        """
        搜索音乐
            按照关键字搜索一般就用这个
            {hlpretag: "<span class="s-fc7">", hlposttag: "</span>", s: "春夏秋冬 张国荣", type: "1", offset: "0", …}
        :return:
        """
        path = '/weapi/cloudsearch/get/web?csrf_token='
        params = {
            'csrf_token': '',
            'hlposttag': '</span>',
            'hlpretag': '<span class="s-fc7">',
            'limit': str(limit),
            'offset': str(offset),
            's': str(keyword),
            'total': 'true',
            'type': '1'
        }
        return self.request(POST, path, params)

    def songs_search_(self, song):
        """
        搜索音乐,搜索框联动接口,不常用
            {s: "春夏秋冬", limit: "8", csrf_token: ""}
        :return:
        """
        path = "/weapi/search/suggest/web?csrf_token="
        params = {'s': str(song), 'limit': 8, 'csrf_token': ''}
        return self.request(POST, path, params)

    def songs_detail(self, song_id):
        """
        获取歌曲详情
            给定 song id
            {id: "186453", c: "[{"id":"186453"}]", csrf_token: ""}
        :param song_id: 必传参数,song id
        :return: Song
        """
        path = "/weapi/v3/song/detail?csrf_token="
        params = {
            'id': str(song_id),
            'c': "[{'id': " + str(song_id) + "}]",
            'csrf_token': ''
        }
        return self.request(POST, path, params)
Example #11
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('refresh_schedule')
 def __init__(self):
     self.db = DbClient()
     self.config = GetConfig()
     self.raw_proxy_queue = 'raw_proxy'
     self.log = LogHandler('proxy_manager')
     self.useful_proxy_queue = 'useful_proxy'
Example #13
0
class DBConfig(object):
    def __init__(self, ):
        self.config = ConfigParser()
        self.name = "config.ini"
        self.sql_path = os.path.join(ROOT_PATH, self.name)
        self.log = LogHandler("db")

    def add_db_config(self, dbtype, host, port, user, password, database,
                      charset):
        """
        增加或修改数据库配置,配置文件位置config/config.ini
        :param dbtype: 数据库类型
        :param host: 主机
        :param port: 端口
        :param user: 用户名
        :param password: 密码
        :param database: 数据库名称
        :param charset: 字符集
        :return: True 增加或修改成功
        """
        self.config.read(self.sql_path, encoding="utf-8")

        if dbtype in self.config:
            # TODO 设置数据库配置
            self.config.set(dbtype, "host", host)
            self.config.set(dbtype, "port", port)
            self.config.set(dbtype, "user", user)
            self.config.set(dbtype, "password", password)
            self.config.set(dbtype, "database", database)
            self.config.set(dbtype, "charset", charset)

            with open(self.sql_path, "w", encoding="utf8") as f:
                self.config.write(f)
                self.log.info(
                    "Amend the success , Modifying the data %s" %
                    [dbtype, host, port, user, password, database, charset])
                return True
        else:
            # TODO 修改数据库配置
            self.config.add_section(dbtype)
            self.config.set(dbtype, "host", host)
            self.config.set(dbtype, "port", port)
            self.config.set(dbtype, "user", user)
            self.config.set(dbtype, "password", password)
            self.config.set(dbtype, "database", database)
            self.config.set(dbtype, "charset", charset)

            with open(self.sql_path, "w+", encoding="utf8") as f:
                self.config.write(f)
                self.log.info(
                    "Amend the success , Modifying the data %s" %
                    [dbtype, host, port, user, password, database, charset])
                return True

    def get_db_config(self, dbtyep):
        """
        返回数据库相关配置
        :param dbtyep: 数据库类型
        :return: dict(数据库配置) None不存在
        """
        # TODO 获取配置
        self.config.read(self.sql_path, encoding="utf-8")
        if dbtyep in self.config:
            options = self.config.items(dbtyep)
            option = {x: y for x, y in options}
            for k, v in option.items():
                if k == "port":
                    option[k] = int(v)
            self.log.info("success %s" % option)
            return option
        else:
            self.log.error("Parameter error %s" % dbtyep)
            return None

    def update_config(self, section, option, value):
        """
        根据传入参数修改相关配置
        :param section: 块
        :param option:  修改key
        :param value:   修改值
        :return: True 修改成功 False 参数错误
        """
        # TODO 新增其他配置
        self.config.read(self.sql_path, encoding="utf-8")
        if section in self.config.sections():
            if option in self.config.options(section):
                self.config.set(section, option, value)
                # return '需要修改'
                self.log.info("Need to be modified")
            else:
                self.log.error("Parameter error %s" % option)
                return None
        else:
            self.log.error("Parameter error %s" % section)

            return None

        with open(self.sql_path, "w", encoding="utf8") as f:
            self.config.write(f)
            self.log.info("Amend the success")
            return True

    def add_config(self, section, option, value):
        """
        独立创建其他配置文件
        :param section: 块
        :param option:  修改key
        :param value:   修改值
        :return: True 修改成功
        """
        self.config.read(self.sql_path, encoding="utf-8")
        if section not in self.config.sections():
            self.config.add_section(section)
            self.config.set(section, option, value)
            with open(self.sql_path, "w+", encoding="utf8") as f:
                self.config.write(f)
                self.log.info("Amend the success")
        elif section in self.config.sections():
            self.config.set(section, option, value)
            with open(self.sql_path, "w+", encoding="utf8") as f:
                self.config.write(f)
                self.log.info("Amend the success")
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('valid_schedule')
Example #15
0
--------------------------------------------
"""
__author__ = 'wanglin'

import os
import json
import cx_Oracle
import MySQLdb
from DBUtils.PooledDB import PooledDB
from util.ConfigHandler import ConfigHandler
from util.Singleton import Singleton
from util.LogHandler import LogHandler

os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'

log = LogHandler('DBManager')

class DBManager(object):
    __metaclass__ = Singleton

    def __init__(self, option='oracle'):
        self.option = option
        self.config = ConfigHandler().get(self.option)
        if option == 'oracle':
            dsn = '{host}:{port}/{db}'.format(host=self.config['host'], port=self.config['port'], db=self.config['db'])
            connKwargs = {'user': self.config['user'], 'password': self.config['passwd'], 'dsn': dsn}
            self._pool = PooledDB(cx_Oracle, mincached=2, maxcached=2, maxshared=5, maxconnections=10, **connKwargs)
        else:
            connKwargs = {'host': self.config['host'], 'port': int(self.config['port']), 'user': self.config['user'],
                          'passwd': self.config['passwd'], 'db': self.config['db'], 'charset': self.config['charset']}
            self._pool = PooledDB(MySQLdb, mincached=2, maxcached=2, maxshared=5, maxconnections=10, **connKwargs)
Example #16
0
class SqliteClient(object):
    def __init__(self, dbtype='sqlit'):
        """

        :param dbtype: 选择数据库类型
        """
        self.log = LogHandler("db")
        DBCONFIG = DBConfig().get_db_config(dbtype)
        ROOT_PATH = os.path.join(os.path.dirname(os.path.abspath(CURRENT_PATH)), DBCONFIG.get('path'))
        DB_NAME = DBCONFIG.get("dbname")
        DB_PATH = os.path.join(ROOT_PATH,DB_NAME)
        print(DB_PATH)
        self.conn = sqlite3.connect(DB_PATH)

        self.c = self.conn.cursor()

    def create_table_sqlite(self):
        """
        创建数据表
        :return: false true
        """
        try:
            sql = "create table if not exists ipdaili(ip_addr TEXT, ip_port TEXT, type TEXT,ip_proxy TEXT, Downloadtime TEXT)"
            # self.c.execute('''CREATE TABLE ipdaili
                 # (ip_addr TEXT, ip_port TEXT, type TEXT,ip_proxy TEXT, Downloadtime TEXT )''')
            self.c.execute(sql)
            self.conn.commit()
        except Exception as e:
            self.log.error(e)
            return False
        else:
            self.log.info("create success")
            return True

    def insert_table_sqlite(self, ip_addr, ip_port, type,ip_proxy):
        """
        插入数据
        :param ip_addr: ip地址
        :param ip_port: 端口
        :param type:    类型
        :return:false true
        """
        downloadtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        try:
            self.c.execute("INSERT INTO ipdaili (ip_addr,ip_port,type,ip_proxy,Downloadtime) VALUES (?,?,?,?,?)",
                           (ip_addr, ip_port, type,ip_proxy, downloadtime))
            self.conn.commit()
        except Exception as e:
            self.log.error(e)
            return False
        else:
            self.log.info("insert success")
            return True

    def search_table_sqlite(self, sql="select * from ipdaili"):
        """
        查询数据数
        :param sql:执行sql语句
        :return:结果值 false
        """
        try:
            res = self.c.execute(sql)
            self.conn.commit()
        except Exception as e:
            self.log.error(e)
            return False
        else:
            self.log.info("search success")
            return res.fetchall()

    def __del__(self):
        """
        关闭链接
        :return:
        """
        # class_name = self.__class__.__name__
        self.conn.close()
Example #17
0
class MysqlCline(object):
    def __init__(self, dbtype):
        """
        创建数据库
        :param dbtype: 数据库类型
        """
        self.log = LogHandler("db")

        dbconfig = DBConfig().get_db_config(dbtype)

        # self.connection = pymysql.connect(
        #     **dbconfig,
        # )
        if dbtype == "mysql":
            # print("ok")
            self.connection = pymysql.connect(**dbconfig, )

    def create_table_mysql(self):
        """
        创建表
        :return: false true
        """
        sql = """CREATE TABLE IF NOT EXISTS ipdaili (
          ip_addr varchar(30) DEFAULT NULL,
          ip_port varchar(11) DEFAULT NULL,
          type varchar(10) DEFAULT NULL,
          Downloadtime varchar(30) DEFAULT NULL
            )"""
        try:
            cursor = self.connection.cursor()
            cursor.execute(sql)
            self.connection.commit()
            self.log.info("create success")
            return True
        except Exception as e:
            self.log.error(e)
            return False
        finally:
            self.log.info("create success")
            return True
        pass

    def insert_table_mysql(self, ip_addr, ip_port, type):
        """
        插入数据
        :param ip_addr: ip地址
        :param ip_port: 端口
        :param type:    类型
        :return:false true
        """
        # 插入数据  # TODO 不能用with
        try:
            cursor = self.connection.cursor()
            downloadtime = datetime.datetime.now().strftime(
                "%Y-%m-%d %H:%M:%S")
            sql = "INSERT INTO ipdaili VALUES ('" + ip_addr + "','" + ip_port + "','" + type + "','" + downloadtime + "');"

            cursor.execute(sql)
            self.connection.commit()
            self.connection.commit()
            self.log.info("inserter sql success")
            return True
        except Exception as e:
            self.log.error(e)
            return False
        finally:
            self.log.info("insert success")

    def search_table_mysql(self, sql="select * from ipdaili"):
        """
        查询数据库
        :param sql:查询语句
        :return:结果值 false
        """
        try:
            cursor = self.connection.cursor()
            cursor.execute(sql)
            res = cursor.fetchall()
        except Exception as e:
            self.log.error(e)
            return False
        finally:

            self.log.info("search success")
            return res

    def __del__(self):
        """
        关闭数据库链接
        :return:
        """
        self.connection.close()
    File Name:      monitorSpider
    Description:    
    Author:         wanglin
    Date:           2017/12/28
--------------------------------------------
    Change Activity:2017/12/28;
--------------------------------------------
"""
__author__ = 'wanglin'

import json
import requests
import datetime
from util.LogHandler import LogHandler

log = LogHandler('monitorSpider')


name = {'Total': '-', 'Used': '-', 'Free': '-', 'PercentUsed': '-', 'TotalBlocks': '-', 'TotalFiles': '-',
        'SoftwareVersion': '-'}

node = {'name': '-', 'lastContact': '-', 'xferaddr': '-', 'adminState': '-', 'capacity': '-', 'usedSpace': '-',
        'blockPoolUsedPercent': '-', 'version': '-', }


def get_info():
    url = 'http://192.168.88.1:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo'
    try:
        r = requests.get(url=url)
    except Exception as ex:
        print(ex)
Example #19
0
def test_log_handler():
    log = LogHandler("Tlog")
    log.info("test log")
    log.resetName("test1")
    log.info('this is a log from test1')

    log.resetName('test2')
    log.info('this is a log from test2')
Example #20
0
"""
-------------------------------------------
    File Name:      jinja2html
    Description:    
    Author:         wanglin
    Date:           2017/12/28
--------------------------------------------
    Change Activity:2017/12/28;
--------------------------------------------
"""
__author__ = 'wanglin'

import os
from jinja2 import Environment, FileSystemLoader
from util.LogHandler import LogHandler

log = LogHandler('jinja2html')
PATH = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_ENVIRONMENT = Environment(autoescape=False,
                                   loader=FileSystemLoader(
                                       os.path.join(PATH, 'templates')),
                                   trim_blocks=False)


def create_html(nameinfo, datainfo, tableinfo):
    context = {'info': nameinfo, 'datainfo': datainfo, 'tableinfo': tableinfo}
    log.info('Jinja1 context info: {}'.format(context))
    html = TEMPLATE_ENVIRONMENT.get_template('base.html').render(context)
    log.info('Successful rendering report page. ')
    return html
Example #21
0
class IpSpider(object):
    def __init__(self, urltype):
        """

        :param urltype: 0-国内高匿代理IP;1-国内透明代理IP;2-国内HTTPS代理IP;3-国外高匿代理IP
        """
        url_list = {
            0: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=1',
            1: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=2',
            2: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=3',
            3: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=4',
        }
        if urltype in [0, 1, 2, 3]:
            self.url = url_list.get(urltype)
        self.ua = UserAgent()
        self.sqlite = SqliteClient()
        self.sqlite.create_table_sqlite()
        self.log = LogHandler("db")

    def run_spider(self, page):
        """
        进行爬虫抓取
        :param page:几页
        :return:tuple
        """
        iplist = []
        for x in range(1, page + 1):
            headers = {'Host': 'www.pcdaili.com', "user-agent": self.ua.chrome}
            sp_url = self.url + "&page=%d" % x
            try:
                r = requests.get(sp_url, headers=headers)
            except Exception as e:
                self.log.error(e)
            finally:
                html = etree.HTML(r.text)
                res = html.xpath(
                    '/html/body/div/div/div[2]/table/tbody/tr/td/text()')
                iptuple = self.group_list(res, 7)
                iplist.append(iptuple)
                time.sleep(1)
                self.log.info("spider html ok")
        return iplist

    def group_list(self, grouped, length):
        """
        分组
        :param grouped:列表
        :param length:分组长度
        :return: [(),()]
        """
        d = [
            tuple(grouped[i:i + length])
            for i in range(0, len(grouped), length)
        ]

        return d[:13]

    def ip_insert_sql(self, ip_list):
        """
        ip代理插入数据库
        :param ip_list: ip列表
        :return:
        """
        for y in range(len(ip_list)):
            # print(ip_list[y])
            for x in ip_list[y]:
                ip_addr = x[0]
                ip_port = x[1]
                type = x[3]
                ip_proxy = type + "://" + ip_addr + ":" + ip_port
                is_ok_ip = self.validate_ip(type=type, ip_proxy=ip_proxy)
                if is_ok_ip:
                    insert_res = self.sqlite.insert_table_sqlite(
                        ip_addr=ip_addr,
                        ip_port=ip_port,
                        type=type,
                        ip_proxy=ip_proxy)

        return True

    def validate_ip(self, type, ip_proxy):
        """
        测试ip是否能够代理访问https://weibo.com/
        :param type:ip类型
        :param ip_proxy:IP地址
        :return:true false
        """
        test_url = "https://weibo.com/"
        proxies = {type: ip_proxy}

        try:
            requests.get(test_url, proxies=proxies)
        except Exception as e:
            self.log.error(e)
            return False
        else:
            self.log.info(ip_proxy + " is ok !test url is " + test_url)
            return True
Example #22
0
 def __init__(self, ):
     self.config = ConfigParser()
     self.name = "config.ini"
     self.sql_path = os.path.join(ROOT_PATH, self.name)
     self.log = LogHandler("db")
# -*- coding: utf-8 -*-
import requests
from lxml import etree
from util.LogHandler import LogHandler
from util.WebRequest import WebRequest

logger = LogHandler(__name__)


def robustCrawl(func):
    def decorate(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            logger.info(u"sorry,主区出错。原因:")
            logger.info(e)

    return decorate


def verifyProxyFormat(proxy):
    import re
    verify_regex = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}'
    return True if re.findall(verify_regex, proxy) else False


def getHtmlTree(url, **kwargs):
    header = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',