class ProxyValidSchedule(ProxyManager): def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('valid_schedule') def __validProxy(self): """ 验证代理 :return: """ while True: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if isinstance(each_proxy, bytes): each_proxy = each_proxy.decode('utf-8') if validUsefulProxy(each_proxy): # 成功计数器加1 self.db.inckey(each_proxy, 1) self.log.debug( 'validProxy_b: {} validation pass'.format(each_proxy)) else: # 失败计数器减一 self.db.inckey(each_proxy, -1) # self.db.delete(each_proxy) self.log.info( 'validProxy_b: {} validation fail'.format(each_proxy)) value = self.db.getvalue(each_proxy) if value and int(value) < -5: # 计数器小于-5删除该代理 self.db.delete(each_proxy) self.log.info('validProxy_a running normal') def main(self): self.__validProxy()
class ProxyManager(object): def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() for proxy in getattr(GeteFreeProxy, proxyGetter.strip())(): if proxy.strip(): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): self.db.changeTable(self.useful_proxy_queue) return self.db.get() def delete(self, proxy): self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def get_status(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.get_status() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyRefreshSchedule(ProxyManager): """ 代理定时刷新 """ def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule') def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s start validProxy_a' % time.ctime()) exist_proxy = self.db.getAll() while raw_proxy: if validUsefulProxy(raw_proxy) and (raw_proxy not in exist_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('validProxy_a: %s validation pass' % raw_proxy) else: self.log.debug('validProxy_a: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s validProxy_a complete' % time.ctime())
def test_log_handler(): log = LogHandler("Tlog") log.info("test log") log.resetName("test1") log.info('this is a log from test1') log.resetName('test2') log.info('this is a log from test2')
class DBConfig(object): def __init__(self, ): self.config = ConfigParser() self.name = "config.ini" self.sql_path = os.path.join(ROOT_PATH, self.name) self.log = LogHandler("db") def add_db_config(self, dbtype, host, port, user, password, database, charset): """ 增加或修改数据库配置,配置文件位置config/config.ini :param dbtype: 数据库类型 :param host: 主机 :param port: 端口 :param user: 用户名 :param password: 密码 :param database: 数据库名称 :param charset: 字符集 :return: True 增加或修改成功 """ self.config.read(self.sql_path, encoding="utf-8") if dbtype in self.config: # TODO 设置数据库配置 self.config.set(dbtype, "host", host) self.config.set(dbtype, "port", port) self.config.set(dbtype, "user", user) self.config.set(dbtype, "password", password) self.config.set(dbtype, "database", database) self.config.set(dbtype, "charset", charset) with open(self.sql_path, "w", encoding="utf8") as f: self.config.write(f) self.log.info( "Amend the success , Modifying the data %s" % [dbtype, host, port, user, password, database, charset]) return True else: # TODO 修改数据库配置 self.config.add_section(dbtype) self.config.set(dbtype, "host", host) self.config.set(dbtype, "port", port) self.config.set(dbtype, "user", user) self.config.set(dbtype, "password", password) self.config.set(dbtype, "database", database) self.config.set(dbtype, "charset", charset) with open(self.sql_path, "w+", encoding="utf8") as f: self.config.write(f) self.log.info( "Amend the success , Modifying the data %s" % [dbtype, host, port, user, password, database, charset]) return True def get_db_config(self, dbtyep): """ 返回数据库相关配置 :param dbtyep: 数据库类型 :return: dict(数据库配置) None不存在 """ # TODO 获取配置 self.config.read(self.sql_path, encoding="utf-8") if dbtyep in self.config: options = self.config.items(dbtyep) option = {x: y for x, y in options} for k, v in option.items(): if k == "port": option[k] = int(v) self.log.info("success %s" % option) return option else: self.log.error("Parameter error %s" % dbtyep) return None def update_config(self, section, option, value): """ 根据传入参数修改相关配置 :param section: 块 :param option: 修改key :param value: 修改值 :return: True 修改成功 False 参数错误 """ # TODO 新增其他配置 self.config.read(self.sql_path, encoding="utf-8") if section in self.config.sections(): if option in self.config.options(section): self.config.set(section, option, value) # return '需要修改' self.log.info("Need to be modified") else: self.log.error("Parameter error %s" % option) return None else: self.log.error("Parameter error %s" % section) return None with open(self.sql_path, "w", encoding="utf8") as f: self.config.write(f) self.log.info("Amend the success") return True def add_config(self, section, option, value): """ 独立创建其他配置文件 :param section: 块 :param option: 修改key :param value: 修改值 :return: True 修改成功 """ self.config.read(self.sql_path, encoding="utf-8") if section not in self.config.sections(): self.config.add_section(section) self.config.set(section, option, value) with open(self.sql_path, "w+", encoding="utf8") as f: self.config.write(f) self.log.info("Amend the success") elif section in self.config.sections(): self.config.set(section, option, value) with open(self.sql_path, "w+", encoding="utf8") as f: self.config.write(f) self.log.info("Amend the success")
_smtpadd = 'smtp.ruifucredit.com' def sendMail(sender, reciver, subject, content, passwd, smtpadd): log.info('Start to initialize the mail message.') username = sender password = passwd msg = MIMEMultipart('related') msg['Subject'] = subject # html格式 html = content htm = MIMEText(html, 'html', 'utf-8') msg.attach(htm) msg['From'] = sender msg['To'] = reciver # 发送邮件 smtp = smtplib.SMTP() smtp.connect(smtpadd) smtp.login(username, password) smtp.sendmail(sender, reciver.split(','), msg.as_string()) smtp.quit() if __name__ == "__main__": result = get_info() tableinfo = get_table_count() html = create_html(result['nameinfo'], result['datainfo'], tableinfo) sendMail(_sender_address, _reciver_address, _subject, html, _passwd, _smtpadd) log.info('Send mail successfully.')
class IpSpider(object): def __init__(self, urltype): """ :param urltype: 0-国内高匿代理IP;1-国内透明代理IP;2-国内HTTPS代理IP;3-国外高匿代理IP """ url_list = { 0: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=1', 1: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=2', 2: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=3', 3: 'http://www.pcdaili.com/index.php?m=daili&a=free&type=4', } if urltype in [0, 1, 2, 3]: self.url = url_list.get(urltype) self.ua = UserAgent() self.sqlite = SqliteClient() self.sqlite.create_table_sqlite() self.log = LogHandler("db") def run_spider(self, page): """ 进行爬虫抓取 :param page:几页 :return:tuple """ iplist = [] for x in range(1, page + 1): headers = {'Host': 'www.pcdaili.com', "user-agent": self.ua.chrome} sp_url = self.url + "&page=%d" % x try: r = requests.get(sp_url, headers=headers) except Exception as e: self.log.error(e) finally: html = etree.HTML(r.text) res = html.xpath( '/html/body/div/div/div[2]/table/tbody/tr/td/text()') iptuple = self.group_list(res, 7) iplist.append(iptuple) time.sleep(1) self.log.info("spider html ok") return iplist def group_list(self, grouped, length): """ 分组 :param grouped:列表 :param length:分组长度 :return: [(),()] """ d = [ tuple(grouped[i:i + length]) for i in range(0, len(grouped), length) ] return d[:13] def ip_insert_sql(self, ip_list): """ ip代理插入数据库 :param ip_list: ip列表 :return: """ for y in range(len(ip_list)): # print(ip_list[y]) for x in ip_list[y]: ip_addr = x[0] ip_port = x[1] type = x[3] ip_proxy = type + "://" + ip_addr + ":" + ip_port is_ok_ip = self.validate_ip(type=type, ip_proxy=ip_proxy) if is_ok_ip: insert_res = self.sqlite.insert_table_sqlite( ip_addr=ip_addr, ip_port=ip_port, type=type, ip_proxy=ip_proxy) return True def validate_ip(self, type, ip_proxy): """ 测试ip是否能够代理访问https://weibo.com/ :param type:ip类型 :param ip_proxy:IP地址 :return:true false """ test_url = "https://weibo.com/" proxies = {type: ip_proxy} try: requests.get(test_url, proxies=proxies) except Exception as e: self.log.error(e) return False else: self.log.info(ip_proxy + " is ok !test url is " + test_url) return True
class SqliteClient(object): def __init__(self, dbtype='sqlit'): """ :param dbtype: 选择数据库类型 """ self.log = LogHandler("db") DBCONFIG = DBConfig().get_db_config(dbtype) ROOT_PATH = os.path.join(os.path.dirname(os.path.abspath(CURRENT_PATH)), DBCONFIG.get('path')) DB_NAME = DBCONFIG.get("dbname") DB_PATH = os.path.join(ROOT_PATH,DB_NAME) print(DB_PATH) self.conn = sqlite3.connect(DB_PATH) self.c = self.conn.cursor() def create_table_sqlite(self): """ 创建数据表 :return: false true """ try: sql = "create table if not exists ipdaili(ip_addr TEXT, ip_port TEXT, type TEXT,ip_proxy TEXT, Downloadtime TEXT)" # self.c.execute('''CREATE TABLE ipdaili # (ip_addr TEXT, ip_port TEXT, type TEXT,ip_proxy TEXT, Downloadtime TEXT )''') self.c.execute(sql) self.conn.commit() except Exception as e: self.log.error(e) return False else: self.log.info("create success") return True def insert_table_sqlite(self, ip_addr, ip_port, type,ip_proxy): """ 插入数据 :param ip_addr: ip地址 :param ip_port: 端口 :param type: 类型 :return:false true """ downloadtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") try: self.c.execute("INSERT INTO ipdaili (ip_addr,ip_port,type,ip_proxy,Downloadtime) VALUES (?,?,?,?,?)", (ip_addr, ip_port, type,ip_proxy, downloadtime)) self.conn.commit() except Exception as e: self.log.error(e) return False else: self.log.info("insert success") return True def search_table_sqlite(self, sql="select * from ipdaili"): """ 查询数据数 :param sql:执行sql语句 :return:结果值 false """ try: res = self.c.execute(sql) self.conn.commit() except Exception as e: self.log.error(e) return False else: self.log.info("search success") return res.fetchall() def __del__(self): """ 关闭链接 :return: """ # class_name = self.__class__.__name__ self.conn.close()
class MysqlCline(object): def __init__(self, dbtype): """ 创建数据库 :param dbtype: 数据库类型 """ self.log = LogHandler("db") dbconfig = DBConfig().get_db_config(dbtype) # self.connection = pymysql.connect( # **dbconfig, # ) if dbtype == "mysql": # print("ok") self.connection = pymysql.connect(**dbconfig, ) def create_table_mysql(self): """ 创建表 :return: false true """ sql = """CREATE TABLE IF NOT EXISTS ipdaili ( ip_addr varchar(30) DEFAULT NULL, ip_port varchar(11) DEFAULT NULL, type varchar(10) DEFAULT NULL, Downloadtime varchar(30) DEFAULT NULL )""" try: cursor = self.connection.cursor() cursor.execute(sql) self.connection.commit() self.log.info("create success") return True except Exception as e: self.log.error(e) return False finally: self.log.info("create success") return True pass def insert_table_mysql(self, ip_addr, ip_port, type): """ 插入数据 :param ip_addr: ip地址 :param ip_port: 端口 :param type: 类型 :return:false true """ # 插入数据 # TODO 不能用with try: cursor = self.connection.cursor() downloadtime = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") sql = "INSERT INTO ipdaili VALUES ('" + ip_addr + "','" + ip_port + "','" + type + "','" + downloadtime + "');" cursor.execute(sql) self.connection.commit() self.connection.commit() self.log.info("inserter sql success") return True except Exception as e: self.log.error(e) return False finally: self.log.info("insert success") def search_table_mysql(self, sql="select * from ipdaili"): """ 查询数据库 :param sql:查询语句 :return:结果值 false """ try: cursor = self.connection.cursor() cursor.execute(sql) res = cursor.fetchall() except Exception as e: self.log.error(e) return False finally: self.log.info("search success") return res def __del__(self): """ 关闭数据库链接 :return: """ self.connection.close()