class DoFetchProxy(ProxyManager): def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('fetch_proxy') def main(self): self.log.info("***** start fetch proxy *****") self.fetch() self.log.info("***** finish fetch proxy *****")
class Check(ProxyManager, Thread): def __init__(self, queue, thread_name): ProxyManager.__init__(self) Thread.__init__(self, name=thread_name) self.queue = queue self.log = LogHandler('init_proxy_check') def run(self): self.log.info('Init Proxy Check - {} : start'.format(self.name)) while True: try: proxy_key = self.queue.get(block=False) except Empty: self.log.info('Init Proxy Check - {} : end'.format(self.name)) break proxy_obj = Proxy.newProxyFromJson(proxy_key) proxy_obj, status = check_proxy_useful(proxy_obj) if status: self.log.info( 'Init Proxy Check - {}: {} validation pass'.format( self.name, proxy_obj.proxy)) self.client.put(proxy_obj) else: self.log.info( 'Init Proxy Check - {}: {} validation fail'.format( self.name, proxy_obj.proxy)) self.client.delete(proxy_obj.proxy) self.queue.task_done()
class ProxyValidater: """ 验证useful_proxy_queue中的代理,将不可用的移出 """ def __init__(self): self._pm = ProxyManager() self.queue = Queue() self.proxy_list = None self.proxy_dict = dict() self.log = LogHandler('proxy_validater') def _valid_proxy(self, threads=50): """ 验证useful_proxy代理 :param threads: 线程数 :return: """ thread_list = list() for index in range(threads): thread_list.append(ValidateProxy(self.queue, self.proxy_dict)) for thread in thread_list: thread.daemon = True thread.start() for thread in thread_list: thread.join() def put_queue(self): self._pm.db.change_table(self._pm.useful_proxy_queue) self.proxy_list = self._pm.db.get_all() for proxy in self.proxy_list: self.queue.put(proxy) self.proxy_dict[proxy] = 0 def main(self): self.put_queue() while True: if not self.queue.empty(): self.log.info("Start valid useful proxy") self._valid_proxy() else: self.log.info('Valid Complete! sleep 600 sec.') time.sleep(600) self.put_queue()
class ValidateProxy(Thread): """ 多线程验证useful_proxy """ def __init__(self, queue, item_dict): self._pm = ProxyManager() super().__init__() self.log = LogHandler('validate_proxy', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict def run(self): self._pm.db.change_table(self._pm.useful_proxy_queue) while self.queue.qsize(): proxy = self.queue.get() if valid_useful_proxy(proxy): # 验证通过,从计数字典删除 self.log.info('ProxyCheck: {} validation pass'.format(proxy)) del self.item_dict[proxy] else: # 验证失败,计数加1 self.item_dict[proxy] += 1 self.log.info('ProxyCheck: {} validation fail'.format(proxy)) if self.item_dict[proxy] >= FAIL_COUNT: # 超过最大失败次数,从计数字典和数据库删除 self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) del self.item_dict[proxy] self._pm.db.delete(proxy) else: # 未超过最大失败次数,放回队列 self.queue.put(proxy) self.queue.task_done()
def run_schedule(): start_init_proxy() start_proxy_check() schedule_log = LogHandler('schedule_log') schedule = BlockingScheduler(logger=schedule_log) schedule.add_job(start_init_proxy, 'interval', minutes=GETTER_CYCLE, id="start_init_proxy", name="抓取代理初始化验证") schedule.add_job(start_proxy_check, 'interval', minutes=TESTER_CYCLE, id="start_proxy_check", name="代理可用性定时复核") schedule.start()
def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('fetch_proxy')
class ProxyManager(object): def __init__(self): self.client = db.DBclient() self.log = LogHandler('proxy_manager') def fetch(self): proxy_set = set() self.log.info(u'代理抓取: start') get_function = GetFunctions() for proxy_get in get_function.proxy_get_functions: self.log.info('Get Proxy - {}: start'.format(proxy_get)) try: for proxy in getattr(GetFreeProxy, proxy_get.strip())(): proxy = proxy.strip() if not proxy or not verifyProxyFormat(proxy): self.log.error('Get Proxy - {}: {} error'.format( proxy_get, proxy)) continue elif proxy in proxy_set: self.log.info('Get Proxy - {}: {} is exist'.format( proxy_get, proxy)) continue else: self.log.info('Get Proxy - {}: {} success'.format( proxy_get, proxy)) self.client.put(Proxy(proxy, source=proxy_get)) proxy_set.add(proxy) except Exception as e: self.log.error('Get Proxy - {}: error'.format(proxy_get)) self.log.error(str(e)) def get(self): proxy_list = self.client.getAll() if proxy_list: proxy = random.choice(proxy_list) return Proxy.newProxyFromJson(proxy) else: return None def getAll(self): proxy_list = self.client.getAll() return [Proxy.newProxyFromJson(_) for _ in proxy_list] def getCount(self): proxy_counts = self.client.getCount() return proxy_counts def delete(self, proxy_key): self.client.delete(proxy_key)
import random #当前文件的路径 pwd = os.getcwd() project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..") sys.path.append(project_path) print(project_path) from proxy.proxy_valid import ValidIp from api.rest_api import RestApi from util.util_function import CheckDir, DownloadFile, WriteInfo from util.log_handler import LogHandler from util.config import GetConfig # log = LogHandler('read_csv') log = LogHandler('new_0') api = RestApi() configs = GetConfig() # proxies = ValidIp(True,'http://www.jiayuan.com') proxies = ValidIp(True, 'http://www.jiayuan.com') print(proxies) url_address = 'http://www.jiayuan.com/' #当前文件的路径
import random #当前文件的路径 pwd = os.getcwd() project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..") sys.path.append(project_path) print(project_path) from proxy.proxy import ValidIp from api.rest_api import RestApi from util.util_function import CheckDir, DownloadFile, WriteInfo from util.log_handler import LogHandler from util.config import GetConfig # log = LogHandler('read_csv') log = LogHandler('search_user_photos') api = RestApi() configs = GetConfig() # proxies = ValidIp("local",'http://www.jiayuan.com') proxies = ValidIp("local", 'http://www.jiayuan.com') print(proxies) url_address = 'http://www.jiayuan.com/' #当前文件的路径
class ProxyRefresher: """ 代理定时刷新 """ def __init__(self): self._pm = ProxyManager() self.log = LogHandler('proxy_refresher') def fetch_all_proxy(self): """ fetch proxy into Db by ProxyGetter/get_free_proxy.py :return: """ for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) # for proxy in getattr(GetFreeProxy, proxyGetter.strip())(self.get()): for proxy in getattr(GetFreeProxy, proxyGetter.strip())(None): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verify_proxy_format(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self._pm.db.change_table(self._pm.raw_proxy_queue) self._pm.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) pass except Exception as e: self.log.error("{func}: fetch proxy fail, {e}".format( func=proxyGetter, e=e)) continue def validate_raw_proxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self._pm.db.change_table(self._pm.raw_proxy_queue) raw_proxy = self._pm.db.pop() self.log.info('ProxyRefresher: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self._pm.get_all() while raw_proxy: if (raw_proxy not in remaining_proxies ) and valid_useful_proxy(raw_proxy): self._pm.db.change_table(self._pm.useful_proxy_queue) self._pm.db.put(raw_proxy) self.log.info('ProxyRefresher: %s validation pass' % raw_proxy) else: self.log.info('ProxyRefresher: %s validation fail' % raw_proxy) self._pm.db.change_table(self._pm.raw_proxy_queue) raw_proxy = self._pm.db.pop() remaining_proxies = self._pm.get_all() self.log.info('ProxyRefresher: %s validProxy complete' % time.ctime())
def __init__(self): self._pm = ProxyManager() self.queue = Queue() self.proxy_list = None self.proxy_dict = dict() self.log = LogHandler('proxy_validater')
def testLogHandler(): """ test function LogHandler in Util/LogHandler :return: """ log = LogHandler('test') log.error('this is a log from test') log.resetName(name='test1') log.warning('this is a log from test1') log.resetName(name='test2') log.info('this is a log from test2')
def __init__(self, queue, item_dict): self._pm = ProxyManager() super().__init__() self.log = LogHandler('validate_proxy', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict
def __init__(self): self.client = db.DBclient() self.log = LogHandler('proxy_manager')
# -*- coding: utf-8 -*- # !/usr/bin/env python import requests import time, os, sys from lxml import etree from contextlib import closing from util.log_handler import LogHandler from util.web_request import WebRequest sys.path.append('..') from util.log_handler import LogHandler log = LogHandler('photo') # #当前文件的路径 # pwd = os.getcwd() # #当前文件的父路径 # father_path=os.path.abspath(os.path.dirname(pwd)+os.path.sep+".") # #当前文件的前两级目录 # grader_father=os.path.abspath(os.path.dirname(pwd)+os.path.sep+"..") # noinspection PyPep8Naming def robustCrawl(func): def decorate(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e:
def __init__(self): self._pm = ProxyManager() self.log = LogHandler('proxy_refresher')
#当前文件的路径 pwd = os.getcwd() project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..") sys.path.append(project_path) from util.config import GetConfig configs = GetConfig() from proxy.proxy_valid import ValidIp from api.rest_api import RestApi from util.util_function import CheckDir, DownloadFile, WriteInfo from util.log_handler import LogHandler log = LogHandler('read_csv') api = RestApi() # proxies = ValidIp('1','http://www.jiayuan.com') proxies = ValidIp(True, 'http://www.jiayuan.com') #当前文件的路径 # csv_path = project_path+'\logs\csv\\' csv_path = project_path + '/logs/csv/' #输出文件夹 out_dir = './download'
from util.log_handler import LogHandler from util.config import GetConfig configs = GetConfig() host = 'ws://' + str(configs.host_ip) + ':' + str(configs.host_port) + "/cable" try: import thread except ImportError: import _thread as thread import time logger = LogHandler('web_socket') logger.info('this is a log from web_socket') def on_message(ws, message): data = json.loads(message) print(data['type']) if data['type'] == 'ping': print(data['type']) else: logger.info(data)
# -*- coding: utf-8 -*- ''' ----------------------------------- FileName: check_proxy Description: 验证代理格式 Author: 瓦都尅 Date: 2019/10/30 ----------------------------------- ''' import re from proxy.get_free_proxyip import GetFreeProxy from util.log_handler import LogHandler log = LogHandler('check_proxy', file=False) def verifyProxyFormat(proxy): """ 检查代理格式 """ verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" _proxy = re.findall(verify_regex, proxy) return True if len(_proxy) == 1 and _proxy[0] == proxy else False class CheckProxy(object): @staticmethod def checkAllGetProxyFunc(): """ 检查get_free_proxyip所有代理获取函数运行情况
# from tomorrow import threads import random #当前文件的路径 pwd = os.getcwd() project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..") sys.path.append(project_path) print(project_path) from api.rest_api import RestApi from util.util_function import CheckDir, DownloadFile, WriteInfo from util.log_handler import LogHandler from util.config import GetConfig # log = LogHandler('read_csv') log = LogHandler('test_uid') api = RestApi() def get_uid(data): try: r = api.get_uid(data) return (json.loads(r)["data"]) except Exception as e: log.error("api request fail: %s", format(e)) while True: data = {'need': 20000, 'remark': "im test"}
# print(project_path) from login import GetUserCookie from proxy.proxy_valid import ValidIp from api.rest_api import RestApi from util.util_function import CheckDir, DownloadFile, WriteInfo from util.log_handler import LogHandler from util.config import GetConfig # change: 定义当前爬虫名字 app = "uid5" # change: 每次请求 uid 数量 req_nums = 200 log = LogHandler(app) # 初始化 api = RestApi() configs = GetConfig() url_address = 'http://www.jiayuan.com/' #当前文件的路径 csv_path = project_path + '\logs\csv\\' #输出文件夹 out_dir = './download.new' # cookie = GetUserCookie()
# coding =utf-8 import json, random, sys import requests sys.path.append('..') from util.config import GetConfig from util.log_handler import LogHandler configs = GetConfig() log = LogHandler('proxy') #本地ip proxy_local_host = configs.proxy_local #在线ip https://github.com/jhao104/proxy_pool proxy_online_host = configs.proxy_online # # 1.只调用一个方法,本地和网络均可用 # # #使用本地代理获取ip def GetLocalIp(): r = requests.get(proxy_local_host) ip_ports = json.loads(r.text) num = random.randint(0,10)
def __init__(self, queue, thread_name): ProxyManager.__init__(self) Thread.__init__(self, name=thread_name) self.queue = queue self.log = LogHandler('init_proxy_check')
# coding =utf-8 import requests import json, sys import random from proxy.proxy import NewProxyIp sys.path.append('..') from util.log_handler import LogHandler log = LogHandler('proxy') loger = LogHandler('proxy_ok') # #2. 获取到代理后判断能否访问网站 # #获取ip,调用NewProxyIp()默认为在线获取,NewProxyIp("1")为本地代理获取 def GenNewIp(local): proxy = NewProxyIp(local) return proxy #验证IP地址是否能进入网站 #ValidIp('1','http://www.jiayuan.com' ) def ValidIp(local=True, valid_host='http://httpbin.org/ip'): #调用获取ip方法 proxy = GenNewIp(local) # print(proxy) retry_count = 20