class _ThreadFetcher(Thread): def __init__(self, fetch_source, proxy_dict): Thread.__init__(self) self.fetch_source = fetch_source self.proxy_dict = proxy_dict self.fetcher = getattr(ProxyFetcher, fetch_source, None) self.log = LogHandler("fetcher") self.conf = ConfigHandler() self.proxy_handler = ProxyHandler() def run(self): self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source)) try: for proxy in self.fetcher(): self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23))) proxy = proxy.strip() if proxy in self.proxy_dict: self.proxy_dict[proxy].add_source(self.fetch_source) else: self.proxy_dict[proxy] = Proxy( proxy, source=self.fetch_source) except Exception as e: self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source)) self.log.error(str(e))
class Fetcher(object): name = "fetcher" def __init__(self): self.log = LogHandler(self.name) self.conf = ConfigHandler() self.proxy_handler = ProxyHandler() self.loadIp() def loadIp(self): if False != os.path.isfile("qqwry.dat"): self.ip = QQwry() self.ip.load_file('qqwry.dat') else: self.ip = False def fetch(self): """ fetch proxy into db with proxyFetcher :return: """ proxy_set = set() self.log.info("ProxyFetch : start") for fetch_name in self.conf.fetchers: self.log.info("ProxyFetch - {func}: start".format(func=fetch_name)) fetcher = getattr(ProxyFetcher, fetch_name, None) if not fetcher: self.log.error("ProxyFetch - {func}: class method not exists!") continue if not callable(fetcher): self.log.error("ProxyFetch - {func}: must be class method") continue try: for proxy in fetcher(): if proxy in proxy_set: self.log.info('ProxyFetch - %s: %s exist' % (fetch_name, proxy.ljust(23))) continue else: self.log.info('ProxyFetch - %s: %s success' % (fetch_name, proxy.ljust(23))) if proxy.strip(): if self.ip: area = " ".join(self.ip.lookup( proxy.split(':')[0])) else: self.loadIp() area = '' proxy_set.add((proxy, fetch_name, area)) except Exception as e: self.log.error( "ProxyFetch - {func}: error".format(func=fetch_name)) self.log.error(str(e)) self.log.info("ProxyFetch - all complete!") return proxy_set
class Fetcher(object): name = "fetcher" def __init__(self): self.log = LogHandler(self.name) self.conf = ConfigHandler() self.proxy_handler = ProxyHandler() def fetch(self): """ fetch proxy into db with proxyFetcher :return: """ proxy_set = set() url_set = set() self.log.info("ProxyFetch : start") for fetch_name in self.conf.fetchers: self.log.info("ProxyFetch - {func}: start".format(func=fetch_name)) fetcher = getattr(ProxyFetcher, fetch_name, None) if not fetcher: self.log.error("ProxyFetch - {func}: class method not exists!") continue if not callable(fetcher): self.log.error("ProxyFetch - {func}: must be class method") continue try: for proxy in fetcher(): if proxy.url in url_set: self.log.info( f'ProxyFetch - {fetch_name}: {proxy.url} exist') continue self.log.info( f'ProxyFetch - {fetch_name}: {p.url} success') for tag in VERIFY_URL.keys(): p = deepcopy(proxy) p.tag = tag proxy_set.add(p) except Exception as e: self.log.error( "ProxyFetch - {func}: error".format(func=fetch_name)) self.log.error(str(e)) self.log.info("ProxyFetch - all complete!") return proxy_set
class Fetcher(object): name = "fetcher" def __init__(self): self.log = LogHandler(self.name) self.conf = ConfigHandler() self.proxy_handler = ProxyHandler() def fetch(self): """ fetch proxy with proxyFetcher :return: """ proxy_set = set() self.log.info("ProxyFetch : start") for fetch_name in self.conf.fetchers: self.log.info("ProxyFetch - {func}: start".format(func=fetch_name)) fetcher = getattr(ProxyFetcher, fetch_name, None) if not fetcher: self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_name)) continue if not callable(fetcher): self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_name)) continue try: for proxy in fetcher(): if proxy in proxy_set: self.log.info('ProxyFetch - %s: %s exist' % (fetch_name, proxy.ljust(23))) continue else: self.log.info('ProxyFetch - %s: %s success' % (fetch_name, proxy.ljust(23))) if proxy.strip(): proxy_set.add(proxy) except Exception as e: self.log.error("ProxyFetch - {func}: error".format(func=fetch_name)) self.log.error(str(e)) self.log.info("ProxyFetch - all complete!") return proxy_set
class Fetcher(object): name = "fetcher" def __init__(self): self.log = LogHandler(self.name) self.conf = ConfigHandler() self.proxy_handler = ProxyHandler() def run(self): """ fetch proxy with proxyFetcher :return: """ proxy_dict = dict() self.log.info("ProxyFetch : start") for fetch_source in self.conf.fetchers: self.log.info( "ProxyFetch - {func}: start".format(func=fetch_source)) fetcher = getattr(ProxyFetcher, fetch_source, None) if not fetcher: self.log.error( "ProxyFetch - {func}: class method not exists!".format( func=fetch_source)) continue if not callable(fetcher): self.log.error( "ProxyFetch - {func}: must be class method".format( func=fetch_source)) continue try: for proxy in fetcher(): self.log.info('ProxyFetch - %s: %s ok' % (fetch_source, proxy.ljust(23))) proxy = proxy.strip() if proxy in proxy_dict: proxy_dict[proxy].add_source(fetch_source) else: proxy_dict[proxy] = Proxy(proxy, source=fetch_source) except Exception as e: self.log.error( "ProxyFetch - {func}: error".format(func=fetch_source)) self.log.error(str(e)) self.log.info("ProxyFetch - all complete!") for _ in proxy_dict.values(): if DoValidator.preValidator(_.proxy): yield _
class Fetcher(object): name = "fetcher" def __init__(self): self.log = LogHandler(self.name) self.conf = ConfigHandler() def run(self): """ fetch proxy with proxyFetcher :return: """ proxy_dict = dict() thread_list = list() self.log.info("ProxyFetch : start") for fetch_source in self.conf.fetchers: self.log.info("ProxyFetch - {func}: start".format(func=fetch_source)) fetcher = getattr(ProxyFetcher, fetch_source, None) if not fetcher: self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source)) continue if not callable(fetcher): self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source)) continue thread_list.append(_ThreadFetcher(fetch_source, proxy_dict)) for thread in thread_list: thread.setDaemon(True) thread.start() for thread in thread_list: thread.join() self.log.info("ProxyFetch - all complete!") for _ in proxy_dict.values(): if DoValidator.preValidator(_.proxy): yield _
class WebRequest(object): name = "web_request" def __init__(self, *args, **kwargs): self.log = LogHandler(self.name, file=False) self.response = Response() @property def user_agent(self): """ return an User-Agent at random :return: """ ua_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", ] return random.choice(ua_list) @property def header(self): """ basic header :return: """ return { 'User-Agent': self.user_agent, 'Accept': '*/*', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.9' } def get(self, url, proxies=None, retry_time=3, retry_interval=3, timeout=5, *args, **kwargs): """ get method :param url: target url :param header: headers :param retry_time: retry time :param retry_interval: retry interval :param timeout: network timeout :return: """ # headers = self.header # if header and isinstance(header, dict): # headers.update(header) # proxies = None # if useProxies: # proxies = {'http': MAINPROXY, 'https': MAINPROXY} while True: try: self.response = requests.get(url, proxies=proxies, headers=self.header, timeout=timeout, *args, **kwargs) return self except Exception as e: self.log.error("requests: %s error: %s" % (url, str(e))) retry_time -= 1 if retry_time <= 0: # resp = Response() # resp.status_code = 500 self.response.status_code = 500 return self self.log.info("retry %s second after" % retry_interval) time.sleep(retry_interval) @property def tree(self): if self.response.status_code == 200: return etree.HTML(self.response.content) @property def text(self): return self.response.text
class WebRequest(object): name = "web_request" def __init__(self, *args, **kwargs): self.log = LogHandler(self.name, file=False) self.response = Response() @property def user_agent(self): """ return an User-Agent at random :return: """ ua_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', ] return random.choice(ua_list) @property def header(self): """ basic header :return: """ return { 'User-Agent': self.user_agent, 'Accept': '*/*', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.8' } def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): """ get method :param url: target url :param header: headers :param retry_time: retry time :param retry_interval: retry interval :param timeout: network timeout :return: """ headers = self.header if header and isinstance(header, dict): headers.update(header) while True: try: self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs) return self except Exception as e: self.log.error("requests: %s error: %s" % (url, str(e))) retry_time -= 1 if retry_time <= 0: resp = Response() resp.status_code = 200 return self self.log.info("retry %s second after" % retry_interval) time.sleep(retry_interval) @property def tree(self): return etree.HTML(self.response.content) @property def text(self): return self.response.text @property def json(self): try: return self.response.json() except Exception as e: self.log.error(str(e)) return {}
def testLogHandler(): log = LogHandler('test') log.info('this is info') log.error('this is error')
class Checker(Thread): """ 多线程检测代理是否可用 """ def __init__(self, check_type, queue, thread_name): Thread.__init__(self, name=thread_name) self.type = check_type self.log = LogHandler("checker") self.proxy_handler = ProxyHandler() self.queue = queue self.conf = ConfigHandler() def run(self): self.log.info("ProxyCheck - {} : start".format(self.name)) while True: try: proxy_json = self.queue.get(block=False) except Empty: self.log.info("ProxyCheck - {} : complete".format(self.name)) break proxy = Proxy.createFromJson(proxy_json) proxy = proxyCheck(proxy) if self.type == "raw": if proxy.last_status: if self.proxy_handler.exists(proxy): self.log.info('ProxyCheck - {} : {} exists'.format( self.name, proxy.proxy.ljust(23))) else: self.log.info('ProxyCheck - {} : {} success'.format( self.name, proxy.proxy.ljust(23))) self.proxy_handler.put(proxy) else: self.log.info('ProxyCheck - {} : {} fail'.format( self.name, proxy.proxy.ljust(23))) else: if proxy.last_status: self.log.info('ProxyCheck - {} : {} pass'.format( self.name, proxy.proxy.ljust(23))) self.proxy_handler.update(proxy) else: if proxy.fail_count > self.conf.maxFailCount: self.log.info( 'ProxyCheck - {} : {} fail, count {} delete'. format(self.name, proxy.proxy.ljust(23), proxy.fail_count)) self.proxy_handler.delete(proxy) else: self.log.info( 'ProxyCheck - {} : {} fail, count {} keep'.format( self.name, proxy.proxy.ljust(23), proxy.fail_count)) self.proxy_handler.update(proxy) self.queue.task_done()
class _ThreadChecker(Thread): """ 多线程检测 """ def __init__(self, work_type, target_queue, thread_name): Thread.__init__(self, name=thread_name) self.work_type = work_type self.log = LogHandler("checker") self.proxy_handler = ProxyHandler() self.target_queue = target_queue self.conf = ConfigHandler() def run(self): self.log.info("{}ProxyCheck - {}: start".format( self.work_type.title(), self.name)) while True: try: proxy = self.target_queue.get(block=False) except Empty: self.log.info("{}ProxyCheck - {}: complete".format( self.work_type.title(), self.name)) break proxy = DoValidator.validator(proxy, self.work_type) if self.work_type == "raw": self.__ifRaw(proxy) else: self.__ifUse(proxy) self.target_queue.task_done() def __ifRaw(self, proxy): if proxy.last_status: if self.proxy_handler.exists(proxy): self.log.info('RawProxyCheck - {}: {} exist'.format( self.name, proxy.proxy.ljust(23))) else: self.log.info('RawProxyCheck - {}: {} pass'.format( self.name, proxy.proxy.ljust(23))) self.proxy_handler.put(proxy) else: self.log.info('RawProxyCheck - {}: {} fail'.format( self.name, proxy.proxy.ljust(23))) def __ifUse(self, proxy): if proxy.last_status: self.log.info('UseProxyCheck - {}: {} pass'.format( self.name, proxy.proxy.ljust(23))) self.proxy_handler.put(proxy) else: if proxy.fail_count > self.conf.maxFailCount: self.log.info( 'UseProxyCheck - {}: {} fail, count {} delete'.format( self.name, proxy.proxy.ljust(23), proxy.fail_count)) self.proxy_handler.delete(proxy) else: self.log.info( 'UseProxyCheck - {}: {} fail, count {} keep'.format( self.name, proxy.proxy.ljust(23), proxy.fail_count)) self.proxy_handler.put(proxy)