def main(): num_worker_threads = UPTO pool = Pool(num_worker_threads) for n in xrange(1, UPTO): pool.apply_async(process, args=(n,)) pool.join() print cnt
class DbProxiesCheck(object): def __init__(self): #创建操作数据库对象 self.mongo_pool = MongoPool() #待检测ip队列 self.queue = Queue() #协程池 self.coroutine_pool = Pool() #异步回调函数 def __check_callback(self, temp): self.coroutine_pool.apply_async(self.__check_one, callback=self.__check_one()) def run(self): #处理检测代理ip核心逻辑 proxies = self.mongo_pool.find_all() for proxy in proxies: self.queue.put(proxy) #开启多异步任务 for i in range(TEST_PROXIES_ASYNC_COUNT): #异步回调,死循环执行该方法 self.coroutine_pool.apply_async(self.__check_one, callback=self.__check_one()) #当前线程等待队列任务完成 self.queue.join() def __check_one(self): #检查一个代理ip可用性 #从队列中获取一个proxy proxy = self.queue.get() checked_proxy = check_proxy(proxy) if checked_proxy.speed == -1: checked_proxy.score -= 1 if checked_proxy.score == 0: self.mongo_pool.delete(checked_proxy) else: self.mongo_pool.update(checked_proxy) else: checked_proxy.score = MAX_SCORE self.mongo_pool.updata(checked_proxy) #调度队列的task_done方法(一个任务完成) self.queue.task_done() @classmethod def start(cls): ''' 类方法,依据配置文件的时间间隔运行检测数据库中的ip可用性,单位小时 ''' test = DbProxiesCheck() test.run() schedule.every(TEST_RUN_INTERVAL).hours.do(test.run) while 1: schedule.run_pending() time.sleep(60)
class RunSpider(object): def __init__(self): #在init中,建立数据连接,获取要操作的集合 self.mongo_pool=MongoPool() # 在init方法中创建协程池对象 self.coroutine_pool=Pool() def get_spider_from_settings(self): #根据配置文件信息,获取爬虫对象列表 #遍历配置文件中爬虫信息,获取每个爬虫全类名 for full_class_name in PROXIES_SPIDERS: #core.proxy_spider.proxy_spiders.XiciSpider #获取模块名 和 类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # 根据模块名,导入模块 module = importlib.import_module(module_name) # 根据类名,从模块中,获取类 cls = getattr(module, class_name) # 3创建爬虫对象 spider = cls() print(spider, "666") yield spider def run(self): #根据配置文件信息,获取爬虫对象列表, spiders=self.get_spider_from_settings() # 遍历爬虫对象列表,获取爬虫对象,遍历爬虫对象的get_proxies方法,获取IP for spider in spiders: # 使用异步执行这个方法 # self._execute_one_spider_task(spider) self.coroutine_pool.apply_async(self._execute_one_spider_task,args=(spider,)) # 调用协程的join方法,让当前线程等待 协程 任务完成 self.coroutine_pool.join() # 把处理一个代理爬虫的代码抽到一个方法,用于处理一个爬虫任务的 def _execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): # print(proxy) # 检验代理IP(代理ip检验模块) proxy = check_proxy(proxy) # 如果可用,写入数据库(数据库模块),如果speed不为-1,就说明可用 if proxy.speed != -1: # 写入数据库(数据库模块) self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): # 1,定义一个start的类方法 # 2,创建当前类的对象,调用run方法 rs=RunSpider() rs.run() # 3,使用schedule模块,每隔一定时间,执行当前对象的run方法 #修改配置文件,增加爬虫运行时间间隔的配置,单位为小时 schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run) while True: schedule.run_pending() time.sleep(2)
def _load_dns_servers(self): print('[*] Validate DNS servers ...') self.dns_servers = [] # create a process pool for checking DNS servers, the number is your processors(cores) * 2, just change it! processors = cpu_count() * 2 pool = Pool(processors) # read dns ips and check one by one for server in open('dict/dns_servers.txt').readlines(): server = server.strip() if server: pool.apply_async(self._test_server, (server, )) pool.join() # waiting for process finish self.dns_count = len(self.dns_servers) sys.stdout.write('\n') dns_info = '[+] Found {} available DNS Servers in total'.format( self.dns_count) print(dns_info) if self.dns_count == 0: print('[ERROR] No DNS Servers available.') sys.exit(-1)
class RunSpider(object): def __init__(self): # 创建mongopool对象 self.mongo_pool = MongoPool() # 创建协程池 self.coroutine_pool = Pool() def get_spider_from_settings(self): ''' 根据配置文件获取爬虫对象列表, :return: ''' # 遍历文件爬虫的全类名 for full_class_name in PROXIES_SPIDERS: # 获取模块名和类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # print(full_class_name.rsplit('.', maxsplit=1)) # 根据模块名导入模块 module = importlib.import_module(module_name) # 根据类名,从模块中获取类 cls = getattr(module, class_name) spider = cls() # print(spider) yield spider def run(self): # 根据配置文件获取爬虫对象列表, spiders = self.get_spider_from_settings() for spider in spiders: # 异步调用执行的方法 self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider,)) # 调用协程的join,让当前线程等待 协程的任务完成 self.coroutine_pool.join() def _execute_one_spider_task(self, spider): # 用于处理爬虫的方法 try: # 遍历爬虫对象的方法 for proxy in spider.get_proxies(): # print(proxy) # 检测代理可用性 proxy = check_proxy(proxy) # 如果speed不为-1 就说明可用 if proxy.speed != -1: self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): rs = RunSpider() rs.run() # 每间隔多长时间进行一次执行 # settings里面配置 schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run) while True: # 检测时间 每隔一秒钟检查一次是否到了时间 schedule.run_pending() time.sleep(1)
def load_testing(worker_directory_path, tender_file_path, worker, config, count, initial_number, tender_id_base, concurency, run_auction=False, start_time=None, time_offset=120, wait_for_result=False): positions = 4 auction_id_template = \ tender_id_base * (32 - positions) + '{{0:0{}d}}'.format(positions) pool = Pool(concurency) for i in xrange(initial_number, count): auction_id = auction_id_template.format(i) pool.apply_async( planning, (worker_directory_path, tender_file_path, worker, auction_id, config, start_time, time_offset, wait_for_result)) if run_auction: pool.apply_async(run, (tender_file_path, worker, auction_id, config, start_time, time_offset, wait_for_result)) pool.wait_available() pool.join()
class RunSpider: def __init__(self): self.collection = MongoClient() self.pool = Pool() def get_spiders(self): for i in PROXY_SPIDERS: module_name, class_name = i.split('.') module = importlib.import_module(module_name) spider = getattr(module, class_name) yield spider() def run(self): spiders = self.get_spiders() for spider in spiders: self.pool.apply_async(self.execute_spider, args=(spider, )) self.pool.join() def execute_spider(self, spider): for proxy in spider.get_proxies(): self.collection.add(proxy) @classmethod def start(cls): rs = cls() rs.run() schedule.every(1).hours.do(rs.run()) while True: schedule.run_pending() time.sleep(1800)
def crawl(self,pages,depth=2): self.g = nx.DiGraph() for i in range(depth): newpages = set() pagehtmls = [] pool = Pool(50) for page in pages: pool.apply_async(self.download,args=(page,pagehtmls)) pool.join() for page,html in pagehtmls: if not html: continue soup = BeautifulSoup(html) self.add_to_index(page,soup) links = soup('a') for link in links: if 'href' in dict(link.attrs): url = urljoin(page,link['href']) if url.find("'") != -1: continue url = url.split('#')[0] if url[0:4]=='http' and not self.is_indexed(url): newpages.add(url) linkText = self.get_text_only(link) self.add_linkref(page,url,linkText) pages = newpages self.calculaterpagerank(20)
def main(): num_worker_threads = UPTO pool = Pool(num_worker_threads) for n in xrange(1, UPTO): pool.apply_async(process, args=(n, )) pool.join() print cnt
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() self.coroutine_pool = Pool() def get_spider_from_settings(self): for full_class_name in PROXIES_SPIDERS: module_name, class_name = full_class_name.rsplit('.', maxsplit=1) module = importlib.import_module(module_name) cls = getattr(module, class_name) spider = cls() yield spider def run(self): spiders = self.get_spider_from_settings() for spider in spiders: # self.__execute_one_spider_task(spider) self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider, )) self.coroutine_pool.join() def __execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) if proxy.speed != -1: self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex)
def export(srcdirname, destination_dir, format='mp3'): """Exports separated music into wav or mp3 Parameters ---------- srcdirname : str Source Directory containing separated files destination_dir: str Destination directory format : {'wav','mp3'} File formats (codecs) """ destination_dir = os.path.join(destination_dir, srcdirname) os.makedirs(destination_dir, exist_ok=True) audio_adapter = get_default_audio_adapter() pool = Pool() for track in ('vocals.wav', 'other.wav', 'bass.wav', 'drums.wav', 'piano.wav'): filepath = os.path.join(directories['tmpsplit'], srcdirname, track) if os.path.exists(filepath): if format == 'wav': shutil.copy2(filepath, destination_dir) else: data = list(audio_adapter.load(filepath)) instrument = track.split('.')[0] pool.apply_async( audio_adapter.save, (os.path.join(destination_dir, f'{instrument}.mp3'), *data, 'mp3', '128k')) #pool.close() pool.join()
class ProxyTester: def __init__(self): self.collection = MongoClient() self.queue = Queue() self.pool = Pool() def run(self): proxies = self.collection.find_all() for proxy in proxies: self.queue.put(proxy) for i in range(10): self.pool.apply_async(self.async_code, callback=self.async_callback) self.queue.join() def async_callback(self, temp): self.pool.apply_async(self.async_code, callback=self.async_callback) def async_code(self): proxy = self.queue.get() proxy = self.check_proxy(proxy) if proxy.protocol == -1: proxy.score -= 1 if proxy.score == 0: self.collection.delete_one(proxy) else: self.collection.update(proxy) else: proxy.score = MAX_SCORE self.collection.update(proxy) self.queue.task_done() def check_proxy(self, proxy): proxies = { 'http':'http://{}:{}'.format(proxy.ip, proxy.port), 'https':'https://{}:{}'.format(proxy.ip, proxy.port) } http_url = 'http://httpbin.org/get' https_url = 'https://httpbin.org/get' try: http_response = requests.get(http_url, proxies=proxies) https_response = requests.get(https_url, proxies=proxies) if http_response.ok and https_response.ok: proxy.protocol = 2 elif http_response.ok: proxy.protocol = 0 elif https_response.ok: proxy.protocol = 1 except Exception: proxy.protocol = -1 return proxy @classmethod def start(cls): proxy_tester = cls() proxy_tester.run() schedule.every(3).minutes.do(proxy_tester.run) while True: schedule.run_pending() time.sleep(30)
def run(self): self.f.write("proxy_list = [\n") p = Pool(20) for i in range(600, 650): p.apply_async(self.func, (i,)) p.join() self.f.write("]") self.f.close()
class ProxyTester(object): def __init__(self): # 创建操作数据库的mongoPool对象 self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_pool = Pool() def __check_callback(self,temp): self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback) def run(self): # 提供一个run方法,用于处理检测代理IP核心逻辑 # 2.1 从数据库中获取所有代理IP proxies = self.mongo_pool.find_all() for proxy in proxies: # self.__check_one_proxy(proxy) # 把代理IP添加到队列中 self.queue.put(proxy) # 3.5 开启多个一个异步任务,来处理代理IP的检测,可以通过配置文件指定异步数量 for i in range(TEST_PROXIES_ASYNC_COUNT): # 3.4 通过异步回调,使用死循环不断执行这个方法 self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback) # 让当前线程,等待队列任务完成 self.queue.join() def __check_one_proxy(self): # 3.3 把检查一个代理可用性的代码,抽取到一个方法中, # 从队列中获取代理IP,进行检查,检查完毕 proxy = self.queue.get() # 2.3 检查代理可用性 proxy = check_proxy(proxy) # 如果代理不可用,让代理分数-1 if proxy.speed == -1: proxy.score -= 1 # 如果代理分数等于0,就从数据库中删除该代理 if proxy.score <= 45: self.mongo_pool.delete_one(proxy) else: # 更新代理IP self.mongo_pool.update_one(proxy) else: # 2.5 如果代理可用,就恢复该代理分数,更新到数据库中 proxy.score = MAX_SCORE self.mongo_pool.update_one(proxy) # 调度队列的task_done方法 self.queue.task_done() @classmethod def start(cls): proxy_tester = cls() proxy_tester.run() schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run) while True: schedule.run_pending() time.sleep(1)
class ProxyTester(object): def __init__(self): self.queue = Queue() self.pool = Pool() self.proxy_pool = Mysql() def _test_proxy(self): proxy = self.queue.get() try: proxy = check_proxy(proxy) if proxy.speed == -1: proxy.score -= 1 if proxy.score == 0: self.proxy_pool.delete(proxy) logger.info('删除代理:{}'.format(proxy)) else: self.proxy_pool.update_score(proxy) else: proxy.score = settings.MAX_SCORE self.proxy_pool.update_score(proxy) except Exception as ex: logger.exception(ex) self.queue.task_done() def _test_proxy_finish(self, temp): self.pool.apply_async(self._test_proxy, callback=self._test_proxy_finish) def run(self): # 1. 获取所有代理IP proxies = self.proxy_pool.find_all() # 2. 如果代理池为空, 直接返回 if proxies is None: print("代理池为空") return # 获取所有的代理, 放到队列中 for proxy in proxies: self.queue.put(proxy) # 开启多个异步任务执行检查IP的任务 for i in range(settings.TESTER_ANSYC_COUNT): self.pool.apply_async(self._test_proxy, callback=self._test_proxy_finish) # 让主线程等待异步任务完成 self.queue.join() @staticmethod def start(): tester = ProxyTester() tester.run() schedule.every(settings.TESTER_INTERVAL).hours.do(tester.run) while True: schedule.run_pending() time.sleep(1)
class QiushiSpider(object): def __init__(self): self.urlQueue = Queue() self.base_url = "https://www.qiushibaike.com/8hr/page/{}/" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36" } self.pool = Pool(5) pass # 生产 url 函数 def get_url_list(self): for page in range(1, 14): self.urlQueue.put(self.base_url.format(page)) def exec_task(self): # 1. 从 urlQueue 获取 url url = self.urlQueue.get() # 2. 发起请求获取响应并且获取 html response = requests.get(url, headers=self.headers) html = response.text # 3. 从 html 中 提取数据 eroot = etree.HTML(html) texts = eroot.xpath('//div["recommend-article"]/ul/li/div/a/text()') for item in texts: # 4. 保存数据 print(item) self.urlQueue.task_done() pass def exec_task_finished(self, ret): # print("任务执行完成回调") self.pool.apply_async(self.exec_task, callback=self.exec_task_finished) pass def run(self): self.get_url_list() for i in range(5): # 让线程池执行任务 # 1. 第一个参数执行 具体任务代码函数 # 2. callback 当任务执行完成以后回调函数 self.pool.apply_async(self.exec_task, callback=self.exec_task_finished) # 需要让祝线程挂起 self.urlQueue.join() pass
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() self.coroutine_pool = Pool() def get_spider_from_settings(self): ''' 获取配置文件中的具体爬虫列表创建对象 ''' for full_class_name in PROXIES_SPIDERS: module_name, class_name = full_class_name.rsplit('.', maxsplit=1) #动态导入模块 module = importlib.import_module(module_name) cls = getattr(module, class_name) spider = cls() yield spider def run(self): ''' 遍历爬虫对象,执行get_proxies方法 ''' spiders = self.get_spider_from_settings() for spider in spiders: self.coroutine_pool.apply_async(self.__run_one_spider, args=(spider, )) #当前线程等待爬虫执行完毕 self.coroutine_pool.join() def __run_one_spider(self, spider): try: check_ip_count = 0 for proxy in spider.get_proxies(): time.sleep(0.1) checked_proxy = check_proxy(proxy) check_ip_count += 1 if proxy.speed != -1: self.mongo_pool.insert(checked_proxy) logger.info('爬虫{}爬取并校验{}个ip完毕'.format(spider, check_ip_count)) except Exception as er: logger.exception(er) logger.exception("爬虫{} 出现错误".format(spider)) @classmethod def start(cls): ''' 类方法,依据配置文件汇总的时间间隔run爬虫,单位小时 ''' rs = RunSpider() rs.run() schedule.every(SPIDERS_RUN_INTERVAL).hours.do(rs.run) while 1: schedule.run_pending() time.sleep(60)
def download_all_ts(self, ts_url): pool = Pool(size=5) for url in ts_url: filepath = os.path.join(self.save_path, url[-20:]) pool.apply_async(urlretrieve, kwds={ "url": url, "filename": filepath }) pool.join()
def main(): for i in range(2): gpool.apply(delay_func) print(time.time()) gpool.join() print(time.time()) print('----end---?') print(len(gpool)) gpool.apply_async(add, args=[4, 2]) gpool.join()
def run(self, ): url_list = ['www.baidu.com', 'www.110088.com'] for url in url_list: self.request_total(url) total = self.get_total() print("ip总数问为:{}, 总页数为:{}".format(self.total, total)) pool_g = Pool(GEVENT_COUNT) for i in range(1, total + 1): pool_g.apply_async(self.parse, [url, i]) pool_g.join()
class RunSpider(object): def __init__(self): # 创建MongoPool对象 self.mongo_pool = MongoPool() # 3.1 在init方法中创建协程池对象 self.coroutine_pool = Pool() def get_spider_from_settings(self): """根据胚子文件信息, 获取爬虫对象列表""" # 首先遍历配置文件中的爬虫信息, 获取每个爬虫全类名 for full_class_name in PROXIES_SPIDERS: # core.proxy_spider.proxy_spiders.Ip66Spider # 获取模块名和类名,然后根据模块名动态创建类对象 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # 根据模块名导入模块 module = importlib.import_module(module_name) # 根据模块获取爬虫对象 cls = getattr(module, class_name) # 创建爬虫对象 spider = cls() yield spider def run(self): # 2.1 根据配置文件信息,获取爬虫对象列表 spiders = self.get_spider_from_settings() for spider in spiders: # 2.2 遍历爬虫对象列表, 获取爬虫对象, 遍历爬虫对象的get_proxies方法, 获取代理IP # self._execute_one_spider_task(spiders) # 抽取出的方法使用线程池调度 self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider, )) # 3.4 调用协程的 join方法, 让当前线程等待协程任务的完成 self.coroutine_pool.join() def _execute_one_spider_task(self, spider): try: # 遍历爬虫对象的get_proxies方法, 获取代理IP for proxy in spider.get_proxies(): # 2.3 检测代理IP(代理IP检测模块) proxy = check_proxy(proxy) # 如果速度不为-1, 说明可用 if proxy.speed != -1: # 写入数据库 self.mongo_pool.insert_one(proxy) except Exception as e: logger.exception(e) @classmethod def start(cls): cls().run() schedule.every(SPIDER_TIME_DELAY).hours.do(cls().run) while True: schedule.run_pending() time.sleep(1)
class ProxyTexter(object): def __init__(self): '''创建操作数据库的对象''' self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_proxy = Pool() def __check_callback(self, temp): # 死循环调用 self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback) def run(self): '''检测核心逻辑''' proxies = self.mongo_pool.find_all() for proxy in proxies: # 检测 # self.__check_noe_proxy(proxy) # 把代理添加到队列中 self.queue.put(proxy) # 异步 for i in range(TEXT_PROXIES_AXYNC_COUT): # 异步回调 self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback) # 让当前的 线程 等待 队列任务的完成 self.queue.join() def __check_noe_proxy(self): '''处理单个代理''' # 获取队列中的代理 proxy = self.queue.get() proxy = check_proxy(proxy) if proxy.speed == -1: proxy.score -= 1 if proxy.score == 0: self.mongo_pool.delete_one(proxy) else: self.mongo_pool.delete_one(proxy) else: proxy.score = MAX_SCORE self.mongo_pool.delete_one(proxy) # 调度队列的task_done方法 self.queue.task_done() @classmethod def start(cls): '''运行时间设定,制动执行''' proxy_tester = cls() proxy_tester.run() # schedule.every(TEXT_PROXIES_INTERVAL).hour.do(proxy_tester.run) # 每多少小时检测一次 schedule.every(TEXT_PROXIES_INTERVAL).minutes.do(proxy_tester.run) # 每多少分钟检测一次 while True: schedule.run_pending() time.sleep(1)
class ProxyTester(object): def __init__(self): self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_pool = Pool() def _check_callback(self, temp): self.coroutine_pool.apply_async(self._check_one_proxy, callback=self._check_callback) def run(self): # 检测所有ip可用性 # 获取数据库的ip proxies = self.mongo_pool.find_all() # 遍历代理ip列表 for proxy in proxies: # 把代理ip添加到队列中 self.queue.put(proxy) # 开启异步检测 for i in range(TEST_PROXIES_ASYNC_COUNT): # 通过异步回调 使用循环不停的执行 self.coroutine_pool.apply_async(self._check_one_proxy, callback=self._check_callback) # 让当前线程,等待队列完成 self.queue.join() def _check_one_proxy(self): # print(proxy) # 检测ip可用性 # 从队列里面获取队列 proxy = self.queue.get() proxy = check_proxy(proxy) # 如果可用 代理分数减掉1 if proxy.speed == -1: proxy.score -= 1 # 判断分数是否为零 if proxy.score == 0: self.mongo_pool.delete_one(proxy) else: # 更新代理ip self.mongo_pool.update_one(proxy) else: # 如果代理可用, 就恢复代理的分数, 更新到数据库中 proxy.score = MAX_SCORE self.mongo_pool.update_one(proxy) self.queue.task_done() @classmethod def start(cls): # 创建对象 proxy_tester = cls() proxy_tester.run() # 每隔一定的时间执行一次 schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run) while True: schedule.run_pending() time.sleep(1)
class RunSpider(object): spider_list = [ 'kuaiSpider', 'jiangxianSpider', 'xilaSpider', 'xiaohuanSpider', 'zhimaSpider', 'nimaSpider', 'qiyunSpider', 'spider89', ] module_name = 'core.proxy_spider.proxy_spiders' def __init__(self, module_name='', spider_list=[]): if module_name: self.module_name = module_name if spider_list: self.spider_list = spider_list self.mongo_pool = MongoPool() #创建协程池 self.coroutine_pool = Pool() def get_spider_cls(self, spider_list, module_name): module = importlib.import_module(module_name) for spider_name in spider_list: spider_cls = getattr(module, spider_name) yield spider_cls def run_spider(self): for spider in self.get_spider_cls(self.spider_list, self.module_name): #self.__execute_one_spider_task(spider) self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider, )) self.coroutine_pool.join() def __execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) if proxy.delay != -1: self.mongo_pool.insert_one(proxy) print("新代理插入成功" + dict(proxy)) except Exception as ex: logger.exception(ex) @classmethod def start(cls): rs = RunSpider() rs.run_spider() schedule.every(RUN_SPIDER_INTERVAL).hours.do(rs.run_spider) while True: schedule.run_pending() time.sleep(30)
def solve_year(year): print u'%d开始'%year pages = int(get_pages('http://loan.ppdai.com/blacklist/%d_m0'%year)) pool = Pool(size=20) for i in xrange(pages): pool.apply_async(crawl_page, args=(year, i)) # for j in xrange(1,pages/20): # gevent.joinall([gevent.spawn(crawl_page, year, i) for i in xrange(20*(j-1), 20*j)],) # gevent.joinall([gevent.spawn(crawl_page, year, i) for i in xrange(pages-pages%20, pages+1)],) pool.join() CONNECTION.close() print year, u'完成'
def load_dns_server(): pool = Pool(20) for dns_server in open('dict/dns_server_list.txt').readlines(): dns_server = dns_server.strip() if dns_server: pool.apply_async(detect_dns_server, (dns_server, )) pool.join() if len(dns_servers) == 0: print 'No available.' sys.exit(1) print 'Avaiable Dns in total: %d' % (len(dns_servers)) return dns_servers
class ProxyTester(object): def __init__(self): self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_pool = Pool() def __check_callback(self, temp): self.coroutine_pool.apply_async(self.__check_one_proxy, callback=self.__check_callback) def run(self): # 提供一个 run 方法,用于处理检测代理IP核心逻辑 # 2.1 从数据库中获取所以代理IP proxies = self.mongo_pool.find_all() # 2.2 遍历代理IP列表 for proxy in proxies: # self.__check_one_proxy(proxy) # 把代理ip添加到队列中 self.queue.put(proxy) for i in range(TEST_PROXIES_ASYNC_COUNT): self.coroutine_pool.apply_async(self.__check_one_proxy, callback=self.__check_callback) self.queue.join() def __check_one_proxy(self): ''' 检查一个代理IP的可用性 ''' proxy = self.queue.get() # 2.3 检测代理可用性 print(proxy) proxy = check_proxy(proxy) if proxy.speed == -1: proxy.score -= 1 if proxy.score == 0: self.mongo_pool.delete_one(proxy) else: # 否则更新该代理ip self.mongo_pool.update_one(proxy) else: # 2.5 如果代理可用,就恢复该代理的分数,更新到数据库中 proxy.score = MAX_SCORE self.mongo_pool.update_one(proxy) self.queue.task_done() @classmethod def start(cls): # 4.2.1 创建本类对象 proxy_tester = cls() proxy_tester.run() schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run) while True: schedule.run_pending() time.sleep(1)
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() #创建协程池对象 self.coroutine_pool = Pool() def get_spider_from_settings(self): """根据配置文件信息,获取爬虫对象列表""" #遍历配置文件中爬虫信息,获取每个爬虫全类名 for full_class_name in PROXIES_SPIDERS: #获取模块名和类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) #根据模块名,导入模块 module = importlib.import_module(module_name) # #根据类名,从模块中,获取类 cls = getattr(module, class_name) #创建爬虫对象 spider = cls() # print(spider) yield spider def run(self): # 根据配置文件信息,获取爬虫对象列表 spiders = self.get_spider_from_settings() for spider in spiders: #把处理一个代理爬虫的代码抽到一个方法用于处理一个爬虫任务 # self.__execute_one_spider_task(spider) self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider, )) #调用协程的join方法,让当前线程等待协程任务的未完成 self.coroutine_pool.join() def __execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): # 检测代理IP(调用检测模块) proxy = check_proxy(proxy) # 如果可用,写入数据库(调用数据库模块,speed不为-1就说明可用) if proxy.speed != -1: self.mongo_pool.insert_one(proxy) # print(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): rs = RunSpider() rs.run() #每间隔多少个小时运行爬虫 schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run) while True: schedule.run_pending() time.sleep(1)
class Worker(ConsumerMixin): def __init__(self, connection): self.connection = connection self.handlers = collections.defaultdict(list) self._loaded_handlers = False self._pool = Pool(5) #pool size def add_handler(self, queue, handler): self.handlers[queue].append(handler) def run(self): if not self._loaded_handlers: from django_kombu.settings import kombu_settings, perform_import for q in kombu_settings.QUEUES: for handler_cls in perform_import(q[2], 'QUEUE'): self.add_handler(q[0], handler_cls()) self._loaded_handlers = True super(Worker, self).run() def get_consumers(self, Consumer, channel): callbacks = [ partial(self.dispatch_message, q.name) for q in task_queues ] return [ Consumer(queues=q, callbacks=[cb]) for q, cb in zip(task_queues, callbacks) ] def dispatch_message(self, queue, *args): for handler in self.handlers[queue]: if handler.match(*args): try: if kombu_settings.GEVENT: self._pool.spawn(lambda : handler.handle(*args)) else: self._pool.apply_async(lambda : handler.handle(*args)) #handler.handle(*args) except: logger.error(traceback.format_exc()) else: logger.info('SUCCESS: %(routing_key)s %(body)s' % dict( body = args[0], routing_key = args[1].delivery_info['routing_key'] )) def on_connection_error(self, exc, interval): logger.error('Broker connection error: %r. Trying again in %s seconds.', exc, interval) def on_decode_error(self, message, exc): logger.error("Can't decode message body: %r (type:%r encoding:%r raw:%r')", exc, message.content_type, message.content_encoding, safe_repr(message.body) )
class RunSpider(object): """启动spider""" def __init__(self): '''创建数据库对象''' self.mongo_pool = MongoPool() # 创建协程池 self.coroutine_pool = Pool() def get_spider_from_settings(self): '''根据配置信息,获取爬虫列表''' for full_class_name in PROXIES_SPDERS: module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # 从左往右截1次 module = importlib.import_module(module_name) cls = getattr(module, class_name) spdier = cls() yield spdier def run(self): spdiers = self.get_spider_from_settings() for spider in spdiers: # self.__execute_one_spider_task(spider) # 通过一部的方法执行 self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider,)) # 调用join方法,当前线程 等待 协程 任务的完成 self.coroutine_pool.join() def __execute_one_spider_task(self, spider): '''把处理一个代理爬虫的代码抽到一个方法''' try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) # print(proxy) # 写入数据库 self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): '''运行时间设定,制动执行''' rs = RunSpider() rs.run() # schedule.every(RUN_SPDERS_INTERVAL).hour.do(rs.run) # 小时 schedule.every(RUN_SPDERS_INTERVAL).minutes.do(rs.run) # 分钟 while True: schedule.run_pending() time.sleep(1)
class GeventExecutor(AbstractExecutor): def __init__(self, task_cls, max_threads, multiple_instances=False): super(GeventExecutor, self).__init__(task_cls) self._max_threads = max_threads self._multiple_instances = multiple_instances if multiple_instances: self._tasks_pool = Queue() for _ in xrange(max_threads): self._tasks_pool.put(task_cls()) else: self._task = task_cls() self._thread_pool = Pool(size=max_threads) def setup_tasks(self): if self._multiple_instances: for task in self._tasks_pool.queue: task.setup() else: self._task.setup() def join(self, timeout=sys.maxint): super(GeventExecutor, self).join() self._thread_pool.join() def available(self): is_it = not self._thread_pool.full() #if not is_it: # gevent.sleep(0) gevent.sleep(0) return is_it def wait_available(self): gevent.sleep(0) self._thread_pool.wait_available() def _run_task(self, run_id): self._thread_pool.apply_async(self._run_on_thread_pool, (run_id,)) #gevent.sleep(0) def _run_on_thread_pool(self, run_id): try: if self._multiple_instances: try: task = self._tasks_pool.get() result = run_task_func_wrapper(task.run, run_id) finally: self._tasks_pool.put(task) else: result = run_task_func_wrapper(self._task.run, run_id) self.on_async_run_finished(result) except: log.debug("DEUUU MEEERDA", exc_info=True)
def _load_dns_servers(self): self.dns_servers = [] pool = Pool(30) for server in open('dict/dns_servers.txt').xreadlines(): server = server.strip() if server: pool.apply_async(self._test_server, (server,)) pool.join() self.dns_count = len(self.dns_servers) sys.stdout.write('\n') if self.dns_count == 0: sys.exit(-1)
def start_pool(size): t1 = datetime.now() pool = Pool(size) while (datetime.now() - t1).seconds <= SECONDS: print 'pool.free_count():', pool.free_count() if pool.free_count() == 0: pool.wait_available() print '<free 1>' pool.apply_async(test_get) print 'Joining............................................' pool.join() t2 = datetime.now() print COUNT, TIMEOUT_CNT print COUNT / (t2-t1).seconds
def load_dns_servers(): dns_servers = [] pool = Pool(5) for server in open('subDomainsBrute/dict/dns_servers.txt').readlines(): server = server.strip() if server and not server.startswith('#'): pool.apply_async(test_server, (server, dns_servers)) pool.join() server_count = len(dns_servers) if server_count == 0: print_msg('[ERROR] No valid DNS Server !', line_feed=True) sys.exit(-1) return dns_servers
def download(self, project_list, directory): if not os.path.exists(directory): os.mkdir(directory) pool = Pool(20) for i in project_list: l = i.split("*") title = l[0] branch_name = l[1] print('downloading ' + title + ' to ' + directory + '/') pool.apply_async(self.git_clone, (title, directory, branch_name)) print('downloading please don\'t stop it') pool.join()
def explore(self, url): """Travel will never stop""" self.visited_urls = set() self.base = urlparse(url) # Limit Pool size to 100 to prevent HTTP timeouts pool = Pool(100) def visit(target, source): if not self.is_invalid(target): for url, source in self.visit(target, source): pool.apply_async(visit, args=[url, source]) pool.apply_async(visit, args=[url, None]) pool.join()
def register(): f = open('fail.txt', 'w') f.close() f = open('success.txt', 'w') f.close() p = Pool(1000) f = open('邮箱.txt', 'r') # emails = f.read().strip().split('\n') emails = list( map(lambda x: x.split('----')[0], f.read().strip().split('\n'))) for i in emails: p.apply_async(reg, args=(i, )) p.join() print('over!')
class WorkerPool(object): def __init__(self): self.pool_size = options.pool_size self.job_pool = Pool(size=self.pool_size) self.result = Queue() self.target_queue = Queue() def add_job(self, job_func, *args, **kwargs): job = self.job_pool.apply_async( job_func, args=args, kwds=kwargs, callback=self._call_func) self.job_pool.add(job) def run(self, timeout=None): self.job_pool.join(timeout=timeout, raise_error=False) def _call_func(self, job_ret): if job_ret: self.result.put(job_ret) def shutdown(self): self.job_pool.kill()
def load_dns_servers(): print_msg('[+] Validate DNS servers', line_feed=True) dns_servers = [] pool = Pool(10) for server in open('dict/dns_servers.txt').readlines(): server = server.strip() if server: pool.apply_async(test_server, (server, dns_servers)) pool.join() dns_count = len(dns_servers) print_msg('\n[+] %s available DNS Servers found in total' % dns_count, line_feed=True) if dns_count == 0: print_msg('[ERROR] No DNS Servers available!', line_feed=True) sys.exit(-1) return dns_servers
def _load_dns_servers(self): print '[+] Validate DNS servers ...' self.dns_servers = [] pool = Pool(30) for server in open('dict/dns_servers.txt').xreadlines(): server = server.strip() if server: pool.apply_async(self._test_server, (server,)) pool.join() self.dns_count = len(self.dns_servers) sys.stdout.write('\n') print '[+] Found %s available DNS Servers in total' % self.dns_count if self.dns_count == 0: print '[ERROR] No DNS Servers available.' sys.exit(-1)
class WorkerPool(object): JOB_UNSTART = 0 # poc not run JOB_RUNNING = 1 JOB_FINISHED = 2 # poc run ok JOB_ERROR = -1 # error encountered when run poc JOB_ABORT = -2 # running poc is abort, viz unfinished def __init__(self, concurrency=10): self.concurrency = concurrency self.jobPool = Pool(size=concurrency) self.errNum = 0 # failed job(run time error but not aborted) self.successNum = 0 self.totalNum = 0 self.results = {} def work(self, iterJobFuncArgs, jobFunc, timeout=None): for jobFuncArgs in iterJobFuncArgs: self.results[hash(str(jobFuncArgs))] = { 'state': self.JOB_UNSTART, 'args': jobFuncArgs, } self.totalNum += 1 self.jobPool.add( self.jobPool.apply_async( self._doJob, args=(jobFunc, jobFuncArgs,), kwds=None, callback=self._cbJobFinished ) ) self.jobPool.join(timeout=timeout, raise_error=False) return self.results def _cbJobFinished(self, jobResult): if jobResult['state'] == self.JOB_ERROR: self.errNum += 1 elif jobResult['state'] == self.JOB_FINISHED: self.successNum += 1 def _doJob(self, jobFunc, jobFuncArgs): self.results[hash(str(jobFuncArgs))]['state'] = self.JOB_RUNNING try: self.results[hash(str(jobFuncArgs))]['jobRet'] = \ jobFunc(*jobFuncArgs) if isinstance(jobFuncArgs, list) \ else jobFunc(jobFuncArgs) self.results[hash(str(jobFuncArgs))]['state'] = self.JOB_FINISHED except Exception as err: self.results[hash(str(jobFuncArgs))]['exception'] = str(err) self.results[hash(str(jobFuncArgs))]['state'] = self.JOB_ERROR return self.results[hash(str(jobFuncArgs))] def handleAbort(self): for jobId in self.results.keys(): if self.results[jobId]['state'] in (self.JOB_RUNNING, self.JOB_UNSTART): self.results[jobId]['state'] = self.JOB_ABORT
class mainclass(OSISStoreMongo): TTL = 3600 * 24 * 5 # 5 days def __init__(self, *args, **kwargs): super(mainclass, self).__init__(*args, **kwargs) self.pool = Pool(1000) def set_helper(self, session, value, isList=False): if isList: for eco in value: self.set_helper(session, eco) return True db, _ = self._getMongoDB(session) objectindb = db.find_one({"guid": value["guid"]}) if objectindb: objectindb.update(value) value = objectindb noreraise = value.pop('noreraise', False) self.setPreSave(value, session) new = False if objectindb: if noreraise: return value['guid'], new, False db.update({'guid': value['guid']}, {'$inc': {'occurrences': value['occurrences']}, '$set': {'lasttime': value['lasttime'], 'errormessage': value['errormessage'], 'errormessagePub': value['errormessagePub'], 'state': value['state']} }) else: new = True db.save(value) return value['guid'], new, True def set(self, key, value, waitIndex=False, session=None): if isinstance(value, list): self.pool.wait_available() self.pool.apply_async(self.set_helper, (session, value, True)) return None, None, True return self.set_helper(session, value)
class GeventExecutor(PoolOfPoolsMixin, Executor): """ Implementation of Gevent executor fully compatible with :py:class:`concurrent.futures.Executor`. """ # noinspection PyUnusedLocal def __init__(self, *args, **kwargs): super(GeventExecutor, self).__init__() self._max_workers = 100 self.worker_pool = Pool(self._max_workers) def submit(self, fn, *args, **kwargs): future = self.worker_pool.apply_async(fn, args, kwargs) return GreenletFuture(future)
class MultiProcess(object): def __init__(self,task_func,func_name,*args): self.task_func=task_func self.func_name=func_name self.hosts = args self.pool = Pool(3) self.result = [] def execute(self): for h in self.hosts[0]: p = self.pool.apply_async(self.task_func,args=(h,self.func_name)) self.result.append(p) #self.pool.join() return self.get_result() def get_result(self): return self.result #def addcallback(self,func): # return func(self.execute()) def addcallback(self,handler,key): return handler.handle_result(self.execute(),key)
def index(self, activity_ids=None, limit=None, after=None, before=None): def strava2dict(a): return { "id": a.id, "name": a.name, "type": a.type, "summary_polyline": a.map.summary_polyline, "beginTimestamp": a.start_date_local, "total_distance": float(a.distance), "elapsed_time": int(a.elapsed_time.total_seconds()), "average_speed": float(a.average_speed) } dtypes = { "id": "uint32", "type": "category", "total_distance": "float32", "elapsed_time": "uint32", "average_speed": "float16" } if self.indexing(): return [{ "error": "Indexing activities for user {}...<br>Please try again in a few seconds.<br>" .format(self.strava_id) }] ind = cache.get(self.index_key()) if ind: dt_last_indexed, packed = ind activity_index = pd.read_msgpack(packed).astype({"type": str}) elapsed = (datetime.utcnow() - dt_last_indexed).total_seconds() # update the index if we need to if (elapsed > CACHE_INDEX_UPDATE_TIMEOUT) and (not OFFLINE): latest = activity_index.index[0] app.logger.info("updating activity index for {}" .format(self.strava_id)) already_got = set(activity_index.id) try: activities_list = [strava2dict( a) for a in self.client().get_activities(after=latest) if a.id not in already_got] except Exception as e: return [{"error": str(e)}] if activities_list: df = pd.DataFrame(activities_list).set_index( "beginTimestamp") activity_index = ( df.append(activity_index) .drop_duplicates() .sort_index(ascending=False) .astype(dtypes) ) dt_last_indexed = datetime.utcnow() cache.set(self.index_key(), (dt_last_indexed, activity_index.to_msgpack(compress='blosc')), CACHE_INDEX_TIMEOUT) if activity_ids: df = activity_index[activity_index["id"].isin(activity_ids)] else: if limit: df = activity_index.head(limit) else: df = activity_index if after: df = df[:after] if before: df = df[before:] df = df.reset_index() df.beginTimestamp = df.beginTimestamp.astype(str) return df.to_dict("records") # If we got here then the index hasn't been created yet Q = Queue() P = Pool() def async_job(user, limit=None, after=None, before=None): user.indexing(True) activities_list = [] count = 1 try: for a in self.client().get_activities(): d = strava2dict(a) if d.get("summary_polyline"): activities_list.append(d) if (limit or (after and (d["beginTimestamp"] >= after)) or (before and (d["beginTimestamp"] <= before))): d2 = dict(d) d2["beginTimestamp"] = str(d2["beginTimestamp"]) Q.put(d2) app.logger.info("put {} on queue".format(d2["id"])) if limit: limit -= 1 if not limit: Q.put({"stop_rendering": "1"}) else: Q.put({"msg": "indexing...{} activities".format(count)}) count += 1 gevent.sleep(0) except Exception as e: Q.put({"error": str(e)}) else: Q.put({"msg": "done indexing {} activities.".format(count)}) activity_index = (pd.DataFrame(activities_list) .set_index("beginTimestamp") .sort_index(ascending=False) .astype(dtypes)) app.logger.debug("done with indexing for {}".format(self)) dt_last_indexed = datetime.utcnow() packed = activity_index.to_msgpack(compress='blosc') cache.set(self.index_key(), (dt_last_indexed, packed), CACHE_INDEX_TIMEOUT) app.logger.info("cached {}, size={}".format(self.index_key(), len(packed))) finally: user.indexing(False) Q.put(StopIteration) P.apply_async(async_job, [self, limit, after, before]) return Q
def _loop_tasks(logger, redis_server, redis_list_name, pickle_dir, pickle_ext, pickle_corrupt_time, broker_routing_key, broker_exchange, broker_mq_url, swift_auth, swift_user, swift_key): log_normal(logger, {'action': 'uploader-started'}, LOG_INFO) try: if not os.path.exists(pickle_dir): os.mkdir(pickle_dir) except OSError: pass r = redis.Redis(redis_server) def push_redis(): ts_now = time.time() for path in os.listdir(pickle_dir): path = os.path.join(pickle_dir, path) if path.endswith(pickle_ext): ts_file = os.path.getmtime(path) if ts_now - ts_file > pickle_corrupt_time: os.remove(path) else: r.lpush(redis_list_name, path) def start_empty_archieve(): if os.path.exists(PID_FILE): return with open(PID_FILE, 'w') as f: f.write(str(os.getpid())) time.sleep(0.2) with open(PID_FILE, 'r') as f: pid = int(f.read().strip()) if pid == os.getpid(): log_normal(logger, { 'action': 'empty-archieve-starting', 'info': { 'pid': pid } }, LOG_INFO) r.ltrim(redis_list_name, 0, -1) push_redis() log_normal(logger, {'action': 'empty-archieve-done'}, LOG_INFO) os.remove(PID_FILE) if r.llen(redis_list_name) == 0: log_normal(logger, {'action': 'no-task-in-redis-queue'}, LOG_INFO) start_empty_archieve() # Main loop. p = Pool(POOL_SIZE) while process_isalive(os.getppid()): res = r.brpop(redis_list_name, timeout=1) if not res: continue _, pickle_path = res log_normal(logger, { 'action': 'got-redis-task', 'info': { 'pickle_path': pickle_path } }, LOG_INFO) p.wait_available() p.apply_async(do_task, (logger, pickle_path, swift_auth, swift_user, swift_key)) p.join() # Delete pid file if os.path.exists(PID_FILE): try: os.remove(PID_FILE) except OSError: pass log_normal(logger, {'action': 'exit-uploader-process'}, LOG_INFO)
def deploy_tarball_to_s3(tarball_obj, bucket_name, prefix='', region='us-west-2', concurrency=50, no_compress=False, strip_components=0): """ Upload the contents of `tarball_obj`, a File-like object representing a valid .tar.gz file, to the S3 bucket `bucket_name` """ # Connect to S3 and get a reference to the bucket name we will push files to conn = connect_to_region(region) if conn is None: logging.error("Invalid AWS region %s" % region) return try: bucket = conn.get_bucket(bucket_name, validate=True) except boto.exception.S3ResponseError: logging.error("S3 bucket %s does not exist in region %s" % (bucket_name, region)) return # Open the tarball try: with tarfile.open(name=None, mode="r:*", fileobj=tarball_obj) as tarball: files_uploaded = 0 # Parallelize the uploads so they don't take ages pool = Pool(concurrency) # Iterate over the tarball's contents. try: for member in tarball: # Ignore directories, links, devices, fifos, etc. if not member.isfile(): continue # Mimic the behaviour of tar -x --strip-components= stripped_name = member.name.split('/')[strip_components:] if not bool(stripped_name): continue path = os.path.join(prefix, '/'.join(stripped_name)) # Read file data from the tarball fd = tarball.extractfile(member) # Send a job to the pool. pool.wait_available() pool.apply_async(__deploy_asset_to_s3, (fd.read(), path, member.size, bucket, not no_compress)) files_uploaded += 1 # Wait for all transfers to finish pool.join() except KeyboardInterrupt: # Ctrl-C pressed print("Cancelling upload...") pool.join() finally: print("Uploaded %i files" % (files_uploaded)) except tarfile.ReadError: print("Unable to read asset tarfile", file=sys.stderr) return
from gevent import monkey monkey.patch_all() from gevent.pool import Pool import requests urls = [ 'http://python.org/', 'http://www.pocketplaylab.com/', 'http://github.com/' ] def download(i, url): print('No.{}: Downloading: {}'.format(i, url)) requests.get(url) print('No.{}: Done: {}'.format(i, url)) pool = Pool(size=3) for i, url in enumerate(urls, 1): pool.apply_async(download, args=[i, url]) pool.join()
for key in bucket: #ca.log("Found "+key.name) # if key.name[-1] == "/": # if not key.name[0:-1] in files: # pool.apply_async(worker, args=(region_endpoints['publicURL'], # ca.conf['swiftcontainer'], # ca.creds['token'], # key.name, # ca.conf['s3bucket'], # ca.conf['s3accesskey'].encode('ascii'), # ca.conf['s3secretkey'].encode('ascii'))) if not key.name in files: pool.apply_async(worker, args=(region_endpoints['publicURL'], ca.conf['container'], ca.creds['token'], key.name, ca.conf['s3bucket'], ca.conf['s3accesskey'].encode('ascii'), ca.conf['s3secretkey'].encode('ascii'))) pool.join() if ca.conf.get('emailreport') and copied_files: ca.log("Sending email.") ca.email("Copied "+str(len(copied_files))+" files to "+ca.conf['container'],''' Copied the follow files from S3 bucket %s to Swift container %s: %s ''' % (ca.conf['s3bucket'],ca.conf['container'],"\n".join(copied_files))) ca.log("Done.",'',100)
def multi_execute_command(request): if request.method == 'GET': task_id = time.time().__str__().split(".")[0] group_name = request.GET.getlist('group') host = request.GET.getlist('host') bind_groups = request.user.userprofile.bind_groups.select_related() hosts=[] for obj in bind_groups: for g in group_name: if obj.host_group.name == g: hosts += obj.get_host_ref() command_name = request.GET.get('command') new_hosts=[] if hosts: for h in hosts: new_hosts.append(h.ip_address) new_hosts=list(set(new_hosts+host)) res_list=[] db=get_mongo_conn() if db is None: return HttpResponse("mongo cannot connect") db.adsame.insert({"task_id":task_id}); audit_log = AuditLogTest.objects.create(user=request.user.username,group=",".join(group_name), command_type="cmd",command=command_name, execute_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), task_status="executing") audit_log.task_id = task_id audit_log.save() db=get_mongo_conn() if db is None: return HttpResponse("mongo cannot connect") db.adsame.insert({"task_id":task_id}) pool = Pool(POOL_SIZE) for h in new_hosts: db.adsame.insert({"task_id":task_id,"host":h,"status":"padding"}) p = pool.apply_async(execute_task,args=(task_id,h,command_name,db)) res_list.append(p) pool.join() result={} r=None for res in res_list: ret={} try: return_result,error,return_code,ip = res.get() if return_code == 0: return_code = 'success' else: return_code = 'failed' ret['result'],ret['error'],ret['status'] = return_result,error,return_code result[ip]=ret except: pass end_time = datetime.datetime.now() audit_log=AuditLogTest.objects.get(task_id=task_id) audit_log.task_status="success" audit_log.finish_time=end_time.strftime('%Y-%m-%d %H:%M:%S') audit_log.result=json.dumps(result) audit_log.save() result['task_id']=task_id return HttpResponse(json.dumps(result),content_type="application/json")
class Worker: def __init__(self, seeds, done_que, run_que): self.showpercounts = 10 self.timeout = 5 self.starttime = time.time() self.oldtime = 0 self.quit = 0 self.https_enable = 0 self.run_que = run_que self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.loadstate() self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google', 'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' )) self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv')) self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 60 self.poolmaxfree = 20 self.freecount = 0 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.debugnosave = 1 self.tt = 1 self.done_sites_fname='done_sites.bin' try: self.bfdone = BloomFilter.open(self.done_sites_fname) except: self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M if self.run_que.qsize() == 0: for seed in seeds: self.run_que.put( seed.split("http://")[1] ) if self.https_enable == 0: self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I) else: self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I) def cb_httpget(self, data = None): if not data: return seed, err, headers, content = data st = time.time() if err: self.handle_error(err,seed) return if self.https_enable == 0: seed = seed[7:] self.bfdone.add(seed) self.done += 1 data={'seed':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) self.done_que.put(dat) et = time.time() self.cbcputime += (et-st) #self.tt=(et-st) if self.done % self.showpercounts == 0: self.out(seed) pass def out(self, seed): spendtime = time.time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \ (self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed ) def work(self): while self.quit == 0: st = time.time() curdone = self.done self.freecount = self.down_pool.free_count() if self.freecount > self.poolmaxfree: self.tasks = [] minlen = min(self.freecount+1,self.run_que.qsize()) #if minlen <=0:break for i in range( minlen): stt = time.time() url = self.run_que.get() ett = time.time() if url in self.bfdone:# 5%-10% continue url = "http://"+url self.tasks.append(url) for url in self.tasks: self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget) time.sleep(0.1) et = time.time() self.curspeed = (self.done - curdone) / (et-st) #self.tt = (et-st) self.down_pool.join() print "All OVER" def handle_error(self,e,url): if e.find('DNSError') > 0 : self.err.dns += 1 self.err.rdns.append(url) elif e.find('reset') > 0 :#Connection reset self.err.reset += 1 self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 self.err.rrefuse.append(url) else: self.err.others +=1 self.err.rothers.append(url) print "Error", url, e # requests is better through test def httpget_requests(self, url): st = time.time() con = "" e = "" res_headers = "" headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } res = None try: # todo: query the ip of the website before get through dns req = requests req.max_redirects = 1 res = req.get(url, timeout = (3,2), headers = headers ) if self.https_enable == 0 and res.url.lower().startswith('http:'): if 'content-type' not in res.headers.keys() or 'html' not in res.headers['content-type']: return None con = res.content res.close() except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() return url,e,None,None et = time.time() self.totalnettime += (et-st) self.tt = (et-st) return url, e, res.headers, con def savestate(self): self.quit = 1 now = time.time() self.oldtime += (now - self.starttime) #should hold on the singal for procdata done with open('state.txt','wb') as f: f.write(str(self.oldtime) + '\n') # tasks run_queue done f.write(str(len(self.tasks)) + '\n') for t in self.tasks: f.write(t + '\n') l = self.run_que.qsize() f.write(str(l)+ '\n') while l > 0: f.write( self.run_que.pop() + '\n') l-=1 f.write(str((self.done)) + '\n') with open('err_records.pack','wb') as f: cPickle.dump(self.err,f,2) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully." f.close() exit(0) def loadstate(self): try: with open('state.txt') as f: self.oldtime = float(f.readline()) tasks = int(f.readline()) for i in xrange(tasks): self.run_que.add(f.readline().rstrip('\n')) runnings = int(f.readline()) for i in xrange(runnings): self.run_que.add(f.readline().rstrip('\n')) self.done = int(f.readline()) with open('err_records.pack','rb') as f: self.err = cPickle.load(f) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly." except Exception as e: print e
return True return False except: return False api = WebAPI(key) #get the first page of results res = api.search(filter) #keep track of how many results we have left total_results = (res['total']) page = 1 list = [] outfile = open('netwave.html','w') length = 0 try: while(page * 100 <= total_results): # Check the matches to see if they fit what we are looking for for host in res['matches']: ip = ''.join(str(host['ip'])) port = ''.join(str(host['port'])) pool.apply_async(checkCam, (ip,port),) #pool.join() page +=1 res = api.search(filter,page) except(): print 'fail'
class worker: def __init__(self,seeds): self.showpercounts = 50 self.timeout = 10 self.starttime = time.time() self.quit = 0 #self.run_queue = Queue() self.run_queue = daemon.run_que self.done_queue = daemon.done_que self.tasks = [] self.done = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 300 self.freecount = 0 #self.maxfreecnt = 4 self.down_pool = Pool(size=self.poolsize) #self.mutex = gevent.coros.RLock() self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.test = 0 self.errcnt = 0 self.bfdone = daemon.bfdone self.size = 0 if self.run_queue.qsize() == 0: for seed in seeds: self.run_queue.put( seed.split("http://")[-1] ) self.urlpatern = re.compile('href=[\"\']http://([^/?#\"\']+)') def cb_httpget(self, data): st = time.time() seed, err, headers, content = data #sself.test += 1 if err or len(content) == 0: self.errcnt += 1 return data={'url':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) self.size = len(content) self.done_queue.put(dat) self.done += 1 #seed.split('http://')[-1] self.bfdone.add(seed) et = time.time() self.cbcputime += (et-st) if self.done % self.showpercounts == 0: t = self.cbcputime/self.done self.out(seed ,(et-st)) def out(self, cururl, cbtime=0 ): spendtime = time.time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "%s D:%-4d R:%-7d SpeedT:%.2f/s SpeedC:%.2f/s Test:%0.2f CB:%0.4f Active:%d Err:%d %s" % (now, (self.done), self.run_queue.qsize(), \ self.done/spendtime,self.curspeed, self.test, cbtime ,self.poolsize-self.freecount, self.errcnt, cururl ) def work(self): while self.quit == 0: curstime = time.time() self.freecount = self.down_pool.free_count() self.tasks = [] if self.freecount == 0: gevent.sleep(0.1) continue st = time.time() xlen = self.freecount lasturl = "" while xlen > 0: xlen -= 1 url = self.run_queue.get() if url == lasturl: continue else: lasturl = url url = "http://"+url if url in self.bfdone: xlen += 1 continue #print xlen, url, self.down_pool.free_count() self.tasks.append(url) self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget) et = time.time() curetime = time.time() #self.curspeed = (self.done - curdone) / (curetime-curstime) self.down_pool.join() print "All OVER" # requests is better than pycurl ? def httpget_requests(self, url): st = time.time() con = "" e = None #'Connection':'close', headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } try: # query the ip of the website req = requests #r = requests req.max_redirects = 1 #with gevent.Timeout(5, False) as timeout: res = req.get(url, timeout = self.timeout) if res.url.startswith('https'): raise con = res.content headers = res.headers res.close() except KeyboardInterrupt: raise except Exception as e: et = time.time() return url,e,None,None et = time.time() self.totalnettime += (et-st) self.curspeed = self.totalnettime/(self.done+1) return url, e, headers, con
class Gsh(object): def __init__(self, hosts, command, fork_limit=1, timeout=None, hooks=None): self.hosts = set(hosts) self.command = command self.fork_limit = self._build_fork_limit(fork_limit, len(self.hosts)) self.timeout = timeout # Treat 0 second timeouts as no timeout. if not timeout: self.timeout = None if hooks is None: hooks = [] self.hooks = hooks self._pool = Pool(max(self.fork_limit, 1)) self._greenlets = [] self._remotes = [] self._pre_job_hooks = None self._post_job_hooks = None @staticmethod def _build_fork_limit(fork_limit, num_hosts): if isinstance(fork_limit, int) or fork_limit.isdigit(): return int(fork_limit) if fork_limit.endswith("%"): return int(float(num_hosts) * (float(fork_limit[:-1]) / 100.0)) # If we can't parse your forklimit go serial for safety. return 1 def run_async(self): # Don't start executing until the pre_job hooks have completed. self._pre_job_hooks = gevent.spawn(self._run_pre_job_hooks) self._pre_job_hooks.join() for host in self.hosts: remote_command = RemotePopen(host, self.command, hooks=self.hooks, timeout=self.timeout) self._remotes.append(remote_command) self._greenlets.append(self._pool.apply_async(remote_command.run)) self._post_job_hooks = gevent.spawn(self._run_post_job_hooks) def _run_pre_job_hooks(self): for hook in self.hooks: hook.pre_job(self.command, self.hosts, time.time()) def _run_post_job_hooks(self): # Wait for all greenlets to finish before running these hooks. gevent.joinall(self._greenlets) for hook in self.hooks: hook.post_job(time.time()) def wait(self, timeout=None): rc = 0 gevent.joinall(self._greenlets + [self._post_job_hooks], timeout=timeout, raise_error=True) for remote in self._remotes: if remote.rc: return remote.rc return rc
def uncompress_and_copy(src_bucket, src_key, dst_bucket, dst_keyprefix='', concurrency=50, strip_components=0, extract_dates=False): """Upload the contents of a tarball to the S3 bucket.""" client = boto3.client('s3') tarfile_key = client.get_object(Bucket=src_bucket, Key=src_key) tarball_obj = tarfile_key['Body'] # Open the tarball try: with tarfile.open(name=None, mode="r|*", fileobj=tarball_obj) as tarball: files_uploaded = 0 # Parallelize the uploads so they don't take ages pool = Pool(concurrency) # Iterate over the tarball's contents. try: for member in tarball: # Ignore directories, links, devices, fifos, etc. if not member.isfile(): continue # mimic the behavior of tar -x --strip-components= stripped_name = member.name.split('/')[strip_components:] if not bool(stripped_name): continue # add the date from the filename, if requested if extract_dates: m = re.search(r"\-(\d{4})(\d{2})(\d{2}).tar", src_key) if m: date_key = '-'.join([m.group(1), m.group(2), m.group(3)]) keyprefix = os.path.join(dst_keyprefix, date_key) else: log.warn("Extract_dates requested, but no date found") keyprefix = dst_keyprefix else: keyprefix = dst_keyprefix path = os.path.join(keyprefix, '/'.join(stripped_name)) # Read file data from the tarball fd = tarball.extractfile(member) # Send a job to the pool. pool.wait_available() pool.apply_async(__deploy_asset_to_s3, (fd.read(), member.size, dst_bucket, path)) files_uploaded += 1 # Wait for all transfers to finish pool.join() except KeyboardInterrupt: # Ctrl-C pressed print("Cancelling upload...") pool.join() finally: log.info("Uploaded %i files" % (files_uploaded)) except tarfile.ReadError: print("Unable to read asset tarfile", file=sys.stderr) return return {'source': os.path.join(src_bucket, src_key), 'destination': os.path.join(dst_bucket, keyprefix), 'files_sent': files_uploaded, 'bytes_sent': 0}