コード例 #1
0
ファイル: gvnt.py プロジェクト: ThePenguin1140/jabbapylib
def main():
    num_worker_threads = UPTO
    pool = Pool(num_worker_threads)
    for n in xrange(1, UPTO):
        pool.apply_async(process, args=(n,))
    pool.join()
    print cnt
コード例 #2
0
ファイル: proxy_test.py プロジェクト: R2h1/ProxyPool
class DbProxiesCheck(object):
    def __init__(self):
        #创建操作数据库对象
        self.mongo_pool = MongoPool()
        #待检测ip队列
        self.queue = Queue()
        #协程池
        self.coroutine_pool = Pool()

    #异步回调函数
    def __check_callback(self, temp):
        self.coroutine_pool.apply_async(self.__check_one,
                                        callback=self.__check_one())

    def run(self):
        #处理检测代理ip核心逻辑
        proxies = self.mongo_pool.find_all()

        for proxy in proxies:
            self.queue.put(proxy)

        #开启多异步任务
        for i in range(TEST_PROXIES_ASYNC_COUNT):
            #异步回调,死循环执行该方法
            self.coroutine_pool.apply_async(self.__check_one,
                                            callback=self.__check_one())
        #当前线程等待队列任务完成
        self.queue.join()

    def __check_one(self):
        #检查一个代理ip可用性
        #从队列中获取一个proxy
        proxy = self.queue.get()

        checked_proxy = check_proxy(proxy)

        if checked_proxy.speed == -1:
            checked_proxy.score -= 1
            if checked_proxy.score == 0:
                self.mongo_pool.delete(checked_proxy)
            else:
                self.mongo_pool.update(checked_proxy)
        else:
            checked_proxy.score = MAX_SCORE
            self.mongo_pool.updata(checked_proxy)
        #调度队列的task_done方法(一个任务完成)
        self.queue.task_done()

    @classmethod
    def start(cls):
        '''
		类方法,依据配置文件的时间间隔运行检测数据库中的ip可用性,单位小时
		'''
        test = DbProxiesCheck()
        test.run()
        schedule.every(TEST_RUN_INTERVAL).hours.do(test.run)

        while 1:
            schedule.run_pending()
            time.sleep(60)
コード例 #3
0
ファイル: run_spiders.py プロジェクト: qicaiyun/python-test
class RunSpider(object):
    def __init__(self):
        #在init中,建立数据连接,获取要操作的集合
        self.mongo_pool=MongoPool()
        # 在init方法中创建协程池对象
        self.coroutine_pool=Pool()

    def get_spider_from_settings(self):
        #根据配置文件信息,获取爬虫对象列表
        #遍历配置文件中爬虫信息,获取每个爬虫全类名
        for full_class_name in PROXIES_SPIDERS:
            #core.proxy_spider.proxy_spiders.XiciSpider
            #获取模块名 和 类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            # 根据模块名,导入模块
            module = importlib.import_module(module_name)
            # 根据类名,从模块中,获取类
            cls = getattr(module, class_name)
            # 3创建爬虫对象
            spider = cls()
            print(spider, "666")
            yield spider


    def run(self):
        #根据配置文件信息,获取爬虫对象列表,
        spiders=self.get_spider_from_settings()
        # 遍历爬虫对象列表,获取爬虫对象,遍历爬虫对象的get_proxies方法,获取IP

        for spider in spiders:
            # 使用异步执行这个方法
            # self._execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self._execute_one_spider_task,args=(spider,))
        # 调用协程的join方法,让当前线程等待 协程 任务完成
        self.coroutine_pool.join()
    # 把处理一个代理爬虫的代码抽到一个方法,用于处理一个爬虫任务的
    def _execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                # print(proxy)
                # 检验代理IP(代理ip检验模块)
                proxy = check_proxy(proxy)
                # 如果可用,写入数据库(数据库模块),如果speed不为-1,就说明可用
                if proxy.speed != -1:
                    # 写入数据库(数据库模块)
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)
    @classmethod
    def start(cls):
        # 1,定义一个start的类方法
        # 2,创建当前类的对象,调用run方法
        rs=RunSpider()
        rs.run()
        # 3,使用schedule模块,每隔一定时间,执行当前对象的run方法
        #修改配置文件,增加爬虫运行时间间隔的配置,单位为小时
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run)
        while True:
            schedule.run_pending()
            time.sleep(2)
コード例 #4
0
    def _load_dns_servers(self):
        print('[*] Validate DNS servers ...')
        self.dns_servers = []

        # create a process pool for checking DNS servers, the number is your processors(cores) * 2, just change it!
        processors = cpu_count() * 2
        pool = Pool(processors)

        # read dns ips and check one by one
        for server in open('dict/dns_servers.txt').readlines():
            server = server.strip()
            if server:
                pool.apply_async(self._test_server, (server, ))

        pool.join()  # waiting for process finish
        self.dns_count = len(self.dns_servers)

        sys.stdout.write('\n')
        dns_info = '[+] Found {} available DNS Servers in total'.format(
            self.dns_count)
        print(dns_info)

        if self.dns_count == 0:
            print('[ERROR] No DNS Servers available.')
            sys.exit(-1)
コード例 #5
0
class RunSpider(object):
    def __init__(self):
        # 创建mongopool对象
        self.mongo_pool = MongoPool()
        # 创建协程池
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        '''
        根据配置文件获取爬虫对象列表,
        :return:
        '''
        # 遍历文件爬虫的全类名
        for full_class_name in PROXIES_SPIDERS:
            # 获取模块名和类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            # print(full_class_name.rsplit('.', maxsplit=1))
            # 根据模块名导入模块
            module = importlib.import_module(module_name)
            # 根据类名,从模块中获取类
            cls = getattr(module, class_name)
            spider = cls()
            # print(spider)
            yield spider

    def run(self):
        #  根据配置文件获取爬虫对象列表,
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            # 异步调用执行的方法
            self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider,))
        # 调用协程的join,让当前线程等待 协程的任务完成
        self.coroutine_pool.join()

    def _execute_one_spider_task(self, spider):
        # 用于处理爬虫的方法
        try:
            # 遍历爬虫对象的方法
            for proxy in spider.get_proxies():
                # print(proxy)
                # 检测代理可用性
                proxy = check_proxy(proxy)
                # 如果speed不为-1 就说明可用
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        rs = RunSpider()
        rs.run()

        # 每间隔多长时间进行一次执行
        # settings里面配置
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run)
        while True:
            # 检测时间  每隔一秒钟检查一次是否到了时间
            schedule.run_pending()
            time.sleep(1)
コード例 #6
0
def load_testing(worker_directory_path,
                 tender_file_path,
                 worker,
                 config,
                 count,
                 initial_number,
                 tender_id_base,
                 concurency,
                 run_auction=False,
                 start_time=None,
                 time_offset=120,
                 wait_for_result=False):
    positions = 4

    auction_id_template = \
        tender_id_base * (32 - positions) + '{{0:0{}d}}'.format(positions)

    pool = Pool(concurency)
    for i in xrange(initial_number, count):
        auction_id = auction_id_template.format(i)
        pool.apply_async(
            planning,
            (worker_directory_path, tender_file_path, worker, auction_id,
             config, start_time, time_offset, wait_for_result))
        if run_auction:
            pool.apply_async(run,
                             (tender_file_path, worker, auction_id, config,
                              start_time, time_offset, wait_for_result))
        pool.wait_available()
    pool.join()
コード例 #7
0
class RunSpider:
    def __init__(self):
        self.collection = MongoClient()
        self.pool = Pool()

    def get_spiders(self):
        for i in PROXY_SPIDERS:
            module_name, class_name = i.split('.')
            module = importlib.import_module(module_name)
            spider = getattr(module, class_name)
            yield spider()

    def run(self):
        spiders = self.get_spiders()
        for spider in spiders:
            self.pool.apply_async(self.execute_spider, args=(spider, ))
        self.pool.join()

    def execute_spider(self, spider):
        for proxy in spider.get_proxies():
            self.collection.add(proxy)

    @classmethod
    def start(cls):
        rs = cls()
        rs.run()
        schedule.every(1).hours.do(rs.run())
        while True:
            schedule.run_pending()
            time.sleep(1800)
コード例 #8
0
 def crawl(self,pages,depth=2):
    self.g = nx.DiGraph()
    for i in range(depth):
       newpages = set()
       pagehtmls = []
       pool = Pool(50)
       for page in pages:
          pool.apply_async(self.download,args=(page,pagehtmls))
       pool.join()
       for page,html in pagehtmls:
          if not html:
             continue
          soup = BeautifulSoup(html)
          self.add_to_index(page,soup)
          links = soup('a')
          for link in links:
             if 'href' in dict(link.attrs):
                url = urljoin(page,link['href'])
                if url.find("'") != -1:
                   continue
                url = url.split('#')[0]
                if url[0:4]=='http' and not self.is_indexed(url):
                   newpages.add(url)
                linkText = self.get_text_only(link)
                self.add_linkref(page,url,linkText)
          pages = newpages
    self.calculaterpagerank(20)
コード例 #9
0
ファイル: gvnt.py プロジェクト: the7day/jabbapylib
def main():
    num_worker_threads = UPTO
    pool = Pool(num_worker_threads)
    for n in xrange(1, UPTO):
        pool.apply_async(process, args=(n, ))
    pool.join()
    print cnt
コード例 #10
0
class RunSpider(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        for full_class_name in PROXIES_SPIDERS:
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            module = importlib.import_module(module_name)
            cls = getattr(module, class_name)
            spider = cls()
            yield spider

    def run(self):
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            # self.__execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self.__execute_one_spider_task,
                                            args=(spider, ))

        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)
コード例 #11
0
def export(srcdirname, destination_dir, format='mp3'):
    """Exports separated music into wav or mp3

    Parameters
    ----------
    srcdirname : str
        Source Directory containing separated files
    destination_dir: str
        Destination directory
    format : {'wav','mp3'}
        File formats (codecs)
    """
    destination_dir = os.path.join(destination_dir, srcdirname)
    os.makedirs(destination_dir, exist_ok=True)
    audio_adapter = get_default_audio_adapter()
    pool = Pool()
    for track in ('vocals.wav', 'other.wav', 'bass.wav', 'drums.wav',
                  'piano.wav'):
        filepath = os.path.join(directories['tmpsplit'], srcdirname, track)
        if os.path.exists(filepath):
            if format == 'wav':
                shutil.copy2(filepath, destination_dir)
            else:
                data = list(audio_adapter.load(filepath))
                instrument = track.split('.')[0]
                pool.apply_async(
                    audio_adapter.save,
                    (os.path.join(destination_dir,
                                  f'{instrument}.mp3'), *data, 'mp3', '128k'))

    #pool.close()
    pool.join()
コード例 #12
0
class ProxyTester:
    def __init__(self):
        self.collection = MongoClient()
        self.queue = Queue()
        self.pool = Pool()

    def run(self):
        proxies = self.collection.find_all()
        for proxy in proxies:
            self.queue.put(proxy)
        for i in range(10):
            self.pool.apply_async(self.async_code, callback=self.async_callback)
        self.queue.join()

    def async_callback(self, temp):
        self.pool.apply_async(self.async_code, callback=self.async_callback)

    def async_code(self):
        proxy = self.queue.get()
        proxy = self.check_proxy(proxy)
        if proxy.protocol == -1:
            proxy.score -= 1
            if proxy.score == 0:
                self.collection.delete_one(proxy)
            else:
                self.collection.update(proxy)
        else:
            proxy.score = MAX_SCORE
            self.collection.update(proxy)
        self.queue.task_done()

    def check_proxy(self, proxy):
        proxies = {
            'http':'http://{}:{}'.format(proxy.ip, proxy.port),
            'https':'https://{}:{}'.format(proxy.ip, proxy.port)
        }
        http_url = 'http://httpbin.org/get'
        https_url = 'https://httpbin.org/get'
        try:
            http_response = requests.get(http_url, proxies=proxies)
            https_response = requests.get(https_url, proxies=proxies)
            if http_response.ok and https_response.ok:
                proxy.protocol = 2
            elif http_response.ok:
                proxy.protocol = 0
            elif https_response.ok:
                proxy.protocol = 1
        except Exception:
            proxy.protocol = -1
        return proxy

    @classmethod
    def start(cls):
        proxy_tester = cls()
        proxy_tester.run()
        schedule.every(3).minutes.do(proxy_tester.run)
        while True:
            schedule.run_pending()
            time.sleep(30)
コード例 #13
0
ファイル: proxy.py プロジェクト: TinSyner/github_spider
 def run(self):
     self.f.write("proxy_list = [\n")
     p = Pool(20)
     for i in range(600, 650):
         p.apply_async(self.func, (i,))
     p.join()
     self.f.write("]")
     self.f.close()
コード例 #14
0
class ProxyTester(object):

    def __init__(self):
        # 创建操作数据库的mongoPool对象
        self.mongo_pool = MongoPool()
        self.queue = Queue()
        self.coroutine_pool = Pool()

    def __check_callback(self,temp):
        self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback)

    def run(self):
        # 提供一个run方法,用于处理检测代理IP核心逻辑
        # 2.1 从数据库中获取所有代理IP
        proxies = self.mongo_pool.find_all()
        for proxy in proxies:
            # self.__check_one_proxy(proxy)
            # 把代理IP添加到队列中
            self.queue.put(proxy)
        # 3.5 开启多个一个异步任务,来处理代理IP的检测,可以通过配置文件指定异步数量
        for i in range(TEST_PROXIES_ASYNC_COUNT):
            # 3.4 通过异步回调,使用死循环不断执行这个方法
            self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback)

        # 让当前线程,等待队列任务完成
        self.queue.join()

    def __check_one_proxy(self):
        # 3.3 把检查一个代理可用性的代码,抽取到一个方法中,
        # 从队列中获取代理IP,进行检查,检查完毕
        proxy = self.queue.get()

        # 2.3 检查代理可用性
        proxy = check_proxy(proxy)
        # 如果代理不可用,让代理分数-1
        if proxy.speed == -1:
            proxy.score -= 1
            # 如果代理分数等于0,就从数据库中删除该代理
            if proxy.score <= 45:
                self.mongo_pool.delete_one(proxy)
            else:
                # 更新代理IP
                self.mongo_pool.update_one(proxy)
        else:
            # 2.5 如果代理可用,就恢复该代理分数,更新到数据库中
            proxy.score = MAX_SCORE
            self.mongo_pool.update_one(proxy)
        # 调度队列的task_done方法
        self.queue.task_done()

    @classmethod
    def start(cls):
        proxy_tester = cls()
        proxy_tester.run()
        schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #15
0
class ProxyTester(object):
    def __init__(self):
        self.queue = Queue()
        self.pool = Pool()
        self.proxy_pool = Mysql()

    def _test_proxy(self):
        proxy = self.queue.get()
        try:
            proxy = check_proxy(proxy)
            if proxy.speed == -1:
                proxy.score -= 1
                if proxy.score == 0:
                    self.proxy_pool.delete(proxy)
                    logger.info('删除代理:{}'.format(proxy))
                else:
                    self.proxy_pool.update_score(proxy)
            else:
                proxy.score = settings.MAX_SCORE
                self.proxy_pool.update_score(proxy)

        except Exception as ex:
            logger.exception(ex)

        self.queue.task_done()

    def _test_proxy_finish(self, temp):
        self.pool.apply_async(self._test_proxy,
                              callback=self._test_proxy_finish)

    def run(self):
        # 1. 获取所有代理IP
        proxies = self.proxy_pool.find_all()
        # 2. 如果代理池为空, 直接返回
        if proxies is None:
            print("代理池为空")
            return

        # 获取所有的代理, 放到队列中
        for proxy in proxies:
            self.queue.put(proxy)

        # 开启多个异步任务执行检查IP的任务
        for i in range(settings.TESTER_ANSYC_COUNT):
            self.pool.apply_async(self._test_proxy,
                                  callback=self._test_proxy_finish)

        # 让主线程等待异步任务完成
        self.queue.join()

    @staticmethod
    def start():
        tester = ProxyTester()
        tester.run()
        schedule.every(settings.TESTER_INTERVAL).hours.do(tester.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #16
0
class QiushiSpider(object):
    def __init__(self):
        self.urlQueue = Queue()
        self.base_url = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
        }

        self.pool = Pool(5)

        pass

    # 生产 url 函数
    def get_url_list(self):
        for page in range(1, 14):
            self.urlQueue.put(self.base_url.format(page))

    def exec_task(self):

        # 1. 从 urlQueue 获取 url
        url = self.urlQueue.get()

        # 2. 发起请求获取响应并且获取  html
        response = requests.get(url, headers=self.headers)

        html = response.text

        # 3. 从 html 中 提取数据
        eroot = etree.HTML(html)
        texts = eroot.xpath('//div["recommend-article"]/ul/li/div/a/text()')
        for item in texts:
            # 4. 保存数据
            print(item)

        self.urlQueue.task_done()
        pass

    def exec_task_finished(self, ret):
        # print("任务执行完成回调")
        self.pool.apply_async(self.exec_task, callback=self.exec_task_finished)
        pass

    def run(self):

        self.get_url_list()

        for i in range(5):
            # 让线程池执行任务
            # 1. 第一个参数执行 具体任务代码函数
            # 2. callback 当任务执行完成以后回调函数
            self.pool.apply_async(self.exec_task,
                                  callback=self.exec_task_finished)

        # 需要让祝线程挂起
        self.urlQueue.join()
        pass
コード例 #17
0
class RunSpider(object):
    def __init__(self):

        self.mongo_pool = MongoPool()
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        '''
		获取配置文件中的具体爬虫列表创建对象
		'''
        for full_class_name in PROXIES_SPIDERS:
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            #动态导入模块
            module = importlib.import_module(module_name)

            cls = getattr(module, class_name)
            spider = cls()
            yield spider

    def run(self):
        '''
		遍历爬虫对象,执行get_proxies方法
		'''
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            self.coroutine_pool.apply_async(self.__run_one_spider,
                                            args=(spider, ))
        #当前线程等待爬虫执行完毕
        self.coroutine_pool.join()

    def __run_one_spider(self, spider):
        try:
            check_ip_count = 0
            for proxy in spider.get_proxies():
                time.sleep(0.1)
                checked_proxy = check_proxy(proxy)
                check_ip_count += 1
                if proxy.speed != -1:
                    self.mongo_pool.insert(checked_proxy)
            logger.info('爬虫{}爬取并校验{}个ip完毕'.format(spider, check_ip_count))

        except Exception as er:
            logger.exception(er)
            logger.exception("爬虫{} 出现错误".format(spider))

    @classmethod
    def start(cls):
        '''
		类方法,依据配置文件汇总的时间间隔run爬虫,单位小时
		'''
        rs = RunSpider()
        rs.run()
        schedule.every(SPIDERS_RUN_INTERVAL).hours.do(rs.run)

        while 1:
            schedule.run_pending()
            time.sleep(60)
コード例 #18
0
 def download_all_ts(self, ts_url):
     pool = Pool(size=5)
     for url in ts_url:
         filepath = os.path.join(self.save_path, url[-20:])
         pool.apply_async(urlretrieve,
                          kwds={
                              "url": url,
                              "filename": filepath
                          })
     pool.join()
コード例 #19
0
ファイル: gpool.py プロジェクト: echoocking/DontForget
def main():
    for i in range(2):
        gpool.apply(delay_func)
    print(time.time())
    gpool.join()
    print(time.time())
    print('----end---?')
    print(len(gpool))
    gpool.apply_async(add, args=[4, 2])
    gpool.join()
コード例 #20
0
 def run(self, ):
     url_list = ['www.baidu.com', 'www.110088.com']
     for url in url_list:
         self.request_total(url)
         total = self.get_total()
         print("ip总数问为:{}, 总页数为:{}".format(self.total, total))
         pool_g = Pool(GEVENT_COUNT)
         for i in range(1, total + 1):
             pool_g.apply_async(self.parse, [url, i])
         pool_g.join()
コード例 #21
0
ファイル: run_spider.py プロジェクト: hzt045858/IPProxyPool
class RunSpider(object):
    def __init__(self):
        # 创建MongoPool对象
        self.mongo_pool = MongoPool()
        # 3.1 在init方法中创建协程池对象
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        """根据胚子文件信息, 获取爬虫对象列表"""
        # 首先遍历配置文件中的爬虫信息, 获取每个爬虫全类名
        for full_class_name in PROXIES_SPIDERS:
            # core.proxy_spider.proxy_spiders.Ip66Spider
            # 获取模块名和类名,然后根据模块名动态创建类对象
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            # 根据模块名导入模块
            module = importlib.import_module(module_name)
            # 根据模块获取爬虫对象
            cls = getattr(module, class_name)
            # 创建爬虫对象
            spider = cls()
            yield spider

    def run(self):

        # 2.1 根据配置文件信息,获取爬虫对象列表
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            # 2.2 遍历爬虫对象列表, 获取爬虫对象, 遍历爬虫对象的get_proxies方法, 获取代理IP
            # self._execute_one_spider_task(spiders)
            # 抽取出的方法使用线程池调度
            self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider, ))
        # 3.4 调用协程的 join方法, 让当前线程等待协程任务的完成
        self.coroutine_pool.join()

    def _execute_one_spider_task(self, spider):
        try:
            # 遍历爬虫对象的get_proxies方法, 获取代理IP
            for proxy in spider.get_proxies():
                # 2.3 检测代理IP(代理IP检测模块)
                proxy = check_proxy(proxy)
                # 如果速度不为-1, 说明可用
                if proxy.speed != -1:
                    # 写入数据库
                    self.mongo_pool.insert_one(proxy)
        except Exception as e:
            logger.exception(e)

    @classmethod
    def start(cls):
        cls().run()

        schedule.every(SPIDER_TIME_DELAY).hours.do(cls().run)
        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #22
0
ファイル: proxy_text.py プロジェクト: zhou5284/IPProxyPool
class ProxyTexter(object):

    def __init__(self):
        '''创建操作数据库的对象'''
        self.mongo_pool = MongoPool()
        self.queue = Queue()
        self.coroutine_proxy = Pool()

    def __check_callback(self, temp):
        # 死循环调用
        self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback)

    def run(self):
        '''检测核心逻辑'''
        proxies = self.mongo_pool.find_all()

        for proxy in proxies:
            # 检测
            # self.__check_noe_proxy(proxy)
            # 把代理添加到队列中
            self.queue.put(proxy)
        # 异步
        for i in range(TEXT_PROXIES_AXYNC_COUT):
            # 异步回调
            self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback)
        # 让当前的 线程 等待 队列任务的完成
        self.queue.join()

    def __check_noe_proxy(self):
        '''处理单个代理'''
        # 获取队列中的代理
        proxy = self.queue.get()
        proxy = check_proxy(proxy)
        if proxy.speed == -1:
            proxy.score -= 1
            if proxy.score == 0:
                self.mongo_pool.delete_one(proxy)
            else:
                self.mongo_pool.delete_one(proxy)
        else:
            proxy.score = MAX_SCORE
            self.mongo_pool.delete_one(proxy)
        # 调度队列的task_done方法
        self.queue.task_done()

    @classmethod
    def start(cls):
        '''运行时间设定,制动执行'''
        proxy_tester = cls()
        proxy_tester.run()
        # schedule.every(TEXT_PROXIES_INTERVAL).hour.do(proxy_tester.run)  # 每多少小时检测一次
        schedule.every(TEXT_PROXIES_INTERVAL).minutes.do(proxy_tester.run)  # 每多少分钟检测一次
        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #23
0
class ProxyTester(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        self.queue = Queue()
        self.coroutine_pool = Pool()

    def _check_callback(self, temp):
        self.coroutine_pool.apply_async(self._check_one_proxy, callback=self._check_callback)

    def run(self):
        # 检测所有ip可用性
        # 获取数据库的ip
        proxies = self.mongo_pool.find_all()
        # 遍历代理ip列表
        for proxy in proxies:
            # 把代理ip添加到队列中
            self.queue.put(proxy)
        # 开启异步检测
        for i in range(TEST_PROXIES_ASYNC_COUNT):
            # 通过异步回调 使用循环不停的执行
            self.coroutine_pool.apply_async(self._check_one_proxy, callback=self._check_callback)
        # 让当前线程,等待队列完成
        self.queue.join()

    def _check_one_proxy(self):
        # print(proxy)
        # 检测ip可用性
        # 从队列里面获取队列
        proxy = self.queue.get()
        proxy = check_proxy(proxy)
        # 如果可用  代理分数减掉1
        if proxy.speed == -1:
            proxy.score -= 1
            # 判断分数是否为零
            if proxy.score == 0:
                self.mongo_pool.delete_one(proxy)
            else:
                # 更新代理ip
                self.mongo_pool.update_one(proxy)
        else:
            # 如果代理可用, 就恢复代理的分数, 更新到数据库中
            proxy.score = MAX_SCORE
            self.mongo_pool.update_one(proxy)
        self.queue.task_done()

    @classmethod
    def start(cls):
        # 创建对象
        proxy_tester = cls()
        proxy_tester.run()
        # 每隔一定的时间执行一次
        schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #24
0
class RunSpider(object):

    spider_list = [
        'kuaiSpider',
        'jiangxianSpider',
        'xilaSpider',
        'xiaohuanSpider',
        'zhimaSpider',
        'nimaSpider',
        'qiyunSpider',
        'spider89',
    ]
    module_name = 'core.proxy_spider.proxy_spiders'

    def __init__(self, module_name='', spider_list=[]):
        if module_name:
            self.module_name = module_name
        if spider_list:
            self.spider_list = spider_list
        self.mongo_pool = MongoPool()
        #创建协程池
        self.coroutine_pool = Pool()

    def get_spider_cls(self, spider_list, module_name):
        module = importlib.import_module(module_name)
        for spider_name in spider_list:
            spider_cls = getattr(module, spider_name)
            yield spider_cls

    def run_spider(self):
        for spider in self.get_spider_cls(self.spider_list, self.module_name):
            #self.__execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self.__execute_one_spider_task,
                                            args=(spider, ))
        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                if proxy.delay != -1:
                    self.mongo_pool.insert_one(proxy)
                    print("新代理插入成功" + dict(proxy))
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        rs = RunSpider()
        rs.run_spider()
        schedule.every(RUN_SPIDER_INTERVAL).hours.do(rs.run_spider)
        while True:
            schedule.run_pending()
            time.sleep(30)
コード例 #25
0
ファイル: m.py プロジェクト: veryflying/ppdai
def solve_year(year):
    print u'%d开始'%year
    pages = int(get_pages('http://loan.ppdai.com/blacklist/%d_m0'%year))
    pool = Pool(size=20)
    for i in xrange(pages):
        pool.apply_async(crawl_page, args=(year, i))
    # for j in xrange(1,pages/20):
    #     gevent.joinall([gevent.spawn(crawl_page, year, i) for i in xrange(20*(j-1), 20*j)],)
    # gevent.joinall([gevent.spawn(crawl_page, year, i) for i in xrange(pages-pages%20, pages+1)],)
    pool.join()
    CONNECTION.close()
    print year, u'完成'
コード例 #26
0
ファイル: dns_server.py プロジェクト: idlefire/SubDoaminTrack
def load_dns_server():
    pool = Pool(20)
    for dns_server in open('dict/dns_server_list.txt').readlines():
        dns_server = dns_server.strip()
        if dns_server:
            pool.apply_async(detect_dns_server, (dns_server, ))
    pool.join()
    if len(dns_servers) == 0:
        print 'No available.'
        sys.exit(1)
    print 'Avaiable Dns in total: %d' % (len(dns_servers))
    return dns_servers
コード例 #27
0
class ProxyTester(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        self.queue = Queue()
        self.coroutine_pool = Pool()

    def __check_callback(self, temp):
        self.coroutine_pool.apply_async(self.__check_one_proxy,
                                        callback=self.__check_callback)

    def run(self):
        # 提供一个 run 方法,用于处理检测代理IP核心逻辑
        # 2.1 从数据库中获取所以代理IP
        proxies = self.mongo_pool.find_all()
        # 2.2 遍历代理IP列表
        for proxy in proxies:
            # self.__check_one_proxy(proxy)
            # 把代理ip添加到队列中
            self.queue.put(proxy)
        for i in range(TEST_PROXIES_ASYNC_COUNT):
            self.coroutine_pool.apply_async(self.__check_one_proxy,
                                            callback=self.__check_callback)
        self.queue.join()

    def __check_one_proxy(self):
        ''' 检查一个代理IP的可用性 '''
        proxy = self.queue.get()
        # 2.3 检测代理可用性
        print(proxy)
        proxy = check_proxy(proxy)
        if proxy.speed == -1:
            proxy.score -= 1
            if proxy.score == 0:
                self.mongo_pool.delete_one(proxy)
            else:
                # 否则更新该代理ip
                self.mongo_pool.update_one(proxy)
        else:
            # 2.5 如果代理可用,就恢复该代理的分数,更新到数据库中
            proxy.score = MAX_SCORE
            self.mongo_pool.update_one(proxy)
        self.queue.task_done()

    @classmethod
    def start(cls):
        # 4.2.1 创建本类对象
        proxy_tester = cls()
        proxy_tester.run()

        schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #28
0
class RunSpider(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        #创建协程池对象
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        """根据配置文件信息,获取爬虫对象列表"""
        #遍历配置文件中爬虫信息,获取每个爬虫全类名
        for full_class_name in PROXIES_SPIDERS:
            #获取模块名和类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            #根据模块名,导入模块
            module = importlib.import_module(module_name)
            # #根据类名,从模块中,获取类
            cls = getattr(module, class_name)
            #创建爬虫对象
            spider = cls()
            # print(spider)
            yield spider

    def run(self):
        # 根据配置文件信息,获取爬虫对象列表
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            #把处理一个代理爬虫的代码抽到一个方法用于处理一个爬虫任务
            # self.__execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self.__execute_one_spider_task,
                                            args=(spider, ))
            #调用协程的join方法,让当前线程等待协程任务的未完成
        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                # 检测代理IP(调用检测模块)
                proxy = check_proxy(proxy)
                # 如果可用,写入数据库(调用数据库模块,speed不为-1就说明可用)
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
                    # print(proxy)
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        rs = RunSpider()
        rs.run()
        #每间隔多少个小时运行爬虫
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #29
0
ファイル: worker.py プロジェクト: largetalk/django-kombu
class Worker(ConsumerMixin):
    def __init__(self, connection):
        self.connection = connection
        self.handlers = collections.defaultdict(list)
        self._loaded_handlers = False
        self._pool = Pool(5) #pool size

    def add_handler(self, queue, handler):
        self.handlers[queue].append(handler)

    def run(self):
        if not self._loaded_handlers:
            from django_kombu.settings import kombu_settings, perform_import
            for q in kombu_settings.QUEUES:
                for handler_cls in perform_import(q[2], 'QUEUE'):
                    self.add_handler(q[0], handler_cls())
            self._loaded_handlers = True

        super(Worker, self).run()

    def get_consumers(self, Consumer, channel):
        callbacks = [ partial(self.dispatch_message, q.name) for q in task_queues ]

        return [
            Consumer(queues=q, callbacks=[cb]) for q, cb in zip(task_queues, callbacks)
        ]

    def dispatch_message(self, queue, *args):
        for handler in self.handlers[queue]:
            if handler.match(*args):
                try:
                    if kombu_settings.GEVENT:
                        self._pool.spawn(lambda : handler.handle(*args))
                    else:
                        self._pool.apply_async(lambda : handler.handle(*args))
                    #handler.handle(*args)
                except:
                    logger.error(traceback.format_exc())
                else:
                    logger.info('SUCCESS: %(routing_key)s %(body)s' % dict(
                        body        = args[0],
                        routing_key = args[1].delivery_info['routing_key']
                    ))

    def on_connection_error(self, exc, interval):
        logger.error('Broker connection error: %r. Trying again in %s seconds.', exc, interval)

    def on_decode_error(self, message, exc):
        logger.error("Can't decode message body: %r (type:%r encoding:%r raw:%r')",
              exc, message.content_type, message.content_encoding,
              safe_repr(message.body)
        )
コード例 #30
0
class RunSpider(object):
    """启动spider"""

    def __init__(self):
        '''创建数据库对象'''
        self.mongo_pool = MongoPool()
        # 创建协程池
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        '''根据配置信息,获取爬虫列表'''
        for full_class_name in PROXIES_SPDERS:
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)  # 从左往右截1次
            module = importlib.import_module(module_name)
            cls = getattr(module, class_name)
            spdier = cls()

            yield spdier

    def run(self):

        spdiers = self.get_spider_from_settings()

        for spider in spdiers:
            # self.__execute_one_spider_task(spider)
            # 通过一部的方法执行
            self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider,))
        # 调用join方法,当前线程 等待 协程 任务的完成
        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        '''把处理一个代理爬虫的代码抽到一个方法'''
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                # print(proxy)
                # 写入数据库
                self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        '''运行时间设定,制动执行'''
        rs = RunSpider()
        rs.run()
        # schedule.every(RUN_SPDERS_INTERVAL).hour.do(rs.run)  # 小时
        schedule.every(RUN_SPDERS_INTERVAL).minutes.do(rs.run) # 分钟

        while True:
            schedule.run_pending()
            time.sleep(1)
コード例 #31
0
ファイル: gevent_core.py プロジェクト: abranches/pumba
class GeventExecutor(AbstractExecutor):
    def __init__(self, task_cls, max_threads, multiple_instances=False):
        super(GeventExecutor, self).__init__(task_cls)
        self._max_threads = max_threads
        self._multiple_instances = multiple_instances
        if multiple_instances:
            self._tasks_pool = Queue()
            for _ in xrange(max_threads):
                self._tasks_pool.put(task_cls())
        else:
            self._task = task_cls()
        self._thread_pool = Pool(size=max_threads)

    def setup_tasks(self):
        if self._multiple_instances:
            for task in self._tasks_pool.queue:
                task.setup()
        else:
            self._task.setup()

    def join(self, timeout=sys.maxint):
        super(GeventExecutor, self).join()
        self._thread_pool.join()

    def available(self):
        is_it = not self._thread_pool.full()
        #if not is_it:
        #    gevent.sleep(0)
        gevent.sleep(0)
        return is_it

    def wait_available(self):
        gevent.sleep(0)
        self._thread_pool.wait_available()

    def _run_task(self, run_id):
        self._thread_pool.apply_async(self._run_on_thread_pool, (run_id,))
        #gevent.sleep(0)

    def _run_on_thread_pool(self, run_id):
        try:
            if self._multiple_instances:
                try:
                    task = self._tasks_pool.get()
                    result = run_task_func_wrapper(task.run, run_id)
                finally:
                    self._tasks_pool.put(task)
            else:
                result = run_task_func_wrapper(self._task.run, run_id)
            self.on_async_run_finished(result)
        except:
            log.debug("DEUUU MEEERDA", exc_info=True)
コード例 #32
0
    def _load_dns_servers(self):
        self.dns_servers = []
        pool = Pool(30)
        for server in open('dict/dns_servers.txt').xreadlines():
            server = server.strip()
            if server:
                pool.apply_async(self._test_server, (server,))
        pool.join()

        self.dns_count = len(self.dns_servers)
        sys.stdout.write('\n')
        if self.dns_count == 0:
            sys.exit(-1)
コード例 #33
0
def start_pool(size):
    t1 = datetime.now()
    pool = Pool(size)
    while (datetime.now() - t1).seconds <= SECONDS:
        print 'pool.free_count():', pool.free_count()
        if pool.free_count() == 0:
            pool.wait_available()
            print '<free 1>'
        pool.apply_async(test_get)
    print 'Joining............................................'
    pool.join()
    t2 = datetime.now()
    print COUNT, TIMEOUT_CNT
    print COUNT / (t2-t1).seconds
コード例 #34
0
def load_dns_servers():
    dns_servers = []
    pool = Pool(5)
    for server in open('subDomainsBrute/dict/dns_servers.txt').readlines():
        server = server.strip()
        if server and not server.startswith('#'):
            pool.apply_async(test_server, (server, dns_servers))
    pool.join()

    server_count = len(dns_servers)
    if server_count == 0:
        print_msg('[ERROR] No valid DNS Server !', line_feed=True)
        sys.exit(-1)
    return dns_servers
コード例 #35
0
ファイル: main.py プロジェクト: TinSyner/github_spider
    def download(self, project_list, directory):
        if not os.path.exists(directory):
            os.mkdir(directory)

        pool = Pool(20)
        for i in project_list:
            l = i.split("*")
            title = l[0]
            branch_name = l[1]
            print('downloading ' + title + ' to ' + directory + '/')
            pool.apply_async(self.git_clone, (title, directory, branch_name))

        print('downloading please don\'t stop it')
        pool.join()
コード例 #36
0
ファイル: __init__.py プロジェクト: kyleconroy/salesman
    def explore(self, url):
        """Travel will never stop"""
        self.visited_urls = set()
        self.base = urlparse(url)

        # Limit Pool size to 100 to prevent HTTP timeouts
        pool = Pool(100)

        def visit(target, source):
            if not self.is_invalid(target):
                for url, source in self.visit(target, source):
                    pool.apply_async(visit, args=[url, source])

        pool.apply_async(visit, args=[url, None])
        pool.join()
コード例 #37
0
def register():
    f = open('fail.txt', 'w')
    f.close()
    f = open('success.txt', 'w')
    f.close()
    p = Pool(1000)
    f = open('邮箱.txt', 'r')
    # emails = f.read().strip().split('\n')
    emails = list(
        map(lambda x: x.split('----')[0],
            f.read().strip().split('\n')))
    for i in emails:
        p.apply_async(reg, args=(i, ))
    p.join()
    print('over!')
コード例 #38
0
ファイル: coroutine.py プロジェクト: 0x554simon/3102
class WorkerPool(object):

    def __init__(self):
        self.pool_size = options.pool_size
        self.job_pool = Pool(size=self.pool_size)
        self.result = Queue()
        self.target_queue = Queue()

    def add_job(self, job_func, *args, **kwargs):
        job = self.job_pool.apply_async(
            job_func,
            args=args,
            kwds=kwargs,
            callback=self._call_func)
        self.job_pool.add(job)

    def run(self, timeout=None):
        self.job_pool.join(timeout=timeout, raise_error=False)

    def _call_func(self, job_ret):
        if job_ret:
            self.result.put(job_ret)

    def shutdown(self):
        self.job_pool.kill()
コード例 #39
0
ファイル: common.py プロジェクト: coco413/subDomainsBrute
def load_dns_servers():
    print_msg('[+] Validate DNS servers', line_feed=True)
    dns_servers = []
    pool = Pool(10)
    for server in open('dict/dns_servers.txt').readlines():
        server = server.strip()
        if server:
            pool.apply_async(test_server, (server, dns_servers))
    pool.join()

    dns_count = len(dns_servers)
    print_msg('\n[+] %s available DNS Servers found in total' % dns_count, line_feed=True)
    if dns_count == 0:
        print_msg('[ERROR] No DNS Servers available!', line_feed=True)
        sys.exit(-1)
    return dns_servers
コード例 #40
0
    def _load_dns_servers(self):
        print '[+] Validate DNS servers ...'
        self.dns_servers = []
        pool = Pool(30)
        for server in open('dict/dns_servers.txt').xreadlines():
            server = server.strip()
            if server:
                pool.apply_async(self._test_server, (server,))
        pool.join()

        self.dns_count = len(self.dns_servers)
        sys.stdout.write('\n')
        print '[+] Found %s available DNS Servers in total' % self.dns_count
        if self.dns_count == 0:
            print '[ERROR] No DNS Servers available.'
            sys.exit(-1)
コード例 #41
0
ファイル: beecoroutine.py プロジェクト: 2625668714/Beehive
class WorkerPool(object):
    JOB_UNSTART = 0  # poc not run
    JOB_RUNNING = 1
    JOB_FINISHED = 2  # poc run ok
    JOB_ERROR = -1  # error encountered when run poc
    JOB_ABORT = -2  # running poc is abort, viz unfinished

    def __init__(self, concurrency=10):
        self.concurrency = concurrency
        self.jobPool = Pool(size=concurrency)
        self.errNum = 0  # failed job(run time error but not aborted)
        self.successNum = 0
        self.totalNum = 0
        self.results = {}

    def work(self, iterJobFuncArgs, jobFunc, timeout=None):
        for jobFuncArgs in iterJobFuncArgs:
            self.results[hash(str(jobFuncArgs))] = {
                'state': self.JOB_UNSTART,
                'args': jobFuncArgs,
            }
            self.totalNum += 1
            self.jobPool.add(
                self.jobPool.apply_async(
                    self._doJob,
                    args=(jobFunc, jobFuncArgs,),
                    kwds=None,
                    callback=self._cbJobFinished
                )
            )
        self.jobPool.join(timeout=timeout, raise_error=False)
        return self.results

    def _cbJobFinished(self, jobResult):
        if jobResult['state'] == self.JOB_ERROR:
            self.errNum += 1
        elif jobResult['state'] == self.JOB_FINISHED:
            self.successNum += 1

    def _doJob(self, jobFunc, jobFuncArgs):
        self.results[hash(str(jobFuncArgs))]['state'] = self.JOB_RUNNING
        try:
            self.results[hash(str(jobFuncArgs))]['jobRet'] = \
                jobFunc(*jobFuncArgs) if isinstance(jobFuncArgs, list) \
                                      else jobFunc(jobFuncArgs)
            self.results[hash(str(jobFuncArgs))]['state'] = self.JOB_FINISHED
        except Exception as err:
            self.results[hash(str(jobFuncArgs))]['exception'] = str(err)
            self.results[hash(str(jobFuncArgs))]['state'] = self.JOB_ERROR
        return self.results[hash(str(jobFuncArgs))]

    def handleAbort(self):
        for jobId in self.results.keys():
            if self.results[jobId]['state'] in (self.JOB_RUNNING,
                                                self.JOB_UNSTART):
                self.results[jobId]['state'] = self.JOB_ABORT
コード例 #42
0
class mainclass(OSISStoreMongo):
    TTL = 3600 * 24 * 5  # 5 days
    def __init__(self, *args, **kwargs):
        super(mainclass, self).__init__(*args, **kwargs)
        self.pool = Pool(1000)

    def set_helper(self, session, value, isList=False):
        if isList:
            for eco in value:
                self.set_helper(session, eco)
            return True
        db, _ = self._getMongoDB(session)
        objectindb = db.find_one({"guid": value["guid"]})
        if objectindb:
            objectindb.update(value)
            value = objectindb
        noreraise = value.pop('noreraise', False)
        self.setPreSave(value, session)
        new = False
        if objectindb:
            if noreraise:
                return value['guid'], new, False
            db.update({'guid': value['guid']},
                      {'$inc': {'occurrences': value['occurrences']},
                       '$set': {'lasttime': value['lasttime'],
                                'errormessage': value['errormessage'],
                                'errormessagePub': value['errormessagePub'],
                                'state': value['state']}
                       })

        else:
            new = True
            db.save(value)
        return value['guid'], new, True

    def set(self, key, value, waitIndex=False, session=None):
        if isinstance(value, list):
            self.pool.wait_available()
            self.pool.apply_async(self.set_helper, (session, value, True))
            return None, None, True
        return self.set_helper(session, value)
コード例 #43
0
ファイル: _gevent.py プロジェクト: 9seconds/streams
class GeventExecutor(PoolOfPoolsMixin, Executor):
    """
    Implementation of Gevent executor fully compatible with
    :py:class:`concurrent.futures.Executor`.
    """

    # noinspection PyUnusedLocal
    def __init__(self, *args, **kwargs):
        super(GeventExecutor, self).__init__()
        self._max_workers = 100
        self.worker_pool = Pool(self._max_workers)

    def submit(self, fn, *args, **kwargs):
        future = self.worker_pool.apply_async(fn, args, kwargs)
        return GreenletFuture(future)
コード例 #44
0
ファイル: backend.py プロジェクト: kurtsun/webadmin
class MultiProcess(object):
	def __init__(self,task_func,func_name,*args):
		self.task_func=task_func
		self.func_name=func_name
		self.hosts = args
		self.pool = Pool(3)
		self.result = []
	
	def execute(self):
		for h in self.hosts[0]:
			p = self.pool.apply_async(self.task_func,args=(h,self.func_name))
			self.result.append(p)	
		
		#self.pool.join()
		return self.get_result()
	
	def get_result(self):
		return self.result
	
	#def addcallback(self,func):
	#	return func(self.execute())

	def addcallback(self,handler,key):
		return handler.handle_result(self.execute(),key)
コード例 #45
0
ファイル: models.py プロジェクト: ebrensi/running_data
    def index(self, activity_ids=None, limit=None,  after=None, before=None):

        def strava2dict(a):
            return {
                "id": a.id,
                "name": a.name,
                "type": a.type,
                "summary_polyline": a.map.summary_polyline,
                "beginTimestamp": a.start_date_local,
                "total_distance": float(a.distance),
                "elapsed_time": int(a.elapsed_time.total_seconds()),
                "average_speed": float(a.average_speed)
            }
        dtypes = {
            "id": "uint32",
            "type": "category",
            "total_distance": "float32",
            "elapsed_time": "uint32",
            "average_speed": "float16"
        }

        if self.indexing():
            return [{
                    "error": "Indexing activities for user {}...<br>Please try again in a few seconds.<br>"
                    .format(self.strava_id)
                    }]

        ind = cache.get(self.index_key())
        if ind:
            dt_last_indexed, packed = ind
            activity_index = pd.read_msgpack(packed).astype({"type": str})
            elapsed = (datetime.utcnow() -
                       dt_last_indexed).total_seconds()

            # update the index if we need to
            if (elapsed > CACHE_INDEX_UPDATE_TIMEOUT) and (not OFFLINE):
                latest = activity_index.index[0]
                app.logger.info("updating activity index for {}"
                                .format(self.strava_id))

                already_got = set(activity_index.id)

                try:
                    activities_list = [strava2dict(
                        a) for a in self.client().get_activities(after=latest)
                        if a.id not in already_got]
                except Exception as e:
                    return [{"error": str(e)}]

                if activities_list:
                    df = pd.DataFrame(activities_list).set_index(
                        "beginTimestamp")

                    activity_index = (
                        df.append(activity_index)
                        .drop_duplicates()
                        .sort_index(ascending=False)
                        .astype(dtypes)
                    )

                dt_last_indexed = datetime.utcnow()
                cache.set(self.index_key(),
                          (dt_last_indexed,
                           activity_index.to_msgpack(compress='blosc')),
                          CACHE_INDEX_TIMEOUT)

            if activity_ids:
                df = activity_index[activity_index["id"].isin(activity_ids)]
            else:
                if limit:
                    df = activity_index.head(limit)
                else:
                    df = activity_index
                    if after:
                        df = df[:after]
                    if before:
                        df = df[before:]
            df = df.reset_index()
            df.beginTimestamp = df.beginTimestamp.astype(str)
            return df.to_dict("records")

        # If we got here then the index hasn't been created yet
        Q = Queue()
        P = Pool()

        def async_job(user, limit=None, after=None, before=None):

            user.indexing(True)

            activities_list = []
            count = 1
            try:
                for a in self.client().get_activities():
                    d = strava2dict(a)
                    if d.get("summary_polyline"):
                        activities_list.append(d)
                        if (limit or
                            (after and (d["beginTimestamp"] >= after)) or
                                (before and (d["beginTimestamp"] <= before))):
                            d2 = dict(d)
                            d2["beginTimestamp"] = str(d2["beginTimestamp"])
                            Q.put(d2)
                            app.logger.info("put {} on queue".format(d2["id"]))

                            if limit:
                                limit -= 1
                                if not limit:
                                    Q.put({"stop_rendering": "1"})
                        else:
                            Q.put({"msg": "indexing...{} activities".format(count)})

                        count += 1
                        gevent.sleep(0)
            except Exception as e:
                Q.put({"error": str(e)})
            else:
                Q.put({"msg": "done indexing {} activities.".format(count)})

                activity_index = (pd.DataFrame(activities_list)
                                  .set_index("beginTimestamp")
                                  .sort_index(ascending=False)
                                  .astype(dtypes))

                app.logger.debug("done with indexing for {}".format(self))
                dt_last_indexed = datetime.utcnow()
                packed = activity_index.to_msgpack(compress='blosc')
                cache.set(self.index_key(),
                          (dt_last_indexed, packed),
                          CACHE_INDEX_TIMEOUT)

                app.logger.info("cached {}, size={}".format(self.index_key(),
                                                            len(packed)))
            finally:
                user.indexing(False)
                Q.put(StopIteration)

        P.apply_async(async_job, [self, limit, after, before])
        return Q
コード例 #46
0
ファイル: uploader.py プロジェクト: dlf412/thunderCopyright
def _loop_tasks(logger, redis_server, redis_list_name,
               pickle_dir, pickle_ext, pickle_corrupt_time,
               broker_routing_key, broker_exchange, broker_mq_url,
               swift_auth, swift_user, swift_key):

    log_normal(logger, {'action': 'uploader-started'}, LOG_INFO)
    
    try:
        if not os.path.exists(pickle_dir):
            os.mkdir(pickle_dir)
    except OSError:
        pass

    r = redis.Redis(redis_server)
    def push_redis():
        ts_now = time.time()
        for path in os.listdir(pickle_dir):
            path = os.path.join(pickle_dir, path)
            if path.endswith(pickle_ext):
                ts_file = os.path.getmtime(path)
                
                if ts_now - ts_file > pickle_corrupt_time:
                    os.remove(path)
                else:
                    r.lpush(redis_list_name, path)

    def start_empty_archieve():
        if os.path.exists(PID_FILE):
            return

        with open(PID_FILE, 'w') as f:
            f.write(str(os.getpid()))
        time.sleep(0.2)
        with open(PID_FILE, 'r') as f:
            pid = int(f.read().strip())
        if pid == os.getpid():
            log_normal(logger, {
                'action': 'empty-archieve-starting',
                'info': {
                    'pid': pid
                }
            }, LOG_INFO)
            r.ltrim(redis_list_name, 0, -1)
            push_redis()
            log_normal(logger, {'action': 'empty-archieve-done'}, LOG_INFO)
            os.remove(PID_FILE)

    if r.llen(redis_list_name) == 0:
        log_normal(logger, {'action': 'no-task-in-redis-queue'}, LOG_INFO)
        start_empty_archieve()

    # Main loop.
    p = Pool(POOL_SIZE)
    while process_isalive(os.getppid()):
        res = r.brpop(redis_list_name, timeout=1)
        if not res:
            continue
        _, pickle_path = res
        log_normal(logger, {
            'action': 'got-redis-task',
            'info': {
                'pickle_path': pickle_path
            }
        }, LOG_INFO)
        p.wait_available()
        p.apply_async(do_task, (logger, pickle_path, swift_auth, swift_user, swift_key))

    p.join()
    # Delete pid file
    if os.path.exists(PID_FILE):
        try:
            os.remove(PID_FILE)
        except OSError:
            pass

    log_normal(logger, {'action': 'exit-uploader-process'}, LOG_INFO)
コード例 #47
0
ファイル: untar-to-s3.py プロジェクト: Kixeye/untar-to-s3
def deploy_tarball_to_s3(tarball_obj, bucket_name, prefix='', region='us-west-2', concurrency=50, no_compress=False, strip_components=0):
    """
    Upload the contents of `tarball_obj`, a File-like object representing a valid .tar.gz file, to the S3 bucket `bucket_name`
    """
    # Connect to S3 and get a reference to the bucket name we will push files to
    conn = connect_to_region(region)
    if conn is None:
        logging.error("Invalid AWS region %s" % region)
        return

    try:
        bucket = conn.get_bucket(bucket_name, validate=True)
    except boto.exception.S3ResponseError:
        logging.error("S3 bucket %s does not exist in region %s" % (bucket_name, region))
        return

    # Open the tarball
    try:
        with tarfile.open(name=None, mode="r:*", fileobj=tarball_obj) as tarball:

            files_uploaded = 0

            # Parallelize the uploads so they don't take ages
            pool = Pool(concurrency)

            # Iterate over the tarball's contents.
            try:
                for member in tarball:

                    # Ignore directories, links, devices, fifos, etc.
                    if not member.isfile():
                        continue

                    # Mimic the behaviour of tar -x --strip-components=
                    stripped_name = member.name.split('/')[strip_components:]
                    if not bool(stripped_name):
                        continue

                    path = os.path.join(prefix, '/'.join(stripped_name))

                    # Read file data from the tarball
                    fd = tarball.extractfile(member)

                    # Send a job to the pool.
                    pool.wait_available()
                    pool.apply_async(__deploy_asset_to_s3, (fd.read(), path, member.size, bucket, not no_compress))

                    files_uploaded += 1

                # Wait for all transfers to finish
                pool.join()

            except KeyboardInterrupt:
                # Ctrl-C pressed
                print("Cancelling upload...")
                pool.join()

            finally:
                print("Uploaded %i files" % (files_uploaded))

    except tarfile.ReadError:
        print("Unable to read asset tarfile", file=sys.stderr)
        return
コード例 #48
0
from gevent import monkey
monkey.patch_all()
from gevent.pool import Pool

import requests

urls = [
    'http://python.org/',
    'http://www.pocketplaylab.com/',
    'http://github.com/'
]


def download(i, url):
    print('No.{}: Downloading: {}'.format(i, url))
    requests.get(url)
    print('No.{}: Done: {}'.format(i, url))

pool = Pool(size=3)
for i, url in enumerate(urls, 1):
    pool.apply_async(download, args=[i, url])
pool.join()
コード例 #49
0
ファイル: s3swiftsync.py プロジェクト: hpcloud/CloudAgents
	for key in bucket:
		#ca.log("Found "+key.name)
# 		if key.name[-1] == "/":
# 			if not key.name[0:-1] in files:
# 				pool.apply_async(worker, args=(region_endpoints['publicURL'],
# 									ca.conf['swiftcontainer'],
# 									ca.creds['token'],
# 									key.name,
# 									ca.conf['s3bucket'],
# 									ca.conf['s3accesskey'].encode('ascii'),
# 									ca.conf['s3secretkey'].encode('ascii')))
		if not key.name in files:
				pool.apply_async(worker, args=(region_endpoints['publicURL'],
												ca.conf['container'],
												ca.creds['token'],
												key.name,
												ca.conf['s3bucket'],
												ca.conf['s3accesskey'].encode('ascii'),
												ca.conf['s3secretkey'].encode('ascii')))

	pool.join()
	
	if ca.conf.get('emailreport') and copied_files:
		ca.log("Sending email.")
		ca.email("Copied "+str(len(copied_files))+" files to "+ca.conf['container'],'''
	Copied the follow files from S3 bucket %s to Swift container %s:
	
%s
	''' % (ca.conf['s3bucket'],ca.conf['container'],"\n".join(copied_files)))

	ca.log("Done.",'',100)
コード例 #50
0
ファイル: views.py プロジェクト: kurtsun/webadmin
def multi_execute_command(request):
	if request.method == 'GET':
		task_id = time.time().__str__().split(".")[0]
		group_name = request.GET.getlist('group')
		host = request.GET.getlist('host')
		bind_groups = request.user.userprofile.bind_groups.select_related()
		hosts=[]
		for obj in bind_groups:
			for g in group_name:
				if obj.host_group.name == g:
					hosts += obj.get_host_ref()
		command_name = request.GET.get('command')
		
		
		new_hosts=[]
		if hosts:
			for h in hosts:
				new_hosts.append(h.ip_address)

		new_hosts=list(set(new_hosts+host))	
		res_list=[]
		db=get_mongo_conn()
		if db is None:
			return HttpResponse("mongo cannot connect")
		db.adsame.insert({"task_id":task_id});
		audit_log = AuditLogTest.objects.create(user=request.user.username,group=",".join(group_name),
												command_type="cmd",command=command_name,
												execute_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),
												task_status="executing")
		audit_log.task_id = task_id
		audit_log.save()
		db=get_mongo_conn()
		if db is None:
			return HttpResponse("mongo cannot connect")
		db.adsame.insert({"task_id":task_id})
		pool = Pool(POOL_SIZE)
		for h in new_hosts:
			db.adsame.insert({"task_id":task_id,"host":h,"status":"padding"})
			p = pool.apply_async(execute_task,args=(task_id,h,command_name,db))
			res_list.append(p)
		
		pool.join()
		result={}
		r=None
		for res in res_list:
			ret={}
			try:
				return_result,error,return_code,ip = res.get()
				if return_code == 0:
					return_code = 'success'
				else:
					return_code = 'failed'
				ret['result'],ret['error'],ret['status'] = return_result,error,return_code
				result[ip]=ret
			except:
				pass
		end_time = datetime.datetime.now()
		audit_log=AuditLogTest.objects.get(task_id=task_id)
		audit_log.task_status="success"
		audit_log.finish_time=end_time.strftime('%Y-%m-%d %H:%M:%S')
		audit_log.result=json.dumps(result)
		audit_log.save()
		result['task_id']=task_id
		return HttpResponse(json.dumps(result),content_type="application/json")
コード例 #51
0
class Worker:

	def __init__(self, seeds, done_que, run_que):

		self.showpercounts = 10
		self.timeout = 5
		self.starttime = time.time()
		self.oldtime = 0

		self.quit = 0
		self.https_enable = 0


		self.run_que = run_que
		self.done_que = done_que
		self.tasks = []
		self.done = 1

		self.errdone = set()
		self.err = Error()

		self.loadstate()

		self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
	'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
		self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))

		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 60
		self.poolmaxfree = 20
		self.freecount = 0
		self.down_pool = Pool(size=self.poolsize)

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0

		self.debugnosave = 1
		self.tt = 1

		self.done_sites_fname='done_sites.bin'
		try:
			self.bfdone = BloomFilter.open(self.done_sites_fname)
		except:
			self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M 

		if self.run_que.qsize() == 0:
			for seed in seeds:
				self.run_que.put( seed.split("http://")[1] )

		if self.https_enable == 0:
			self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
		else:
			self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)


	def cb_httpget(self, data = None):

		if not data:
			return
		seed, err, headers, content = data
		st = time.time()

		if err:
			self.handle_error(err,seed)
			return

		if self.https_enable == 0:
			seed = seed[7:]

		self.bfdone.add(seed)
		self.done += 1

		data={'seed':seed,'headers':headers,'content':content}

		dat = cPickle.dumps(data)
		self.done_que.put(dat)

		et = time.time()
		self.cbcputime += (et-st)
		#self.tt=(et-st)

		if self.done % self.showpercounts == 0:
			self.out(seed)
			pass

	def out(self, seed):

		spendtime = time.time() - self.starttime
		spendtime = 1 if spendtime == 0 else spendtime
		nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
		now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )
		print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \
			(self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed )
	
	
	def work(self):

		while self.quit == 0:

			st = time.time()
			curdone = self.done

			self.freecount = self.down_pool.free_count()
			

			if self.freecount > self.poolmaxfree:
				self.tasks = []
				minlen = min(self.freecount+1,self.run_que.qsize())
				#if minlen <=0:break
				
				for i in range( minlen):
					stt = time.time()
					url = self.run_que.get()
					ett = time.time()
					if url in self.bfdone:# 5%-10%
							continue

					url = "http://"+url
					self.tasks.append(url)

				for url in self.tasks:
					self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget)

			
			time.sleep(0.1)
			et = time.time()	
			self.curspeed = (self.done - curdone) / (et-st)
			#self.tt = (et-st)

	
		self.down_pool.join()
		print "All OVER"

	def handle_error(self,e,url):

		if e.find('DNSError') > 0 :
			self.err.dns += 1
			self.err.rdns.append(url)
		elif e.find('reset') > 0 :#Connection reset
			self.err.reset += 1
			self.err.rreset.append(url)
		elif e.find('Max retries') > 0 or e.find('Connection aborted'): #
			self.err.conntimeout += 1
			self.err.rconntimeout.append(url)
		elif e.find('refused') > 0: #Connection refused
			self.err.refuse += 1
			self.err.rrefuse.append(url)

		else:
			self.err.others +=1
			self.err.rothers.append(url)
			print "Error", url, e

	# requests is better through test
	def httpget_requests(self, url):

		st = time.time()
		con = ""
		e = ""
		res_headers = ""
		headers = {
					'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
					'Accept-Encoding':'gzip,deflate',
					'Connection':'close',
					'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
				}


		res = None
		try:
			# todo: query the ip of the website before get through dns
			req = requests
			req.max_redirects = 1
			res = req.get(url, timeout = (3,2), headers = headers )
			if self.https_enable == 0 and res.url.lower().startswith('http:'):
				if 'content-type' not in res.headers.keys() or 'html' not in res.headers['content-type']:
					return None
				con = res.content
				
			res.close()

		except KeyboardInterrupt:
				raise
		except Exception as e:
			e = str(e)
			if res:
				res.close()

			return url,e,None,None

		et = time.time()
		self.totalnettime += (et-st)
		self.tt = (et-st)
		return url, e, res.headers, con

	def savestate(self):

		self.quit = 1
		now = time.time()
		self.oldtime += (now - self.starttime)

		#should hold on the singal for procdata done


		with open('state.txt','wb') as f:
			f.write(str(self.oldtime) + '\n')
			# tasks run_queue done
			f.write(str(len(self.tasks)) + '\n')
			for t in self.tasks:
				f.write(t + '\n')
			l = self.run_que.qsize()
			f.write(str(l)+ '\n')
			while l > 0:
				f.write( self.run_que.pop() + '\n')
				l-=1
			f.write(str((self.done)) + '\n')
 
		with open('err_records.pack','wb') as f:
			cPickle.dump(self.err,f,2)

		print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully."
		f.close()
		exit(0)

	def loadstate(self):

		try:
			with open('state.txt') as f:
				self.oldtime = float(f.readline())
				tasks = int(f.readline())
				for i in xrange(tasks):
					self.run_que.add(f.readline().rstrip('\n'))

				runnings = int(f.readline())
				for i in xrange(runnings):
					self.run_que.add(f.readline().rstrip('\n'))

				self.done = int(f.readline())

			with open('err_records.pack','rb') as f:
				self.err = cPickle.load(f)

			print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly."
		except Exception as e:
				print e
コード例 #52
0
ファイル: shodan-turk.py プロジェクト: 735tesla/Eagleeye
			return True
		return False
	except:
		return False


api = WebAPI(key)

#get the first page of results
res = api.search(filter)

#keep track of how many results we have left
total_results = (res['total'])
page = 1
list = []
outfile = open('netwave.html','w')
length = 0
try:
	while(page * 100 <= total_results):
	# Check the matches to see if they fit what we are looking for
		for host in res['matches']:
			ip = ''.join(str(host['ip']))
			port = ''.join(str(host['port']))
			pool.apply_async(checkCam, (ip,port),)
			#pool.join()
		page +=1
		res = api.search(filter,page)
except():
	print 'fail'

コード例 #53
0
class worker:

	def __init__(self,seeds):

		self.showpercounts = 50
		self.timeout = 10
		self.starttime = time.time()
		self.quit = 0

		#self.run_queue = Queue()
		self.run_queue = daemon.run_que
		self.done_queue = daemon.done_que
		self.tasks = []
		self.done = 0
		
		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 300
		self.freecount = 0
		#self.maxfreecnt = 4
		self.down_pool = Pool(size=self.poolsize)

		#self.mutex = gevent.coros.RLock()

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0
		self.test = 0
		self.errcnt  = 0
		self.bfdone = daemon.bfdone
		self.size = 0
		
		if self.run_queue.qsize() == 0:
			for seed in seeds:
				self.run_queue.put( seed.split("http://")[-1] )

		self.urlpatern = re.compile('href=[\"\']http://([^/?#\"\']+)')



	def cb_httpget(self, data):

		st = time.time()
		seed, err, headers, content = data

		#sself.test += 1
		if err or len(content) == 0:
			self.errcnt += 1
			return
			
		data={'url':seed,'headers':headers,'content':content}
		dat = cPickle.dumps(data)
		
		self.size = len(content)

		self.done_queue.put(dat)
		self.done += 1
		#seed.split('http://')[-1]
		self.bfdone.add(seed)

		et = time.time()
		
		self.cbcputime += (et-st)

		if self.done % self.showpercounts == 0:
			t = self.cbcputime/self.done
			self.out(seed ,(et-st))

		

	def out(self, cururl, cbtime=0 ):
		spendtime = time.time() - self.starttime
		spendtime = 1 if spendtime == 0 else spendtime
		nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
		now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )

		print "%s D:%-4d R:%-7d SpeedT:%.2f/s SpeedC:%.2f/s Test:%0.2f CB:%0.4f Active:%d Err:%d %s" % (now, (self.done), self.run_queue.qsize(), \
			self.done/spendtime,self.curspeed, self.test, cbtime ,self.poolsize-self.freecount, self.errcnt, cururl )
	
	

	def work(self):

		while self.quit == 0:
			curstime = time.time()

			self.freecount = self.down_pool.free_count()

			self.tasks = []
			if self.freecount == 0:
				gevent.sleep(0.1)
				continue

			st = time.time()
			xlen = self.freecount

			lasturl = ""
			while xlen > 0:
				xlen -= 1

				url = self.run_queue.get()
				if url == lasturl:
					continue
				else:
					lasturl = url
				url = "http://"+url
				if url in self.bfdone:
					xlen += 1
					continue
				#print xlen, url, self.down_pool.free_count()

				self.tasks.append(url)
				self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget)
			
			et = time.time()

			curetime = time.time()
			#self.curspeed = (self.done - curdone) / (curetime-curstime)
	
		self.down_pool.join()
		print "All OVER"

	
	# requests is better than pycurl ?
	def httpget_requests(self, url):

		st = time.time()
		con = ""
		e = None
		#'Connection':'close',
		headers = {
					'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
					'Accept-Encoding':'gzip,deflate',
					'Connection':'close',
					'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
				}

		try:
			# query the ip of the website
			req = requests
			#r = requests
			req.max_redirects = 1
			#with gevent.Timeout(5, False) as timeout:
			res = req.get(url, timeout = self.timeout)
			if res.url.startswith('https'):
				raise
			con = res.content
			headers = res.headers
			res.close()


		except KeyboardInterrupt:
				raise
		except Exception as e:

			et = time.time()
			return url,e,None,None

		et = time.time()
		self.totalnettime += (et-st)
		self.curspeed = self.totalnettime/(self.done+1)
		return url, e, headers, con
コード例 #54
0
ファイル: __init__.py プロジェクト: zorkian/gsh
class Gsh(object):
    def __init__(self, hosts, command, fork_limit=1, timeout=None, hooks=None):
        self.hosts = set(hosts)
        self.command = command
        self.fork_limit = self._build_fork_limit(fork_limit, len(self.hosts))
        self.timeout = timeout

        # Treat 0 second timeouts as no timeout.
        if not timeout:
            self.timeout = None

        if hooks is None:
            hooks = []
        self.hooks = hooks

        self._pool = Pool(max(self.fork_limit, 1))
        self._greenlets = []
        self._remotes = []

        self._pre_job_hooks = None
        self._post_job_hooks = None

    @staticmethod
    def _build_fork_limit(fork_limit, num_hosts):
        if isinstance(fork_limit, int) or fork_limit.isdigit():
            return int(fork_limit)
        if fork_limit.endswith("%"):
            return int(float(num_hosts) * (float(fork_limit[:-1]) / 100.0))
        # If we can't parse your forklimit go serial for safety.
        return 1

    def run_async(self):

        # Don't start executing until the pre_job hooks have completed.
        self._pre_job_hooks = gevent.spawn(self._run_pre_job_hooks)
        self._pre_job_hooks.join()

        for host in self.hosts:
            remote_command = RemotePopen(host, self.command, hooks=self.hooks, timeout=self.timeout)
            self._remotes.append(remote_command)
            self._greenlets.append(self._pool.apply_async(remote_command.run))

        self._post_job_hooks = gevent.spawn(self._run_post_job_hooks)

    def _run_pre_job_hooks(self):
        for hook in self.hooks:
            hook.pre_job(self.command, self.hosts, time.time())

    def _run_post_job_hooks(self):
        # Wait for all greenlets to finish before running these hooks.
        gevent.joinall(self._greenlets)
        for hook in self.hooks:
            hook.post_job(time.time())

    def wait(self, timeout=None):
        rc = 0
        gevent.joinall(self._greenlets + [self._post_job_hooks], timeout=timeout, raise_error=True)
        for remote in self._remotes:
            if remote.rc:
                return remote.rc
        return rc
コード例 #55
0
def uncompress_and_copy(src_bucket, src_key, dst_bucket, dst_keyprefix='',
                        concurrency=50, strip_components=0,
                        extract_dates=False):
    """Upload the contents of a tarball to the S3 bucket."""

    client = boto3.client('s3')
    tarfile_key = client.get_object(Bucket=src_bucket, Key=src_key)
    tarball_obj = tarfile_key['Body']

    # Open the tarball
    try:
        with tarfile.open(name=None, mode="r|*", fileobj=tarball_obj) as tarball:

            files_uploaded = 0

            # Parallelize the uploads so they don't take ages
            pool = Pool(concurrency)

            # Iterate over the tarball's contents.
            try:
                for member in tarball:

                    # Ignore directories, links, devices, fifos, etc.
                    if not member.isfile():
                        continue

                    # mimic the behavior of tar -x --strip-components=
                    stripped_name = member.name.split('/')[strip_components:]
                    if not bool(stripped_name):
                        continue

                    # add the date from the filename, if requested
                    if extract_dates:
                        m = re.search(r"\-(\d{4})(\d{2})(\d{2}).tar", src_key)
                        if m:
                            date_key = '-'.join([m.group(1), m.group(2), m.group(3)])
                            keyprefix = os.path.join(dst_keyprefix, date_key)
                        else:
                            log.warn("Extract_dates requested, but no date found")
                            keyprefix = dst_keyprefix
                    else:
                        keyprefix = dst_keyprefix

                    path = os.path.join(keyprefix, '/'.join(stripped_name))

                    # Read file data from the tarball
                    fd = tarball.extractfile(member)

                    # Send a job to the pool.
                    pool.wait_available()
                    pool.apply_async(__deploy_asset_to_s3, (fd.read(), member.size,
                                                            dst_bucket,
                                                            path))

                    files_uploaded += 1

                # Wait for all transfers to finish
                pool.join()

            except KeyboardInterrupt:
                # Ctrl-C pressed
                print("Cancelling upload...")
                pool.join()

            finally:
                log.info("Uploaded %i files" % (files_uploaded))

    except tarfile.ReadError:
        print("Unable to read asset tarfile", file=sys.stderr)
        return

    return {'source': os.path.join(src_bucket, src_key),
            'destination': os.path.join(dst_bucket, keyprefix),
            'files_sent': files_uploaded,
            'bytes_sent': 0}