class RedisSource(Source): """ redis source """ def __init__(self, redis_host, redis_port, redis_key, **kwargs): try: from custom_redis.client import Redis except ImportError: try: from redis import Redis except ImportError: warnings.warn( "RedisSource depends on redis, try: pip install redis. ") exit(1) self.redis_key = redis_key self.redis_conn = Redis(redis_host, redis_port) async def __anext__(self): """ 异步迭代器需要实现这个方法,这是一个异步方法,最终返回一个迭代值。 :return: """ return self.redis_conn.lpop(self.redis_key) async def push_back(self, data): self.redis_conn.rpush(self.redis_key, data) @staticmethod def enrich_parser(sub_parser): sub_parser.add_argument("-rh", "--redis-host", default="0.0.0.0") sub_parser.add_argument("-rp", "--redis-port", default=6379) sub_parser.add_argument("-rk", "--redis-key", default="download_meta") sub_parser.add_argument("--idle", action="store_true", help="Idle... ")
class MultiDownloadProcess(Logger, MultiThreadClosing): name = "multidownload_process" def __init__(self, settings): self.settings_file = settings Logger.__init__(self, settings) self.set_logger() MultiThreadClosing.__init__(self) self.de_queue = Queue() if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.small = False @classmethod def parse_args(cls): parser = ArgumentParser() parser.add_argument("-s", "--settings", dest="settings", default="settings.py") return cls(**vars(parser.parse_args())) def is_small(self): self.small=True def callback(self, item, flag): """ callback called when download is finished. :return: """ raise NotImplementedError() def decode(self, item): """ redis pop out to got url, filename, directory :param item: :return: (url, filename, directory) """ raise NotImplementedError() def processing(self, de, url_paths, item): if self.small: downloader = "download_small_file" else: downloader = "start" flag = False try: t1 = time.time() length = len(url_paths) for index, (url, filename, path) in enumerate(url_paths): result = getattr(de, downloader)(url=url, filename=filename, path=path) self.logger.info("download process %s/%s completed"%(index+1, length)) flag = flag or result t2 = time.time() self.logger.info("download finished, success:%s, seconds:%.4f"%(flag, t2-t1)) self.de_queue.put(de) self.callback(item, flag) self.logger.info("callback finished, seconds:%.4f"%(time.time()-t2)) except Exception: self.logger.error(traceback.format_exc()) finally: try: self.threads.remove(current_thread()) except ValueError: pass self.logger.info("the count of thread which alives is %s. "%len(self.threads)) def start(self): self.logger.debug("start process %s"%self.name) concurrent_download_count = self.settings.get("CONCURRENT_DOWNLOAD_COUNT", 10) for i in range(concurrent_download_count): DE = DownloaderEngine(self.settings_file, signal_open=False) DE.set_logger(self.logger) self.de_queue.put(DE) self.logger.debug("setup %s des"%concurrent_download_count) while self.alive: try: item = self.redis_conn.lpop(self.settings.get("QUEUE_KEY")) except Exception: self.logger.error("redis error %s"%traceback.format_exc()) item = None if not item: self.logger.debug("got no message...") time.sleep(1) continue self.logger.debug("%s tasks to be continue..."%self.redis_conn.llen(self.settings.get("QUEUE_KEY"))) try: url_paths = self.decode(item) except Exception: self.logger.error(traceback.format_exc()) url_paths = [] while url_paths: try: DE = self.de_queue.get_nowait() th = Thread(target=self.processing, args=(DE, url_paths, item)) self.set_force_interrupt(th) self.logger.debug("start a new thread. ") th.start() except Empty: time.sleep(1) else: break while True: if filter(lambda x:x.is_alive(), self.threads): time.sleep(1) else: break
class MultiDownloadProcess(Logger, MultiThreadClosing): name = "multidownload_process" def __init__(self, settings): self.settings_file = settings Logger.__init__(self, settings) self.set_logger() MultiThreadClosing.__init__(self) self.de_queue = Queue() if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.small = False @classmethod def parse_args(cls): parser = ArgumentParser() parser.add_argument("-s", "--settings", dest="settings", default="settings.py") return cls(**vars(parser.parse_args())) def is_small(self): self.small = True def callback(self, item, flag): """ callback called when download is finished. :return: """ raise NotImplementedError() def decode(self, item): """ redis pop out to got url, filename, directory :param item: :return: (url, filename, directory) """ raise NotImplementedError() def processing(self, de, url_paths, item): if self.small: downloader = "download_small_file" else: downloader = "start" flag = False try: t1 = time.time() length = len(url_paths) for index, (url, filename, path) in enumerate(url_paths): result = getattr(de, downloader)(url=url, filename=filename, path=path) self.logger.info("download process %s/%s completed" % (index + 1, length)) flag = flag or result t2 = time.time() self.logger.info("download finished, success:%s, seconds:%.4f" % (flag, t2 - t1)) self.de_queue.put(de) self.callback(item, flag) self.logger.info("callback finished, seconds:%.4f" % (time.time() - t2)) except Exception: self.logger.error(traceback.format_exc()) finally: try: self.threads.remove(current_thread()) except ValueError: pass self.logger.info("the count of thread which alives is %s. " % len(self.threads)) def start(self): self.logger.debug("start process %s" % self.name) concurrent_download_count = self.settings.get( "CONCURRENT_DOWNLOAD_COUNT", 10) for i in range(concurrent_download_count): DE = DownloaderEngine(self.settings_file, signal_open=False) DE.set_logger(self.logger) self.de_queue.put(DE) self.logger.debug("setup %s des" % concurrent_download_count) while self.alive: try: item = self.redis_conn.lpop(self.settings.get("QUEUE_KEY")) except Exception: self.logger.error("redis error %s" % traceback.format_exc()) item = None if not item: self.logger.debug("got no message...") time.sleep(1) continue self.logger.debug( "%s tasks to be continue..." % self.redis_conn.llen(self.settings.get("QUEUE_KEY"))) try: url_paths = self.decode(item) except Exception: self.logger.error(traceback.format_exc()) url_paths = [] while url_paths: try: DE = self.de_queue.get_nowait() th = Thread(target=self.processing, args=(DE, url_paths, item)) self.set_force_interrupt(th) self.logger.debug("start a new thread. ") th.start() except Empty: time.sleep(1) else: break while True: if filter(lambda x: x.is_alive(), self.threads): time.sleep(1) else: break