class Communicator: def __init__(self, is_ps, ps_host): # There might be other ways to get the IP address server_addr = socket.gethostbyname(ps_host.split(":")[0]) server_port = int(ps_host.split(":")[1]) self.__is_ps = is_ps self.manager = SyncManager(address=(), authkey=b'abc') self.__start() self.task = self.manager.Queue() self.result = self.manager.Queue() self.end_flag = self.manager.Queue() self.data_sync = self.manager.Queue() self.idle_gpuq = Manager().Queue() self.data_count = 0 return def __start(self): if self.__is_ps: self.manager.start() else: while True: try: self.manager.connect() break except: print("waiting for connecting ...") return
class DbndKubernetesExecutor(KubernetesExecutor): def __init__(self, kube_dbnd=None): # type: (DbndKubernetesExecutor, DbndKubernetesClient) -> None super(DbndKubernetesExecutor, self).__init__() from multiprocessing.managers import SyncManager self._manager = SyncManager() self.kube_dbnd = kube_dbnd _update_airflow_kube_config(airflow_kube_config=self.kube_config, engine_config=kube_dbnd.engine_config) def start(self): logger.info("Starting Kubernetes executor..") self._manager.start(mgr_init) dbnd_run = try_get_databand_run() if dbnd_run: self.worker_uuid = str(dbnd_run.run_uid) else: self.worker_uuid = ( KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid()) self.log.debug("Start with worker_uuid: %s", self.worker_uuid) # always need to reset resource version since we don't know # when we last started, note for behavior below # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs # /CoreV1Api.md#list_namespaced_pod # KubeResourceVersion.reset_resource_version() self.task_queue = self._manager.Queue() self.result_queue = self._manager.Queue() self.kube_client = self.kube_dbnd.kube_client self.kube_scheduler = DbndKubernetesScheduler( self.kube_config, self.task_queue, self.result_queue, self.kube_client, self.worker_uuid, kube_dbnd=self.kube_dbnd, ) if self.kube_dbnd.engine_config.debug: self.log.setLevel(logging.DEBUG) self.kube_scheduler.log.setLevel(logging.DEBUG) self._inject_secrets() self.clear_not_launched_queued_tasks() self._flush_result_queue() # override - by default UpdateQuery not working failing with # sqlalchemy.exc.CompileError: Unconsumed column names: state # due to model override # + we don't want to change tasks statuses - maybe they are managed by other executors @provide_session def clear_not_launched_queued_tasks(self, *args, **kwargs): # we don't clear kubernetes tasks from previous run pass
def start_sync_manager(): global gamemode_summaries global champion_summaries global requested_matches global request_queue global response_queue manager = SyncManager() manager.start() gamemode_summaries = manager.dict([]) champion_summaries = manager.dict([]) requested_matches = manager.list([]) request_queue = manager.Queue() response_queue = manager.Queue()
def prepare_experiment(env, args): # Manager to share PER between a learner and explorers SyncManager.register('PrioritizedReplayBuffer', PrioritizedReplayBuffer) manager = SyncManager() manager.start() kwargs = get_default_rb_dict(args.replay_buffer_size, env) kwargs["check_for_update"] = True global_rb = manager.PrioritizedReplayBuffer(**kwargs) # queues to share network parameters between a learner and explorers n_queue = 1 if args.n_env > 1 else args.n_explorer n_queue += 1 # for evaluation queues = [manager.Queue() for _ in range(n_queue)] # Event object to share training status. if event is set True, all exolorers stop sampling transitions is_training_done = Event() # Lock lock = manager.Lock() # Shared memory objects to count number of samples and applied gradients trained_steps = Value('i', 0) return global_rb, queues, is_training_done, lock, trained_steps
def __init__(self, conf_path='config/pn_conf.yaml'): self.conf_path = os.path.abspath(conf_path) self.conf = getConf(self.conf_path, root_key='itchat') self.thread_id = None self.gid = None # 记录我们群的UserName if self.conf['use_custom_manager']: # create proxy manager mgr = SyncManager( (get_lan_ip(), self.conf['custom_manager_port']), self.conf['custom_manager_authkey'].encode('utf8')) #-# sleep(0.5) # wait for manager to start mgr.connect() else: mgr = multiprocessing.Manager() self.q_send = mgr.Queue() self.event_exit = mgr.Event() multiprocessing.current_process( ).authkey = self.conf['custom_manager_authkey'].encode( 'utf8') # https://bugs.python.org/issue7503 self.proc_wx = multiprocessing.Process(target=self.run, args=(self.event_exit, self.q_send)) self.proc_wx.start()
def function_handler(payload): job = SimpleNamespace(**payload) manager = SyncManager() manager.start() job_queue = manager.Queue() job_runners = [] processes = min(job.worker_processes, len(job.call_ids)) logger.info("Starting {} processes".format(processes)) for runner_id in range(processes): p = mp.Process(target=process_runner, args=(runner_id, job_queue)) job_runners.append(p) p.start() for call_id in job.call_ids: data_byte_range = job.data_byte_ranges.pop(0) logger.info('Going to execute job {}-{}'.format(job.job_key, call_id)) job_queue.put((job, call_id, data_byte_range)) for i in range(processes): job_queue.put(ShutdownSentinel()) for runner in job_runners: runner.join() manager.shutdown()
def preprocess_cycles(client: InfluxDBClient, executor: Executor, manager: SyncManager, dry_run=False): logger.info("Preprocessing charge cycles") queue = manager.Queue() series = client.list_series("samples") futures = [] # TODO merge results of different detectors for attr, where, detector in [ ('charger_acvoltage', 'charger_acvoltage>0 OR veh_speed > 0', ChargeCycleACVoltageDetection(time_epoch=client.time_epoch)), ('ischarging', 'ischarging>0 OR veh_speed > 0', ChargeCycleIsChargingDetection(time_epoch=client.time_epoch)), ('ac_hvpower', 'ac_hvpower>0 OR veh_speed > 0', ChargeCycleACHVPowerDetection(time_epoch=client.time_epoch)), ('hvbatt_soc', 'hvbatt_soc<200', ChargeCycleDerivDetection(time_epoch=client.time_epoch)) ]: fields = ["time", "participant", "hvbatt_soc", "veh_speed"] if attr not in fields: fields.append(attr) futures += [executor.submit(preprocess_cycle, nr, client, queue, sname, join_selectors([sselector, where]), fields, detector, dry_run) for nr, (sname, sselector) in enumerate(series)] logger.debug("Tasks started, waiting for results...") async_progress(futures, queue) data = [f.result() for f in futures] logger.debug("Tasks done") data.sort(key=lambda a: a[0:1]) logger.info(__("Detected charge cycles:\n{}", tabulate(data, headers=["attr", "#", "cycles", "cycles_disc"])))
def __init__(self, conf_path='config/pn_conf.yaml'): # input param self.conf_path = conf_path self.conf = getConf(self.conf_path, root_key='audio') if self.conf['target'] == 'pi': self.t2s = Text2SpeechBaidu(self.conf_path) # sync else: self.t2s = Text2SpeechXunFei(self.conf_path) # sync self.executor_t2s = concurrent.futures.ProcessPoolExecutor(2) # async if self.conf['use_custom_manager']: # create proxy manager mgr = SyncManager( (get_lan_ip(), self.conf['custom_manager_port']), self.conf['custom_manager_authkey'].encode('utf8')) sleep(0.5) # wait for manager to start mgr.connect() else: mgr = multiprocessing.Manager() self.q_audio = mgr.Queue() #-# debug('audio data queue created. %s', self.q_audio) self.event_exit = mgr.Event() multiprocessing.current_process( ).authkey = self.conf['custom_manager_authkey'].encode( 'utf8') # https://bugs.python.org/issue7503 self.proc_play = multiprocessing.Process(target=self.playAudioFromQ, args=(self.q_audio, self.event_exit)) self.proc_play.start() #-# debug('play background proc start. %s', self.proc_play) # 触发进程池worker进程创建, 貌似提前创建的占用内存小些 self.executor_t2s.map(noop_func, (None, None))
def function_handler(payload): job = SimpleNamespace(**payload) setup_lithops_logger(job.log_level) processes = min(job.worker_processes, len(job.call_ids)) logger.info('Tasks received: {} - Concurrent processes: {}'.format( len(job.call_ids), processes)) env = job.extra_env env['LITHOPS_WORKER'] = 'True' env['PYTHONUNBUFFERED'] = 'True' os.environ.update(env) storage_config = extract_storage_config(job.config) internal_storage = InternalStorage(storage_config) job.func = get_function_and_modules(job, internal_storage) job_data = get_function_data(job, internal_storage) if processes == 1: job_queue = queue.Queue() for call_id in job.call_ids: data = job_data.pop(0) job_queue.put((job, call_id, data)) job_queue.put(ShutdownSentinel()) process_runner(job_queue) else: manager = SyncManager() manager.start() job_queue = manager.Queue() job_runners = [] for call_id in job.call_ids: data = job_data.pop(0) job_queue.put((job, call_id, data)) for i in range(processes): job_queue.put(ShutdownSentinel()) for runner_id in range(processes): p = mp.Process(target=process_runner, args=(job_queue, )) job_runners.append(p) p.start() logger.info('Worker process {} started'.format(runner_id)) for runner in job_runners: runner.join() manager.shutdown() # Delete modules path from syspath module_path = os.path.join(MODULES_DIR, job.job_key) if module_path in sys.path: sys.path.remove(module_path) # Unset specific job env vars for key in job.extra_env: os.environ.pop(key, None) os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None)
def preprocess_trips(client: InfluxDBClient, executor: Executor, manager: SyncManager, dry_run=False): logger.info("Preprocessing trips") queue = manager.Queue() series = client.list_series("samples") futures = [executor.submit(preprocess_trip, nr, client, queue, sname, sselector, dry_run) for nr, (sname, sselector) in enumerate(series)] logger.debug("Tasks started, waiting for results...") async_progress(futures, queue) data = [f.result() for f in futures] logger.debug("Tasks done") data.sort(key=lambda a: a[0]) logger.info(__("Detected trips:\n{}", tabulate(data, headers=["#", "cycles", "cycles_disc"])))
def test_run_keyboard_interrupt(sync_manager: SyncManager) -> None: """Function: run should stop by keyboard interupt.""" queue_process_id = sync_manager.Queue() replier = Replier(sync_manager.dict(), queue_process_id) loop = asyncio.new_event_loop() with ProcessPoolExecutor() as executor: future = cast( "Future[Any]", loop.run_in_executor(executor, run, replier, None, keyboard_interrupt)) queue_process_id.get() assert not future.get_loop().is_running() assert not future.done()
def __init__(self): mgr = SyncManager() mgr.start(signal.signal, (signal.SIGINT, signal.SIG_IGN)) self.ns_default = mgr.Namespace() self.ns_default.error = None self.ns_stats = mgr.Namespace() self.input_queue = mgr.Queue(maxsize=100) self.error_occurred = mgr.Event() self.error_processed = mgr.Event() self.batch_done = mgr.Event() self.mgr = mgr self.stats_lock = mgr.Lock() self.main_lock = mgr.Lock()
def get_server_queue(): #FIXME: some OSX users were getting "Can't assign requested address" errors # if we use socket.gethostname() for the address. Changing it to # 'localhost' seems to fix the issue, but I don't know why. We had to # use socket.gethostname() in order to get our benchmark tests to run # using qsub on a linux cluster, so with this 'fix', testflo benchmark tests # will likely not work on a cluster of OSX machines. if sys.platform == 'darwin': addr = 'localhost' else: addr = socket.gethostname() manager = SyncManager(address=(addr, 0), authkey=_testflo_authkey) manager.start() return manager, manager.Queue()
def function_handler(payload): job = SimpleNamespace(**payload) processes = min(job.worker_processes, len(job.call_ids)) logger.info('Tasks received: {} - Concurrent workers: {}'.format( len(job.call_ids), processes)) storage_config = extract_storage_config(job.config) internal_storage = InternalStorage(storage_config) job.func = get_function_and_modules(job, internal_storage) job_data = get_function_data(job, internal_storage) if processes == 1: job_queue = queue.Queue() for task_id in job.call_ids: data = job_data.pop(0) job_queue.put((job, task_id, data)) job_queue.put(ShutdownSentinel()) process_runner(job_queue, internal_storage) else: manager = SyncManager() manager.start() job_queue = manager.Queue() job_runners = [] for runner_id in range(processes): p = mp.Process(target=process_runner, args=(job_queue, internal_storage)) job_runners.append(p) p.start() logger.info('Worker process {} started'.format(runner_id)) for call_id in job.call_ids: data = job_data.pop(0) job_queue.put((job, call_id, data)) for i in range(processes): job_queue.put(ShutdownSentinel()) for runner in job_runners: runner.join() manager.shutdown() # Delete modules path from syspath module_path = os.path.join(MODULES_DIR, job.job_key) if module_path in sys.path: sys.path.remove(module_path)
class ClassWithManager(object): def __init__(self): self._manager = SyncManager() self._manager.start(mgr_init) self.task_queue = self._manager.Queue() def run_and_sleep(self, sleep_time): try: time.sleep(sleep_time) finally: self.end() def end(self): logging.error( "Executor shutting down, task_queue approximate size=%d", self.task_queue.qsize(), )
def run(self): logging.basicConfig(format=LOG_FORMAT, filename=LOG_FILE, filemode='w', level=logging.INFO) SyncManager.register('SessionManager', SessionManager) SyncManager.register('DBManager', DBManager) manager = SyncManager() manager.start() db_manager = manager.DBManager() odometer_value = manager.Value(c_int, 0) vin = manager.Queue(1) session_manager = manager.SessionManager(odometer_value) can_manager = CanManager(session_manager, odometer_value, vin) nfc_manager = NFCManager(session_manager, db_manager) upload_manager = UploadManager(session_manager, db_manager) signal.pause()
def run_generator_in_process(gen, *args, **kwargs): import multiprocessing from multiprocessing.managers import SyncManager manager = SyncManager() manager.start() q = manager.Queue(maxsize=1024) process = multiprocessing.Process(target=generate_to_queue, args=(gen, args, kwargs, q)) process.start() while True: info, i = q.get() if info == StopIteration: break elif info == "exception": raise i else: assert info == "item" print("qsize", q.qsize()) yield i manager.shutdown()
class DataSender: def __init__(self): self.driveup = driveUp.DriveUp('client.json') self.running = True self._manager = SyncManager() def start(self): self.driveup.authenticate() self._manager.start(self._mgr_init) self._que = self._manager.Queue() self._process = Process(target=self.up, args=(self._que, )) self._process.start() def _mgr_init(self): signal.signal(signal.SIGINT, signal.SIG_IGN) print("initialized manager") def up(self, que): def stop(val, val2): print "process SIGINT stopping" self.running = False signal.signal(signal.SIGINT, stop) print('datauploader started') while self.running or not que.empty(): item = que.get(True) print("handling item={0}".format(item)) self.driveup.upload(item) que.task_done() time.sleep(2) print("datauploader process terminating...") def send(self, data): self._que.put(data) def stop(self): print("shutting down sender") self.running = False self._que.join() self._process.terminate()
def apply(func, args): import random from Queue import Empty from multiprocessing import Process from multiprocessing.managers import SyncManager import tempfile import os if os.getcwd()[:4] != '/afs': try: m = SyncManager(address=tempfile.mktemp(prefix='dqu_subprocess-', dir=os.getcwd())) m.start() except EOFError: m = SyncManager() m.start() else: m = SyncManager() m.start() q = m.Queue() p = Process(target=_local_apply_core, args=(func, args, q)) p.start() p.join() print 'Manager socket is', m.address try: rv = q.get(False) except Empty: raise RuntimeError('daughter died while trying to execute %s%s' % (func.func_name, args)) if isinstance(rv, BaseException): if isinstance(rv, SystemExit): print 'SystemExit raised by daughter; ignoring' return None else: raise rv m.shutdown() return rv
def import_data(): logger.info("Start log file import") parsers = [V1Parser, V2Parser, V3Parser] logger.info( __("Using parser version {version}", version=arguments["--version"])) csv_parser = parsers[int(arguments["--version"]) - 1] if arguments["FILE"] is not None: file_path = arguments["FILE"] if not os.path.isabs(file_path): file_path = os.path.join(os.getcwd(), file_path) directory_path = os.path.dirname(file_path) directory = Directory(os.path.basename(directory_path), directory_path) file = os.path.basename(file_path) logger.debug( __("directory: {dir}, file:{file}", dir=directory, file=file)) _execute_import(csv_parser(), directory, file=file) else: directories = FileSystemAccess(logger).get_directories( config["webike.imei_regex"]) manager = SyncManager() manager.start() queue = manager.Queue() with ProcessPoolExecutor(max_workers=14) as executor: futures = [ executor.submit(_execute_import, csv_parser(), directory, queue) for directory in directories ] async_progress(futures, queue, delay=10) logger.info("Import complete")
def replier(sync_manager: SyncManager) -> Generator[Replier, None, None]: yield Replier(sync_manager.dict(), sync_manager.Queue())
def run(self, shuffle=False): """ Runs all experiments. Blocks until all experiment are finished. """ # Setup multiprocessing logging manager = SyncManager() manager.start(lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) result_log_queue = manager.Queue() self.cancel_experiments = manager.Value('b', 0) self.interrupt_condition = manager.Condition() listener = multiprocessing.Process(target=result_log_listener, args=(result_log_queue, setup_result_logger, self.result_log_name,)) listener.start() # Setup callback throttling result_lock = Lock() callback_lock = Lock() self.callback_disabled = False def call_callback(experiment_id=None, pause=0): with callback_lock: with result_lock: if pause: output_status(prefix='C') self.last_callback = datetime.now() self.save_results() sys.stdout.write('Results saved. ') sys.stdout.flush() if not self.callback_disabled: sys.stdout.write('Digesting results ... ') sys.stdout.flush() try: self.update_callback(experiment_id) except Exception as ex: # pylint: disable=W sys.stdout.write('errored with {}\n\n\n'.format(ex.__class__)) self.callback_disabled = True else: sys.stdout.write('done\n') else: sys.stdout.write('Digestion disabled, due to previous exception.\n') self.last_callback = datetime.now() # setup process pool self.jobs_total = len(self.experiments) start_time = datetime.now() print("Using up to %i CPUs %s" % (self.cpu_limit, 'with numpy multi-threading disabled' if os.environ.get('OMP_NUM_THREADS', None) == '1' else '')) with multiprocessing.Pool(self.cpu_limit) as pool: # print status function def output_status(prefix='F'): progress = self.jobs_finished / self.jobs_total elapsed_time = datetime.now() - start_time errors = "" if self.jobs_errored == 0 else " %i ERRORED, " % self.jobs_errored sys.stdout.write( ("%s %s: %i jobs, %i finished, %i queued," + errors + " %.0f%%, ~remaining: %s\n") % ( prefix, datetime.now().strftime('%c'), self.jobs_total, self.jobs_finished, self.jobs_total - self.jobs_finished, progress * 100, timedelta(seconds=(elapsed_time * (1 - progress) / progress).total_seconds() // 15 * 15) if progress > 0 else '???', ) ) # define callbacks, they are run within the main process, but in separate threads def update_status(result=None): from pandas import DataFrame with result_lock: self.jobs_finished += 1 output_status() if not result: return row = {} experiment = self.experiments[result.experiment_id] row.update({ 'experiment_id': result.experiment_id, 'experiment_hash': experiment.hash, 'experiment': experiment.__class__.__name__, }) row.update(experiment.parameters._asdict()) row.update(result._asdict()) self.results = self.results.append(DataFrame([row]), sort=True) # If there is already a callback waiting, we will replace it and therefore cancel it if self.next_callback and self.next_callback.is_alive(): sleep(0) # let other threads run first if callback_lock.acquire(blocking=False): self.next_callback.cancel() callback_lock.release() else: # the callback is currently waiting for the result_lock return # Schedule callback either immediately (0) or after the pause expired pause = max(0.0, self.update_callback_min_pause - (datetime.now() - self.last_callback).total_seconds()) self.next_callback = Timer( pause, call_callback, args=[result.experiment_id, pause], ) self.next_callback.start() def update_status_error(exception): if isinstance(exception, ExperimentCanceledException): return print('Experiment exception: ', exception, file=sys.stderr) traceback.print_exception(type(exception), exception, exception.__traceback__, file=sys.stderr) self.jobs_errored += 1 self.exceptions.append(exception) update_status() # randomize order experiments = list(self.experiments.values()) if shuffle: random.seed(0xdeadbeef) random.shuffle(experiments) # filter loaded experiments if not self.results.empty: known_hashes = [ex.hash for ex in experiments] len_before = len(experiments) loaded_experiment_hashes = self.results.loc[:, ['experiment_hash']].values[:, 0] experiments = [ex for ex in experiments if ex.hash not in loaded_experiment_hashes] if loaded_experiment_hashes.size: print('Continuing from %s' % self.results_file) self.jobs_finished = len_before - len(experiments) # check for experiments with results that we don't know unknown_experiments = self.results.loc[~self.results['experiment_hash'].isin(known_hashes)] if not unknown_experiments.empty: print('@' * 80) print('Results file %s contains %i results that are not in the study\'s' % (self.results_file, len(unknown_experiments))) print('experiment definition. Did you delete experiments from your study?') print('@' * 80) # experiment execution for experiment in experiments: # Assign experiment to GPU (if used) : might be replaced by more sophisticated load balancer if self.gpu_limit > 0: gpu_num = self.gpu_counter % self.gpu_limit experiment.assign_to_gpu(gpu_num) self.gpu_counter += 1 # Add experiment to execution queue pool.apply_async( experiment.execute, (result_log_queue, self.result_log_name, self.cancel_experiments, self.interrupt_condition), callback=update_status, error_callback=update_status_error, ) def signal_handler(_sig, _frame): self.num_int += 1 if self.num_int > 1: print("Killing all processes.") sys.exit(1) print( "\rPerforming graceful shutdown... (Press CTRL-C again to force. This might result in data loss.)") with self.interrupt_condition: self.cancel_experiments.value = 1 self.interrupt_condition.notify_all() signal.signal(signal.SIGINT, signal_handler) # show status, then block until we're ready output_status() pool.close() pool.join() if self.next_callback and self.next_callback.is_alive(): with callback_lock: self.next_callback.cancel() call_callback() # quit logger result_log_queue.put(None) # trigger listener to quit listener.join() # check if we got any exceptions as results if self.exceptions: raise FailedExperimentsException(self.exceptions)
class DbndKubernetesScheduler(AirflowKubernetesScheduler): """ Very serious override of AirflowKubernetesScheduler 1. better visability on errors, so we proceed Failures with much more info 2. tracking of all around "airflow run" events -> Pod Crashes, Pod Submission errors a. in case of crash (OOM, evicted pod) -> error propogation to databand and retry """ def __init__( self, kube_config, task_queue, result_queue, kube_client, worker_uuid, kube_dbnd ): super(DbndKubernetesScheduler, self).__init__( kube_config, task_queue, result_queue, kube_client, worker_uuid ) self.kube_dbnd = kube_dbnd # PATCH watcher communication manager # we want to wait for stop, instead of "exit" inplace, so we can get all "not" received messages from multiprocessing.managers import SyncManager # TODO: why can't we use original SyncManager? # Scheduler <-> (via _manager) KubeWatcher # if _manager dies inplace, we will not get any "info" from KubeWatcher until shutdown self._manager = SyncManager() self._manager.start(mgr_init) self.watcher_queue = self._manager.Queue() self.current_resource_version = 0 self.kube_watcher = self._make_kube_watcher_dbnd() # pod to airflow key (dag_id, task_id, execution_date) self.submitted_pods = {} # type: Dict[str,SubmittedPodState] # sending data to databand tracker self.metrics_logger = KubernetesMetricsLogger() # disappeared pods mechanism self.last_disappeared_pods = {} self.current_iteration = 1 # add `k8s-scheduler:` prefix to all log messages self._log = PrefixLoggerAdapter("k8s-scheduler", self.log) def _make_kube_watcher(self): # prevent storing in db of the kubernetes resource version, because the kubernetes db model only stores a single value # of the resource version while we need to store a sperate value for every kubernetes executor (because even in a basic flow # we can have two Kubernets executors running at once, the one that launched the driver and the one inside the driver). # # the resource version is the position inside the event stream of the kubernetes cluster and is used by the watcher to poll # Kubernets for events. It's probably fine to not store this because by default Kubernetes will returns "the evens currently in cache" # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_namespaced_pod return None def _make_kube_watcher_dbnd(self): watcher = DbndKubernetesJobWatcher(**get_job_watcher_kwargs(self)) watcher.start() return watcher @staticmethod def _create_pod_id(dag_id, task_id): task_run = try_get_databand_run().get_task_run(task_id) return task_run.job_id__dns1123 def _health_check_kube_watcher(self): if self.kube_watcher.is_alive(): pass else: self.log.error( "Error while health checking kube watcher process. " "Process died for unknown reasons" ) self.kube_watcher = self._make_kube_watcher_dbnd() def run_next(self, next_job): """ The run_next command will check the task_queue for any un-run jobs. It will then create a unique job-id, launch that job in the cluster, and store relevant info in the current_jobs map so we can track the job's status """ key, command, kube_executor_config = next_job dag_id, task_id, execution_date, try_number = key self.log.debug( "Kube POD to submit: image=%s with %s", self.kube_config.kube_image, str(next_job), ) dr = try_get_databand_run() task_run = dr.get_task_run_by_af_id(task_id) pod_command = [str(c) for c in command] task_engine = task_run.task_engine # type: KubernetesEngineConfig pod = task_engine.build_pod( task_run=task_run, cmds=pod_command, labels={ "airflow-worker": self.worker_uuid, "dag_id": make_safe_label_value(dag_id), "task_id": make_safe_label_value(task_run.task_af_id), "execution_date": self._datetime_to_label_safe_datestring( execution_date ), "try_number": str(try_number), }, try_number=try_number, include_system_secrets=True, ) pod_ctrl = self.kube_dbnd.get_pod_ctrl_for_pod(pod) self.submitted_pods[pod.name] = SubmittedPodState( pod_name=pod.name, task_run=task_run, scheduler_key=key, submitted_at=utcnow(), ) pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True) self.metrics_logger.log_pod_submitted(task_run.task, pod_name=pod.name) # in airflow>1.10.10 delete_pod method takes additional "namespace" arg # we do not use it in our overridden method but still we need to adjust # method signature to avoid errors when we run code on airflow>1.10.10. def delete_pod(self, pod_id, *args): # we are going to delete pod only once. # the moment it's removed from submitted_pods, we will not handle it event, neither delete it submitted_pod = self.submitted_pods.pop(pod_id, None) if not submitted_pod: return try: self.metrics_logger.log_pod_finished(submitted_pod.task_run.task) except Exception: # Catch all exceptions to prevent any delete loops, best effort self.log.exception( "%s failed to save pod finish info: pod_name=%s.!", submitted_pod.task_run, pod_id, ) try: result = self.kube_dbnd.delete_pod(pod_id, self.namespace) return result except Exception: # Catch all exceptions to prevent any delete loops, best effort self.log.exception( "%s: Exception raised when trying to delete pod: pod_name=%s.", submitted_pod.task_run, pod_id, ) def terminate(self): # we kill watcher and communication channel first # prevent watcher bug of being stacked on termination during event processing try: self.kube_watcher.safe_terminate() super(DbndKubernetesScheduler, self).terminate() finally: self._terminate_all_running_pods() def _terminate_all_running_pods(self): """ Clean up of all running pods on terminate: """ # now we need to clean after the run pods_to_delete = sorted(list(self.submitted_pods.values())) if not pods_to_delete: return self.log.info( "Terminating run, deleting all %d submitted pods that are still running/not finalized", len(pods_to_delete), ) for submitted_pod in pods_to_delete: try: self.delete_pod(submitted_pod.pod_name) except Exception: self.log.exception("Failed to terminate pod %s", submitted_pod.pod_name) # Wait for pods to be deleted and execute their own state management self.log.info( "Setting all running/not finalized pods to cancelled in 10 seconds..." ) time.sleep(10) try: for submitted_pod in pods_to_delete: task_run = submitted_pod.task_run if task_run.task_run_state in TaskRunState.final_states(): self.log.info( "%s with pod %s was %s, skipping", task_run, submitted_pod.pod_name, task_run.task_run_state, ) continue task_run.set_task_run_state(TaskRunState.CANCELLED) except Exception: self.log.exception("Could not set pods to cancelled!") def process_watcher_task(self, task): """Process the task event sent by watcher.""" pod_id, state, labels, resource_version = task pod_name = pod_id self.log.debug( "Attempting to process pod; pod_name: %s; state: %s; labels: %s", pod_id, state, labels, ) submitted_pod = self.submitted_pods.get(pod_name) if submitted_pod is None: # this is deleted pod - on delete watcher will send event # 1. delete by scheduler - we skip here # 2. external delete -> we continue to process the event return # DBND-AIRFLOW we have it precached, we don't need to go to DB # key = self._labels_to_key(labels=labels) # if not key: # self.log.info( # "Can't find a key for event from %s - %s from labels %s, skipping", # pod_name, # state, # labels, # ) # return self.log.debug( "Attempting to process pod; pod_name: %s; state: %s; labels: %s", pod_id, state, labels, ) # we are not looking for key task_run = submitted_pod.task_run key = submitted_pod.scheduler_key if submitted_pod.processed: # we already processed this kind of event, as in this process we have failed status already self.log.info( "%s Skipping pod '%s' event from %s - already processed", state, pod_name, ) return if state == State.RUNNING: # we should get here only once -> when pod starts to run self._process_pod_running(submitted_pod) # we will not send event to executor (otherwise it will delete the running pod) return try: if state is None: # simple case, pod has success - will be proceed by airflow main scheduler (Job) # task can be failed or passed. Airflow exit with 0 if task has failed regular way. self._process_pod_success(submitted_pod) self.result_queue.put((key, state, pod_name, resource_version)) elif state == State.FAILED: # Pod crash, it was deleted, killed, evicted.. we need to give it extra treatment self._process_pod_failed(submitted_pod) self.result_queue.put((key, state, pod_id, resource_version)) else: self.log.debug("finishing job %s - %s (%s)", key, state, pod_id) self.result_queue.put((key, state, pod_id, resource_version)) finally: submitted_pod.processed = True def _process_pod_running(self, submitted_pod): task_run = submitted_pod.task_run pod_name = submitted_pod.pod_name if submitted_pod.node_name: self.log.info( "%s: Zombie bug: Seeing pod event again. " "Probably something happening with pod and it's node: %s", submitted_pod.task_run, submitted_pod.pod_name, ) return pod_data = self.get_pod_status(pod_name) if not pod_data or not pod_data.spec.node_name: self.log.error("%s: Failed to find pod data for %s", pod_name) node_name = "failed_to_find" else: node_name = pod_data.spec.node_name self.metrics_logger.log_pod_running(task_run.task, node_name=node_name) submitted_pod.node_name = node_name task_run.set_task_run_state(TaskRunState.RUNNING, track=False) def _process_pod_success(self, submitted_pod): task_run = submitted_pod.task_run pod_name = submitted_pod.pod_name if submitted_pod.processed: self.log.info( "%s Skipping pod 'success' event from %s: already processed", pod_name ) return ti = get_airflow_task_instance(task_run=task_run) # we print success message to the screen # we will not send it to databand tracking store if ti.state == State.SUCCESS: dbnd_state = TaskRunState.SUCCESS elif ti.state in {State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE}: dbnd_state = TaskRunState.UP_FOR_RETRY elif ti.state in {State.FAILED, State.SHUTDOWN}: dbnd_state = TaskRunState.FAILED else: # we got a corruption here: error_msg = ( "Pod %s has finished with SUCCESS, but task instance state is %s, failing the job." % (pod_name, ti.state) ) error_help = "Please check pod logs/eviction retry" task_run_error = TaskRunError.build_from_message( task_run, error_msg, help_msg=error_help ) self._handle_crashed_task_instance( failure_reason=PodFailureReason.err_pod_evicted, task_run_error=task_run_error, task_run=task_run, ) return task_run.set_task_run_state(dbnd_state, track=False) self.log.info( "%s has been completed at pod '%s' with state %s try_number=%s!" % (task_run, pod_name, ti.state, ti._try_number) ) def _process_pod_failed(self, submitted_pod): task_run = submitted_pod.task_run pod_name = submitted_pod.pod_name task_id = task_run.task_af_id ti_state = get_airflow_task_instance_state(task_run=task_run) self.log.info( "%s: pod %s has crashed, airflow state: %s", task_run, pod_name, ti_state ) pod_data = self.get_pod_status(pod_name) pod_ctrl = self.kube_dbnd.get_pod_ctrl(pod_name, self.namespace) pod_logs = [] if pod_data: pod_status_log = _get_status_log_safe(pod_data) pod_phase = pod_data.status.phase if pod_phase != "Pending": pod_logs = pod_ctrl.get_pod_logs() else: pod_status_log = "POD NOT FOUND" error_msg = "Pod %s at %s has failed (task state=%s)!" % ( pod_name, self.namespace, ti_state, ) failure_reason, failure_message = self._find_pod_failure_reason( task_run=task_run, pod_data=pod_data, pod_name=pod_name ) if failure_reason: error_msg += "Found reason for failure: %s - %s." % ( failure_reason, failure_message, ) error_help_msg = "Please see full pod log for more details." if pod_logs: error_help_msg += "\nPod logs:\n%s\n" % "\n".join( ["out: %s" % l for l in pod_logs[-20:]] ) from dbnd._core.task_run.task_run_error import TaskRunError task_run_error = TaskRunError.build_from_message( task_run=task_run, msg=error_msg, help_msg=error_help_msg, ) if ti_state == State.FAILED: # Pod has failed, however, Airfow managed to update the state # that means - all code (including dbnd) were executed # let just notify the error, so we can show it in summary it # we will not send it to databand tracking store task_run.set_task_run_state( TaskRunState.FAILED, track=False, error=task_run_error ) self.log.info( "%s", task_run.task.ctrl.banner( "Task %s(%s) - pod %s has failed, airlfow state=Failed!" % (task_run.task.task_name, task_id, pod_name), color="red", task_run=task_run, ), ) return True # we got State.Failed from watcher, but at DB airflow instance in different state # that means the task has failed in the middle # (all kind of errors and exit codes) task_run_log = error_msg task_run_log += pod_status_log if pod_logs: # let's upload it logs - we don't know what happen task_run_log += "\nPod logs:\n\n%s\n\n" % "\n".join(pod_logs) task_run.tracker.save_task_run_log(task_run_log) self._handle_crashed_task_instance( task_run=task_run, task_run_error=task_run_error, failure_reason=failure_reason, ) def _find_pod_failure_reason( self, task_run, pod_name, pod_data, ): if not pod_data: return ( PodFailureReason.err_pod_deleted, "Pod %s probably has been deleted (can not be found)" % pod_name, ) pod_phase = pod_data.status.phase pod_ctrl = self.kube_dbnd.get_pod_ctrl(name=pod_name) if pod_phase == "Pending": self.log.info( "Got pod %s at Pending state which is failing: looking for the reason..", pod_name, ) try: pod_ctrl.check_deploy_errors(pod_data) except KubernetesImageNotFoundError as ex: return PodFailureReason.err_image_pull, str(ex) except Exception as ex: pass return None, None if pod_data.metadata.deletion_timestamp: return ( PodFailureReason.err_pod_deleted, "Pod %s has been deleted at %s" % (pod_name, pod_data.metadata.deletion_timestamp), ) pod_exit_code = _try_get_pod_exit_code(pod_data) if pod_exit_code: self.log.info("Found pod exit code %d for pod %s", pod_exit_code, pod_name) pod_exit_code = str(pod_exit_code) return pod_exit_code, "Pod exit code %s" % pod_exit_code return None, None @provide_session def _handle_crashed_task_instance( self, task_run, task_run_error, failure_reason, session=None ): task_instance = get_airflow_task_instance(task_run, session=session) task_instance.task = task_run.task.ctrl.airflow_op retry_config = self.kube_dbnd.engine_config.pod_retry_config retry_count = retry_config.get_retry_count(failure_reason) if retry_count is not None: # update retry for the latest values (we don't have task_run.task.task_retries = retry_count task_instance.task.retries = retry_count task_instance.max_tries = retry_count self.log.info( "Retry %s task: max_retries=%s, task.retries=%s, current:%s state:%s", task_run, task_instance.max_tries, task_instance.task.retries, task_instance._try_number, task_instance.state, ) # retry condition: self.task.retries and self.try_number <= self.max_tries increase_try_number = False if task_instance.state == State.QUEUED: # Special case - no airflow code has been run in the pod at all. # usually its increased the momen state moved to Running. And while at running state -> it will be the same value # Must increment try number, task_instance._try_number += 1 session.merge(task_instance) session.commit() task_instance.handle_failure(str(task_run_error.exception), session=session) if task_instance.state == State.UP_FOR_RETRY: task_run.set_task_run_state( TaskRunState.UP_FOR_RETRY, track=True, error=task_run_error ) else: task_run.set_task_run_state( TaskRunState.FAILED, track=True, error=task_run_error ) def get_pod_status(self, pod_name): pod_ctrl = self.kube_dbnd.get_pod_ctrl(name=pod_name) return pod_ctrl.get_pod_status_v1()
FooManager.register('bar', bar) if __name__ == '__main__': mgr = FooManager() mgr.start() res = mgr.bar() print res._getvalue() print str(res)[0] print type(res) sys.exit(0) qm = SyncManager() qm.start() mgr = FooManager() mgr.start() results = [] for _i in xrange(1): q = qm.Queue() res = mgr.foo(3, q) print res print type(res) results.append(q.get()) print '='*80 sleep(3) sleep(5) print 'Done.'
class DbndKubernetesScheduler(AirflowKubernetesScheduler): def __init__(self, kube_config, task_queue, result_queue, kube_client, worker_uuid, kube_dbnd): super(DbndKubernetesScheduler, self).__init__(kube_config, task_queue, result_queue, kube_client, worker_uuid) self.kube_dbnd = kube_dbnd # PATCH manage watcher from multiprocessing.managers import SyncManager self._manager = SyncManager() self._manager.start(mgr_init) self.watcher_queue = self._manager.Queue() self.current_resource_version = 0 self.kube_watcher = self._make_kube_watcher_dbnd() # will be used to low level pod interactions self.failed_pods_to_ignore = [] self.running_pods = {} self.pod_to_task = {} self.metrics_logger = KubernetesMetricsLogger() def _make_kube_watcher(self): # prevent storing in db of the kubernetes resource version, because the kubernetes db model only stores a single value # of the resource version while we need to store a sperate value for every kubernetes executor (because even in a basic flow # we can have two Kubernets executors running at once, the one that launched the driver and the one inside the driver). # # the resource version is the position inside the event stream of the kubernetes cluster and is used by the watcher to poll # Kubernets for events. It's probably fine to not store this because by default Kubernetes will returns "the evens currently in cache" # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_namespaced_pod return None def _make_kube_watcher_dbnd(self): watcher = DbndKubernetesJobWatcher( namespace=self.namespace, watcher_queue=self.watcher_queue, resource_version=self.current_resource_version, worker_uuid=self.worker_uuid, kube_config=self.kube_config, kube_dbnd=self.kube_dbnd, ) watcher.start() return watcher @staticmethod def _create_pod_id(dag_id, task_id): task_run = try_get_databand_run().get_task_run(task_id) return task_run.job_id__dns1123 def _health_check_kube_watcher(self): if self.kube_watcher.is_alive(): pass else: self.log.error("Error while health checking kube watcher process. " "Process died for unknown reasons") self.kube_watcher = self._make_kube_watcher_dbnd() def run_next(self, next_job): """ The run_next command will check the task_queue for any un-run jobs. It will then create a unique job-id, launch that job in the cluster, and store relevant info in the current_jobs map so we can track the job's status """ key, command, kube_executor_config = next_job dag_id, task_id, execution_date, try_number = key self.log.debug( "Kube POD to submit: image=%s with %s", self.kube_config.kube_image, str(next_job), ) dr = try_get_databand_run() task_run = dr.get_task_run_by_af_id(task_id) pod_command = [str(c) for c in command] task_engine = task_run.task_engine # type: KubernetesEngineConfig pod = task_engine.build_pod( task_run=task_run, cmds=pod_command, labels={ "airflow-worker": self.worker_uuid, "dag_id": self._make_safe_label_value(dag_id), "task_id": self._make_safe_label_value(task_run.task_af_id), "execution_date": self._datetime_to_label_safe_datestring(execution_date), "try_number": str(try_number), }, try_number=try_number, include_system_secrets=True, ) pod_ctrl = self.kube_dbnd.get_pod_ctrl_for_pod(pod) self.running_pods[pod.name] = self.namespace self.pod_to_task[pod.name] = task_run.task pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True) self.metrics_logger.log_pod_started(task_run.task) def delete_pod(self, pod_id): if pod_id in self.failed_pods_to_ignore: logger.warning( "Received request to delete pod %s that is ignored! Ignoring...", pod_id) return try: found_pod = self.running_pods.pop(pod_id, None) if found_pod: result = self.kube_dbnd.delete_pod(pod_id, self.namespace) if pod_id in self.pod_to_task: self.metrics_logger.log_pod_deleted( self.pod_to_task[pod_id]) self.pod_to_task.pop(pod_id) # Keep the cache clean return result except Exception as e: # Catch all exceptions to prevent any delete loops, best effort logger.warning( "Exception raised when trying to delete pod %s! Adding to ignored list...", pod_id, ) self.failed_pods_to_ignore.append(pod_id) def terminate(self): pods_to_delete = sorted(self.running_pods.keys()) if pods_to_delete: logger.info("Deleting %d submitted pods: %s", len(pods_to_delete), pods_to_delete) for pod_name in pods_to_delete: try: self.delete_pod(pod_name) except Exception: logger.exception("Failed to terminate pod %s", pod_name) super(DbndKubernetesScheduler, self).terminate()
def manager_queue(sync_manager: SyncManager) -> Generator["queue.Queue[Any]", None, None]: yield sync_manager.Queue()
} global_rb = manager.PrioritizedReplayBuffer(memory_size, env_dict=env_dict, alpha=PER_a, default_dtype=np.float16, check_for_update=True) n_explorer = multiprocessing.cpu_count() - 1 epsilons = [ pow(0.4, 1 + (i / (n_explorer - 1)) * 7) for i in range(n_explorer) ] # apex paper n_queue = n_explorer n_queue += 1 # for evaluation # n_queue += 1 # for prefetch queues = [manager.Queue() for _ in range(n_queue)] # Event object to share training status. if event is set True, all exolorers stop sampling transitions is_training_done = Event() transitions = Value('i', 0) # Lock lock = manager.Lock() # Shared memory objects to count number of samples and applied gradients trained_steps = Value('i', 0) tasks = [] local_buffer_size = 200 # 100论文数据 episode_max_steps = step_limit
def execute_tools(config, path, progress=None): """ Executes the suite of TidyPy tools upon the project and returns the issues that are found. :param config: the TidyPy configuration to use :type config: dict :param path: that path to the project to analyze :type path: str :param progress: the progress reporter object that will receive callbacks during the execution of the tool suite. If not specified, not progress notifications will occur. :type progress: tidypy.Progress :rtype: tidypy.Collector """ progress = progress or QuietProgress() progress.on_start() manager = SyncManager() manager.start() num_tools = 0 tools = manager.Queue() for name, cls in get_tools().items(): if config[name]['use'] and cls.can_be_used(): num_tools += 1 tools.put({ 'name': name, 'config': config[name], }) collector = Collector(config) if not num_tools: progress.on_finish() return collector notifications = manager.Queue() environment = manager.dict({ 'finder': Finder(path, config), }) workers = [] for _ in range(config['workers']): worker = Worker( args=( tools, notifications, environment, ), ) worker.start() workers.append(worker) while num_tools: try: notification = notifications.get(True, 0.25) except Empty: pass else: if notification['type'] == 'start': progress.on_tool_start(notification['tool']) elif notification['type'] == 'complete': collector.add_issues(notification['issues']) progress.on_tool_finish(notification['tool']) num_tools -= 1 progress.on_finish() return collector
""" can NOT work """ from multiprocessing.managers import SyncManager manager = SyncManager(address=('localhost', 50000), authkey='abracadabra') manager.start() queue = manager.Queue() manager.register("get_queue", callable=lambda: queue) manager.join()
def run_server(): from connectn.tournament import run_tournament_process from multiprocessing.managers import SyncManager cu.configure_logging(cu.SERVER_PROCESS_LOG) logger = logging.getLogger(__name__) cu.start_stunnel(True) manager = SyncManager() manager.start(_process_init) sq = mp.Queue() rq = manager.Queue() shutdown = manager.Event() rg = mp.Process( target=_process_init, args=(run_tournament_process, sq, rq, shutdown, cu.PLAY_ALL), name="RunGames", ) rg.start() logger.info("Started run_games process") with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as ls: try: ls.settimeout(5.0) ls.bind(("localhost", cu.LISTEN_PORT)) ls.listen(5) logger.info("Started server listening socket") except Exception: logger.exception("Failure when binding to the listening port.") else: updated_agent_archives = [] running = True while running: try: (cs, addr) = ls.accept() logger.info("Accepted connection.") handle_client(cs, updated_agent_archives) except socket.timeout: if len(updated_agent_archives): logger.info( f"Server sending {len(updated_agent_archives)} new agents for game-play." ) logger.info(f"{updated_agent_archives}") sq.put(updated_agent_archives) updated_agent_archives = [] except cu.InactiveSocket: logger.exception("Connection failed") except KeyboardInterrupt: inp = input("Shutdown? y/[n] ").lower() while inp not in ("", "y", "n"): inp = input("Shutdown? y/[n] ").lower() if inp == "y": logger.info("KeyboardInterrupt: Shutting down") running = False else: inp = input("Play all games? y/[n] ").lower() while inp not in ("", "y", "n"): inp = input("Play all games? y/[n] ").lower() if inp == "y": sq.put("PLAY_ALL") except Exception: logger.exception("Unexpected error, will try to keep running.") store_results_local(rq) finally: """ If the port is orphaned use: fuser -kn tcp <port> """ ls.shutdown(socket.SHUT_RDWR) logger.info("Closed server socket") logger.info("Telling run_games process to shutdown.") shutdown.set() rg.join() logger.info("Finished server shutdown.")