Esempio n. 1
0
class Communicator:
    def __init__(self, is_ps, ps_host):
        # There might be other ways to get the IP address
        server_addr = socket.gethostbyname(ps_host.split(":")[0])
        server_port = int(ps_host.split(":")[1])

        self.__is_ps = is_ps
        self.manager = SyncManager(address=(), authkey=b'abc')
        self.__start()

        self.task = self.manager.Queue()
        self.result = self.manager.Queue()
        self.end_flag = self.manager.Queue()
        self.data_sync = self.manager.Queue()
        self.idle_gpuq = Manager().Queue()
        self.data_count = 0

        return

    def __start(self):
        if self.__is_ps:
            self.manager.start()
        else:
            while True:
                try:
                    self.manager.connect()
                    break
                except:
                    print("waiting for connecting ...")
        return
Esempio n. 2
0
class DbndKubernetesExecutor(KubernetesExecutor):
    def __init__(self, kube_dbnd=None):
        # type: (DbndKubernetesExecutor, DbndKubernetesClient) -> None
        super(DbndKubernetesExecutor, self).__init__()

        from multiprocessing.managers import SyncManager

        self._manager = SyncManager()

        self.kube_dbnd = kube_dbnd
        _update_airflow_kube_config(airflow_kube_config=self.kube_config,
                                    engine_config=kube_dbnd.engine_config)

    def start(self):
        logger.info("Starting Kubernetes executor..")
        self._manager.start(mgr_init)

        dbnd_run = try_get_databand_run()
        if dbnd_run:
            self.worker_uuid = str(dbnd_run.run_uid)
        else:
            self.worker_uuid = (
                KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid())
        self.log.debug("Start with worker_uuid: %s", self.worker_uuid)

        # always need to reset resource version since we don't know
        # when we last started, note for behavior below
        # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs
        # /CoreV1Api.md#list_namespaced_pod
        # KubeResourceVersion.reset_resource_version()
        self.task_queue = self._manager.Queue()
        self.result_queue = self._manager.Queue()

        self.kube_client = self.kube_dbnd.kube_client
        self.kube_scheduler = DbndKubernetesScheduler(
            self.kube_config,
            self.task_queue,
            self.result_queue,
            self.kube_client,
            self.worker_uuid,
            kube_dbnd=self.kube_dbnd,
        )

        if self.kube_dbnd.engine_config.debug:
            self.log.setLevel(logging.DEBUG)
            self.kube_scheduler.log.setLevel(logging.DEBUG)

        self._inject_secrets()
        self.clear_not_launched_queued_tasks()
        self._flush_result_queue()

    # override - by default UpdateQuery not working failing with
    # sqlalchemy.exc.CompileError: Unconsumed column names: state
    # due to model override
    # + we don't want to change tasks statuses - maybe they are managed by other executors
    @provide_session
    def clear_not_launched_queued_tasks(self, *args, **kwargs):
        # we don't clear kubernetes tasks from previous run
        pass
def start_sync_manager():
    global gamemode_summaries
    global champion_summaries
    global requested_matches
    global request_queue
    global response_queue
    manager = SyncManager()
    manager.start()

    gamemode_summaries = manager.dict([])
    champion_summaries = manager.dict([])
    requested_matches = manager.list([])
    request_queue = manager.Queue()
    response_queue = manager.Queue()
Esempio n. 4
0
def prepare_experiment(env, args):
    # Manager to share PER between a learner and explorers
    SyncManager.register('PrioritizedReplayBuffer', PrioritizedReplayBuffer)
    manager = SyncManager()
    manager.start()

    kwargs = get_default_rb_dict(args.replay_buffer_size, env)
    kwargs["check_for_update"] = True
    global_rb = manager.PrioritizedReplayBuffer(**kwargs)

    # queues to share network parameters between a learner and explorers
    n_queue = 1 if args.n_env > 1 else args.n_explorer
    n_queue += 1  # for evaluation
    queues = [manager.Queue() for _ in range(n_queue)]

    # Event object to share training status. if event is set True, all exolorers stop sampling transitions
    is_training_done = Event()

    # Lock
    lock = manager.Lock()

    # Shared memory objects to count number of samples and applied gradients
    trained_steps = Value('i', 0)

    return global_rb, queues, is_training_done, lock, trained_steps
Esempio n. 5
0
    def __init__(self, conf_path='config/pn_conf.yaml'):
        self.conf_path = os.path.abspath(conf_path)
        self.conf = getConf(self.conf_path, root_key='itchat')

        self.thread_id = None

        self.gid = None  # 记录我们群的UserName
        if self.conf['use_custom_manager']:
            # create proxy manager
            mgr = SyncManager(
                (get_lan_ip(), self.conf['custom_manager_port']),
                self.conf['custom_manager_authkey'].encode('utf8'))
            #-#            sleep(0.5)  # wait for manager to start
            mgr.connect()
        else:
            mgr = multiprocessing.Manager()
        self.q_send = mgr.Queue()
        self.event_exit = mgr.Event()
        multiprocessing.current_process(
        ).authkey = self.conf['custom_manager_authkey'].encode(
            'utf8')  # https://bugs.python.org/issue7503
        self.proc_wx = multiprocessing.Process(target=self.run,
                                               args=(self.event_exit,
                                                     self.q_send))
        self.proc_wx.start()
Esempio n. 6
0
def function_handler(payload):
    job = SimpleNamespace(**payload)

    manager = SyncManager()
    manager.start()
    job_queue = manager.Queue()
    job_runners = []

    processes = min(job.worker_processes, len(job.call_ids))
    logger.info("Starting {} processes".format(processes))

    for runner_id in range(processes):
        p = mp.Process(target=process_runner, args=(runner_id, job_queue))
        job_runners.append(p)
        p.start()

    for call_id in job.call_ids:
        data_byte_range = job.data_byte_ranges.pop(0)
        logger.info('Going to execute job {}-{}'.format(job.job_key, call_id))
        job_queue.put((job, call_id, data_byte_range))

    for i in range(processes):
        job_queue.put(ShutdownSentinel())

    for runner in job_runners:
        runner.join()

    manager.shutdown()
Esempio n. 7
0
def preprocess_cycles(client: InfluxDBClient, executor: Executor, manager: SyncManager, dry_run=False):
    logger.info("Preprocessing charge cycles")
    queue = manager.Queue()
    series = client.list_series("samples")
    futures = []
    # TODO merge results of different detectors
    for attr, where, detector in [
        ('charger_acvoltage', 'charger_acvoltage>0 OR veh_speed > 0',
         ChargeCycleACVoltageDetection(time_epoch=client.time_epoch)),
        ('ischarging', 'ischarging>0 OR veh_speed > 0',
         ChargeCycleIsChargingDetection(time_epoch=client.time_epoch)),
        ('ac_hvpower', 'ac_hvpower>0 OR veh_speed > 0',
         ChargeCycleACHVPowerDetection(time_epoch=client.time_epoch)),
        ('hvbatt_soc', 'hvbatt_soc<200', ChargeCycleDerivDetection(time_epoch=client.time_epoch))
    ]:
        fields = ["time", "participant", "hvbatt_soc", "veh_speed"]
        if attr not in fields:
            fields.append(attr)
        futures += [executor.submit(preprocess_cycle,
                                    nr, client, queue, sname, join_selectors([sselector, where]),
                                    fields, detector, dry_run)
                    for nr, (sname, sselector) in enumerate(series)]

    logger.debug("Tasks started, waiting for results...")
    async_progress(futures, queue)
    data = [f.result() for f in futures]
    logger.debug("Tasks done")
    data.sort(key=lambda a: a[0:1])
    logger.info(__("Detected charge cycles:\n{}", tabulate(data, headers=["attr", "#", "cycles", "cycles_disc"])))
Esempio n. 8
0
 def __init__(self, conf_path='config/pn_conf.yaml'):
     # input param
     self.conf_path = conf_path
     self.conf = getConf(self.conf_path, root_key='audio')
     if self.conf['target'] == 'pi':
         self.t2s = Text2SpeechBaidu(self.conf_path)  # sync
     else:
         self.t2s = Text2SpeechXunFei(self.conf_path)  # sync
     self.executor_t2s = concurrent.futures.ProcessPoolExecutor(2)  # async
     if self.conf['use_custom_manager']:
         # create proxy manager
         mgr = SyncManager(
             (get_lan_ip(), self.conf['custom_manager_port']),
             self.conf['custom_manager_authkey'].encode('utf8'))
         sleep(0.5)  # wait for manager to start
         mgr.connect()
     else:
         mgr = multiprocessing.Manager()
     self.q_audio = mgr.Queue()
     #-#        debug('audio data queue created. %s', self.q_audio)
     self.event_exit = mgr.Event()
     multiprocessing.current_process(
     ).authkey = self.conf['custom_manager_authkey'].encode(
         'utf8')  # https://bugs.python.org/issue7503
     self.proc_play = multiprocessing.Process(target=self.playAudioFromQ,
                                              args=(self.q_audio,
                                                    self.event_exit))
     self.proc_play.start()
     #-#        debug('play background proc start. %s', self.proc_play)
     # 触发进程池worker进程创建, 貌似提前创建的占用内存小些
     self.executor_t2s.map(noop_func, (None, None))
Esempio n. 9
0
def function_handler(payload):
    job = SimpleNamespace(**payload)
    setup_lithops_logger(job.log_level)

    processes = min(job.worker_processes, len(job.call_ids))
    logger.info('Tasks received: {} - Concurrent processes: {}'.format(
        len(job.call_ids), processes))

    env = job.extra_env
    env['LITHOPS_WORKER'] = 'True'
    env['PYTHONUNBUFFERED'] = 'True'
    os.environ.update(env)

    storage_config = extract_storage_config(job.config)
    internal_storage = InternalStorage(storage_config)
    job.func = get_function_and_modules(job, internal_storage)
    job_data = get_function_data(job, internal_storage)

    if processes == 1:
        job_queue = queue.Queue()
        for call_id in job.call_ids:
            data = job_data.pop(0)
            job_queue.put((job, call_id, data))
        job_queue.put(ShutdownSentinel())
        process_runner(job_queue)
    else:
        manager = SyncManager()
        manager.start()
        job_queue = manager.Queue()
        job_runners = []

        for call_id in job.call_ids:
            data = job_data.pop(0)
            job_queue.put((job, call_id, data))

        for i in range(processes):
            job_queue.put(ShutdownSentinel())

        for runner_id in range(processes):
            p = mp.Process(target=process_runner, args=(job_queue, ))
            job_runners.append(p)
            p.start()
            logger.info('Worker process {} started'.format(runner_id))

        for runner in job_runners:
            runner.join()

        manager.shutdown()

    # Delete modules path from syspath
    module_path = os.path.join(MODULES_DIR, job.job_key)
    if module_path in sys.path:
        sys.path.remove(module_path)

    # Unset specific job env vars
    for key in job.extra_env:
        os.environ.pop(key, None)
    os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None)
Esempio n. 10
0
def preprocess_trips(client: InfluxDBClient, executor: Executor, manager: SyncManager, dry_run=False):
    logger.info("Preprocessing trips")
    queue = manager.Queue()
    series = client.list_series("samples")
    futures = [executor.submit(preprocess_trip, nr, client, queue, sname, sselector, dry_run)
               for nr, (sname, sselector) in enumerate(series)]
    logger.debug("Tasks started, waiting for results...")
    async_progress(futures, queue)
    data = [f.result() for f in futures]
    logger.debug("Tasks done")
    data.sort(key=lambda a: a[0])
    logger.info(__("Detected trips:\n{}", tabulate(data, headers=["#", "cycles", "cycles_disc"])))
Esempio n. 11
0
 def test_run_keyboard_interrupt(sync_manager: SyncManager) -> None:
     """Function: run should stop by keyboard interupt."""
     queue_process_id = sync_manager.Queue()
     replier = Replier(sync_manager.dict(), queue_process_id)
     loop = asyncio.new_event_loop()
     with ProcessPoolExecutor() as executor:
         future = cast(
             "Future[Any]",
             loop.run_in_executor(executor, run, replier, None,
                                  keyboard_interrupt))
         queue_process_id.get()
     assert not future.get_loop().is_running()
     assert not future.done()
Esempio n. 12
0
 def __init__(self):
     mgr = SyncManager()
     mgr.start(signal.signal, (signal.SIGINT, signal.SIG_IGN))
     self.ns_default = mgr.Namespace()
     self.ns_default.error = None
     self.ns_stats = mgr.Namespace()
     self.input_queue = mgr.Queue(maxsize=100)
     self.error_occurred = mgr.Event()
     self.error_processed = mgr.Event()
     self.batch_done = mgr.Event()
     self.mgr = mgr
     self.stats_lock = mgr.Lock()
     self.main_lock = mgr.Lock()
Esempio n. 13
0
def get_server_queue():
    #FIXME: some OSX users were getting "Can't assign requested address" errors
    # if we use socket.gethostname() for the address. Changing it to
    # 'localhost' seems to fix the issue, but I don't know why. We had to
    # use socket.gethostname() in order to get our benchmark tests to run
    # using qsub on a linux cluster, so with this 'fix', testflo benchmark tests
    # will likely not work on a cluster of OSX machines.
    if sys.platform == 'darwin':
        addr = 'localhost'
    else:
        addr = socket.gethostname()

    manager = SyncManager(address=(addr, 0), authkey=_testflo_authkey)
    manager.start()
    return manager, manager.Queue()
Esempio n. 14
0
def function_handler(payload):
    job = SimpleNamespace(**payload)
    processes = min(job.worker_processes, len(job.call_ids))

    logger.info('Tasks received: {} - Concurrent workers: {}'.format(
        len(job.call_ids), processes))

    storage_config = extract_storage_config(job.config)
    internal_storage = InternalStorage(storage_config)
    job.func = get_function_and_modules(job, internal_storage)
    job_data = get_function_data(job, internal_storage)

    if processes == 1:
        job_queue = queue.Queue()
        for task_id in job.call_ids:
            data = job_data.pop(0)
            job_queue.put((job, task_id, data))
        job_queue.put(ShutdownSentinel())
        process_runner(job_queue, internal_storage)
    else:
        manager = SyncManager()
        manager.start()
        job_queue = manager.Queue()
        job_runners = []

        for runner_id in range(processes):
            p = mp.Process(target=process_runner,
                           args=(job_queue, internal_storage))
            job_runners.append(p)
            p.start()
            logger.info('Worker process {} started'.format(runner_id))

        for call_id in job.call_ids:
            data = job_data.pop(0)
            job_queue.put((job, call_id, data))

        for i in range(processes):
            job_queue.put(ShutdownSentinel())

        for runner in job_runners:
            runner.join()

        manager.shutdown()

    # Delete modules path from syspath
    module_path = os.path.join(MODULES_DIR, job.job_key)
    if module_path in sys.path:
        sys.path.remove(module_path)
Esempio n. 15
0
class ClassWithManager(object):
    def __init__(self):
        self._manager = SyncManager()
        self._manager.start(mgr_init)
        self.task_queue = self._manager.Queue()

    def run_and_sleep(self, sleep_time):
        try:

            time.sleep(sleep_time)
        finally:
            self.end()

    def end(self):

        logging.error(
            "Executor shutting down, task_queue approximate size=%d",
            self.task_queue.qsize(),
        )
Esempio n. 16
0
    def run(self):
        logging.basicConfig(format=LOG_FORMAT,
                            filename=LOG_FILE,
                            filemode='w',
                            level=logging.INFO)

        SyncManager.register('SessionManager', SessionManager)
        SyncManager.register('DBManager', DBManager)
        manager = SyncManager()
        manager.start()

        db_manager = manager.DBManager()
        odometer_value = manager.Value(c_int, 0)
        vin = manager.Queue(1)
        session_manager = manager.SessionManager(odometer_value)

        can_manager = CanManager(session_manager, odometer_value, vin)
        nfc_manager = NFCManager(session_manager, db_manager)
        upload_manager = UploadManager(session_manager, db_manager)

        signal.pause()
Esempio n. 17
0
def run_generator_in_process(gen, *args, **kwargs):
    import multiprocessing
    from multiprocessing.managers import SyncManager
    manager = SyncManager()
    manager.start()
    q = manager.Queue(maxsize=1024)

    process = multiprocessing.Process(target=generate_to_queue,
                                      args=(gen, args, kwargs, q))
    process.start()
    while True:
        info, i = q.get()
        if info == StopIteration:
            break
        elif info == "exception":
            raise i
        else:
            assert info == "item"
            print("qsize", q.qsize())
            yield i
    manager.shutdown()
Esempio n. 18
0
class DataSender:
    def __init__(self):
        self.driveup = driveUp.DriveUp('client.json')
        self.running = True
        self._manager = SyncManager()

    def start(self):
        self.driveup.authenticate()
        self._manager.start(self._mgr_init)
        self._que = self._manager.Queue()
        self._process = Process(target=self.up, args=(self._que, ))
        self._process.start()

    def _mgr_init(self):
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        print("initialized manager")

    def up(self, que):
        def stop(val, val2):
            print "process SIGINT stopping"
            self.running = False

        signal.signal(signal.SIGINT, stop)
        print('datauploader started')
        while self.running or not que.empty():
            item = que.get(True)
            print("handling item={0}".format(item))
            self.driveup.upload(item)
            que.task_done()
            time.sleep(2)
        print("datauploader process terminating...")

    def send(self, data):
        self._que.put(data)

    def stop(self):
        print("shutting down sender")
        self.running = False
        self._que.join()
        self._process.terminate()
Esempio n. 19
0
def apply(func, args):
    import random
    from Queue import Empty
    from multiprocessing import Process
    from multiprocessing.managers import SyncManager
    import tempfile
    import os
    if os.getcwd()[:4] != '/afs':
        try:
            m = SyncManager(address=tempfile.mktemp(prefix='dqu_subprocess-',
                                                    dir=os.getcwd()))
            m.start()
        except EOFError:
            m = SyncManager()
            m.start()
    else:
        m = SyncManager()
        m.start()
    q = m.Queue()
    p = Process(target=_local_apply_core, args=(func, args, q))
    p.start()
    p.join()
    print 'Manager socket is', m.address
    try:
        rv = q.get(False)
    except Empty:
        raise RuntimeError('daughter died while trying to execute %s%s' %
                           (func.func_name, args))
    if isinstance(rv, BaseException):
        if isinstance(rv, SystemExit):
            print 'SystemExit raised by daughter; ignoring'
            return None
        else:
            raise rv
    m.shutdown()
    return rv
Esempio n. 20
0
def import_data():
    logger.info("Start log file import")

    parsers = [V1Parser, V2Parser, V3Parser]
    logger.info(
        __("Using parser version {version}", version=arguments["--version"]))
    csv_parser = parsers[int(arguments["--version"]) - 1]

    if arguments["FILE"] is not None:
        file_path = arguments["FILE"]
        if not os.path.isabs(file_path):
            file_path = os.path.join(os.getcwd(), file_path)

        directory_path = os.path.dirname(file_path)
        directory = Directory(os.path.basename(directory_path), directory_path)
        file = os.path.basename(file_path)

        logger.debug(
            __("directory: {dir}, file:{file}", dir=directory, file=file))

        _execute_import(csv_parser(), directory, file=file)
    else:

        directories = FileSystemAccess(logger).get_directories(
            config["webike.imei_regex"])
        manager = SyncManager()
        manager.start()
        queue = manager.Queue()
        with ProcessPoolExecutor(max_workers=14) as executor:
            futures = [
                executor.submit(_execute_import, csv_parser(), directory,
                                queue) for directory in directories
            ]

            async_progress(futures, queue, delay=10)
    logger.info("Import complete")
Esempio n. 21
0
def replier(sync_manager: SyncManager) -> Generator[Replier, None, None]:
    yield Replier(sync_manager.dict(), sync_manager.Queue())
Esempio n. 22
0
    def run(self, shuffle=False):
        """
        Runs all experiments. Blocks until all experiment are finished.
        """

        # Setup multiprocessing logging
        manager = SyncManager()
        manager.start(lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
        result_log_queue = manager.Queue()
        self.cancel_experiments = manager.Value('b', 0)
        self.interrupt_condition = manager.Condition()
        listener = multiprocessing.Process(target=result_log_listener,
                                           args=(result_log_queue, setup_result_logger, self.result_log_name,))
        listener.start()

        # Setup callback throttling
        result_lock = Lock()
        callback_lock = Lock()
        self.callback_disabled = False

        def call_callback(experiment_id=None, pause=0):
            with callback_lock:
                with result_lock:
                    if pause:
                        output_status(prefix='C')
                    self.last_callback = datetime.now()
                    self.save_results()
                    sys.stdout.write('Results saved. ')
                    sys.stdout.flush()
                    if not self.callback_disabled:
                        sys.stdout.write('Digesting results ... ')
                        sys.stdout.flush()

                        try:
                            self.update_callback(experiment_id)
                        except Exception as ex:  # pylint: disable=W
                            sys.stdout.write('errored with {}\n\n\n'.format(ex.__class__))
                            self.callback_disabled = True
                        else:
                            sys.stdout.write('done\n')
                    else:
                        sys.stdout.write('Digestion disabled, due to previous exception.\n')

        self.last_callback = datetime.now()

        # setup process pool
        self.jobs_total = len(self.experiments)
        start_time = datetime.now()
        print("Using up to %i CPUs %s" %
              (self.cpu_limit,
               'with numpy multi-threading disabled' if os.environ.get('OMP_NUM_THREADS', None) == '1' else ''))
        with multiprocessing.Pool(self.cpu_limit) as pool:

            # print status function
            def output_status(prefix='F'):
                progress = self.jobs_finished / self.jobs_total
                elapsed_time = datetime.now() - start_time
                errors = "" if self.jobs_errored == 0 else " %i ERRORED, " % self.jobs_errored
                sys.stdout.write(
                    ("%s %s: %i jobs, %i finished, %i queued," + errors + " %.0f%%, ~remaining: %s\n") %
                    (
                        prefix,
                        datetime.now().strftime('%c'),
                        self.jobs_total,
                        self.jobs_finished,
                        self.jobs_total - self.jobs_finished,
                        progress * 100,
                        timedelta(seconds=(elapsed_time * (1 - progress) / progress).total_seconds() // 15 * 15)
                        if progress > 0 else '???',
                    )
                )

            # define callbacks, they are run within the main process, but in separate threads
            def update_status(result=None):
                from pandas import DataFrame
                with result_lock:
                    self.jobs_finished += 1
                    output_status()
                    if not result:
                        return

                    row = {}
                    experiment = self.experiments[result.experiment_id]
                    row.update({
                        'experiment_id': result.experiment_id,
                        'experiment_hash': experiment.hash,
                        'experiment': experiment.__class__.__name__,
                    })
                    row.update(experiment.parameters._asdict())
                    row.update(result._asdict())
                    self.results = self.results.append(DataFrame([row]), sort=True)

                # If there is already a callback waiting, we will replace it and therefore cancel it
                if self.next_callback and self.next_callback.is_alive():
                    sleep(0)  # let other threads run first
                    if callback_lock.acquire(blocking=False):
                        self.next_callback.cancel()
                        callback_lock.release()
                    else:
                        # the callback is currently waiting for the result_lock
                        return

                # Schedule callback either immediately (0) or after the pause expired
                pause = max(0.0, self.update_callback_min_pause - (datetime.now() - self.last_callback).total_seconds())
                self.next_callback = Timer(
                    pause,
                    call_callback,
                    args=[result.experiment_id, pause],
                )
                self.next_callback.start()

            def update_status_error(exception):
                if isinstance(exception, ExperimentCanceledException):
                    return
                print('Experiment exception: ', exception, file=sys.stderr)
                traceback.print_exception(type(exception), exception, exception.__traceback__, file=sys.stderr)
                self.jobs_errored += 1
                self.exceptions.append(exception)
                update_status()

            # randomize order
            experiments = list(self.experiments.values())
            if shuffle:
                random.seed(0xdeadbeef)
                random.shuffle(experiments)

            # filter loaded experiments
            if not self.results.empty:
                known_hashes = [ex.hash for ex in experiments]
                len_before = len(experiments)
                loaded_experiment_hashes = self.results.loc[:, ['experiment_hash']].values[:, 0]
                experiments = [ex for ex in experiments if ex.hash not in loaded_experiment_hashes]
                if loaded_experiment_hashes.size:
                    print('Continuing from %s' % self.results_file)
                    self.jobs_finished = len_before - len(experiments)

                # check for experiments with results that we don't know
                unknown_experiments = self.results.loc[~self.results['experiment_hash'].isin(known_hashes)]
                if not unknown_experiments.empty:
                    print('@' * 80)
                    print('Results file %s contains %i results that are not in the study\'s' %
                          (self.results_file, len(unknown_experiments)))
                    print('experiment definition. Did you delete experiments from your study?')
                    print('@' * 80)

            # experiment execution
            for experiment in experiments:
                # Assign experiment to GPU (if used) : might be replaced by more sophisticated load balancer
                if self.gpu_limit > 0:
                    gpu_num = self.gpu_counter % self.gpu_limit
                    experiment.assign_to_gpu(gpu_num)
                    self.gpu_counter += 1
                # Add experiment to execution queue
                pool.apply_async(
                    experiment.execute,
                    (result_log_queue, self.result_log_name, self.cancel_experiments, self.interrupt_condition),
                    callback=update_status,
                    error_callback=update_status_error,
                )

            def signal_handler(_sig, _frame):
                self.num_int += 1
                if self.num_int > 1:
                    print("Killing all processes.")
                    sys.exit(1)
                print(
                    "\rPerforming graceful shutdown... (Press CTRL-C again to force. This might result in data loss.)")
                with self.interrupt_condition:
                    self.cancel_experiments.value = 1
                    self.interrupt_condition.notify_all()

            signal.signal(signal.SIGINT, signal_handler)

            # show status, then block until we're ready
            output_status()
            pool.close()

            pool.join()

            if self.next_callback and self.next_callback.is_alive():
                with callback_lock:
                    self.next_callback.cancel()

            call_callback()

            # quit logger
            result_log_queue.put(None)  # trigger listener to quit
            listener.join()

            # check if we got any exceptions as results
            if self.exceptions:
                raise FailedExperimentsException(self.exceptions)
Esempio n. 23
0
class DbndKubernetesScheduler(AirflowKubernetesScheduler):
    """
    Very serious override of AirflowKubernetesScheduler
    1. better visability on errors, so we proceed Failures with much more info
    2. tracking of all around "airflow run" events -> Pod Crashes, Pod Submission errors
        a. in case of crash (OOM, evicted pod) -> error propogation to databand and retry
    """

    def __init__(
        self, kube_config, task_queue, result_queue, kube_client, worker_uuid, kube_dbnd
    ):
        super(DbndKubernetesScheduler, self).__init__(
            kube_config, task_queue, result_queue, kube_client, worker_uuid
        )
        self.kube_dbnd = kube_dbnd

        # PATCH watcher communication manager
        # we want to wait for stop, instead of "exit" inplace, so we can get all "not" received messages
        from multiprocessing.managers import SyncManager

        # TODO: why can't we use original SyncManager?
        # Scheduler <-> (via _manager) KubeWatcher
        # if _manager dies inplace, we will not get any "info" from KubeWatcher until shutdown
        self._manager = SyncManager()
        self._manager.start(mgr_init)

        self.watcher_queue = self._manager.Queue()
        self.current_resource_version = 0
        self.kube_watcher = self._make_kube_watcher_dbnd()

        # pod to airflow key (dag_id, task_id, execution_date)
        self.submitted_pods = {}  # type: Dict[str,SubmittedPodState]

        # sending data to databand tracker
        self.metrics_logger = KubernetesMetricsLogger()

        # disappeared pods mechanism
        self.last_disappeared_pods = {}
        self.current_iteration = 1
        # add `k8s-scheduler:` prefix to all log messages
        self._log = PrefixLoggerAdapter("k8s-scheduler", self.log)

    def _make_kube_watcher(self):
        # prevent storing in db of the kubernetes resource version, because the kubernetes db model only stores a single value
        # of the resource version while we need to store a sperate value for every kubernetes executor (because even in a basic flow
        # we can have two Kubernets executors running at once, the one that launched the driver and the one inside the driver).
        #
        # the resource version is the position inside the event stream of the kubernetes cluster and is used by the watcher to poll
        # Kubernets for events. It's probably fine to not store this because by default Kubernetes will returns "the evens currently in cache"
        # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_namespaced_pod
        return None

    def _make_kube_watcher_dbnd(self):
        watcher = DbndKubernetesJobWatcher(**get_job_watcher_kwargs(self))
        watcher.start()
        return watcher

    @staticmethod
    def _create_pod_id(dag_id, task_id):
        task_run = try_get_databand_run().get_task_run(task_id)
        return task_run.job_id__dns1123

    def _health_check_kube_watcher(self):
        if self.kube_watcher.is_alive():
            pass
        else:
            self.log.error(
                "Error while health checking kube watcher process. "
                "Process died for unknown reasons"
            )
            self.kube_watcher = self._make_kube_watcher_dbnd()

    def run_next(self, next_job):
        """

        The run_next command will check the task_queue for any un-run jobs.
        It will then create a unique job-id, launch that job in the cluster,
        and store relevant info in the current_jobs map so we can track the job's
        status
        """
        key, command, kube_executor_config = next_job
        dag_id, task_id, execution_date, try_number = key
        self.log.debug(
            "Kube POD to submit: image=%s with %s",
            self.kube_config.kube_image,
            str(next_job),
        )

        dr = try_get_databand_run()
        task_run = dr.get_task_run_by_af_id(task_id)
        pod_command = [str(c) for c in command]
        task_engine = task_run.task_engine  # type: KubernetesEngineConfig
        pod = task_engine.build_pod(
            task_run=task_run,
            cmds=pod_command,
            labels={
                "airflow-worker": self.worker_uuid,
                "dag_id": make_safe_label_value(dag_id),
                "task_id": make_safe_label_value(task_run.task_af_id),
                "execution_date": self._datetime_to_label_safe_datestring(
                    execution_date
                ),
                "try_number": str(try_number),
            },
            try_number=try_number,
            include_system_secrets=True,
        )

        pod_ctrl = self.kube_dbnd.get_pod_ctrl_for_pod(pod)
        self.submitted_pods[pod.name] = SubmittedPodState(
            pod_name=pod.name,
            task_run=task_run,
            scheduler_key=key,
            submitted_at=utcnow(),
        )

        pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True)
        self.metrics_logger.log_pod_submitted(task_run.task, pod_name=pod.name)

    # in airflow>1.10.10 delete_pod method takes additional "namespace" arg
    # we do not use it in our overridden method but still we need to adjust
    # method signature to avoid errors when we run code on airflow>1.10.10.
    def delete_pod(self, pod_id, *args):
        # we are going to delete pod only once.
        # the moment it's removed from submitted_pods, we will not handle it event, neither delete it
        submitted_pod = self.submitted_pods.pop(pod_id, None)
        if not submitted_pod:
            return

        try:
            self.metrics_logger.log_pod_finished(submitted_pod.task_run.task)
        except Exception:
            # Catch all exceptions to prevent any delete loops, best effort
            self.log.exception(
                "%s failed to save pod finish info: pod_name=%s.!",
                submitted_pod.task_run,
                pod_id,
            )

        try:
            result = self.kube_dbnd.delete_pod(pod_id, self.namespace)
            return result
        except Exception:
            # Catch all exceptions to prevent any delete loops, best effort
            self.log.exception(
                "%s: Exception raised when trying to delete pod: pod_name=%s.",
                submitted_pod.task_run,
                pod_id,
            )

    def terminate(self):
        # we kill watcher and communication channel first

        # prevent watcher bug of being stacked on termination during event processing
        try:
            self.kube_watcher.safe_terminate()
            super(DbndKubernetesScheduler, self).terminate()
        finally:
            self._terminate_all_running_pods()

    def _terminate_all_running_pods(self):
        """
        Clean up of all running pods on terminate:
        """
        # now we need to clean after the run
        pods_to_delete = sorted(list(self.submitted_pods.values()))
        if not pods_to_delete:
            return

        self.log.info(
            "Terminating run, deleting all %d submitted pods that are still running/not finalized",
            len(pods_to_delete),
        )
        for submitted_pod in pods_to_delete:
            try:
                self.delete_pod(submitted_pod.pod_name)
            except Exception:
                self.log.exception("Failed to terminate pod %s", submitted_pod.pod_name)

        # Wait for pods to be deleted and execute their own state management
        self.log.info(
            "Setting all running/not finalized pods to cancelled in 10 seconds..."
        )
        time.sleep(10)
        try:
            for submitted_pod in pods_to_delete:
                task_run = submitted_pod.task_run
                if task_run.task_run_state in TaskRunState.final_states():
                    self.log.info(
                        "%s with pod %s was %s, skipping",
                        task_run,
                        submitted_pod.pod_name,
                        task_run.task_run_state,
                    )
                    continue
                task_run.set_task_run_state(TaskRunState.CANCELLED)
        except Exception:
            self.log.exception("Could not set pods to cancelled!")

    def process_watcher_task(self, task):
        """Process the task event sent by watcher."""
        pod_id, state, labels, resource_version = task
        pod_name = pod_id
        self.log.debug(
            "Attempting to process pod; pod_name: %s; state: %s; labels: %s",
            pod_id,
            state,
            labels,
        )

        submitted_pod = self.submitted_pods.get(pod_name)
        if submitted_pod is None:
            # this is deleted pod - on delete watcher will send event
            # 1. delete by scheduler - we skip here
            # 2. external delete -> we continue to process the event
            return

        # DBND-AIRFLOW we have it precached, we don't need to go to DB
        # key = self._labels_to_key(labels=labels)
        # if not key:
        #     self.log.info(
        #         "Can't find a key for event from %s - %s from labels %s, skipping",
        #         pod_name,
        #         state,
        #         labels,
        #     )
        #     return

        self.log.debug(
            "Attempting to process pod; pod_name: %s; state: %s; labels: %s",
            pod_id,
            state,
            labels,
        )

        # we are not looking for key
        task_run = submitted_pod.task_run
        key = submitted_pod.scheduler_key
        if submitted_pod.processed:
            # we already processed this kind of event, as in this process we have failed status already
            self.log.info(
                "%s Skipping pod '%s' event from %s - already processed",
                state,
                pod_name,
            )
            return

        if state == State.RUNNING:
            # we should get here only once -> when pod starts to run

            self._process_pod_running(submitted_pod)
            # we will not send event to executor (otherwise it will delete the running pod)
            return

        try:
            if state is None:
                # simple case, pod has success - will be proceed by airflow main scheduler (Job)
                # task can be failed or passed. Airflow exit with 0 if task has failed regular way.
                self._process_pod_success(submitted_pod)
                self.result_queue.put((key, state, pod_name, resource_version))
            elif state == State.FAILED:
                # Pod crash, it was deleted, killed, evicted.. we need to give it extra treatment
                self._process_pod_failed(submitted_pod)
                self.result_queue.put((key, state, pod_id, resource_version))
            else:
                self.log.debug("finishing job %s - %s (%s)", key, state, pod_id)
                self.result_queue.put((key, state, pod_id, resource_version))
        finally:
            submitted_pod.processed = True

    def _process_pod_running(self, submitted_pod):
        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        if submitted_pod.node_name:
            self.log.info(
                "%s: Zombie bug: Seeing pod event again. "
                "Probably something happening with pod and it's node: %s",
                submitted_pod.task_run,
                submitted_pod.pod_name,
            )
            return

        pod_data = self.get_pod_status(pod_name)
        if not pod_data or not pod_data.spec.node_name:
            self.log.error("%s: Failed to find pod data for %s", pod_name)
            node_name = "failed_to_find"
        else:
            node_name = pod_data.spec.node_name
            self.metrics_logger.log_pod_running(task_run.task, node_name=node_name)

        submitted_pod.node_name = node_name
        task_run.set_task_run_state(TaskRunState.RUNNING, track=False)

    def _process_pod_success(self, submitted_pod):
        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        if submitted_pod.processed:
            self.log.info(
                "%s Skipping pod 'success' event from %s: already processed", pod_name
            )
            return
        ti = get_airflow_task_instance(task_run=task_run)

        # we print success message to the screen
        # we will not send it to databand tracking store

        if ti.state == State.SUCCESS:
            dbnd_state = TaskRunState.SUCCESS
        elif ti.state in {State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE}:
            dbnd_state = TaskRunState.UP_FOR_RETRY
        elif ti.state in {State.FAILED, State.SHUTDOWN}:
            dbnd_state = TaskRunState.FAILED
        else:
            # we got a corruption here:
            error_msg = (
                "Pod %s has finished with SUCCESS, but task instance state is %s, failing the job."
                % (pod_name, ti.state)
            )
            error_help = "Please check pod logs/eviction retry"
            task_run_error = TaskRunError.build_from_message(
                task_run, error_msg, help_msg=error_help
            )
            self._handle_crashed_task_instance(
                failure_reason=PodFailureReason.err_pod_evicted,
                task_run_error=task_run_error,
                task_run=task_run,
            )
            return

        task_run.set_task_run_state(dbnd_state, track=False)
        self.log.info(
            "%s has been completed at pod '%s' with state %s try_number=%s!"
            % (task_run, pod_name, ti.state, ti._try_number)
        )

    def _process_pod_failed(self, submitted_pod):
        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        task_id = task_run.task_af_id
        ti_state = get_airflow_task_instance_state(task_run=task_run)

        self.log.info(
            "%s: pod %s has crashed, airflow state: %s", task_run, pod_name, ti_state
        )

        pod_data = self.get_pod_status(pod_name)
        pod_ctrl = self.kube_dbnd.get_pod_ctrl(pod_name, self.namespace)

        pod_logs = []
        if pod_data:
            pod_status_log = _get_status_log_safe(pod_data)
            pod_phase = pod_data.status.phase
            if pod_phase != "Pending":
                pod_logs = pod_ctrl.get_pod_logs()
        else:
            pod_status_log = "POD NOT FOUND"

        error_msg = "Pod %s at %s has failed (task state=%s)!" % (
            pod_name,
            self.namespace,
            ti_state,
        )
        failure_reason, failure_message = self._find_pod_failure_reason(
            task_run=task_run, pod_data=pod_data, pod_name=pod_name
        )
        if failure_reason:
            error_msg += "Found reason for failure: %s - %s." % (
                failure_reason,
                failure_message,
            )
        error_help_msg = "Please see full pod log for more details."
        if pod_logs:
            error_help_msg += "\nPod logs:\n%s\n" % "\n".join(
                ["out: %s" % l for l in pod_logs[-20:]]
            )

        from dbnd._core.task_run.task_run_error import TaskRunError

        task_run_error = TaskRunError.build_from_message(
            task_run=task_run, msg=error_msg, help_msg=error_help_msg,
        )

        if ti_state == State.FAILED:
            # Pod has failed, however, Airfow managed to update the state
            # that means - all code (including dbnd) were executed
            # let just notify the error, so we can show it in summary it
            # we will not send it to databand tracking store
            task_run.set_task_run_state(
                TaskRunState.FAILED, track=False, error=task_run_error
            )
            self.log.info(
                "%s",
                task_run.task.ctrl.banner(
                    "Task %s(%s) - pod %s has failed, airlfow state=Failed!"
                    % (task_run.task.task_name, task_id, pod_name),
                    color="red",
                    task_run=task_run,
                ),
            )
            return True
        # we got State.Failed from watcher, but at DB airflow instance in different state
        # that means the task has failed in the middle
        # (all kind of errors and exit codes)
        task_run_log = error_msg
        task_run_log += pod_status_log
        if pod_logs:
            # let's upload it logs - we don't know what happen
            task_run_log += "\nPod logs:\n\n%s\n\n" % "\n".join(pod_logs)
        task_run.tracker.save_task_run_log(task_run_log)

        self._handle_crashed_task_instance(
            task_run=task_run,
            task_run_error=task_run_error,
            failure_reason=failure_reason,
        )

    def _find_pod_failure_reason(
        self, task_run, pod_name, pod_data,
    ):
        if not pod_data:
            return (
                PodFailureReason.err_pod_deleted,
                "Pod %s probably has been deleted (can not be found)" % pod_name,
            )

        pod_phase = pod_data.status.phase
        pod_ctrl = self.kube_dbnd.get_pod_ctrl(name=pod_name)

        if pod_phase == "Pending":
            self.log.info(
                "Got pod %s at Pending state which is failing: looking for the reason..",
                pod_name,
            )
            try:
                pod_ctrl.check_deploy_errors(pod_data)
            except KubernetesImageNotFoundError as ex:
                return PodFailureReason.err_image_pull, str(ex)
            except Exception as ex:
                pass
            return None, None

        if pod_data.metadata.deletion_timestamp:
            return (
                PodFailureReason.err_pod_deleted,
                "Pod %s has been deleted at %s"
                % (pod_name, pod_data.metadata.deletion_timestamp),
            )

        pod_exit_code = _try_get_pod_exit_code(pod_data)
        if pod_exit_code:
            self.log.info("Found pod exit code %d for pod %s", pod_exit_code, pod_name)
            pod_exit_code = str(pod_exit_code)
            return pod_exit_code, "Pod exit code %s" % pod_exit_code
        return None, None

    @provide_session
    def _handle_crashed_task_instance(
        self, task_run, task_run_error, failure_reason, session=None
    ):

        task_instance = get_airflow_task_instance(task_run, session=session)
        task_instance.task = task_run.task.ctrl.airflow_op

        retry_config = self.kube_dbnd.engine_config.pod_retry_config
        retry_count = retry_config.get_retry_count(failure_reason)
        if retry_count is not None:
            # update retry for the latest values (we don't have
            task_run.task.task_retries = retry_count
            task_instance.task.retries = retry_count
            task_instance.max_tries = retry_count

        self.log.info(
            "Retry %s  task: max_retries=%s, task.retries=%s, current:%s state:%s",
            task_run,
            task_instance.max_tries,
            task_instance.task.retries,
            task_instance._try_number,
            task_instance.state,
        )
        # retry condition: self.task.retries and self.try_number <= self.max_tries
        increase_try_number = False

        if task_instance.state == State.QUEUED:
            # Special case - no airflow code has been run in the pod at all.
            # usually its increased the momen state moved to Running. And while at running state -> it will be the same value
            # Must increment try number,
            task_instance._try_number += 1
            session.merge(task_instance)
            session.commit()

        task_instance.handle_failure(str(task_run_error.exception), session=session)

        if task_instance.state == State.UP_FOR_RETRY:
            task_run.set_task_run_state(
                TaskRunState.UP_FOR_RETRY, track=True, error=task_run_error
            )
        else:
            task_run.set_task_run_state(
                TaskRunState.FAILED, track=True, error=task_run_error
            )

    def get_pod_status(self, pod_name):
        pod_ctrl = self.kube_dbnd.get_pod_ctrl(name=pod_name)
        return pod_ctrl.get_pod_status_v1()
Esempio n. 24
0
FooManager.register('bar', bar)

if __name__ == '__main__':
    
    
    mgr = FooManager()
    mgr.start()
    
    res = mgr.bar()
    print res._getvalue()
    print str(res)[0]
    print type(res)
    
    sys.exit(0)
    
    qm = SyncManager()
    qm.start()
    
    mgr = FooManager()
    mgr.start()
    results = []
    for _i in xrange(1):
        q = qm.Queue()
        res = mgr.foo(3, q)
        print res
        print type(res)
        results.append(q.get())
        print '='*80
        sleep(3)
    sleep(5)
    print 'Done.'
Esempio n. 25
0
class DbndKubernetesScheduler(AirflowKubernetesScheduler):
    def __init__(self, kube_config, task_queue, result_queue, kube_client,
                 worker_uuid, kube_dbnd):
        super(DbndKubernetesScheduler,
              self).__init__(kube_config, task_queue, result_queue,
                             kube_client, worker_uuid)
        self.kube_dbnd = kube_dbnd

        # PATCH manage watcher
        from multiprocessing.managers import SyncManager

        self._manager = SyncManager()
        self._manager.start(mgr_init)

        self.watcher_queue = self._manager.Queue()
        self.current_resource_version = 0
        self.kube_watcher = self._make_kube_watcher_dbnd()
        # will be used to low level pod interactions
        self.failed_pods_to_ignore = []
        self.running_pods = {}
        self.pod_to_task = {}
        self.metrics_logger = KubernetesMetricsLogger()

    def _make_kube_watcher(self):
        # prevent storing in db of the kubernetes resource version, because the kubernetes db model only stores a single value
        # of the resource version while we need to store a sperate value for every kubernetes executor (because even in a basic flow
        # we can have two Kubernets executors running at once, the one that launched the driver and the one inside the driver).
        #
        # the resource version is the position inside the event stream of the kubernetes cluster and is used by the watcher to poll
        # Kubernets for events. It's probably fine to not store this because by default Kubernetes will returns "the evens currently in cache"
        # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_namespaced_pod
        return None

    def _make_kube_watcher_dbnd(self):
        watcher = DbndKubernetesJobWatcher(
            namespace=self.namespace,
            watcher_queue=self.watcher_queue,
            resource_version=self.current_resource_version,
            worker_uuid=self.worker_uuid,
            kube_config=self.kube_config,
            kube_dbnd=self.kube_dbnd,
        )
        watcher.start()
        return watcher

    @staticmethod
    def _create_pod_id(dag_id, task_id):
        task_run = try_get_databand_run().get_task_run(task_id)
        return task_run.job_id__dns1123

    def _health_check_kube_watcher(self):
        if self.kube_watcher.is_alive():
            pass
        else:
            self.log.error("Error while health checking kube watcher process. "
                           "Process died for unknown reasons")
            self.kube_watcher = self._make_kube_watcher_dbnd()

    def run_next(self, next_job):
        """

        The run_next command will check the task_queue for any un-run jobs.
        It will then create a unique job-id, launch that job in the cluster,
        and store relevant info in the current_jobs map so we can track the job's
        status
        """
        key, command, kube_executor_config = next_job
        dag_id, task_id, execution_date, try_number = key
        self.log.debug(
            "Kube POD to submit: image=%s with %s",
            self.kube_config.kube_image,
            str(next_job),
        )

        dr = try_get_databand_run()
        task_run = dr.get_task_run_by_af_id(task_id)
        pod_command = [str(c) for c in command]
        task_engine = task_run.task_engine  # type: KubernetesEngineConfig
        pod = task_engine.build_pod(
            task_run=task_run,
            cmds=pod_command,
            labels={
                "airflow-worker":
                self.worker_uuid,
                "dag_id":
                self._make_safe_label_value(dag_id),
                "task_id":
                self._make_safe_label_value(task_run.task_af_id),
                "execution_date":
                self._datetime_to_label_safe_datestring(execution_date),
                "try_number":
                str(try_number),
            },
            try_number=try_number,
            include_system_secrets=True,
        )

        pod_ctrl = self.kube_dbnd.get_pod_ctrl_for_pod(pod)
        self.running_pods[pod.name] = self.namespace
        self.pod_to_task[pod.name] = task_run.task

        pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True)
        self.metrics_logger.log_pod_started(task_run.task)

    def delete_pod(self, pod_id):
        if pod_id in self.failed_pods_to_ignore:
            logger.warning(
                "Received request to delete pod %s that is ignored! Ignoring...",
                pod_id)
            return
        try:
            found_pod = self.running_pods.pop(pod_id, None)
            if found_pod:
                result = self.kube_dbnd.delete_pod(pod_id, self.namespace)

                if pod_id in self.pod_to_task:
                    self.metrics_logger.log_pod_deleted(
                        self.pod_to_task[pod_id])
                    self.pod_to_task.pop(pod_id)  # Keep the cache clean

                return result
        except Exception as e:
            # Catch all exceptions to prevent any delete loops, best effort
            logger.warning(
                "Exception raised when trying to delete pod %s! Adding to ignored list...",
                pod_id,
            )
            self.failed_pods_to_ignore.append(pod_id)

    def terminate(self):
        pods_to_delete = sorted(self.running_pods.keys())
        if pods_to_delete:
            logger.info("Deleting %d submitted pods: %s", len(pods_to_delete),
                        pods_to_delete)
            for pod_name in pods_to_delete:
                try:
                    self.delete_pod(pod_name)
                except Exception:
                    logger.exception("Failed to terminate pod %s", pod_name)
        super(DbndKubernetesScheduler, self).terminate()
Esempio n. 26
0
def manager_queue(sync_manager: SyncManager) -> Generator["queue.Queue[Any]", None, None]:
    yield sync_manager.Queue()
Esempio n. 27
0
    }
    global_rb = manager.PrioritizedReplayBuffer(memory_size,
                                                env_dict=env_dict,
                                                alpha=PER_a,
                                                default_dtype=np.float16,
                                                check_for_update=True)

    n_explorer = multiprocessing.cpu_count() - 1
    epsilons = [
        pow(0.4, 1 + (i / (n_explorer - 1)) * 7) for i in range(n_explorer)
    ]  # apex paper

    n_queue = n_explorer
    n_queue += 1  # for evaluation
    # n_queue += 1  # for prefetch
    queues = [manager.Queue() for _ in range(n_queue)]

    # Event object to share training status. if event is set True, all exolorers stop sampling transitions
    is_training_done = Event()

    transitions = Value('i', 0)

    # Lock
    lock = manager.Lock()

    # Shared memory objects to count number of samples and applied gradients
    trained_steps = Value('i', 0)

    tasks = []
    local_buffer_size = 200  # 100论文数据
    episode_max_steps = step_limit
Esempio n. 28
0
def execute_tools(config, path, progress=None):
    """
    Executes the suite of TidyPy tools upon the project and returns the
    issues that are found.

    :param config: the TidyPy configuration to use
    :type config: dict
    :param path: that path to the project to analyze
    :type path: str
    :param progress:
        the progress reporter object that will receive callbacks during the
        execution of the tool suite. If not specified, not progress
        notifications will occur.
    :type progress: tidypy.Progress
    :rtype: tidypy.Collector
    """

    progress = progress or QuietProgress()
    progress.on_start()

    manager = SyncManager()
    manager.start()

    num_tools = 0
    tools = manager.Queue()
    for name, cls in get_tools().items():
        if config[name]['use'] and cls.can_be_used():
            num_tools += 1
            tools.put({
                'name': name,
                'config': config[name],
            })

    collector = Collector(config)
    if not num_tools:
        progress.on_finish()
        return collector

    notifications = manager.Queue()
    environment = manager.dict({
        'finder': Finder(path, config),
    })

    workers = []
    for _ in range(config['workers']):
        worker = Worker(
            args=(
                tools,
                notifications,
                environment,
            ),
        )
        worker.start()
        workers.append(worker)

    while num_tools:
        try:
            notification = notifications.get(True, 0.25)
        except Empty:
            pass
        else:
            if notification['type'] == 'start':
                progress.on_tool_start(notification['tool'])
            elif notification['type'] == 'complete':
                collector.add_issues(notification['issues'])
                progress.on_tool_finish(notification['tool'])
                num_tools -= 1

    progress.on_finish()

    return collector
Esempio n. 29
0
"""
can NOT work
"""

from multiprocessing.managers import SyncManager

manager = SyncManager(address=('localhost', 50000), authkey='abracadabra')
manager.start()
queue = manager.Queue()
manager.register("get_queue", callable=lambda: queue)
manager.join()

Esempio n. 30
0
def run_server():
    from connectn.tournament import run_tournament_process
    from multiprocessing.managers import SyncManager

    cu.configure_logging(cu.SERVER_PROCESS_LOG)
    logger = logging.getLogger(__name__)

    cu.start_stunnel(True)

    manager = SyncManager()
    manager.start(_process_init)
    sq = mp.Queue()
    rq = manager.Queue()
    shutdown = manager.Event()
    rg = mp.Process(
        target=_process_init,
        args=(run_tournament_process, sq, rq, shutdown, cu.PLAY_ALL),
        name="RunGames",
    )
    rg.start()

    logger.info("Started run_games process")
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as ls:
        try:
            ls.settimeout(5.0)
            ls.bind(("localhost", cu.LISTEN_PORT))
            ls.listen(5)
            logger.info("Started server listening socket")
        except Exception:
            logger.exception("Failure when binding to the listening port.")
        else:
            updated_agent_archives = []
            running = True
            while running:
                try:
                    (cs, addr) = ls.accept()
                    logger.info("Accepted connection.")
                    handle_client(cs, updated_agent_archives)
                except socket.timeout:
                    if len(updated_agent_archives):
                        logger.info(
                            f"Server sending {len(updated_agent_archives)} new agents for game-play."
                        )
                        logger.info(f"{updated_agent_archives}")
                        sq.put(updated_agent_archives)
                        updated_agent_archives = []
                except cu.InactiveSocket:
                    logger.exception("Connection failed")
                except KeyboardInterrupt:
                    inp = input("Shutdown? y/[n] ").lower()
                    while inp not in ("", "y", "n"):
                        inp = input("Shutdown? y/[n] ").lower()
                    if inp == "y":
                        logger.info("KeyboardInterrupt: Shutting down")
                        running = False
                    else:
                        inp = input("Play all games? y/[n] ").lower()
                        while inp not in ("", "y", "n"):
                            inp = input("Play all games? y/[n] ").lower()
                        if inp == "y":
                            sq.put("PLAY_ALL")
                except Exception:
                    logger.exception("Unexpected error, will try to keep running.")

                store_results_local(rq)
        finally:
            """
            If the port is orphaned use:
            fuser -kn tcp <port>
            """
            ls.shutdown(socket.SHUT_RDWR)
            logger.info("Closed server socket")

    logger.info("Telling run_games process to shutdown.")
    shutdown.set()
    rg.join()
    logger.info("Finished server shutdown.")