Beispiel #1
0
def main():
    args = parse_args()

    try:
        # assuming we're given the full name of the module
        run_module = importlib.import_module(f'{args.run}')
    except ImportError:
        try:
            run_module = importlib.import_module(
                f'sample_factory.runner.runs.{args.run}')
        except ImportError:
            log.error('Could not import the run module')
            return ExperimentStatus.FAILURE

    run_description = run_module.RUN_DESCRIPTION
    run_description.experiment_suffix = args.experiment_suffix

    if args.runner == 'processes':
        from sample_factory.runner.run_processes import run
        run(run_description, args)
    elif args.runner == 'slurm':
        from sample_factory.runner.run_slurm import run_slurm
        run_slurm(run_description, args)
    elif args.runner == 'ngc':
        from sample_factory.runner.run_ngc import run_ngc
        run_ngc(run_description, args)

    return ExperimentStatus.SUCCESS
Beispiel #2
0
def register_additional_doom_env(doom_spec):
    try:
        spec = doom_env_by_name(doom_spec.name)
        log.error('Doom env spec %s already exists', spec.name)
        return
    except RuntimeError:
        pass

    log.debug('Registering Doom environment %s...', doom_spec.name)
    DOOM_ENVS.append(doom_spec)
Beispiel #3
0
 def wrapper(*args, **kwargs):
     for i in range(num_attempts):
         try:
             return func(*args, **kwargs)
         except exception_class as e:
             if i == num_attempts - 1:
                 raise
             else:
                 log.error('Failed with error %r, trying again', e)
                 sleep(sleep_time)
Beispiel #4
0
    def _write_dict_summaries(dictionary, writer, name, env_steps):
        for d, key, value in iterate_recursively(dictionary):
            if isinstance(value, bool):
                value = int(value)

            if isinstance(value, (int, float)):
                writer.add_scalar(f'zz_pbt/{name}_{key}', value, env_steps)
            elif isinstance(value, (tuple, list)):
                for i, tuple_value in enumerate(value):
                    writer.add_scalar(f'zz_pbt/{name}_{key}_{i}', tuple_value,
                                      env_steps)
            else:
                log.error('Unsupported type in pbt summaries %r', type(value))
Beispiel #5
0
    def _set_env_attr(self, env, player_id, attr_chain, value):
        """Allows us to set an arbitrary attribute of the environment, e.g. attr_chain can be unwrapped.foo.bar"""
        assert player_id == self.player_id

        attrs = attr_chain.split('.')
        curr_attr = env
        try:
            for attr_name in attrs[:-1]:
                curr_attr = getattr(curr_attr, attr_name)
        except AttributeError:
            log.error('Env does not have an attribute %s', attr_chain)

        attr_to_set = attrs[-1]
        setattr(curr_attr, attr_to_set, value)
Beispiel #6
0
    def _selected_weapon_rewards(self, selected_weapon, selected_weapon_ammo, deltas):
        # we must keep the weapon ready for a certain number of frames to get rewards
        unholstered = len(self.selected_weapon) > 4 and all(sw == selected_weapon for sw in self.selected_weapon)
        reward = 0.0

        if selected_weapon_ammo > 0 and unholstered:
            try:
                reward = self.reward_shaping_scheme['selected_weapon'][f'SELECTED{weapon}']
            except KeyError:
                log.error('%r', self.reward_shaping_scheme)
                log.error('%r', selected_weapon)
            weapon_key = f'weapon{selected_weapon}'
            deltas.append((weapon_key, reward))
            self.reward_structure[weapon_key] = self.reward_structure.get(weapon_key, 0.0) + reward

        return reward
Beispiel #7
0
def get_gpus_without_triggering_pytorch_cuda_initialization(envvars=None):
    if envvars is None:
        envvars = os.environ

    import subprocess
    out = subprocess.run([sys.executable, '-m', 'sample_factory.utils.get_available_gpus'], capture_output=True, env=envvars)
    text_output = out.stdout.decode()
    err_output = out.stderr.decode()
    returncode = out.returncode

    from sample_factory.utils.utils import log
    if returncode:
        log.error(
            'Querying available GPUs... return code %d, error: %s, stdout: %s', returncode, err_output, text_output,
        )

    log.debug('Queried available GPUs: %s', text_output)
    return text_output
Beispiel #8
0
    def reset(self):
        self._ensure_initialized()

        if self.record_to is not None and not self.is_multiplayer:
            # does not work in multiplayer (uses different mechanism)
            if not os.path.exists(self.record_to):
                os.makedirs(self.record_to)

            demo_path = self.demo_path(self._num_episodes)
            log.warning('Recording episode demo to %s', demo_path)
            self.game.new_episode(demo_path)
        else:
            if self._num_episodes > 0:
                # no demo recording (default)
                self.game.new_episode()

        self.state = self.game.get_state()
        img = None
        try:
            img = self.state.screen_buffer
        except AttributeError:
            # sometimes Doom does not return screen buffer at all??? Rare bug
            pass

        if img is None:
            log.error(
                'Game returned None screen buffer! This is not supposed to happen!'
            )
            img = self._black_screen()

        # Swap current and previous histogram
        if self.current_histogram is not None and self.previous_histogram is not None:
            swap = self.current_histogram
            self.current_histogram = self.previous_histogram
            self.previous_histogram = swap
            self.current_histogram.fill(0)

        self._actions_flattened = None
        self._last_episode_info = copy.deepcopy(self._prev_info)
        self._prev_info = None

        self._num_episodes += 1

        return np.transpose(img, (1, 2, 0))
Beispiel #9
0
        def wrapper(*args, **kwargs):
            for i in range(num_attempts):
                try:
                    return func(*args, **kwargs)
                except exception_class as e:
                    # This accesses the self instance variable
                    multiagent_wrapper_obj = args[0]
                    multiagent_wrapper_obj.initialized = False
                    multiagent_wrapper_obj.close()

                    # This is done to reset if it is in the step function
                    if should_reset:
                        multiagent_wrapper_obj.reset()

                    if i == num_attempts - 1:
                        raise
                    else:
                        log.error('Failed with error %r, trying again', e)
                        sleep(sleep_time)
Beispiel #10
0
def init_wandb(cfg):
    """
    Must call initialization of Wandb before summary writer is initialized, otherwise
    sync_tensorboard does not work.
    """

    if not cfg.with_wandb:
        log.debug('Weights and Biases integration disabled')
        return

    if 'wandb_unique_id' not in cfg:
        # if we're going to restart the experiment, this will be saved to a json file
        cfg.wandb_unique_id = f'{cfg.experiment}_{datetime.now().strftime("%Y%m%d_%H%M%S_%f")}'

    wandb_unique_id = cfg.wandb_unique_id
    wandb_group = cfg.env if cfg.wandb_group is None else cfg.wandb_group

    log.debug(
        'Weights and Biases integration enabled. Project: %s, user: %s, group: %s, unique_id: %s',
        cfg.wandb_project, cfg.wandb_user, cfg.wandb_group, wandb_unique_id,
    )

    import wandb

    # this can fail occasionally, so we try a couple more times
    @retry(3, exceptions=(Exception,))
    def init_wandb_func():
        wandb.init(
            project=cfg.wandb_project, entity=cfg.wandb_user, sync_tensorboard=True,
            id=wandb_unique_id,
            name=wandb_unique_id,
            group=wandb_group, job_type=cfg.wandb_job_type, tags=cfg.wandb_tags,
            resume=True,
            settings=wandb.Settings(start_method='fork'),
        )

    log.debug('Initializing WandB...')
    try:
        init_wandb_func()
    except Exception as exc:
        log.error(f'Could not initialize WandB! {exc}')

    wandb.config.update(cfg, allow_val_change=True)
Beispiel #11
0
def read_seeds_file(filename, has_keys):
    seeds = []

    with open(filename, 'r') as seed_file:
        lines = seed_file.readlines()
        for line in lines:
            try:
                if has_keys:
                    seed, cache_key = line.split(' ')
                else:
                    seed = line

                seed = int(seed)
                seeds.append(seed)
            except Exception:
                log.error(
                    'Could not read seed value from the file! File potentially corrupted'
                )
                log.exception('Exception when reading seeds file')

    return seeds
Beispiel #12
0
    def run(self):
        for p in self.processes:
            time.sleep(0.3)
            p.start()

        finished_reset = np.zeros([self.cfg.num_workers], dtype=np.bool)
        while not all(finished_reset):
            try:
                msg = self.report_queue.get(timeout=0.1)
                if 'finished_reset' in msg:
                    finished_reset[msg['proc_idx']] = True
                    log.debug('Process %d finished reset! Status %r',
                              msg['proc_idx'], finished_reset)
            except Empty:
                pass

        log.debug('All workers finished reset!')
        time.sleep(2)
        self.start_event.set()

        start = time.time()
        env_frames = 0
        last_process_report = [time.time() for _ in self.processes]

        while not self.terminate.value:
            try:
                try:
                    msgs = self.report_queue.get_many(
                        timeout=self.report_every_sec * 1.5)
                    for msg in msgs:
                        last_process_report[msg['proc_idx']] = time.time()

                        if 'crash' in msg:
                            self.terminate.value = True
                            log.error(
                                'Terminating due to process %d crashing...',
                                msg['proc_idx'])
                            break

                        env_frames += msg['env_frames']

                    if env_frames >= self.cfg.sample_env_frames:
                        log.warning('Desired number of frames reached')
                        self.terminate.value = True

                    if time.time() - start > self.cfg.timeout_seconds:
                        log.warning('Terminated by timer')
                        self.terminate.value = True
                except Empty:
                    pass
            except KeyboardInterrupt:
                self.terminate.value = True
                log.error('KeyboardInterrupt in main loop! Terminating...')
                break

            if time.time() - self.last_report > self.report_every_sec:
                self.report(env_frames)

            for proc_idx, p in enumerate(self.processes):
                delay = time.time() - last_process_report[proc_idx]
                if delay > 600:
                    # killing the whole script is the best way to know that some of the processes froze
                    log.error(
                        'Process %d had not responded in %.1f s!!! Terminating...',
                        proc_idx, delay)
                    self.terminate.value = True

            for p in self.processes:
                if not p.is_alive():
                    self.terminate.value = True
                    log.error('Process %r died! terminating...', p)

        total_time = time.time() - start
        log.info('Collected %d frames in %.1f s, avg FPS: %.1f', env_frames,
                 total_time, env_frames / total_time)
        log.debug('Done sampling...')
Beispiel #13
0
    def sample(self, proc_idx):
        # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        if self.cfg.sampler_worker_gpus:
            set_gpus_for_process(
                proc_idx,
                num_gpus_per_process=1,
                process_type='sampler_proc',
                gpu_mask=self.cfg.sampler_worker_gpus,
            )

        timing = Timing()

        from threadpoolctl import threadpool_limits
        with threadpool_limits(limits=1, user_api=None):
            if self.cfg.set_workers_cpu_affinity:
                set_process_cpu_affinity(proc_idx, self.cfg.num_workers)

            initial_cpu_affinity = psutil.Process().cpu_affinity(
            ) if platform != 'darwin' else None
            psutil.Process().nice(10)

            with timing.timeit('env_init'):
                envs = []
                env_key = ['env' for _ in range(self.cfg.num_envs_per_worker)]

                for env_idx in range(self.cfg.num_envs_per_worker):
                    global_env_id = proc_idx * self.cfg.num_envs_per_worker + env_idx
                    env_config = AttrDict(worker_index=proc_idx,
                                          vector_index=env_idx,
                                          env_id=global_env_id)

                    env = make_env_func(cfg=self.cfg, env_config=env_config)
                    log.debug(
                        'CPU affinity after create_env: %r',
                        psutil.Process().cpu_affinity()
                        if platform != 'darwin' else 'MacOS - None')
                    env.seed(global_env_id)
                    envs.append(env)

                    # this is to track the performance for individual DMLab levels
                    if hasattr(env.unwrapped, 'level_name'):
                        env_key[env_idx] = env.unwrapped.level_name

                episode_length = [0 for _ in envs]
                episode_lengths = [deque([], maxlen=20) for _ in envs]

            # sample a lot of random actions once, otherwise it is pretty slow in Python
            total_random_actions = 500
            actions = [[
                env.action_space.sample() for _ in range(env.num_agents)
            ] for _ in range(total_random_actions)]
            action_i = 0

            try:
                with timing.timeit('first_reset'):
                    for env_idx, env in enumerate(envs):
                        env.reset()
                        log.info('Process %d finished resetting %d/%d envs',
                                 proc_idx, env_idx + 1, len(envs))

                    self.report_queue.put(
                        dict(proc_idx=proc_idx, finished_reset=True))

                self.start_event.wait()

                with timing.timeit('work'):
                    last_report = last_report_frames = total_env_frames = 0
                    while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker:
                        for env_idx, env in enumerate(envs):
                            with timing.add_time(f'{env_key[env_idx]}.step'):
                                obs, rewards, dones, infos = env.step(
                                    actions[action_i])
                                action_i = (action_i +
                                            1) % total_random_actions

                            num_frames = sum(
                                [info.get('num_frames', 1) for info in infos])
                            total_env_frames += num_frames
                            episode_length[env_idx] += num_frames

                            if all(dones):
                                episode_lengths[env_idx].append(
                                    episode_length[env_idx] / env.num_agents)
                                episode_length[env_idx] = 0

                        with timing.add_time('report'):
                            now = time.time()
                            if now - last_report > self.report_every_sec:
                                last_report = now
                                frames_since_last_report = total_env_frames - last_report_frames
                                last_report_frames = total_env_frames
                                self.report_queue.put(
                                    dict(proc_idx=proc_idx,
                                         env_frames=frames_since_last_report))

                                if proc_idx == 0:
                                    log.debug('Memory usage: %.4f Mb',
                                              memory_consumption_mb())

                # Extra check to make sure cpu affinity is preserved throughout the execution.
                # I observed weird effect when some environments tried to alter affinity of the current process, leading
                # to decreased performance.
                # This can be caused by some interactions between deep learning libs, OpenCV, MKL, OpenMP, etc.
                # At least user should know about it if this is happening.
                cpu_affinity = psutil.Process().cpu_affinity(
                ) if platform != 'darwin' else None
                assert initial_cpu_affinity == cpu_affinity, \
                    f'Worker CPU affinity was changed from {initial_cpu_affinity} to {cpu_affinity}!' \
                    f'This can significantly affect performance!'

            except:
                log.exception('Unknown exception')
                log.error('Unknown exception in worker %d, terminating...',
                          proc_idx)
                self.report_queue.put(dict(proc_idx=proc_idx, crash=True))

            time.sleep(proc_idx * 0.01 + 0.01)
            log.info('Process %d finished sampling. Timing: %s', proc_idx,
                     timing)

            for env_idx, env in enumerate(envs):
                if len(episode_lengths[env_idx]) > 0:
                    log.warning('Level %s avg episode len %d',
                                env_key[env_idx],
                                np.mean(episode_lengths[env_idx]))

            for env in envs:
                env.close()
Beispiel #14
0
    def init_subset(self, indices, actor_queues):
        """
        Initialize a subset of actor workers (rollout workers) and wait until the first reset() is completed for all
        envs on these workers.

        This function will retry if the worker process crashes during the initial reset.

        :param indices: indices of actor workers to initialize
        :param actor_queues: task queues corresponding to these workers
        :return: initialized workers
        """

        reset_timelimit_seconds = self.cfg.reset_timeout_seconds  # fail worker if not a single env was reset in that time

        workers = dict()
        last_env_initialized = dict()
        for i in indices:
            w = self.create_actor_worker(i, actor_queues[i])
            w.init()
            w.request_reset()
            workers[i] = w
            last_env_initialized[i] = time.time()

        total_num_envs = self.cfg.num_workers * self.cfg.num_envs_per_worker
        envs_initialized = [0] * self.cfg.num_workers
        workers_finished = set()

        while len(workers_finished) < len(workers):
            failed_worker = -1

            try:
                report = self.report_queue.get(timeout=1.0)

                if 'initialized_env' in report:
                    worker_idx, split_idx, env_i = report['initialized_env']
                    last_env_initialized[worker_idx] = time.time()
                    envs_initialized[worker_idx] += 1

                    log.debug(
                        'Progress for %d workers: %d/%d envs initialized...',
                        len(indices),
                        sum(envs_initialized),
                        total_num_envs,
                    )
                elif 'finished_reset' in report:
                    workers_finished.add(report['finished_reset'])
                elif 'critical_error' in report:
                    failed_worker = report['critical_error']
            except Empty:
                pass

            for worker_idx, w in workers.items():
                if worker_idx in workers_finished:
                    continue

                time_passed = time.time() - last_env_initialized[worker_idx]
                timeout = time_passed > reset_timelimit_seconds

                if timeout or failed_worker == worker_idx or not w.process.is_alive(
                ):
                    envs_initialized[worker_idx] = 0

                    log.error('Worker %d is stuck or failed (%.3f). Reset!',
                              w.worker_idx, time_passed)
                    log.debug('Status: %r', w.process.is_alive())
                    stuck_worker = w
                    stuck_worker.process.kill()

                    new_worker = self.create_actor_worker(
                        worker_idx, actor_queues[worker_idx])
                    new_worker.init()
                    new_worker.request_reset()

                    last_env_initialized[worker_idx] = time.time()
                    workers[worker_idx] = new_worker
                    del stuck_worker

        return workers.values()
Beispiel #15
0
    def sample(self, proc_idx):
        # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        timing = Timing()

        psutil.Process().nice(10)

        num_envs = len(DMLAB30_LEVELS_THAT_USE_LEVEL_CACHE)
        assert self.cfg.num_workers % num_envs == 0, f'should have an integer number of workers per env, e.g. {1 * num_envs}, {2 * num_envs}, etc...'
        assert self.cfg.num_envs_per_worker == 1, 'use populate_cache with 1 env per worker'

        with timing.timeit('env_init'):
            env_key = 'env'
            env_desired_num_levels = 0

            global_env_id = proc_idx * self.cfg.num_envs_per_worker
            env_config = AttrDict(worker_index=proc_idx, vector_index=0, env_id=global_env_id)
            env = create_env(self.cfg.env, cfg=self.cfg, env_config=env_config)
            env.seed(global_env_id)

            # this is to track the performance for individual DMLab levels
            if hasattr(env.unwrapped, 'level_name'):
                env_key = env.unwrapped.level_name
                env_level = env.unwrapped.level

                approx_num_episodes_per_1b_frames = DMLAB30_APPROX_NUM_EPISODES_PER_BILLION_FRAMES[env_key]
                num_billions = DESIRED_TRAINING_LENGTH / int(1e9)
                num_workers_for_env = self.cfg.num_workers // num_envs
                env_desired_num_levels = int((approx_num_episodes_per_1b_frames * num_billions) / num_workers_for_env)

                env_num_levels_generated = len(dmlab_level_cache.DMLAB_GLOBAL_LEVEL_CACHE[0].all_seeds[env_level]) // num_workers_for_env

                log.warning('Worker %d (env %s) generated %d/%d levels!', proc_idx, env_key, env_num_levels_generated, env_desired_num_levels)
                time.sleep(4)

            env.reset()
            env_uses_level_cache = env.unwrapped.env_uses_level_cache

            self.report_queue.put(dict(proc_idx=proc_idx, finished_reset=True))

        self.start_event.wait()

        try:
            with timing.timeit('work'):
                last_report = last_report_frames = total_env_frames = 0
                while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker:
                    action = env.action_space.sample()
                    with timing.add_time(f'{env_key}.step'):
                        env.step(action)

                    total_env_frames += 1

                    with timing.add_time(f'{env_key}.reset'):
                        env.reset()
                        env_num_levels_generated += 1
                        log.debug('Env %s done %d/%d resets', env_key, env_num_levels_generated, env_desired_num_levels)

                    if env_num_levels_generated >= env_desired_num_levels:
                        log.debug('%s finished %d/%d resets, sleeping...', env_key, env_num_levels_generated, env_desired_num_levels)
                        time.sleep(30)  # free up CPU time for other envs

                    # if env does not use level cache, there is no need to run it
                    # let other workers proceed
                    if not env_uses_level_cache:
                        log.debug('Env %s does not require cache, sleeping...', env_key)
                        time.sleep(200)

                    with timing.add_time('report'):
                        now = time.time()
                        if now - last_report > self.report_every_sec:
                            last_report = now
                            frames_since_last_report = total_env_frames - last_report_frames
                            last_report_frames = total_env_frames
                            self.report_queue.put(dict(proc_idx=proc_idx, env_frames=frames_since_last_report))

                            if get_free_disk_space_mb(self.cfg) < 3 * 1024:
                                log.error('Not enough disk space! %d', get_free_disk_space_mb(self.cfg))
                                time.sleep(200)
        except:
            log.exception('Unknown exception')
            log.error('Unknown exception in worker %d, terminating...', proc_idx)
            self.report_queue.put(dict(proc_idx=proc_idx, crash=True))

        time.sleep(proc_idx * 0.1 + 0.1)
        log.info('Process %d finished sampling. Timing: %s', proc_idx, timing)

        env.close()
Beispiel #16
0
def run(run_description, args):
    experiments = run_description.experiments
    max_parallel = args.max_parallel

    log.info('Starting processes with base cmds: %r',
             [e.cmd for e in experiments])
    log.info('Max parallel processes is %d', max_parallel)
    log.info(
        'Monitor log files using\n\n\ttail -f train_dir/%s/**/**/sf_log.txt\n\n',
        run_description.run_name)

    processes = []
    processes_per_gpu = {g: [] for g in range(args.num_gpus)}

    experiments = run_description.generate_experiments(args.train_dir)
    next_experiment = next(experiments, None)

    def find_least_busy_gpu():
        least_busy_gpu = None
        gpu_available_processes = 0

        for gpu_id in range(args.num_gpus):
            available_processes = args.experiments_per_gpu - len(
                processes_per_gpu[gpu_id])
            if available_processes > gpu_available_processes:
                gpu_available_processes = available_processes
                least_busy_gpu = gpu_id

        return least_busy_gpu, gpu_available_processes

    def can_squeeze_another_process():
        if len(processes) >= max_parallel:
            return False

        if args.experiments_per_gpu > 0:
            least_busy_gpu, gpu_available_processes = find_least_busy_gpu()
            if gpu_available_processes <= 0:
                return False

        return True

    failed_processes = []
    last_log_time = 0
    log_interval = 3  # seconds

    while len(processes) > 0 or next_experiment is not None:
        while can_squeeze_another_process() and next_experiment is not None:
            cmd, name, root_dir, exp_env_vars = next_experiment

            cmd_tokens = cmd.split(' ')

            # workaround to make sure we're running the correct python executable from our virtual env
            if cmd_tokens[0].startswith('python'):
                cmd_tokens[0] = sys.executable
                log.debug('Using Python executable %s', cmd_tokens[0])

            ensure_dir_exists(join(args.train_dir, root_dir))

            envvars = os.environ.copy()

            best_gpu = None
            if args.experiments_per_gpu > 0:
                best_gpu, best_gpu_available_processes = find_least_busy_gpu()
                log.info(
                    'The least busy gpu is %d where we can run %d more processes',
                    best_gpu,
                    best_gpu_available_processes,
                )
                envvars['CUDA_VISIBLE_DEVICES'] = f'{best_gpu}'

            log.info('Starting process %r', cmd_tokens)

            if exp_env_vars is not None:
                for key, value in exp_env_vars.items():
                    log.info('Adding env variable %r %r', key, value)
                    envvars[str(key)] = str(value)

            process = subprocess.Popen(cmd_tokens,
                                       stdout=None,
                                       stderr=None,
                                       env=envvars)
            process.gpu_id = best_gpu
            process.proc_cmd = cmd

            processes.append(process)

            if process.gpu_id is not None:
                processes_per_gpu[process.gpu_id].append(process.proc_cmd)

            log.info('Started process %s on GPU %r', process.proc_cmd,
                     process.gpu_id)
            log.info('Waiting for %d seconds before starting next process',
                     args.pause_between)
            time.sleep(args.pause_between)

            next_experiment = next(experiments, None)

        remaining_processes = []
        for process in processes:
            if process.poll() is None:
                remaining_processes.append(process)
                continue
            else:
                if process.gpu_id is not None:
                    processes_per_gpu[process.gpu_id].remove(process.proc_cmd)
                log.info('Process %r finished with code %r', process.proc_cmd,
                         process.returncode)
                if process.returncode != 0:
                    failed_processes.append(
                        (process.proc_cmd, process.pid, process.returncode))
                    log.error('WARNING: RETURN CODE IS %r', process.returncode)

        processes = remaining_processes

        if time.time() - last_log_time > log_interval:
            if failed_processes:
                log.error(
                    'Failed processes: %s', ', '.join([
                        f'PID: {p[1]} code: {p[2]}' for p in failed_processes
                    ]))
            last_log_time = time.time()

        time.sleep(0.1)

    log.info('Done!')

    return 0
Beispiel #17
0
def run_slurm(run_description, args):
    workdir = args.slurm_workdir
    pause_between = args.pause_between

    experiments = run_description.experiments

    log.info('Starting processes with base cmds: %r',
             [e.cmd for e in experiments])

    if not os.path.exists(workdir):
        log.info('Creating %s...', workdir)
        os.makedirs(workdir)

    if args.slurm_sbatch_template is not None:
        with open(args.slurm_sbatch_template, 'r') as template_file:
            sbatch_template = template_file.read()
    else:
        sbatch_template = SBATCH_TEMPLATE_DEFAULT

    log.info('Sbatch template: %s', sbatch_template)

    experiments = run_description.generate_experiments(args.train_dir)
    sbatch_files = []
    for experiment in experiments:
        cmd, name, *_ = experiment

        sbatch_fname = f'sbatch_{name}.sh'
        sbatch_fname = join(workdir, sbatch_fname)

        file_content = sbatch_template + '\n' + cmd + '\n\necho "Done!!!"'
        with open(sbatch_fname, 'w') as sbatch_f:
            sbatch_f.write(file_content)

        sbatch_files.append(sbatch_fname)

    partition = ''
    if args.slurm_partition is not None:
        partition = f'-p {args.slurm_partition} '

    job_ids = []
    idx = 0
    for sbatch_file in sbatch_files:
        idx += 1
        sbatch_fname = os.path.basename(sbatch_file)
        num_cpus = args.slurm_cpus_per_gpu * args.slurm_gpus_per_job
        cmd = f'sbatch {partition}--gres=gpu:{args.slurm_gpus_per_job} -c {num_cpus} --parsable --output {workdir}/{sbatch_fname}-slurm-%j.out {sbatch_file}'
        log.info('Executing %s...', cmd)

        if args.slurm_print_only:
            output = idx
        else:
            cmd_tokens = cmd.split()
            process = Popen(cmd_tokens, stdout=PIPE)
            output, err = process.communicate()
            exit_code = process.wait()
            log.info('Output: %s, err: %s, exit code: %r', output, err,
                     exit_code)

            if exit_code != 0:
                log.error('sbatch process failed!')
                time.sleep(5)

        job_id = int(output)
        job_ids.append(str(job_id))

        time.sleep(pause_between)

    tail_cmd = f'tail -f {workdir}/*.out'
    log.info('Monitor log files using\n\n\t %s \n\n', tail_cmd)

    scancel_cmd = f'scancel {" ".join(job_ids)}'

    log.info('Jobs queued: %r', job_ids)

    log.info('Use this command to cancel your jobs: \n\t %s \n', scancel_cmd)

    with open(join(workdir, 'scancel.sh'), 'w') as fobj:
        fobj.write(scancel_cmd)

    log.info('Done!')
    return 0