Esempio n. 1
0
    def reset(self, report_queue):
        """
        Do the very first reset for all environments in a vector. Populate shared memory with initial obs.
        Note that this is called only once, at the very beginning of training. After this the envs should auto-reset.

        :param report_queue: we use report queue to monitor reset progress (see appo.py). This can be a lengthy
        process.
        :return: first requests for policy workers (to generate actions for the very first env step)
        """

        for env_i, e in enumerate(self.envs):
            observations = e.reset()

            if self.cfg.decorrelate_envs_on_one_worker:
                decorrelate_steps = self.cfg.rollout * env_i + 1
                log.info('Decorrelating experience for %d frames...', decorrelate_steps)
                for decorrelate_step in range(decorrelate_steps):
                    actions = [e.action_space.sample() for _ in range(self.num_agents)]
                    observations, rew, dones, info = e.step(actions)

            for agent_i, obs in enumerate(observations):
                actor_state = self.actor_states[env_i][agent_i]
                actor_state.set_trajectory_data(dict(obs=obs), self.traj_buffer_idx, self.rollout_step)
                # rnn state is already initialized at zero

            # log.debug(
            #     'Reset progress w:%d-%d finished %d/%d, still initializing envs...',
            #     self.worker_idx, self.split_idx, env_i + 1, len(self.envs),
            # )
            report_queue.put(dict(initialized_env=(self.worker_idx, self.split_idx, env_i)))

        policy_request = self._format_policy_request()
        return policy_request
def enjoy(params,
          env_id,
          max_num_episodes=1000000,
          max_num_frames=1e9,
          fps=20):
    def make_env_func():
        e = create_env(env_id, mode='test', skip_frames=True)
        e.seed(0)
        return e

    agent = AgentPPO(make_env_func, params.load())
    env = make_env_func()

    # this helps with screen recording
    pause_at_the_beginning = False
    if pause_at_the_beginning:
        env.render()
        log.info('Press any key to start...')
        cv2.waitKey()

    return run_policy_loop(agent,
                           env,
                           max_num_episodes,
                           fps,
                           max_num_frames=max_num_frames,
                           deterministic=False)
Esempio n. 3
0
    def __init__(self, env, ph_obs, params=None):
        """
        :param env
        :param ph_obs - placeholder for observations
        """
        with tf.variable_scope('rnd'):
            self.params = params
            self.ph_obs = ph_obs

            reg = None  # don't use regularization

            obs_space = main_observation_space(env)

            target_enc_params = get_enc_params(params, 'rnd_target')
            encoder_obs = make_encoder(ph_obs, obs_space, reg, target_enc_params, name='target_encoder')
            self.predicted_features = encoder_obs.encoded_input

            predictor_enc_params = get_enc_params(params, 'rnd_predictor')
            target_features = make_encoder(ph_obs, obs_space, reg, predictor_enc_params, name='predictor_encoder')
            self.tgt_features = tf.stop_gradient(target_features.encoded_input)

            self.feature_vector_size = self.predicted_features.get_shape().as_list()[-1]
            log.info('Feature vector size in RND module: %d', self.feature_vector_size)

            self.objectives = self._objectives()

            self._add_summaries()
            self.summaries = merge_summaries(collections=['rnd'])

            self.step = tf.Variable(0, trainable=False, dtype=tf.int64, name='rnd_step')

            opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='rnd_opt')
            self.train_rnd = opt.minimize(self.objectives.loss, global_step=self.step)
Esempio n. 4
0
    def _save(self):
        checkpoint = self._get_checkpoint_dict()
        assert checkpoint is not None

        checkpoint_dir = self.checkpoint_dir(self.cfg, self.policy_id)
        tmp_filepath = join(checkpoint_dir, '.temp_checkpoint')
        checkpoint_name = f'checkpoint_{self.train_step:09d}_{self.env_steps}.pth'
        filepath = join(checkpoint_dir, checkpoint_name)
        log.info('Saving %s...', tmp_filepath)
        torch.save(checkpoint, tmp_filepath)
        log.info('Renaming %s to %s', tmp_filepath, filepath)
        os.rename(tmp_filepath, filepath)

        while len(self.get_checkpoints(
                checkpoint_dir)) > self.cfg.keep_checkpoints:
            oldest_checkpoint = self.get_checkpoints(checkpoint_dir)[0]
            if os.path.isfile(oldest_checkpoint):
                log.debug('Removing %s', oldest_checkpoint)
                os.remove(oldest_checkpoint)

        if self.cfg.save_milestones_sec > 0:
            # milestones enabled
            if time.time(
            ) - self.last_milestone_time >= self.cfg.save_milestones_sec:
                milestones_dir = ensure_dir_exists(
                    join(checkpoint_dir, 'milestones'))
                milestone_path = join(milestones_dir,
                                      f'{checkpoint_name}.milestone')
                log.debug('Saving a milestone %s', milestone_path)
                shutil.copy(filepath, milestone_path)
                self.last_milestone_time = time.time()
def test_multi_env_performance(test, env_type, num_envs, num_workers):
    t = Timing()
    with t.timeit('init'):
        multi_env = MultiEnv(num_envs,
                             num_workers,
                             test.make_env,
                             stats_episodes=100)
        total_num_frames, frames = 20000, 0

    with t.timeit('first_reset'):
        multi_env.reset()

    next_print = print_step = 10000
    with t.timeit('experience'):
        while frames < total_num_frames:
            _, _, done, info = multi_env.step([0] * num_envs)
            frames += num_env_steps(info)
            if frames > next_print:
                log.info('Collected %d frames of experience...', frames)
                next_print += print_step

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames in parallel, %.1f FPS',
              t.experience, total_num_frames, fps)
    log.debug('Timing: %s', t)

    multi_env.close()
def calc_distance_to_memory(agent, sparse_map, obs):
    distance_net = agent.curiosity.distance

    num_landmarks = sparse_map.num_landmarks()
    curr_obs = [obs] * num_landmarks
    map_obs = [
        sparse_map.get_observation(node) for node in sparse_map.graph.nodes
    ]

    distances = distance_net.distances_from_obs(
        agent.session,
        obs_first=map_obs,
        obs_second=curr_obs,
    )

    min_d, min_d_idx = min_with_idx(distances)
    global last_distances
    last_distances.append(min_d)

    # log.info('Avg.distance: %.3f', np.mean(last_distances))
    log.info('Curr.distance: %.3f', min_d)

    import cv2
    closest_node = list(sparse_map.graph.nodes)[min_d_idx]
    closest_obs = sparse_map.get_observation(closest_node)
    cv2.imshow(
        'closest_obs',
        cv2.resize(cv2.cvtColor(closest_obs, cv2.COLOR_RGB2BGR), (420, 420)))
    cv2.waitKey(1)
def set_gpus_for_process(process_idx,
                         num_gpus_per_process,
                         process_type,
                         gpu_mask=None):
    available_gpus = get_available_gpus()
    if gpu_mask is not None:
        assert len(available_gpus) >= len(available_gpus)
        available_gpus = [available_gpus[g] for g in gpu_mask]
    num_gpus = len(available_gpus)
    gpus_to_use = []

    if num_gpus == 0:
        os.environ[CUDA_ENVVAR] = ''
        log.debug('Not using GPUs for %s process %d', process_type,
                  process_idx)
    else:
        first_gpu_idx = process_idx * num_gpus_per_process
        for i in range(num_gpus_per_process):
            index_mod_num_gpus = (first_gpu_idx + i) % num_gpus
            gpus_to_use.append(available_gpus[index_mod_num_gpus])

        os.environ[CUDA_ENVVAR] = ','.join([str(g) for g in gpus_to_use])
        log.info(
            'Set environment var %s to %r for %s process %d',
            CUDA_ENVVAR,
            os.environ[CUDA_ENVVAR],
            process_type,
            process_idx,
        )
        log.debug('Visible devices: %r', torch.cuda.device_count())

    return gpus_to_use
Esempio n. 8
0
    def _advance_rollouts(self, data, timing):
        split_idx = data['split_idx']

        runner = self.env_runners[split_idx]
        policy_request, complete_rollouts, episodic_stats = runner.advance_rollouts(data, timing)

        with timing.add_time('complete_rollouts'):
            if complete_rollouts:
                self._enqueue_complete_rollouts(split_idx, complete_rollouts)

                if self.num_complete_rollouts == 0 and not self.cfg.benchmark:
                    # we just finished our first complete rollouts, perfect time to wait for experience derorrelation
                    # this guarantees that there won't be any "old" trajectories when we awaken
                    delay = (float(self.worker_idx) / self.cfg.num_workers) * self.cfg.decorrelate_experience_max_seconds
                    log.info(
                        'Worker %d, sleep for %.3f sec to decorrelate experience collection',
                        self.worker_idx, delay,
                    )
                    time.sleep(delay)
                    log.info('Worker %d awakens!', self.worker_idx)

                self.num_complete_rollouts += len(complete_rollouts['rollouts'])

        with timing.add_time('enqueue_policy_requests'):
            if policy_request is not None:
                self._enqueue_policy_request(split_idx, policy_request)

        if episodic_stats:
            self._report_stats(episodic_stats)
Esempio n. 9
0
    def _handle_reset(self):
        for split_idx, env_runner in enumerate(self.env_runners):
            policy_inputs = env_runner.reset(self.report_queue)
            self._enqueue_policy_request(split_idx, policy_inputs)

        log.info('Finished reset for worker %d', self.worker_idx)
        self.report_queue.put(dict(finished_reset=self.worker_idx))
Esempio n. 10
0
def run_policy_loop(agent, env, max_num_episodes, fps=7, deterministic=False):
    """Execute the policy and render onto the screen, using the standard agent interface."""
    agent.initialize()

    episode_rewards = []
    for _ in range(max_num_episodes):
        obs, done = env.reset(), False
        episode_reward = 0

        while not done:
            start = time.time()
            env.render()
            if fps < 1000:
                time.sleep(1.0 / fps)
            action = agent.best_action(obs, deterministic=deterministic)
            obs, rew, done, _ = env.step(action)
            episode_reward += rew

            log.info('Actual fps: %.1f', 1.0 / (time.time() - start))

        env.render()
        time.sleep(0.2)

        episode_rewards.append(episode_reward)
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward,
        )

    agent.finalize()
    env.close()
    return 0
Esempio n. 11
0
    def __init__(self, cfg):
        self.cfg = cfg

        if self.cfg.seed is not None:
            log.info('Settings fixed seed %d', self.cfg.seed)
            torch.manual_seed(self.cfg.seed)
            np.random.seed(self.cfg.seed)

        self.device = torch.device('cuda')

        self.train_step = self.env_steps = 0

        self.total_train_seconds = 0
        self.last_training_step = time.time()

        self.best_avg_reward = math.nan

        self.summary_rate_decay = LinearDecay([(0, 100), (1000000, 2000),
                                               (10000000, 10000)])
        self.last_summary_written = -1e9
        self.save_rate_decay = LinearDecay([(0, self.cfg.initial_save_rate),
                                            (1000000, 5000)],
                                           staircase=100)

        summary_dir = summaries_dir(experiment_dir(cfg=self.cfg))
        self.writer = SummaryWriter(summary_dir, flush_secs=10)
Esempio n. 12
0
def run(args, config):
    local_mode = False
    if args.dbg:
        local_mode = True

    ray.init(local_mode=local_mode)

    cls = get_agent_class(args._run)
    agent = cls(env=args.env, config=config)
    agent.restore(args.checkpoint)
    num_steps = int(1e9)

    render_frameskip = args.render_action_repeat
    if render_frameskip == -1:
        # default - read from config
        # fallback to default if env config does not have it
        render_frameskip = cfg_param('skip_frames',
                                     config.get('env_config', None))

    log.info('Using render frameskip %d! \n\n\n', render_frameskip)

    rollout_loop(
        agent,
        args.env,
        num_steps,
        num_episodes=args.num_episodes,
        no_render=args.no_render,
        fps=args.fps,
        frameskip=render_frameskip,
    )
Esempio n. 13
0
    def _init(self):
        """
        Initialize env runners, that actually do all the work. Also we're doing some utility stuff here, e.g.
        setting process affinity (this is a performance optimization).
        """

        log.info('Initializing envs for env runner %d...', self.worker_idx)

        if self.cfg.force_envs_single_thread:
            from threadpoolctl import threadpool_limits
            threadpool_limits(limits=1, user_api=None)

        if self.cfg.set_workers_cpu_affinity:
            set_process_cpu_affinity(self.worker_idx, self.cfg.num_workers)
        psutil.Process().nice(min(self.cfg.default_niceness + 10, 20))

        self.env_runners = []
        for split_idx in range(self.num_splits):
            env_runner = VectorEnvRunner(
                self.cfg,
                self.vector_size // self.num_splits,
                self.worker_idx,
                split_idx,
                self.num_agents,
                self.shared_buffers,
                self.reward_shaping,
            )
            env_runner.init()
            self.env_runners.append(env_runner)
Esempio n. 14
0
    def cat(self, dict_of_tensor_arrays, macro_batch_size, use_pinned_memory, timing):
        """
        Here 'macro_batch' is the overall size of experience per iteration.
        Macro-batch = mini-batch * num_batches_per_iteration
        """

        tensor_batch = self.batch_pool.get()

        if tensor_batch is not None:
            old_batch_size = tensor_batch_size(tensor_batch)
            if old_batch_size != macro_batch_size:
                # this can happen due to PBT changing batch size during the experiment
                log.warning('Tensor macro-batch size changed from %d to %d!', old_batch_size, macro_batch_size)
                log.warning('Discarding the cached tensor batch!')
                del tensor_batch
                tensor_batch = None

        if tensor_batch is None:
            tensor_batch = copy_dict_structure(dict_of_tensor_arrays)
            log.info('Allocating new CPU tensor batch (could not get from the pool)')

            for d1, cache_d, key, tensor_arr, _ in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch):
                cache_d[key] = torch.cat(tensor_arr, dim=0)
                if use_pinned_memory:
                    cache_d[key] = cache_d[key].pin_memory()
        else:
            with timing.add_time('batcher_mem'):
                for d1, cache_d, key, tensor_arr, cache_t in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch):
                    offset = 0
                    for t in tensor_arr:
                        first_dim = t.shape[0]
                        cache_t[offset:offset + first_dim].copy_(t)
                        offset += first_dim

        return tensor_batch
 def _init(self, init_info):
     log.info('Initializing env for player %d, init_info: %r...',
              self.player_id, init_info)
     env = init_multiplayer_env(self.make_env_func, self.player_id,
                                self.env_config, init_info)
     env.reset()
     return env
Esempio n. 16
0
def main():
    experiments_dir = '/home/alex/all/projects/sample-factory/train_dir'

    all_experiment_dirs_list = [join(experiments_dir, v['dir']) for k, v in EXPERIMENTS.items()]

    for experiment_dir in all_experiment_dirs_list:
        log.debug('Experiment dir: %s', experiment_dir)

    log.debug('Total: %d', len(all_experiment_dirs_list))

    for env, details in EXPERIMENTS.items():
        env_dir = details['dir']
        env_dir = join(experiments_dir, env_dir)
        event_files = Path(env_dir).rglob('*.tfevents.*')
        event_files = list(event_files)
        log.info('Event files: %r', event_files)

        env_dirs = set()
        for event_file in event_files:
            env_dirs.add(os.path.dirname(event_file))

        EXPERIMENTS[env]['dirs'] = sorted(list(env_dirs))
        log.info('Env dirs for env %s is %r', env, env_dirs)

    EXPERIMENT_GROUPS = (('dmlab30',),)

    for group_i, exp_group in enumerate(EXPERIMENT_GROUPS):
        fig, ax = plt.subplots(1, 1)
        ax = [ax]

        count = 0
        for env in exp_group:
            experiments = EXPERIMENTS[env]['dirs']
            aggregate(env, experiments, count, ax[count])
            count += 1

        # handles, labels = ax[-1].get_legend_handles_labels()
        # lgd = fig.legend(handles, labels, bbox_to_anchor=(0.1, 0.88, 0.8, 0.2), loc='lower left', ncol=4, mode="expand", prop={'size': 6})
        # lgd.set_in_layout(True)

        # zhehui
        # plt.show()
        # plot_name = f'{env}_{key.replace("/", " ")}'
        # plt.tight_layout()
        # plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=1, wspace=0)
        # plt.subplots_adjust(wspace=0.12, hspace=0.15)

        plt.tight_layout(rect=(0, 0, 1.0, 0.9))

        plt.margins(0, 0)
        plot_name = f'dmlab30'
        plt.savefig(
            os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'),
            format='pdf',
            bbox_inches='tight',
            pad_inches=0,
        )
        # plt.savefig(os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'), format='pdf', bbox_extra_artists=(lgd,))

    return 0
Esempio n. 17
0
def parse_args(default_env, default_experiment_name, params_cls):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # common args
    parser.add_argument('--experiment', type=str, default=None)
    parser.add_argument('--env', type=str, default=default_env)

    # params object args
    params_cls.add_cli_args(parser)

    args = parser.parse_args()

    experiment = args.experiment
    if experiment is None:
        experiment = get_experiment_name(args.env, default_experiment_name)

    params = params_cls(experiment)
    params.set_command_line(sys.argv)
    params.update(args)

    log.info('Config:')
    for arg in vars(args):
        log.info('%s %r', arg, getattr(args, arg))

    return args, params
Esempio n. 18
0
    def _train_critic(self, buffer, env_steps):
        # train critic
        summary = None
        critic_step = self.critic_step.eval(session=self.session)

        prev_loss = 1e10
        for epoch in range(self.params.ppo_epochs):
            losses = []
            buffer.shuffle()

            for i in range(0, len(buffer), self.params.batch_size):
                with_summaries = self._should_write_summaries(critic_step) and summary is None
                summaries = [self.critic_summaries] if with_summaries else []

                start, end = i, i + self.params.batch_size
                feed_dict = self.input_dict(buffer, start, end)

                result = self.session.run(
                    [self.objectives.critic_loss, self.train_critic] + summaries,
                    feed_dict=feed_dict)

                critic_step += 1
                losses.append(result[0])

                if with_summaries:
                    summary = result[-1]
                    self.summary_writer.add_summary(summary, global_step=env_steps)

            # check loss improvement at the end of each epoch, early stop if necessary
            avg_loss = np.mean(losses)
            if avg_loss >= prev_loss:
                log.info('Early stopping after %d epochs because critic did not improve', epoch)
                log.info('Was %.4f now %.4f, ratio %.3f', prev_loss, avg_loss, avg_loss / prev_loss)
                break
            prev_loss = avg_loss
def train_distance(params, env_id):
    def make_env_func():
        e = create_env(env_id)
        return e

    agent = AgentTMAX(make_env_func, params)
    agent.initialize()

    multi_env = None
    try:
        multi_env = MultiEnv(
            params.num_envs,
            params.num_workers,
            make_env_func=agent.make_env_func,
            stats_episodes=params.stats_episodes,
        )

        train_loop(agent, multi_env)
    except (Exception, KeyboardInterrupt, SystemExit):
        log.exception('Interrupt...')
    finally:
        log.info('Closing env...')
        if multi_env is not None:
            multi_env.close()

        agent.finalize()

    return 0
Esempio n. 20
0
    def reset(self):
        if self._episode_recording_dir is not None and self._record_id > 0:
            # save actions to text file
            with open(join(self._episode_recording_dir, 'actions.json'),
                      'w') as actions_file:
                json.dump(self._recorded_actions, actions_file)

            # rename previous episode dir
            reward = self._recorded_episode_reward + self._recorded_episode_shaping_reward
            new_dir_name = self._episode_recording_dir + f'_r{reward:.2f}'
            os.rename(self._episode_recording_dir, new_dir_name)
            log.info(
                'Finished recording %s (rew %.3f, shaping %.3f)',
                new_dir_name,
                reward,
                self._recorded_episode_shaping_reward,
            )

        dir_name = f'ep_{self._record_id:03d}_p{self._player_id}'
        self._episode_recording_dir = join(self._record_to, dir_name)
        ensure_dir_exists(self._episode_recording_dir)

        self._record_id += 1
        self._frame_id = 0
        self._recorded_episode_reward = 0
        self._recorded_episode_shaping_reward = 0

        self._recorded_actions = []

        return self.env.reset()
Esempio n. 21
0
    def test_running_mean_std(self):
        running_mean_std = RunningMeanStd(max_past_samples=100000)

        true_mu, true_sigma, batch_size = -1, 3, 256

        x = np.random.normal(true_mu, true_sigma, batch_size)

        running_mean_std.update(x)

        # after 1 batch we should have almost the exact same
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        self.assertAlmostEqual(running_mean_std.mean, batch_mean, places=5)
        self.assertAlmostEqual(running_mean_std.var, batch_var, places=5)
        self.assertAlmostEqual(running_mean_std.count, batch_size, places=3)

        # after many batches we should have an accurate estimate
        for _ in range(1000):
            x = np.random.normal(true_mu, true_sigma, batch_size)
            running_mean_std.update(x)

        log.info('estimated mean %.2f variance %.2f', running_mean_std.mean,
                 running_mean_std.var)
        self.assertAlmostEqual(running_mean_std.mean, true_mu, places=0)
        self.assertAlmostEqual(running_mean_std.var, true_sigma**2, places=0)
Esempio n. 22
0
def safe_get(q, timeout=1e6, msg='Queue timeout'):
    """Using queue.get() with timeout is necessary, otherwise KeyboardInterrupt is not handled."""
    while True:
        try:
            return q.get(timeout=timeout)
        except Empty:
            log.info('Queue timed out (%s), timeout %.3f', msg, timeout)
Esempio n. 23
0
 def _load_state(self, checkpoint_dict, load_progress=True):
     if load_progress:
         self.train_step = checkpoint_dict['train_step']
         self.env_steps = checkpoint_dict['env_steps']
     self.actor_critic.load_state_dict(checkpoint_dict['model'])
     self.optimizer.load_state_dict(checkpoint_dict['optimizer'])
     log.info('Loaded experiment state at training iteration %d, env step %d', self.train_step, self.env_steps)
    def _ensure_initialized(self):
        if self.initialized:
            return

        num_attempts = 25
        attempt = 0
        for attempt in range(num_attempts):
            self.workers = [
                MultiAgentEnvWorker(i, self.make_env_func, self.env_config)
                for i in range(self.num_agents)
            ]

            try:
                port_to_use = udp_port_num(self.env_config)
                port = find_available_port(port_to_use, increment=1000)
                log.debug('Using port %d', port)
                init_info = dict(port=port)

                for i, worker in enumerate(self.workers):
                    worker.task_queue.put((init_info, TaskType.INIT))
                    if self.safe_init:
                        time.sleep(1.0)  # just in case
                    else:
                        time.sleep(0.01)

                for i, worker in enumerate(self.workers):
                    worker.result_queue.get(timeout=5)
                    worker.result_queue.task_done()
                    worker.task_queue.join()
            except Exception as exc:
                for worker in self.workers:
                    if isinstance(worker.process, threading.Thread):
                        log.info(
                            'We cannot really kill a thread, so let the whole process die'
                        )
                        raise RuntimeError(
                            'Critical error: worker stuck on initialization. Abort!'
                        )
                    else:
                        log.info('Killing process %r', worker.process.pid)
                        kill(worker.process.pid)
                del self.workers
                log.warning('Could not initialize env, try again! Error: %r',
                            exc)
                time.sleep(1)
            else:
                break

        if attempt >= num_attempts:
            log.error('Could not initialize env even after %d attempts. Fail!',
                      attempt)
            raise RuntimeError(
                'Critical error: worker stuck on initialization, num attempts exceeded. Abort!'
            )

        log.debug('%d agent workers initialized for env %d!',
                  len(self.workers), self.env_config.worker_index)
        log.debug('Took %d attempts!\n', attempt + 1)
        self.initialized = True
Esempio n. 25
0
 def _load_state(self, checkpoint_dict):
     self.train_step = checkpoint_dict['train_step']
     self.env_steps = checkpoint_dict['env_steps']
     self.best_avg_reward = checkpoint_dict['best_avg_reward']
     self.total_train_seconds = checkpoint_dict['total_train_seconds']
     log.info(
         'Loaded experiment state at training iteration %d, env step %d',
         self.train_step, self.env_steps)
Esempio n. 26
0
 def has_enough_data(self):
     len_data, min_data = len(
         self.buffer), self.params.distance_target_buffer_size // 3
     if len_data < min_data:
         log.info('Need to gather more data to train distance net, %d/%d',
                  len_data, min_data)
         return False
     return True
Esempio n. 27
0
 def has_enough_data(self):
     len_data, min_data = len(
         self.buffer), self.params.locomotion_experience_replay_buffer // 3
     if len_data < min_data:
         log.info('Need to gather more data to train locomotion net, %d/%d',
                  len_data, min_data)
         return False
     return True
Esempio n. 28
0
 def _maybe_save(self, step, env_steps):
     self.params.ensure_serialized()
     save_every = self.save_rate_decay.at(step)
     if (step + 1) % save_every == 0:
         log.info('Training step #%d, env steps: %d, saving...', step, env_steps)
         saver_path = model_dir(self.params.experiment_dir()) + '/' + self.__class__.__name__
         self.session.run(self.update_env_steps, feed_dict={self.total_env_steps_placeholder: env_steps})
         self.saver.save(self.session, saver_path, global_step=step)
Esempio n. 29
0
    def close(self):
        log.info('Stopping multi env wrapper...')

        for worker in self.workers:
            worker.task_queue.put((None, MsgType.TERMINATE))
            time.sleep(0.1)
        for worker in self.workers:
            worker.process.join()
Esempio n. 30
0
 def _init(self, envs):
     log.info('Initializing envs %s...', list_to_string(self.env_indices))
     for i in self.env_indices:
         env = self.make_env_func()
         env.seed(i)
         env.reset()
         envs.append(env)
         time.sleep(0.01)