コード例 #1
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
        info_keywords=('is_success',))
    env.seed(seed)
    return env
コード例 #2
0
def make_mujoco_env(env_id, seed, reward_scale=1.0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    rank = MPI.COMM_WORLD.Get_rank()
    myseed = seed  + 1000 * rank if seed is not None else None
    set_global_seeds(myseed)
    env = gym.make(env_id)
    logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank))
    env = Monitor(env, logger_path, allow_early_resets=True)
    env.seed(seed)

    if reward_scale != 1.0:
        from ddpg_curiosity_mc_her.common.retro_wrappers import RewardScaler
        env = RewardScaler(env, reward_scale)

    return env
コード例 #3
0
def run(args):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # If we are supposed to divide gpu usage among a specific set of devices,
    # set this processes' device to the correct one.
    gpu_nums = args['split_gpu_usage_among_device_nums']
    if gpu_nums is not None:
        gpu_num_to_use = gpu_nums[rank % len(gpu_nums)]
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_num_to_use)

    # Seed everything to make things reproducible.
    rank_seed = args['seed'] + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, rank_seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(rank_seed)

    input_dims = configure_dims(args)

    # Configure the replay buffer.
    memory = configure_memory(args)

    with U.single_threaded_session() as sess:
        # Setup up DDPG Agents

        agents = create_agents(sess=sess, memory=memory, input_dims=input_dims, params=args)

        saver = tf.train.Saver()
        if args['restore_from_ckpt'] is not None:
            logger.info("Restoring agents from {}".format(args['restore_from_ckpt']))
            saver.restore(sess, args['restore_from_ckpt'])

        sess.graph.finalize()
        logger.log_graph_to_tensorboard(sess.graph)

        # Setup Rollout workers
        train_policy_fn = get_policy_fn(
            name=args['train_policy_fn'], agents=agents
        )
        eval_policy_fn = get_policy_fn(
            name=args['eval_policy_fn'], agents=agents
        )

        train_rollout_worker = configure_rollout_worker(
            role='train', policy_fn=train_policy_fn, agents=agents, dims=input_dims,
            seed=rank_seed, logger=logger, params=args
        )
        eval_rollout_worker = configure_rollout_worker(
            role='eval', policy_fn=eval_policy_fn, agents=agents, dims=input_dims,
            seed=rank_seed, logger=logger, params=args
        )

        # Begin main training loop
        if rank == 0:
            start_time = time.time()

        if args['do_demo_only'] is False:
            training.train(
                memory=memory, agents=agents, saver=saver, sess=sess,
                train_rollout_worker=train_rollout_worker, eval_rollout_worker=eval_rollout_worker,
                param_noise_adaption_interval=50, **args
            )
        else:
            demo.demo(agents=agents, eval_rollout_worker=eval_rollout_worker,
                      demo_video_recording_name=args["demo_video_recording_name"])

        train_rollout_worker.close()
        eval_rollout_worker.close()

        if rank == 0:
            logger.info('total runtime: {}s'.format(time.time() - start_time))
コード例 #4
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
     env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)))
     return wrap_deepmind(env, **wrapper_kwargs)
コード例 #5
0
def prepare_params(kwargs):

    env_id = kwargs['env_id']

    def make_env():
        return gym.make(env_id)

    kwargs['make_env'] = make_env
    tmp_env = cached_make_env(kwargs['make_env'])
    kwargs['T'] = kwargs['episode_time_horizon']
    del kwargs['episode_time_horizon']
    if kwargs['T'] == 'auto':
        assert hasattr(tmp_env, '_max_episode_steps')
        kwargs['T'] = tmp_env._max_episode_steps
    else:
        kwargs['T'] = int(kwargs['T'])
    tmp_env.reset()

    if kwargs['use_her'] is False:
        # If HER is disabled, disable other HER related params.
        kwargs['replay_strategy'] = 'none'
        kwargs['replay_k'] = 0

    if 'BoxPush' not in kwargs['env_id'] and 'FetchStack' not in kwargs[
            'env_id']:
        kwargs['heatmaps'] = False

    for gamma_key in ['exploit_gamma', 'explore_gamma']:
        kwargs[gamma_key] = 1. - 1. / kwargs['T'] if kwargs[
            gamma_key] == 'auto' else float(kwargs[gamma_key])

    if kwargs['map_dynamics_loss'] and 'BoxPush' in kwargs[
            'env_id'] and 'explore' in kwargs['agent_roles']:
        kwargs['dynamics_loss_mapper'] = DynamicsLossMapper(
            working_dir=os.path.join(logger.get_dir(), 'dynamics_loss'),
            sample_env=cached_make_env(kwargs['make_env']))
    else:
        kwargs['dynamics_loss_mapper'] = None

    for network in ['exploit', 'explore']:
        # Parse noise_type
        action_noise = None
        param_noise = None
        nb_actions = tmp_env.action_space.shape[-1]
        for current_noise_type in kwargs[network + '_noise_type'].split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))
        kwargs[network + '_action_noise'] = action_noise
        kwargs[network + '_param_noise'] = param_noise
        del (kwargs[network + '_noise_type'])

    kwargs['train_rollout_params'] = {
        'compute_Q': False,
        'render': kwargs['render_training']
    }

    kwargs['eval_rollout_params'] = {
        'compute_Q': True,
        'render': kwargs['render_eval']
    }

    if kwargs['mix_extrinsic_intrinsic_objectives_for_explore'] == 'none':
        kwargs['mix_extrinsic_intrinsic_objectives_for_explore'] = None
    else:
        weights_string = kwargs[
            'mix_extrinsic_intrinsic_objectives_for_explore']
        kwargs['mix_extrinsic_intrinsic_objectives_for_explore'] = [
            float(w) for w in weights_string.split(',')
        ]
        assert len(
            kwargs['mix_extrinsic_intrinsic_objectives_for_explore']) == 2

    if kwargs['restore_from_ckpt'] == 'none':
        kwargs['restore_from_ckpt'] = None

    if kwargs['stop_at_score'] == 'none':
        kwargs['stop_at_score'] = None
    else:
        kwargs['stop_at_score'] = float(kwargs['stop_at_score'])

    if kwargs['sub_goal_divisions'] == 'none':
        kwargs['sub_goal_divisions'] = None
    else:
        sub_goal_string = kwargs['sub_goal_divisions']
        sub_goal_divisions = ast.literal_eval(sub_goal_string)

        assert type(sub_goal_divisions) == list
        for list_elem in sub_goal_divisions:
            assert type(list_elem) == list
            for index in list_elem:
                assert type(index) == int

        kwargs['sub_goal_divisions'] = sub_goal_divisions

    if kwargs['split_gpu_usage_among_device_nums'] == 'none':
        kwargs['split_gpu_usage_among_device_nums'] = None
    else:
        gpu_string = kwargs['split_gpu_usage_among_device_nums']
        gpu_nums = ast.literal_eval(gpu_string)
        assert len(gpu_nums) >= 1
        for gpu_num in gpu_nums:
            assert type(gpu_num) == int
        kwargs['split_gpu_usage_among_device_nums'] = gpu_nums

    original_COMM_WORLD_rank = MPI.COMM_WORLD.Get_rank()
    kwargs['explore_comm'] = MPI.COMM_WORLD.Split(
        color=original_COMM_WORLD_rank % kwargs['num_model_groups'],
        key=original_COMM_WORLD_rank)

    if kwargs['save_checkpoints_at'] == 'none':
        kwargs['save_checkpoints_at'] = None
    else:
        save_checkpoints_list = ast.literal_eval(kwargs['save_checkpoints_at'])
        assert type(save_checkpoints_list) == list
        for i in range(len(save_checkpoints_list)):
            save_checkpoints_list[i] = float(save_checkpoints_list[i])
        kwargs['save_checkpoints_at'] = save_checkpoints_list

    if kwargs["demo_video_recording_name"] == 'none':
        kwargs["demo_video_recording_name"] = None
    else:
        assert type(kwargs["demo_video_recording_name"]) == str

    return kwargs
コード例 #6
0
    def generate_rollouts(self, render_override=False, reset_on_success_overrride=False, heatmap_prefix=None,
                          demo_video_recording_name=None):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """

        if demo_video_recording_name is not None:
            if not self.recording:
                # Define the codec and create VideoWriter object
                fourcc = cv2.VideoWriter_fourcc(*'XVID')
                # fourcc = cv2.VideoWriter_fourcc(*'MJPG')
                # fourcc = cv2.VideoWriter_fourcc(*'MP4V')
                self.out = cv2.VideoWriter('{}.avi'.format(demo_video_recording_name), fourcc, 24.0, (2560, 1440))
                self.recording = False


        if heatmap_prefix != self.current_heatmap_prefix:
            write_dir = os.path.join(logger.get_dir(), 'heatmaps')
            self.envs[0].unwrapped.set_location_record_name(write_dir=write_dir, prefix=heatmap_prefix)
            self.current_heatmap_prefix = heatmap_prefix

        self.reset_all(force_env_resets=False)

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        o[:] = self.initial_o

        if self.use_her:
            ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
            ag[:] = self.initial_ag

        # generate episodes
        obs, actions, rewards = [], [], []
        if self.use_her:
            achieved_goals, goals, successes = [], [], []
        else:
            dones = []

        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []
        for t in range(self.T):

            policy_output = self.policy_fn(
                observation=o,
                goal=self.g if self.use_her else None,
                compute_Q=self.compute_Q
            )

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            rewards_new = np.empty((self.rollout_batch_size, 1))
            if self.use_her:
                ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
                success = np.zeros(self.rollout_batch_size)
            else:
                dones_new = np.empty((self.rollout_batch_size, 1))

            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    curr_o_new, curr_reward_new, curr_done_new, info = self.envs[i].step(u[i] * self.max_action)
                    # if shift_rewards_by_50:
                    #     curr_reward_new = curr_reward_new * 50 + 50
                        # curr_reward_new = curr_reward_new / 1000.
                    # else:
                    #     curr_reward_new = curr_reward_new * 50 + 50

                    rewards_new[i] = curr_reward_new
                    self.total_reward_this_episode[i] += curr_reward_new
                    if self.use_her:
                        if 'is_success' in info:
                            success[i] = info['is_success']
                            if reset_on_success_overrride and success[i]:
                                # raise Exception('SUCCESS affected rollout behavior')
                                # print("YOU SHOULD ONLY EVER REACH THIS IF CONDUCTING A DEMO")
                                self.reward_per_episode_history.append(self.total_reward_this_episode[i])
                                self.reset_rollout(i)

                        o_new[i] = curr_o_new['observation']
                        ag_new[i] = curr_o_new['achieved_goal']
                    else:
                        o_new[i] = curr_o_new
                        dones_new[i] = curr_done_new

                        if curr_done_new:
                            self.reward_per_episode_history.append(self.total_reward_this_episode[i])
                            self.reset_rollout(i)

                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]

                    if self.render or render_override:
                        if i == 0:
                            self.envs[0].render()
                            if demo_video_recording_name is not None:
                                frame = self.envs[i].render('rgb_array')[...,::-1]
                                cv2.imshow("recording {}".format(i), frame)
                                key = cv2.waitKey(1) & 0xFF
                                if key == ord('r'):
                                    if not self.recording:
                                        print("\n\n-------RECORDING---------\n\n")
                                        self.recording = True
                                if self.recording:
                                    print("rec {}".format(t))
                                    self.out.write(frame)
                                if key == ord('q'):
                                    self.out.release()
                                    cv2.destroyAllWindows()
                                    print('done')
                                    exit()

                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warning('NaN caught during rollout generation. Trying again...')
                self.reset_all(force_env_resets=True)
                return self.generate_rollouts()

            obs.append(o.copy())
            actions.append(u.copy())
            rewards.append(rewards_new.copy())
            o[...] = o_new
            if self.use_her:
                achieved_goals.append(ag.copy())
                successes.append(success.copy())
                goals.append(self.g.copy())
                ag[...] = ag_new
            else:
                dones.append(dones_new.copy())

        obs.append(o.copy())
        self.initial_o[:] = o

        if self.use_her:
            achieved_goals.append(ag.copy())

        episode = {'o': obs, 'u': actions}
        episode['r'] = rewards

        if self.use_her:
            episode['g'] = goals
            episode['ag'] = achieved_goals
        else:
            episode['t'] = dones

        # print("goals shape: {}".format(np.shape( episode['g'])))
        # print("obs shape: {}".format(np.shape(episode['o'])))
        # print("ag shape: {}".format(np.shape(episode['ag'])))

        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        if self.use_her:
            successful = np.array(successes)[-1, :]
            assert successful.shape == (self.rollout_batch_size,)
            success_rate = np.mean(successful)
            self.success_history.append(success_rate)
            self.reward_per_episode_history.append(self.total_reward_this_episode[i])
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        # print("goals shape: {}".format(np.shape( episode['g'])))
        # print("obs shape: {}".format(np.shape(episode['o'])))
        # print("ag shape: {}".format(np.shape(episode['ag'])))

        return convert_episode_to_batch_major(episode)
コード例 #7
0
def train(memory, agents, saver, sess, train_rollout_worker,
          eval_rollout_worker, n_epochs, n_cycles, n_batches, batch_size,
          rollout_batches_per_cycle, n_test_rollouts, heatmaps,
          dynamics_loss_mapper, do_evaluation, save_at_score, stop_at_score,
          save_checkpoints_at, **kwargs):

    rank = MPI.COMM_WORLD.Get_rank()

    logger.info("Training...")

    batch = 0

    should_quit_early = False

    for epoch in range(1, n_epochs + 1):
        epoch_start_time = datetime.now()

        if dynamics_loss_mapper is not None:
            dynamics_loss_mapper.set_record_write(
                prefix='epoch{}_rank{}'.format(epoch, rank))

        # train
        train_rollout_worker.clear_history()
        for cycle_index in range(n_cycles):
            for _ in range(rollout_batches_per_cycle):

                episode = train_rollout_worker.generate_rollouts(
                    render_override=False,
                    heatmap_prefix='epoch{}_rank{}'.format(epoch, rank)
                    if heatmaps else None)

                memory.store_episode(episode)
                for agent in agents.values():
                    agent.update_normalizers(episode)

            param_noise_distances = {}

            # Adapt param noise.
            if memory.nb_entries >= batch_size:
                for role, agent in agents.items():
                    param_noise_distances[role] = agent.adapt_param_noise()

            for train_step in range(n_batches):
                critic_losses = {}
                actor_losses = {}
                for role, agent in agents.items():
                    critic_losses[role], actor_losses[role] = agent.train()
                for agent in agents.values():
                    agent.update_target_net()

                batch += 1

        if heatmaps:
            train_rollout_worker.flush_env_location_records()
            MPI.COMM_WORLD.Barrier()
            logger.info("Creating heatmap...")
            if rank == 0:
                heatmap_save_path = generate_3d_fetch_stack_heatmap_from_npy_records(
                    working_dir=os.path.join(logger.get_dir(), 'heatmaps'),
                    file_prefix='epoch{}'.format(epoch),
                    delete_records=True)
                logger.info("Heatmap saved to {}".format(heatmap_save_path))

        # test
        if do_evaluation:
            eval_rollout_worker.clear_history()
            for _ in range(n_test_rollouts):
                eval_rollout_worker.generate_rollouts()

            current_score = mpi_average(eval_rollout_worker.current_score())

            if current_score >= save_at_score and rank == 0:
                save_path = os.path.join(logger.get_dir(), 'saved_model',
                                         'model.ckpt')
                logger.info("Saving models to {}".format(save_path))
                saver.save(sess, save_path)

            if save_checkpoints_at is not None:
                for score in save_checkpoints_at.copy():
                    if current_score >= score and rank == 0:
                        logger.info("Reached checkpoint for {}".format(score))
                        save_path = os.path.join(
                            logger.get_dir(), 'saved_model',
                            'model_score_{}.ckpt'.format(
                                str(score).replace(".", "p")))
                        logger.info("Saving models to {}".format(save_path))
                        saver.save(sess, save_path)
                        save_checkpoints_at.remove(score)

            if stop_at_score is not None and current_score >= stop_at_score:
                logger.info("Stopping score of {} reached. Quitting...".format(
                    stop_at_score))
                should_quit_early = True

        # record logs
        logger.record_tabular('epoch', epoch)
        timesteps = MPI.COMM_WORLD.Get_size(
        ) * epoch * n_cycles * rollout_batches_per_cycle * train_rollout_worker.rollout_batch_size * train_rollout_worker.T
        logger.record_tabular('timesteps', timesteps)
        if do_evaluation:
            for key, val in eval_rollout_worker.logs('test'):
                logger.record_tabular(key, mpi_average(val))
        for key, val in train_rollout_worker.logs('train'):
            logger.record_tabular(key, mpi_average(val))
        for role, agent in agents.items():
            for key, val in agent.get_stats().items():
                logger.record_tabular("{}_agent_{}".format(role, key),
                                      mpi_average(val))

        if rank == 0:
            logger.dump_tabular()

        # make sure that different threads have different seeds
        local_uniform = np.random.uniform(size=(1, ))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if rank != 0:
            assert local_uniform[0] != root_uniform[0]

        epoch_end_time = datetime.now()
        if rank == 0:
            logger.info("(epoch took {} seconds)".format(
                (epoch_end_time - epoch_start_time).total_seconds()))
            logger.info("(completed at {})".format(epoch_end_time))

        if should_quit_early:
            break

    if rank == 0:
        save_path = os.path.join(logger.get_dir(), 'saved_model', 'model.ckpt')
        logger.info("Saving models to {}".format(save_path))
        saver.save(sess, save_path)