コード例 #1
0
ファイル: base.py プロジェクト: wwxFromTju/rl-generalization
def make_env(env_id, process_idx=0, outdir=None):
    import sunblaze_envs

    env = sunblaze_envs.make(env_id)
    if outdir:
        env = sunblaze_envs.MonitorParameters(
            env,
            output_filename=os.path.join(
                outdir, 'env-parameters-{}.json'.format(process_idx)))

    return env
コード例 #2
0
ファイル: cmd_util.py プロジェクト: yfletberliac/MERL
def make_env(env_id,
             env_type,
             mpi_rank=0,
             subrank=0,
             seed=None,
             reward_scale=1.0,
             gamestate=None,
             flatten_dict_observations=True,
             wrapper_kwargs=None,
             logger_dir=None):
    wrapper_kwargs = wrapper_kwargs or {}
    if env_type == 'atari':
        env = make_atari(env_id)
    elif env_type == 'retro':
        import retro
        gamestate = gamestate or retro.State.DEFAULT
        env = retro_wrappers.make_retro(
            game=env_id,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE,
            state=gamestate)
    elif env_type == 'sunblaze':
        import sys
        sys.path.append('../rl-generalization')
        import sunblaze_envs
        env = sunblaze_envs.make(env_id)
    else:
        env = gym.make(env_id)

    if flatten_dict_observations and isinstance(env.observation_space,
                                                gym.spaces.Dict):
        keys = env.observation_space.spaces.keys()
        env = FlattenObservation(env, dict_keys=list(keys))

    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
                  logger_dir
                  and os.path.join(logger_dir,
                                   str(mpi_rank) + '.' + str(subrank)),
                  allow_early_resets=True)

    if env_type == 'atari':
        env = wrap_deepmind(env, **wrapper_kwargs)
    elif env_type == 'retro':
        if 'frame_stack' not in wrapper_kwargs:
            wrapper_kwargs['frame_stack'] = 1
        env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)

    if reward_scale != 1:
        env = retro_wrappers.RewardScaler(env, reward_scale)

    return env
コード例 #3
0
        assert len(obs_shape) == 1
        assert isinstance(obs_space, Box)

        self.num_env_params = num_env_params
        new_obs_low = np.concatenate(
            [np.full((num_env_params, ), -np.inf), obs_space.low])
        new_obs_high = np.concatenate(
            [np.full((num_env_params, ), np.inf), obs_space.high])
        self.observation_space = Box(low=new_obs_low, high=new_obs_high)
        self.env_params = None

    def observation(self, observation):
        extra_obs = [v for (k, v) in self.env_params.items()]
        return np.concatenate([extra_obs, observation])

    def set_env_params(self, new_env_params):
        assert len(new_env_params) == self.num_env_params
        self.env_params = new_env_params
        self.env.env.set_env_params(new_env_params)
        # set new vars in env


if __name__ == "__main__":
    env = sunblaze_envs.make("SunblazeAdaptedHalfCheetah-v0")
    print(env.observation_space.shape)
    env = AdaptiveEnvWrapper(env, 3)
    print(env)
    print(env.observation_space.shape)
    print(env.observation_space.low)
    print(env.observation_space.high)
コード例 #4
0
def main(args, logdir):
    """
    Model Based Reinforcement Learning
    1) Generate random trajectories
    2) Train the model on the generated data
    3) For each repetition:
        a) Generate new data using the MPC controller
        b) Retrain the model using the new data and the old data
        c) (Optional) Compute Mean Prediction Error
    """

    # SETUP
    train_envs = []
    test_envs = []
    if args.no_sunblaze:
        train_env = gym.make(args.env_name)
        test_env = gym.make(args.env_name)

        if 'PyBullet' in args.env_name and args.render:
            train_env.render()
            train_env.reset()

    elif args.test_type == 'interpolation':
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))
        test_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))

    elif args.test_type == 'extrapolation':
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + '-v0'))
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))

        test_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name +
                               'RandomExtreme-v0'))
        test_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))

    else:
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + '-v0'))
        test_envs.append(sunblaze_envs.make('Sunblaze' + args.env_name +
                                            '-v0'))

    test_cnt = 0
    for train_env in train_envs:

        assert isinstance(train_env.observation_space, gym.spaces.Box)

        start_time = time.time()
        logger = Logger(logdir)

        is_discrete = isinstance(train_env.action_space, gym.spaces.Discrete)

        ob_dim = train_env.observation_space.shape[0]
        ac_dim = train_env.action_space.n if is_discrete else train_env.action_space.shape[
            0]

        reward_function = get_reward_function(train_env)

        train_env.reset()
        ensemble = Ensemble(ob_dim,
                            ac_dim,
                            is_discrete,
                            args.pnn,
                            args.ensemble_size,
                            args.lr,
                            args.hidden_size,
                            device=nn_utils.DEVICE)

        # TRAIN
        # Instantiate policies
        mpc_policy = MPCPolicy(args, train_env, ensemble, reward_function,
                               nn_utils.DEVICE)
        random_policy = RandomPolicy(train_env)

        # Instantiate Data generator
        data_generator = DataGenerator(args,
                                       train_env,
                                       nn_utils.DEVICE,
                                       mpc_policy,
                                       random_policy,
                                       max_size=args.max_memory_size)

        if args.weights_paths is not None:
            # If weights are given, visualize and quit
            ensemble.load_weights(args.weights_paths)

            current_episodes, rewards, lengths = data_generator.generate_closed_loop_data(
                args.render)
            if args.mpe:
                MPE(train_env,
                    current_episodes,
                    ensemble,
                    args.mpc_horizon,
                    label='Ensemble %s' % (args.weights_paths))
            print('avg reward episode %f' % (np.mean(rewards)))
            print('avg len %f' % (np.mean([len(ep)
                                           for ep in current_episodes])))
            return

        # Otherwise train model on random trajectories
        current_episodes, train_rewards, train_lengths = data_generator.generate_random_data(
        )

        # Train initial model using random trajectories
        train_loss, test_loss = ensemble.train_net(
            args.epochs_rand,
            args.batch_size,
            data_generator,
            samples_per_model=args.samples_per_model)

        if args.mpe:
            print('Computing MPE')
            for (i, model) in enumerate(ensemble.models):
                MPE(train_env,
                    current_episodes,
                    model,
                    args.mpc_horizon,
                    label='random data, model %d' % (i))
            if len(ensemble.models) > 1:
                MPE(train_env,
                    current_episodes,
                    ensemble,
                    args.mpc_horizon,
                    label='random data, ensemble')

        _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data(
            render=args.render)

        # TODO: keep test data only for test data
        for itr in range(args.repetitions):
            print('\nMPC Repetition %d / %d \n' % (itr + 1, args.repetitions))
            epsilon = mpc_policy.update_epsilon(itr)
            perform_logging(itr, logger, eval_rewards, train_rewards,
                            test_loss, train_loss, eval_lengths, train_lengths,
                            start_time, epsilon)
            current_episodes, train_rewards, train_lengths = data_generator.generate_closed_loop_data(
            )

            train_loss, test_loss = ensemble.train_net(
                args.epochs_rl,
                args.batch_size,
                data_generator,
                samples_per_model=args.samples_per_model)

            if args.mpe:
                print('Computing MPE')
                for (i, model) in enumerate(ensemble.models):
                    MPE(train_env,
                        current_episodes,
                        model,
                        args.mpc_horizon,
                        label='rep %d, model %d' % (itr, i))
                if len(ensemble.models) > 1:
                    MPE(train_env,
                        current_episodes,
                        ensemble,
                        args.mpc_horizon,
                        label='rep %d, ensemble' % (itr))

            _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data(
                render=args.render)

            if args.save_model:
                for (i, model) in enumerate(ensemble.models):
                    save_file = '%s/models/rep_%d_model_%d_%.4f.pt' % (
                        str(logdir), itr, i, test_loss[i][-1])
                    torch.save(model.state_dict(), save_file)

        # SUNBLAZE TEST
        for test_env in test_envs:
            test_name = test_env.unwrapped.spec.id
            train_name = train_env.unwrapped.spec.id
            if test_cnt < 3:
                print('\nTESTING: ' + train_name + ' on ' + test_name,
                      flush=True)
                success_function = get_success_function(test_env)
                num_success = 0
                rewards = []
                for ep_num in range(args.test_episodes):
                    success, ep_reward = run_test_episode(
                        test_env, mpc_policy, success_function, args.render)
                    rewards.append(ep_reward)
                    num_success += int(success)
                    print(
                        'Test episode: %2d / %2d \t Success: %d \t Reward: %d'
                        % (ep_num + 1, args.test_episodes, int(success),
                           ep_reward),
                        flush=True)

                score = num_success / args.test_episodes * 100
                logger.log_scalar(score, test_name + '-' + train_name, 0)
                with open(train_name + '_' + test_name + '_score.txt',
                          'w+') as f:
                    f.write('Score for ' + train_name + ' tested on ' +
                            test_name + ': ' + str(score))

                print('\nScore for ' + train_name + ' tested on ' + test_name +
                      ' testing: ',
                      score,
                      flush=True)
                test_cnt += 1