def make_env(env_id, process_idx=0, outdir=None): import sunblaze_envs env = sunblaze_envs.make(env_id) if outdir: env = sunblaze_envs.MonitorParameters( env, output_filename=os.path.join( outdir, 'env-parameters-{}.json'.format(process_idx))) return env
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, logger_dir=None): wrapper_kwargs = wrapper_kwargs or {} if env_type == 'atari': env = make_atari(env_id) elif env_type == 'retro': import retro gamestate = gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro( game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) elif env_type == 'sunblaze': import sys sys.path.append('../rl-generalization') import sunblaze_envs env = sunblaze_envs.make(env_id) else: env = gym.make(env_id) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): keys = env.observation_space.spaces.keys() env = FlattenObservation(env, dict_keys=list(keys)) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': if 'frame_stack' not in wrapper_kwargs: wrapper_kwargs['frame_stack'] = 1 env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
assert len(obs_shape) == 1 assert isinstance(obs_space, Box) self.num_env_params = num_env_params new_obs_low = np.concatenate( [np.full((num_env_params, ), -np.inf), obs_space.low]) new_obs_high = np.concatenate( [np.full((num_env_params, ), np.inf), obs_space.high]) self.observation_space = Box(low=new_obs_low, high=new_obs_high) self.env_params = None def observation(self, observation): extra_obs = [v for (k, v) in self.env_params.items()] return np.concatenate([extra_obs, observation]) def set_env_params(self, new_env_params): assert len(new_env_params) == self.num_env_params self.env_params = new_env_params self.env.env.set_env_params(new_env_params) # set new vars in env if __name__ == "__main__": env = sunblaze_envs.make("SunblazeAdaptedHalfCheetah-v0") print(env.observation_space.shape) env = AdaptiveEnvWrapper(env, 3) print(env) print(env.observation_space.shape) print(env.observation_space.low) print(env.observation_space.high)
def main(args, logdir): """ Model Based Reinforcement Learning 1) Generate random trajectories 2) Train the model on the generated data 3) For each repetition: a) Generate new data using the MPC controller b) Retrain the model using the new data and the old data c) (Optional) Compute Mean Prediction Error """ # SETUP train_envs = [] test_envs = [] if args.no_sunblaze: train_env = gym.make(args.env_name) test_env = gym.make(args.env_name) if 'PyBullet' in args.env_name and args.render: train_env.render() train_env.reset() elif args.test_type == 'interpolation': train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) test_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) elif args.test_type == 'extrapolation': train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + '-v0')) train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) test_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomExtreme-v0')) test_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) else: train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + '-v0')) test_envs.append(sunblaze_envs.make('Sunblaze' + args.env_name + '-v0')) test_cnt = 0 for train_env in train_envs: assert isinstance(train_env.observation_space, gym.spaces.Box) start_time = time.time() logger = Logger(logdir) is_discrete = isinstance(train_env.action_space, gym.spaces.Discrete) ob_dim = train_env.observation_space.shape[0] ac_dim = train_env.action_space.n if is_discrete else train_env.action_space.shape[ 0] reward_function = get_reward_function(train_env) train_env.reset() ensemble = Ensemble(ob_dim, ac_dim, is_discrete, args.pnn, args.ensemble_size, args.lr, args.hidden_size, device=nn_utils.DEVICE) # TRAIN # Instantiate policies mpc_policy = MPCPolicy(args, train_env, ensemble, reward_function, nn_utils.DEVICE) random_policy = RandomPolicy(train_env) # Instantiate Data generator data_generator = DataGenerator(args, train_env, nn_utils.DEVICE, mpc_policy, random_policy, max_size=args.max_memory_size) if args.weights_paths is not None: # If weights are given, visualize and quit ensemble.load_weights(args.weights_paths) current_episodes, rewards, lengths = data_generator.generate_closed_loop_data( args.render) if args.mpe: MPE(train_env, current_episodes, ensemble, args.mpc_horizon, label='Ensemble %s' % (args.weights_paths)) print('avg reward episode %f' % (np.mean(rewards))) print('avg len %f' % (np.mean([len(ep) for ep in current_episodes]))) return # Otherwise train model on random trajectories current_episodes, train_rewards, train_lengths = data_generator.generate_random_data( ) # Train initial model using random trajectories train_loss, test_loss = ensemble.train_net( args.epochs_rand, args.batch_size, data_generator, samples_per_model=args.samples_per_model) if args.mpe: print('Computing MPE') for (i, model) in enumerate(ensemble.models): MPE(train_env, current_episodes, model, args.mpc_horizon, label='random data, model %d' % (i)) if len(ensemble.models) > 1: MPE(train_env, current_episodes, ensemble, args.mpc_horizon, label='random data, ensemble') _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data( render=args.render) # TODO: keep test data only for test data for itr in range(args.repetitions): print('\nMPC Repetition %d / %d \n' % (itr + 1, args.repetitions)) epsilon = mpc_policy.update_epsilon(itr) perform_logging(itr, logger, eval_rewards, train_rewards, test_loss, train_loss, eval_lengths, train_lengths, start_time, epsilon) current_episodes, train_rewards, train_lengths = data_generator.generate_closed_loop_data( ) train_loss, test_loss = ensemble.train_net( args.epochs_rl, args.batch_size, data_generator, samples_per_model=args.samples_per_model) if args.mpe: print('Computing MPE') for (i, model) in enumerate(ensemble.models): MPE(train_env, current_episodes, model, args.mpc_horizon, label='rep %d, model %d' % (itr, i)) if len(ensemble.models) > 1: MPE(train_env, current_episodes, ensemble, args.mpc_horizon, label='rep %d, ensemble' % (itr)) _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data( render=args.render) if args.save_model: for (i, model) in enumerate(ensemble.models): save_file = '%s/models/rep_%d_model_%d_%.4f.pt' % ( str(logdir), itr, i, test_loss[i][-1]) torch.save(model.state_dict(), save_file) # SUNBLAZE TEST for test_env in test_envs: test_name = test_env.unwrapped.spec.id train_name = train_env.unwrapped.spec.id if test_cnt < 3: print('\nTESTING: ' + train_name + ' on ' + test_name, flush=True) success_function = get_success_function(test_env) num_success = 0 rewards = [] for ep_num in range(args.test_episodes): success, ep_reward = run_test_episode( test_env, mpc_policy, success_function, args.render) rewards.append(ep_reward) num_success += int(success) print( 'Test episode: %2d / %2d \t Success: %d \t Reward: %d' % (ep_num + 1, args.test_episodes, int(success), ep_reward), flush=True) score = num_success / args.test_episodes * 100 logger.log_scalar(score, test_name + '-' + train_name, 0) with open(train_name + '_' + test_name + '_score.txt', 'w+') as f: f.write('Score for ' + train_name + ' tested on ' + test_name + ': ' + str(score)) print('\nScore for ' + train_name + ' tested on ' + test_name + ' testing: ', score, flush=True) test_cnt += 1