def actor_forward(self, obs, deterministic=False): latent_pi, _ = self._get_latent(obs) _, action, _ = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic) return tf.stop_gradient(action).numpy() @tf.function def evaluate_actions(self, obs, action, deterministic=False): """ Evaluate actions according to the current policy, given the observations. :param obs: (th.Tensor) :param action: (th.Tensor) :param deterministic: (bool) :return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions and entropy of the action distribution. """ latent_pi, latent_vf = self._get_latent(obs) _, _, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic) log_prob = action_distribution.log_prob(action) value = self.value_net(latent_vf) return value, log_prob, action_distribution.entropy() def value_forward(self, obs): _, latent_vf, _ = self._get_latent(obs) return self.value_net(latent_vf) MlpPolicy = PPOPolicy register_policy("MlpPolicy", MlpPolicy)
def main(args): log_dir = args.log_path if ( args.log_path is not None ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(log_dir) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(log_dir, format_strs=[]) set_global_seeds(args.seed) model_class = SAC_SIR # works also with SAC, DDPG and TD3 env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential, reward_type=args.reward_type, n_object=args.n_object) def make_thunk(rank): return lambda: make_env( env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs) env = ParallelSubprocVecEnv( [make_thunk(i) for i in range(args.num_workers)], reset_when_done=True) def make_thunk_aug(rank): return lambda: FlattenDictWrapper( make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs), ['observation', 'achieved_goal', 'desired_goal']) aug_env_kwargs = env_kwargs.copy() del aug_env_kwargs['max_episode_steps'] aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1] aug_env = ParallelSubprocVecEnv( [make_thunk_aug(i) for i in range(args.num_workers)], reset_when_done=False) if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): os.remove(os.path.join(logger.get_dir(), 'eval.csv')) print('Remove existing eval.csv') eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs) eval_env = FlattenDictWrapper( eval_env, ['observation', 'achieved_goal', 'desired_goal']) if not args.play: os.makedirs(log_dir, exist_ok=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE if not args.play: from stable_baselines.ddpg.noise import NormalActionNoise noise_type = args.action_noise.split('_')[0] if noise_type == 'none': parsed_action_noise = None elif noise_type == 'normal': sigma = float(args.action_noise.split('_')[1]) parsed_action_noise = NormalActionNoise( mean=np.zeros(env.action_space.shape), sigma=sigma * np.ones(env.action_space.shape)) else: raise NotImplementedError train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise, eval_env, aug_env) def callback(_locals, _globals): if _locals['step'] % int(1e3) == 0: if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model( eval_env, _locals["self"], init_on_table=(args.env == 'FetchStack-v2')) elif 'MasspointPushDoubleObstacle-v2' in args.env: mean_eval_reward = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.])) mean_eval_reward2 = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.])) log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv") else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(_locals['self'].num_timesteps, mean_eval_reward) if _locals['step'] % int(2e4) == 0: model_path = os.path.join( log_dir, 'model_' + str(_locals['step'] // int(2e4))) model.save(model_path) print('model saved to', model_path) return True class CustomSACPolicy(SACPolicy): def __init__(self, *model_args, **model_kwargs): super(CustomSACPolicy, self).__init__( *model_args, **model_kwargs, layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) from utils.sac_attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("sac_sir", args) if rank == 0: print('train_kwargs', train_kwargs) print('policy_kwargs', policy_kwargs) # Wrap the model model = HER2(args.policy, env, model_class, n_sampled_goal=4, start_augment_time=args.start_augment, goal_selection_strategy=goal_selection_strategy, num_workers=args.num_workers, policy_kwargs=policy_kwargs, verbose=1, **train_kwargs) print(model.get_parameter_list()) # Train the model model.learn( int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10) if rank == 0: model.save(os.path.join(log_dir, 'final'))
def main(args): log_dir = args.log_path if (args.log_path is not None) else \ "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') configure_logger(log_dir) set_global_seeds(args.seed) n_cpu = get_num_workers(args.env) if not args.play else 1 env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type, args.n_object, args.curriculum) def make_thunk(rank): return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs) env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)]) eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 if "use_cu" in eval_env_kwargs: eval_env_kwargs['use_cu'] = False eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs) print(eval_env) if not args.play: os.makedirs(log_dir, exist_ok=True) train_kwargs = get_train_kwargs("ppo", args, parsed_action_noise=None, eval_env=eval_env) # policy = 'MlpPolicy' from utils.attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("ppo", args) print(policy_kwargs) model = PPO2(args.policy, env, verbose=1, nminibatches=32, lam=0.95, noptepochs=10, ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, **train_kwargs) print(model.get_parameter_list()) def callback(_locals, _globals): num_update = _locals["update"] if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model(eval_env, _locals["self"]) else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(num_update, mean_eval_reward) if num_update % 10 == 0: model_path = os.path.join(log_dir, 'model_' + str(num_update // 10)) model.save(model_path) print('model saved to', model_path) return True model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1) model.save(os.path.join(log_dir, 'final')) else: assert args.load_path is not None model = PPO2.load(args.load_path) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) obs = env.reset() goal_dim = env.get_attr('goal')[0].shape[0] if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'FetchPush' in args.env: while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61 and 0.7 < obs[0][4] < 0.8): obs = env.reset() env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0])) obs = env.env_method('get_obs') obs[0] = np.concatenate([ obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal'] ]) else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal', obs[0][-goal_dim:]) episode_reward = 0.0 num_episode = 0 frame_idx = 0 images = [] if 'max_episode_steps' not in env_kwargs.keys(): env_kwargs['max_episode_steps'] = 100 for i in range(env_kwargs['max_episode_steps'] * 10): img = env.render(mode='rgb_array') ax.cla() ax.imshow(img) if env.get_attr('goal')[0].shape[0] <= 3: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx)) else: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', goal idx ' + str(np.argmax(env.get_attr('goal')[0][3:]))) if 'FetchStack' in args.env: tasks = ['pick and place', 'stack'] ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', task: ' + tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 * goal_dim])]) images.append(img) action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) episode_reward += reward frame_idx += 1 if not args.export_video: plt.pause(0.1) else: plt.imsave( os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) if done: print('episode_reward', episode_reward) if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('goal', obs[0][-goal_dim:]) episode_reward = 0.0 frame_idx = 0 num_episode += 1 if num_episode >= 10: break if args.export_video: os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) for i in range(env_kwargs['max_episode_steps'] * 10): try: os.remove( os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) except: pass
class CustomMlpPolicy(BasePolicy): def __init__(self, *args, **kwargs): super(CustomMlpPolicy, self).__init__(*args, **kwargs, layers=[16], feature_extraction="mlp") class CustomSACPolicy(SACPolicy): def __init__(self, *args, **kwargs): super(CustomSACPolicy, self).__init__(*args, **kwargs, layers=[256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) register_policy('CustomDQNPolicy', CustomDQNPolicy) register_policy('CustomMlpPolicy', CustomMlpPolicy) def flatten_dict_observations(env): assert isinstance(env.observation_space, gym.spaces.Dict) keys = env.observation_space.spaces.keys() return gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) def get_wrapper_class(hyperparams): """ Get a Gym environment wrapper class specified as a hyper parameter "env_wrapper". e.g.
if __name__ == "__main__": #argparse to define the mode train or predict args=parser.parse_args() mode = args.mode #### if mode =='train': env = GymACRoom() learning_rate = [0.0001] # Register the policy, it will check that the name is not already taken register_policy('CustomPolicy', CustomPolicy) for lr in learning_rate: model = PPO2(policy = 'CustomPolicy', env=env, verbose=1, learning_rate=lr, n_steps=1280, tensorboard_log="./AC_tensorboard/") model.learn(total_timesteps = 1000000) #model.save("AC_PPO2_LR_exp"+str(lr)+".zip") model.save("AC_PPO2_exp_neg_noOffset.zip") ''' # scheduling learning rate #lr= 0.1 model = PPO2('MlpPolicy', env, verbose=1, learning_rate=0.1, tensorboard_log="./AC_tensorboard/") model.learn(total_timesteps = 100000) model.learning_rate = 0.01 model.learn(total_timesteps = 100000)
def main(args): log_dir = args.log_path if (args.log_path is not None) else \ "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') configure_logger(log_dir) set_global_seeds(args.seed) n_cpu = get_num_workers(args.env) if not args.play else 1 env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type, args.n_object, args.curriculum) def make_thunk(rank): return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs) env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)]) aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1] aug_env_kwargs = env_kwargs.copy() aug_env_kwargs['max_episode_steps'] = None def make_thunk_aug(rank): return lambda: make_env(env_id=aug_env_name, rank=rank, flatten_dict=True, kwargs=aug_env_kwargs) if not args.parallel: aug_env = make_env(env_id=aug_env_name, rank=0, flatten_dict=True, kwargs=aug_env_kwargs) else: aug_env = ParallelSubprocVecEnv( [make_thunk_aug(i) for i in range(min(32, n_cpu))], reset_when_done=False) print(aug_env) if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): os.remove(os.path.join(logger.get_dir(), 'eval.csv')) print('Remove existing eval.csv') eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 if "use_cu" in eval_env_kwargs: eval_env_kwargs['use_cu'] = False eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs) print(eval_env) if not args.play: os.makedirs(log_dir, exist_ok=True) from utils.attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("ppo_sir", args) train_kwargs = get_train_kwargs("ppo_sir", args, parsed_action_noise=None, eval_env=eval_env, aug_env=aug_env) model = PPO2_SIR(args.policy, env, verbose=1, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, horizon=env_kwargs['max_episode_steps'], **train_kwargs) def callback(_locals, _globals): num_update = _locals["update"] if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model(eval_env, _locals["self"]) else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(num_update, mean_eval_reward) if num_update % 10 == 0: model_path = os.path.join(log_dir, 'model_' + str(num_update // 10)) model.save(model_path) print('model saved to', model_path) return True model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1) model.save(os.path.join(log_dir, 'final'))
# with tf.variable_scope(scope, reuse=reuse): # qf_h = tf.layers.flatten(obs) # for i, layer_size in enumerate(self.layers): # qf_h = tf.layers.dense(qf_h, layer_size, name='fc' + str(i)) # if self.layer_norm: # qf_h = tf.contrib.layers.layer_norm(qf_h, center=True, scale=True) # qf_h = self.activ(qf_h) # if i == 0: # qf_h = tf.concat([qf_h, action], axis=-1) # # the name attribute is used in pop-art normalization # qvalue_fn = tf.layers.dense(qf_h, 1, name='qf_output', # kernel_initializer=tf.zeros_initializer)#random_uniform_initializer(minval=-3e-3, # self.qvalue_fn = qvalue_fn # self._qvalue = qvalue_fn[:, 0] # return self.qvalue_fn # def step(self, obs, state=None, mask=None): # return self.sess.run(self.policy, {self.obs_ph: obs}) # def proba_step(self, obs, state=None, mask=None): # return self.sess.run(self.policy, {self.obs_ph: obs}) # def value(self, obs, action, state=None, mask=None): # return self.sess.run(self._qvalue, {self.obs_ph: obs, self.action_ph: action}) register_policy("SoftPolicy", SoftPolicy)
:param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, feature_extraction="mlp", layer_norm=True, **_kwargs) register_policy("CnnPolicy", CnnPolicy) register_policy("LnCnnPolicy", LnCnnPolicy) register_policy("MlpPolicy", MlpPolicy) register_policy("LnMlpPolicy", LnMlpPolicy) register_policy("CustomSACPolicy", CustomSACPolicy)
**kwargs, net_arch=params.net_arch, act_fun=params.act_fun, PF_linear_also=True) class MLP_Policy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(MLP_Policy, self).__init__(*args, **kwargs, net_arch=params.net_arch, act_fun=params.act_fun, feature_extraction="mlp") register_policy('SCN', SCN) register_policy('SCN_PF_NOnly', SCN_PF_NOnly) register_policy('SCN_PF_Both', SCN_PF_Both) register_policy('MLP_Policy', MLP_Policy) class CustomDQNPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomDQNPolicy, self).__init__(*args, **kwargs, layers=[64], layer_norm=True, feature_extraction="mlp") class CustomMlpPolicy(BasePolicy):
def main(args): log_dir = args.log_path if ( args.log_path is not None ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(log_dir) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(log_dir, format_strs=[]) set_global_seeds(args.seed) model_class = SAC_parallel n_workers = args.num_workers if not args.play else 1 env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential, reward_type=args.reward_type, n_object=args.n_object) def make_thunk(rank): return lambda: make_env( env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs) env = ParallelSubprocVecEnv([make_thunk(i) for i in range(n_workers)], reset_when_done=True) if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): os.remove(os.path.join(logger.get_dir(), 'eval.csv')) print('Remove existing eval.csv') eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs) eval_env = FlattenDictWrapper( eval_env, ['observation', 'achieved_goal', 'desired_goal']) if not args.play: os.makedirs(log_dir, exist_ok=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE if not args.play: from stable_baselines.ddpg.noise import NormalActionNoise noise_type = args.action_noise.split('_')[0] if noise_type == 'none': parsed_action_noise = None elif noise_type == 'normal': sigma = float(args.action_noise.split('_')[1]) parsed_action_noise = NormalActionNoise( mean=np.zeros(env.action_space.shape), sigma=sigma * np.ones(env.action_space.shape)) else: raise NotImplementedError train_kwargs = get_train_kwargs("sac", args, parsed_action_noise, eval_env) def callback(_locals, _globals): if _locals['step'] % int(1e3) == 0: if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model( eval_env, _locals["self"], init_on_table=(args.env == 'FetchStack-v2')) elif 'MasspointPushDoubleObstacle-v2' in args.env: mean_eval_reward = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.])) mean_eval_reward2 = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.])) log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv") else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(_locals['self'].num_timesteps, mean_eval_reward) if _locals['step'] % int(2e4) == 0: model_path = os.path.join( log_dir, 'model_' + str(_locals['step'] // int(2e4))) model.save(model_path) print('model saved to', model_path) return True class CustomSACPolicy(SACPolicy): def __init__(self, *model_args, **model_kwargs): super(CustomSACPolicy, self).__init__( *model_args, **model_kwargs, layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) from utils.sac_attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("sac", args) if rank == 0: print('train_kwargs', train_kwargs) print('policy_kwargs', policy_kwargs) # Wrap the model model = HER2(args.policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, num_workers=args.num_workers, policy_kwargs=policy_kwargs, verbose=1, **train_kwargs) print(model.get_parameter_list()) # Train the model model.learn( int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10) if rank == 0: model.save(os.path.join(log_dir, 'final')) # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method if args.play and rank == 0: assert args.load_path is not None model = HER2.load(args.load_path, env=env) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) obs = env.reset() if 'FetchStack' in args.env: env.env_method('set_task_array', [[(env.get_attr('n_object')[0], 0)]]) obs = env.reset() while env.get_attr('current_nobject')[0] != env.get_attr( 'n_object')[0] or env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'FetchPushWallObstacle' in args.env: while not (obs['observation'][0][4] > 0.7 and obs['observation'][0][4] < 0.8): obs = env.reset() env.env_method('set_goal', [np.array([1.18, 0.8, 0.425, 1, 0])]) obs = env.env_method('get_obs') obs = { 'observation': obs[0]['observation'][None], 'achieved_goal': obs[0]['achieved_goal'][None], 'desired_goal': obs[0]['desired_goal'][None] } # obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']]) elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env: while np.argmax(obs['desired_goal'][0][3:]) != 0: obs = env.reset() elif 'MasspointMaze-v2' in args.env: while obs['observation'][0][0] < 3 or obs['observation'][0][1] < 3: obs = env.reset() env.env_method('set_goal', [np.array([1., 1., 0.15])]) obs = env.env_method('get_obs') obs = { 'observation': obs[0]['observation'][None], 'achieved_goal': obs[0]['achieved_goal'][None], 'desired_goal': obs[0]['desired_goal'][None] } print('goal', obs['desired_goal'][0], 'obs', obs['observation'][0]) episode_reward = 0.0 images = [] frame_idx = 0 num_episode = 0 for i in range(env_kwargs['max_episode_steps'] * 10): img = env.render(mode='rgb_array') ax.cla() ax.imshow(img) tasks = ['pick and place', 'stack'] ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', task: ' + tasks[np.argmax(obs['observation'][0][-2:])]) images.append(img) action, _ = model.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action) episode_reward += reward frame_idx += 1 if args.export_gif: plt.imsave( os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) else: plt.pause(0.02) if done: print('episode_reward', episode_reward) obs = env.reset() if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env: while np.argmax(obs['desired_goal'][0][3:]) != 0: obs = env.reset() print('goal', obs['desired_goal'][0]) episode_reward = 0.0 frame_idx = 0 num_episode += 1 if num_episode >= 1: break exit() if args.export_gif: os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) for i in range(env_kwargs['max_episode_steps'] * 10): # images.append(plt.imread('tempimg' + str(i) + '.png')) try: os.remove( os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) except: pass
class CustomMlpAggregatePolicy(AggregatePolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): super(CustomMlpAggregatePolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, layers=[16], feature_extraction="mlp", **_kwargs) register_policy('SACTwoLayerMlpAggregatePolicy', SACTwoLayerMlpAggregatePolicy) register_policy('MlpAggregatePolicy', MlpAggregatePolicy) register_policy('CustomMlpAggregatePolicy', CustomMlpAggregatePolicy) register_policy('CustomSACPolicy', CustomSACPolicy) register_policy('CustomDQNPolicy', CustomDQNPolicy) register_policy('CustomMlpPolicy', CustomMlpPolicy)
self._qvalue = qvalue_fn[:, 0] return self.qvalue_fn def step(self, obs, state=None, mask=None): return self.sess.run(self.policy, {self.obs_ph: obs}) def proba_step(self, obs, state=None, mask=None): return self.sess.run(self.policy, {self.obs_ph: obs}) def value(self, obs, action, state=None, mask=None): return self.sess.run(self._qvalue, {self.obs_ph: obs, self.action_ph: action}) class LinearPolicy_MLPCritic(LinearPolicy): """ Policy object that implements actor critic, using a MLP (2 layers of 64) :param sess: (TensorFlow session) The current TensorFlow session :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param n_env: (int) The number of environments to run :param n_steps: (int) The number of steps to run for each environment :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): super(LinearPolicy_MLPCritic, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, feature_extraction="mlp", **_kwargs) register_policy("LinearPolicy_MLPCritic", LinearPolicy_MLPCritic)
activation_fn=activ) # Output layer action_scores = tf.contrib.layers.fully_connected( action_out, num_outputs=self.n_actions, activation_fn=None) assert not self.dueling, "Dueling currently not supported" q_out = action_scores self.q_values = q_out self._setup_init() def step(self, obs, state=None, mask=None, deterministic=True): q_values, actions_proba = self.sess.run( [self.q_values, self.policy_proba], {self.obs_ph: obs}) if deterministic: actions = np.argmax(q_values, axis=1) else: actions = np.zeros((len(obs), ), dtype=np.int64) for action_idx in range(len(obs)): actions[action_idx] = np.random.choice( self.n_actions, p=actions_proba[action_idx]) return actions, q_values, None def proba_step(self, obs, state=None, mask=None): return self.sess.run(self.policy_proba, {self.obs_ph: obs}) register_policy("StatePlusImagePolicy", StatePlusImagePolicy)
retro=False, resolution=84 ) env = AnimalSkip(env, skip=SKIP_FRAMES) env = AnimalWrapper(env) env = AnimalStack(env,VISUAL_FRAMES_COUNT, VEL_FRAMES_COUNT, greyscale=USE_GREYSCALE_OBSES) return env return env # Define environments env = create_env_fn(num_actors = 1, inference=False, seed=0) env = make_vec_env(env, n_envs=4) # # register policy register_policy('MyPolicy', LstmPolicy) # # define algorithm model = PPO2('MyPolicy', env, n_steps=256) ######################### # Dataset concatenation # ######################### def dataset_concatenation(dataset_path): ''' Use only when you have datasets of seperate environments. If not, and the code already has a concatenated all_data.npz, ***do not use the function*** Input: Directory where expert trajectory per environment .npz files are present Output: A all_data.npz in the same directory
from stable_baselines.common.policies import register_policy from baselines_lab.policies.cnn_policy import SimpleMazeCnnPolicy, GeneralCnnPolicy from baselines_lab.policies.rnd_policy import RndPolicy from baselines_lab.policies.deepq import FeedForwardPolicy register_policy('RndPolicy', RndPolicy) register_policy('SimpleMazeCnnPolicy', SimpleMazeCnnPolicy) register_policy('GeneralCnnPolicy', GeneralCnnPolicy) register_policy('GeneralDqnPolicy', FeedForwardPolicy)
import os,sys sys.path.insert(0,'D:\\GitHub\\Quantitative-analysis-with-Deep-Learning\\quantitative_analysis_with_deep_learning') import gym from stable_baselines.common.policies import register_policy from stable_baselines.ddpg.policies import FeedForwardPolicy as DDPG_FeedForwardPolicy from stable_baselines.td3.policies import FeedForwardPolicy as TD3_FeedForwardPolicy # 自定义策略和价值网络 class CustomDDPGPolicy(DDPG_FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomDDPGPolicy, self).__init__( *args, **kwargs, layers=[256, 128, 128, 64], feature_extraction="mlp") # Register the policy, it will check that the name is not already taken register_policy('CustomDDPGPolicy', CustomDDPGPolicy) class CustomTD3Policy(TD3_FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomTD3Policy, self).__init__( *args, **kwargs, layers=[256, 128, 128, 64], feature_extraction="mlp") register_policy('CustomTD3Policy', CustomTD3Policy)
# Custom LSTM policy with two MLP layers of size 64 each + a shared LSTM layer of size 4 class CustomLSTMPolicy(LstmPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=4, reuse=False, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=['lstm', dict(pi=[64, 64], vf=[64, 64])], layer_norm=True, feature_extraction="mlp", **_kwargs) # Custom MLP policy of two layers of size 81 each class CustomPolicy_2x81(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy_2x81, self).__init__(*args, **kwargs, net_arch=[dict(pi=[81, 81], vf=[81, 81])], feature_extraction="mlp") # Custom MLP policy of three layers of with variable size class CustomPolicy_3_var(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy_3_var, self).__init__(*args, **kwargs, net_arch=[dict(pi=[80, 49, 30], vf=[80, 28, 10])], feature_extraction="mlp") # Register the policy, it will check that the name is not already taken register_policy('CustomPolicy_3x64', CustomPolicy_3x64) register_policy('CustomPolicy_2x64_shared', CustomPolicy_2x64_shared) register_policy('CustomPolicy_4x128', CustomPolicy_4x128) register_policy('CustomLSTMPolicy', CustomLSTMPolicy) register_policy('CustomPolicy_2x81', CustomPolicy_2x81) register_policy('CustomPolicy_3_var', CustomPolicy_3_var)
class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, net_arch=[dict(pi=[64], vf=[64])], feature_extraction="cnn", cnn_extractor =custom_cnn ) class DqnCnnPolicy(DqnFFPolicy): def __init__(self, *args, **kwargs): super(DqnCnnPolicy, self).__init__(*args, **kwargs, feature_extraction="cnn", cnn_extractor =custom_cnn ) # Register the policy, it will check that the name is not already taken register_policy('CustomPolicy', CustomPolicy) register_policy('DqnCnnPolicy', DqnCnnPolicy) def ppo2train(args): with tf.device('/device:CUDA:1'): # gpus = tf.config.experimental.list_physical_devices('CUDA') # tf.config.experimental.set_visible_devices(gpus[1], 'CUDA') env = make_vec_env('python_1p-v0', n_envs=4) # env = gym.make('python_1p-v0') # env = Monitor(env, filename=None, allow_early_resets=True) # env = DummyVecEnv([lambda: env])
class LargeSACPolicy(SACPolicy): def __init__(self, *args, **kwargs): super(LargeSACPolicy, self).__init__(*args, **kwargs, layers=[256, 256, 256], feature_extraction="mlp") class LargeBasePolicy(BasePolicy): def __init__(self, *args, **kwargs): super(LargeBasePolicy, self).__init__(*args, **kwargs, layers=[256, 256, 256], feature_extraction="mlp") class MediumBasePolicy(BasePolicy): def __init__(self, *args, **kwargs): super(MediumBasePolicy, self).__init__(*args, **kwargs, layers=[256, 256], feature_extraction="mlp") register_policy("CustomSACPolicy", CustomSACPolicy) register_policy("TinySACPolicy", TinySACPolicy) register_policy("LargeSACPolicy", LargeSACPolicy) register_policy("LargeBasePolicy", LargeBasePolicy) register_policy("MediumBasePolicy", MediumBasePolicy) register_policy("TinyDQNPolicy", TinyDQNPolicy) register_policy("MediumDQNPolicy", MediumDQNPolicy) register_policy("LargeDQNPolicy", LargeDQNPolicy) register_policy("HugeDQNPolicy", HugeDQNPolicy) register_policy("BigBigDQNPolicy", BigBigDQNPolicy) register_policy("BigBigBigDQNPolicy", BigBigBigDQNPolicy) register_policy("CustomMlpPolicy", CustomMlpPolicy) def linear_schedule(initial_value): """
n_steps, n_batch, num_actions, distributed_single_stream=False, reuse=False, obs_phs=None, dueling=True, **_kwargs): super(CnnActPolicy, self).__init__( sess, ob_space, ac_space, n_env, n_steps, n_batch, num_actions, distributed_single_stream=distributed_single_stream, reuse=reuse, aggregator='reduceLocalMean', feature_extraction="cnn", obs_phs=obs_phs, dueling=dueling, layer_norm=False, **_kwargs) register_policy("CnnActPolicy", MlpActPolicy) register_policy("MlpActPolicy", MlpActPolicy) register_policy("LnMlpActPolicy", LnMlpActPolicy) register_policy("ActionBranching", ActionBranching)
:param n_env: (int) The number of environments to run :param n_steps: (int) The number of steps to run for each environment :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, feature_extraction="mlp", layer_norm=True, **_kwargs) register_policy("C51CnnPolicy", CnnPolicy) register_policy("C51LnCnnPolicy", LnCnnPolicy) register_policy("C51MlpPolicy", MlpPolicy) register_policy("C51LnMlpPolicy", LnMlpPolicy)
action = self.deterministic_action if deterministic else self.action feed_dict = self._make_feed_dict(obs, state, mask) outputs = [action, self.value_flat, self.state_out, self.neglogp] if extra_op is not None: outputs.append(extra_op) a, v, s, neglogp, ex = self.sess.run(outputs, feed_dict) else: a, v, s, neglogp = self.sess.run(outputs, feed_dict) state = [] for x in s: state.append(x.c) state.append(x.h) state = np.array(state) state = np.transpose(state, (1, 0, 2)) if extra_op is not None: return a, v, state, neglogp, ex else: return a, v, state, neglogp def proba_step(self, obs, state=None, mask=None): return self.sess.run(self.policy_proba, self._make_feed_dict(obs, state, mask)) def value(self, obs, state=None, mask=None): return self.sess.run(self.value_flat, self._make_feed_dict(obs, state, mask)) register_policy('BansalMlpPolicy', MlpPolicyValue) register_policy('BansalLstmPolicy', LSTMPolicy)
**kwargs, layers=[32, 16], act_fun=tf.nn.elu, feature_extraction="mlp") class CustomDDPGPolicy(DDPGPolicy): def __init__(self, *args, **kwargs): super(CustomDDPGPolicy, self).__init__(*args, **kwargs, layers=[32, 8], feature_extraction="mlp", layer_norm=True) register_policy('CustomDDPGPolicy', CustomDDPGPolicy) register_policy('LargeSACPolicy', LargeSACPolicy) register_policy('TinySACPolicy', TinySACPolicy) register_policy('CustomSACPolicy', CustomSACPolicy) register_policy('CustomMlpPolicy', CustomMlpPolicy) def load_vae(path=None, z_size=None): """ :param path: (str) :param z_size: (int) :return: (VAEController) """ # z_size will be recovered from saved model if z_size is None: assert path is not None
ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, obs_phs=None, dueling=True, layer_norm=False, l1_regularizer=0., l2_regularizer=0., **_kwargs): super(CustomRegularizedDQNMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, feature_extraction="mlp", obs_phs=obs_phs, dueling=dueling, layer_norm=layer_norm, l1_regularizer=l1_regularizer, l2_regularizer=l2_regularizer, **_kwargs) register_policy("CustomRegularizedDQNMlpPolicy", CustomRegularizedDQNMlpPolicy)
:param n_steps: (int) The number of steps to run for each environment :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, feature_extraction="mlp", layer_norm=True, **_kwargs) register_policy("CnnPolicy", CnnPolicy) register_policy("LnCnnPolicy", LnCnnPolicy) register_policy("MlpPolicy", MlpPolicy) register_policy("LnMlpPolicy", LnMlpPolicy)
def __init__(self, *args, **kwargs): super(CustomMlpPolicy, self).__init__(*args, **kwargs, layers=[16], feature_extraction="mlp") class CustomSACPolicy(SACPolicy): def __init__(self, *args, **kwargs): super(CustomSACPolicy, self).__init__(*args, **kwargs, layers=[256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) register_policy('CustomDQNPolicy', CustomDQNPolicy) register_policy('SmallMobileNetCnnPolicy', SmallMobileNetCnnPolicy) register_policy('CustomLowerFlopCnnPolicy', CustomLowerFlopCnnPolicy) register_policy('CustomMlpPolicy', CustomMlpPolicy) def flatten_dict_observations(env): assert isinstance(env.observation_space, gym.spaces.Dict) keys = env.observation_space.spaces.keys() return gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) def get_wrapper_class(hyperparams): """ Get a Gym environment wrapper class specified as a hyper parameter