def main(): # create Environment env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=False, useIK=1, isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0) # set seed seed = 1 tf.reset_default_graph() set_global_seed(seed) env.seed(seed) # set log monitor_dir = os.path.join(log_dir,'log') os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, monitor_dir+'/', allow_early_resets=True) # create agent model nb_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions)) model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16, normalize_observations=True,normalize_returns=False, memory_limit=100000, verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False) #start learning model.learn(total_timesteps=500000, seed=seed, callback=callback) # save model print("Saving model.pkl to ",log_dir) act.save(log_dir+"/final_model.pkl")
def td3(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines.ddpg.noise import NormalActionNoise env = gym.make(env_id) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if load_weights is not None: model = TD3.load(load_weights, env, verbose=0) else: model = TD3(policy, env, action_noise=action_noise, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="td3", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def run_experiment(verbose, tensorboard_log, learning_rate): pdb.set_trace() env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) n_actions = env.action_space.shape[-1] stddev = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=learning_rate, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) model.save(expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(learning_rate))) env.close()
def run_stable(num_steps, save_dir): env = make_vec_env(BBall3Env, n_envs=1, monitor_dir=save_dir, env_kwargs=env_config) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions)) model = TD3( MlpPolicy, env, action_noise=action_noise, verbose=1, gamma=0.99, buffer_size=1000000, learning_starts=10000, batch_size=100, learning_rate=1e-3, train_freq=1000, gradient_steps=1000, policy_kwargs={"layers": [64, 64]}, n_cpu_tf_sess=1, ) num_epochs = 1 total_steps = 5e5 for epoch in range(num_epochs): model.learn(total_timesteps=int(total_steps / num_epochs)) model.save(save_dir + "/model.zip")
def get_TD3_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path): policy_kwargs = dict(layers=model_settings['NET_LAYERS']) env = get_single_process_env(model_settings, model_path, ckpt_step) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if ckpt_path is not None: print("Loading model from checkpoint '{}'".format(ckpt_path)) model = TD3.load(ckpt_path, env=env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], action_noise=action_noise, verbose=1, tensorboard_log=tb_path) model.num_timesteps = ckpt_step else: model = TD3(TD3MlpPolicy, env, _init_setup_model=True, policy_kwargs=policy_kwargs, action_noise=action_noise, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) return model, env
def create_action_noise(env, noise_type): action_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) return action_noise
def parse_noise_types(noise_type, nb_actions): """ Parse noise types for policies """ action_noise = None param_noise = None for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) return action_noise, param_noise
def train_TD3(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir,log_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir+'/', allow_early_resets=True) policy = kwargs['policy'] n_timesteps = kwargs['n_timesteps'] noise_type = kwargs['noise_type'] del kwargs['policy'] del kwargs['n_timesteps'] del kwargs['noise_type'] ''' Parameter space noise: injects randomness directly into the parameters of the agent, altering the types of decisions it makes such that they always fully depend on what the agent currently senses. ''' # the noise objects for TD3 nb_actions = env.action_space.shape[-1] action_noise = None if not noise_type is None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) if 'continue' in kwargs and kwargs['continue'] is True: # Continue training print("Loading pretrained agent") # Policy should not be changed del kwargs['policy'] model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir,'tb'), verbose=1, **kwargs) else: if 'continue' in kwargs: del kwargs['continue'] model = TD3(policy, env, action_noise=action_noise, seed=seed, verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
def train(): set_gpu() expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 1e8 save_video_length = 200 save_video_interval = 1000000 file = open('sac_done.txt', 'w+') env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) n_actions = env.action_space.shape[-1] stddev = 0.2 pool = multiprocessing.Pool(processes=4) for lr in [1e-5]: #, 5e-4, 1e-5 logger = osp.join( expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter), np.format_float_scientific(lr))) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() file.close() pool.close() pool.join()
def run(self): self._init() env = self.env model = self.model objective = self.objective if objective == "infogain": wenv = InfogainEnv(env, model) elif objective == "prederr": wenv = PrederrEnv(env, model) else: raise AttributeError( "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'" .format(objective)) wenv.max_episode_len = self.horizon wenv.end_episode_callback = self._end_episode dvenv = DummyVecEnv([lambda: wenv]) if self.rl_algo == "ddpg": self.logger.info("Setting up DDPG as model-free RL algorithm.") pn = AdaptiveParamNoiseSpec() an = NormalActionNoise(np.array([0]), np.array([1])) rl_model = DDPG(DDPGMlpPolicy, dvenv, verbose=1, render=False, action_noise=an, param_noise=pn, nb_rollout_steps=self.horizon, nb_train_steps=self.horizon) elif self.rl_algo == "sac": self.logger.info("Setting up SAC as model-free RL algorithm.") rl_model = SAC(SACMlpPolicy, dvenv, verbose=1, learning_starts=self.horizon) else: raise AttributeError( "Model-free RL algorithm '{}' is unknown.".format( self.rl_algo)) # Train the agent max_steps_total = self.horizon * self.n_episodes * 100 try: self.logger.info("Start the agent") rl_model.learn(total_timesteps=max_steps_total, seed=self.seed) except MaxEpisodesReachedException: print("Exploration finished.")
def td3(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): from stable_baselines.ddpg.noise import NormalActionNoise env = gym.make(env_id) # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(policy, env, action_noise=action_noise, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=timesteps, log_interval=log_interval) save_model_weights(model, "td3", env_id, policy, seed)
def train_initial_policy(model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print( "Model saved as : ", "data/models/" + algo.__name__ + "_initial_policy_" + env_name + "_.pkl") constrained = False # define the environment here env = gym.make(env_name) if NOISE_VALUE > 0: env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if TIMEWRAPPER: # env = TimeFeatureWrapper(env) env = TimeLimit(env, 1000) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda: env]) if NORMALIZE: env = VecNormalize( env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC( CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3( CustomPolicy2, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], gamma=args['gamma'], gradient_steps=args['gradient_steps'], learning_rate=args['learning_rate'], learning_starts=args['learning_starts'], action_noise=action_noise, train_freq=args['train_freq'], ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters']) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2( mlp_standard, env, n_steps=int(args['n_steps'] / env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', ) elif algo.__name__ == "TRPO_lagrangian": print( 'Initializing TRPO-lagrangian with safety-starter-agents hyperparameters .. ' ) model = TRPO_lagrangian( MLPWithSafeValue, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], cost_lim=args['cost_lim'], penalty_init=args['penalty_init'], penalty_lr=args['penalty_lr']) constrained = True else: print('No algorithm matched. Using SAC .. ') model = SAC( CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) else: model.learn( total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, ) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=10, constrained=constrained) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/' + env_name + '.pkl') print('done :: ', model_name) exit()
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
import matplotlib.pyplot as plt from stable_baselines.common.env_checker import check_env from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines.td3.policies import MlpPolicy from NormalizedActions import NormalizeActionWrapper from LearningRocket import LearningRocket env = LearningRocket(VISUALIZE=False) env = NormalizeActionWrapper(env) check_env(env, warn=True) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1) #model = SAC('MlpPolicy', env, verbose=1) model.load("sac_rocket") obs = env.reset() env.sim.VISUALIZE = True done = False actionList = [] obsList = [] rewardList = [] rewardSum = [] X = [] Y = [] Z = []
def RocketTrainer(): #env = SubprocVecEnv([make_env(LearningRocket, 'E:\Tobi\LearningRocket\TestHoverTD3\LearningRocketHover.py', i) for i in range(72)]) # multiprocess environment env = make_vec_env(LearningRocket, n_envs=1) #env = LearningRocket(visualize=False) eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #env = VecNormalize(env) #eval_env = VecNormalize(eval_env) #env = VecNormalize.load("TestHoverTD3_env",env) #eval_env = VecNormalize.load("TestHoverTD3_env",eval_env) eval_callback = EvalCallback(eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) #model = PPO2(MlpPolicy, env, n_steps=1500, nminibatches=144, lam=0.98, gamma=0.999, learning_rate=2.5e-4, # noptepochs=4,ent_coef=0.01,verbose=1, tensorboard_log="./rocket_tensorboard/", # policy_kwargs = dict(layers=[400, 300])) #model = PPO1(MlpPolicy, env, lam=0.98, gamma=0.999,verbose=1, tensorboard_log="./rocket_tensorboard/", # policy_kwargs = dict(layers=[400, 300])) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.2 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, batch_size=256, gamma=0.95, target_policy_noise=0.01, target_noise_clip=0.02, train_freq=10, gradient_steps=10, learning_rate=1e-3, learning_starts=7500, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300]), buffer_size=100000) #model = TD3(MlpPolicy,env,verbose=1) start = t.time() #model = PPO2.load("TestHoverTD3", env=env, tensorboard_log="./rocket_tensorboard/") #model = TD3.load("TestHoverTD3", env=env, tensorboard_log="./rocket_tensorboard/") #while True: #model.learning_rate = 2.5e-3 model.learn(total_timesteps=200000, callback=eval_callback) model.save("TestHoverTD3") #env.save("TestHoverTD3_env") del model # remove to demonstrate saving and loading duration = t.time() - start model = TD3.load("TestHoverTD3", env=eval_env) #model = PPO2.load("TestHoverTD3", env=eval_env) # Enjoy trained agent obs = eval_env.reset() data = [] time = [] actions = [] alt_reward = [] mix_reward = [] temp_reward = [] valveChange = [] speedPunishes = [] total_reward = [] alt_cumu = [] mix_cumu = [] temp_cumu = [] total_cumu = [] start = True modifiers = [1000, 1000, 200, 1, 200, 2000, 10, 1000, 1500, 1] for i in range(10): data.append([]) for i in range(3): actions.append([]) lastValves = [0.15, 0.2, 0.15] for i in range(600): action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = eval_env.step(action) # Or_obs = eval_env.get_original_obs() time.append(i) for j in range(10): data[j].append(obs[0][j] * modifiers[j]) data[2][i] -= 100 for j in range(3): actions[j].append(action[0][j]) offset = abs(data[0][i] - data[1][i]) # if offset < 10: # alt_reward.append(1-offset/10) # else: alt_reward.append((offset / 2) / 1000) mixError = abs(data[6][i] - 5.5) mix_reward.append((mixError / 0.2) / 1000) if mixError > 0.3: mix_reward[i] -= 1 tempError = abs(data[5][i] - 900) temp_reward.append((tempError / 30) / 1000) if tempError > 50: temp_reward[i] -= 1 total_reward.append(alt_reward[i] + mix_reward[i] + temp_reward[i]) if start is True: alt_cumu.append(alt_reward[i]) mix_cumu.append(mix_reward[i]) temp_cumu.append(temp_reward[i]) total_cumu.append(total_reward[i]) start = False else: alt_cumu.append(alt_reward[i] + alt_cumu[i - 1]) mix_cumu.append(mix_reward[i] + mix_cumu[i - 1]) temp_cumu.append(temp_reward[i] + temp_cumu[i - 1]) total_cumu.append(total_reward[i] + total_cumu[i - 1]) plt.figure(figsize=(11, 8)) plt.subplot(4, 2, 1) plt.xlabel('Time(s)') plt.ylabel('Offset (m)') plt.plot(time, data[0], label='Z Position') plt.plot(time, data[1], label='Z Speed') plt.subplot(4, 2, 2) plt.xlabel('Time(s)') plt.ylabel('Actions') plt.plot(time, actions[0], 'b', label='LOX Command') plt.plot(time, actions[1], 'r', label='LH2 Command') plt.plot(time, actions[2], 'y', label='Mix Command') plt.legend(loc='best') plt.subplot(4, 2, 3) plt.xlabel('Time(s)') plt.ylabel('Engine State') plt.plot(time, data[5], label='Temp') plt.legend(loc='best') plt.subplot(4, 2, 5) plt.xlabel('Time(s)') plt.ylabel('Engine State') plt.plot(time, data[4], label='Pressure') plt.legend(loc='best') plt.subplot(4, 2, 4) plt.xlabel('Time(s)') plt.ylabel('Mixture') plt.plot(time, data[6], label='Mixture') plt.legend(loc='best') plt.subplot(4, 2, 6) plt.xlabel('Time(s)') plt.ylabel('Reward values. Valve Error REAL valves') plt.plot(time, alt_reward, label='Altitude Error') plt.plot(time, mix_reward, label='Mixture Error') plt.plot(time, temp_reward, label='Temperature Error') plt.plot(time, total_reward, label='Total Reward') plt.subplot(4, 2, 8) plt.xlabel('Time(s)') plt.ylabel('Reward values cumulative') plt.plot(time, alt_cumu, label='Altitude Error') plt.plot(time, mix_cumu, label='Mixture Error') plt.plot(time, temp_cumu, label='Temperature Error') plt.plot(time, total_cumu, label='Total Reward') plt.legend(loc='best') print(duration) plt.show()
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def __init__(self, sim_env_name='Hopper-v2', real_env_name='HopperModified-v2', frames=NUM_FRAMES_INPUT, num_cores=NUM_CORES, num_rl_threads=NUM_RL_THREADS, load_policy=None, algo=None): self.env_name = sim_env_name self.real_env_name = real_env_name self.frames = frames self.num_cores = num_cores self.fwd_norms_x = (0., 1.) self.fwd_norms_y = (0., 1.) self.inv_norms_x = (0., 1.) self.inv_norms_y = (0., 1.) self.num_rl_threads = num_rl_threads self.real_env = SubprocVecEnv([ lambda: gym.make(self.real_env_name) for i in range(self.num_cores) ]) print('MODIFIED ENV BODY_MASS : ', gym.make(self.real_env_name).model.body_mass) self.sim_env = SubprocVecEnv( [lambda: gym.make(self.env_name) for i in range(self.num_cores)]) print('SIMULATED ENV BODY_MASS : ', gym.make(self.env_name).model.body_mass) # lists to reuse experience from previous grounding steps self.fwd_model_x_list = [] self.fwd_model_y_list = [] self.inv_model_x_list = [] self.inv_model_y_list = [] # initialize target policy if load_policy is None: print('LOADING -RANDOM- INITIAL POLICY') self.target_policy = PPO2(MlpPolicy, env=self.sim_env, verbose=1, tensorboard_log='data/TBlogs/' + self.env_name) else: print('LOADING -PRETRAINED- INITIAL POLICY') # self.target_policy = SAC.load( # load_policy, # env=SubprocVecEnv([lambda: gym.make(self.env_name)]), # tensorboard_log='data/TBlogs/'+self.env_name, # verbose=1, # batch_size=256, # buffer_size=1000000, # ) # TODO: write easy way to switch algorithms # self.target_policy = PPO2.load( # load_policy, # env=SubprocVecEnv([lambda: gym.make(self.env_name)]), # tensorboard_log='TBlogs/'+self.env_name, # verbose=1, # n_steps=256, # # buffer_size=1000000, # ) n_actions = self.sim_env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.target_policy = TD3.load( load_policy, env=DummyVecEnv([lambda: gym.make(self.env_name)]), tensorboard_log='data/TBlogs/' + self.env_name, verbose=1, batch_size=128, gamma=0.99, learning_rate=0.001, action_noise=action_noise, buffer_size=1000000, ) # define the Grounded Action Transformer models here self._init_gat_models() self.grounded_sim_env = None
def main(args): if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) if args.env == 'ant_dir': ant_dir_tasks = pickle.load(open(f"{args.task_path}/ant_dir_tasks", "rb")) env = AntDirEnv(tasks=ant_dir_tasks, include_goal=args.include_goal) elif args.env == 'ant_goal': env = AntGoalEnv(include_goal = args.include_goal) elif args.env == 'cheetah_dir': cheetah_dir_tasks = pickle.load(open(f"{args.task_path}/cheetah_dir_tasks", "rb")) env = HalfCheetahDirEnv(tasks = cheetah_dir_tasks, include_goal = args.include_goal) elif args.env == 'cheetah_vel': cheetah_vel_tasks = pickle.load(open(f"{args.task_path}/cheetah_vel_tasks", "rb")) env = HalfCheetahVelEnv(tasks = cheetah_vel_tasks, include_goal = args.include_goal) elif args.env == 'humanoid_dir': env = HumanoidDirEnv(include_goal = args.include_goal) elif args.env == 'walker_param': walker_tasks = pickle.load(open(f"{args.task_path}/walker_params_tasks", "rb")) env = WalkerRandParamsWrappedEnv(tasks = walker_tasks, include_goal = args.include_goal) elif args.env == 'ml45': from metaworld.benchmarks.base import Benchmark from metaworld.envs.mujoco.multitask_env import MultiClassMultiTaskEnv from metaworld.envs.mujoco.env_dict import HARD_MODE_ARGS_KWARGS, HARD_MODE_CLS_DICT args.type = 'train' if args.task is None: args.task = list(HARD_MODE_ARGS_KWARGS[args.type].keys())[args.task_idx] args_kwargs = HARD_MODE_ARGS_KWARGS[args.type][args.task] args_kwargs['kwargs']['obs_type'] = 'with_goal' args_kwargs['task'] = args.task env = HARD_MODE_CLS_DICT[args.type][args.task](*args_kwargs['args'], **args_kwargs['kwargs']) if args.env == 'ml45': env = TimeLimit(env, max_episode_steps = 150) pickle.dump(args_kwargs, open(args.log_dir + '/env_{}_{}_task{}.pkl'.format(args.env, args.type, args.task_idx), "wb" )) else: env.observation_space = gym.spaces.box.Box(env.observation_space.low, env.observation_space.high) env.action_space = gym.spaces.box.Box(env.action_space.low, env.action_space.high) env = TimeLimit(env, max_episode_steps = 200) pickle.dump(env.unwrapped.tasks, open(args.log_dir + '/env_{}_task{}.pkl'.format(args.env, args.task_idx), "wb" )) if args.alg == 'td3': from stable_baselines.td3.policies import MlpPolicy from stable_baselines.ddpg.noise import NormalActionNoise from src.td3 import TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log = args.log_dir + '/tensorboard/log_{}_task_{}'.format(args.env, args.task_idx), buffer_log = args.log_dir + '/buffers_{}_{}_'.format(args.env, args.task_idx), full_size = args.full_buffer_size, buffer_size = args.replay_buffer_size, batch_size = args.batch_size, policy_kwargs={'layers': [400, 300]}, learning_rate=args.outer_policy_lr ) print('###################################') print('###################################') print('## Running *TD3* data collection ##') print('###################################') print('###################################') model.learn(total_timesteps=args.full_buffer_size, log_interval=10) else: from stable_baselines.sac.policies import MlpPolicy from stable_baselines.sac.policies import FeedForwardPolicy from src.sac2 import SAC env.set_task_idx(args.task_idx) model = SAC(MlpPolicy, env, log_dir=args.log_dir, verbose=1, tensorboard_log = args.log_dir + '/tensorboard/log_{}_task_{}'.format(args.env, args.task_idx), buffer_log = args.log_dir + '/buffers_{}_{}_'.format(args.env, args.task_idx), buffer_size = args.replay_buffer_size, full_size = args.full_buffer_size, batch_size = args.batch_size, policy_kwargs={'layers': [300,300,300]}, learning_rate = 3e-4, gamma = 0.99) print('###################################') print('###################################') print('## Running *SAC* data collection ##') print('###################################') print('###################################') model.learn(total_timesteps = args.full_buffer_size, log_interval = 1) model.save(args.log_dir + '/model_{}_{}'.format(args.env, args.task_idx))
def train(self, args, callback, env_kwargs=None, train_kwargs=None): env = self.makeEnv(args, env_kwargs=env_kwargs) if train_kwargs is None: train_kwargs = {} # Parse noise_type action_noise = None param_noise = None n_actions = env.action_space.shape[-1] if args.noise_param: param_noise = AdaptiveParamNoiseSpec(initial_stddev=args.noise_param_sigma, desired_action_stddev=args.noise_param_sigma) if train_kwargs.get("noise_action", args.noise_action) == 'normal': action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=args.noise_action_sigma * np.ones(n_actions)) elif train_kwargs.get("noise_action", args.noise_action) == 'ou': action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=args.noise_action_sigma * np.ones(n_actions)) # filter the hyperparam, and set default values in case no hyperparam train_kwargs = {k: v for k, v in train_kwargs.items() if k not in ["noise_action_sigma", "noise_action"]} # get the associated policy for the architecture requested if args.srl_model == "raw_pixels": args.policy = "cnn" else: args.policy = "mlp" self.policy = args.policy self.ob_space = env.observation_space self.ac_space = env.action_space policy_fn = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[args.policy] param_kwargs = { "verbose": 1, "render_eval": False, "render": False, "reward_scale": 1., "param_noise": param_noise, "normalize_returns": False, "normalize_observations": (args.srl_model == "raw_pixels"), "critic_l2_reg": 1e-2, "actor_lr": 1e-4, "critic_lr": 1e-3, "action_noise": action_noise, "enable_popart": False, "gamma": 0.99, "clip_norm": None, "nb_train_steps": 100, "nb_rollout_steps": 100, "nb_eval_steps": 50, "batch_size": args.batch_size } self.model = self.model_class(policy_fn, env, **{**param_kwargs, **train_kwargs}) self.model.learn(total_timesteps=args.num_timesteps, seed=args.seed, callback=callback) env.close()
def train( task, alg, logdir, domain_name, *, random_seed=None, num_steps=int(2e3), log_every=int(10e3), num_parallel=8, load_policy=False, load_policy_dir="", **kwargs ): """Train and evaluate an agent Args: task (str): Jitterbug task to train on alg (str): Algorithm to train, one of; - 'ddpg': DDPG Algorithm - 'ppo2': PPO2 Algorithm - 'sac': SAC Algorithm logdir (str): Logging directory domain_name (str): Name of the DMC domain random_seed (int): Random seed to use, or None num_steps (int): Number of training steps to train for log_every (int): Save and log progress every this many timesteps num_parallel (int): Number of parallel environments to run. Only used load_policy (bool): Whether to load an existing or not. It Yes, the policy is loaded from logdir. for A2C and PPO2. """ assert alg in ('ddpg', 'sac', 'ppo2', 'td3'), "Invalid alg: {}".format(alg) assert domain_name in ('jitterbug', 'augmented_jitterbug'), "Invalid domain_name: {}".format(domain_name) # Cast args to types if random_seed is not None: random_seed = int(random_seed) else: random_seed = int(time.time()) # Fix random seed random.seed(random_seed) np.random.seed(random_seed) # Prepare the logging directory os.makedirs(logdir, exist_ok=True) print("Training {} on {} with seed {} for {} steps " "(log every {}), saving to {}".format( alg, task, random_seed, num_steps, log_every, logdir )) if domain_name == "augmented_jitterbug": augmented_jitterbug.augment_Jitterbug(modify_legs=True, modify_mass=True, modify_coreBody1=False, modify_coreBody2=False, modify_global_density=False, modify_gear=False, ) # Construct DMC env env_dmc = suite.load( domain_name=domain_name, task_name=task, task_kwargs=dict(random=random_seed, norm_obs=True), environment_kwargs=dict(flat_observation=True) ) # Wrap gym env in a dummy parallel vector if alg in ('ppo2'): if num_parallel > multiprocessing.cpu_count(): warnings.warn("Number of parallel workers " "({}) > CPU count ({}), setting to # CPUs - 1".format( num_parallel, multiprocessing.cpu_count() )) num_parallel = max( 1, multiprocessing.cpu_count() - 1 ) print("Using {} parallel environments".format(num_parallel)) # XXX ajs 13/Sep/19 Hack to create multiple monitors that don't write to the same file env_vec = SubprocVecEnv([ lambda: Monitor( gym.wrappers.FlattenDictWrapper( jitterbug_dmc.JitterbugGymEnv(env_dmc), dict_keys=["observations"] ), os.path.join(logdir, str(random.randint(0, 99999999))), allow_early_resets=True ) for n in range(num_parallel) ]) else: num_parallel = 1 env_vec = DummyVecEnv([ lambda: Monitor( gym.wrappers.FlattenDictWrapper( jitterbug_dmc.JitterbugGymEnv(env_dmc), dict_keys=["observations"] ), logdir, allow_early_resets=True ) ]) # Record start time start_time = datetime.datetime.now() def _cb(_locals, _globals): """Callback for during training""" if 'last_num_eps' not in _cb.__dict__: _cb.last_num_eps = 0 # Extract episode reward history based on model type if isinstance(_locals['self'], DDPG): ep_r_hist = list(_locals['episode_rewards_history']) elif isinstance(_locals['self'], PPO2): ep_r_hist = [d['r'] for d in _locals['ep_info_buf']] elif isinstance(_locals['self'], SAC): ep_r_hist = [d['r'] for d in _locals['ep_info_buf']] elif isinstance(_locals['self'], TD3): ep_r_hist = [d['r'] for d in _locals['ep_info_buf']] else: raise ValueError("Invalid algorithm: {}".format( _locals['self'] )) # Compute # elapsed steps based on # elapsed episodes ep_size = int( jitterbug_dmc.jitterbug.DEFAULT_TIME_LIMIT / jitterbug_dmc.jitterbug.DEFAULT_CONTROL_TIMESTEP ) num_eps = len(ep_r_hist) elapsed_steps = ep_size * num_eps # Compute elapsed time in seconds elapsed_time = (datetime.datetime.now() - start_time).total_seconds() # Log some info if num_eps != _cb.last_num_eps: _cb.last_num_eps = num_eps print("{:.2f}s | {}ep | {}#: episode reward = " "{:.2f}, last 5 episode reward = {:.2f}".format( elapsed_time, num_eps, elapsed_steps, ep_r_hist[-1], np.mean(ep_r_hist[-5:]) )) # Save model checkpoint model_path = os.path.join(logdir, "model.pkl") print("Saved checkpoint to {}".format(model_path)) _locals['self'].save(model_path) return True if alg == 'ddpg': # Default parameters for DDPG # kwargs.setdefault("normalize_returns", True) # kwargs.setdefault("return_range", (0., 1.)) # kwargs.setdefault("normalize_observations", True) # kwargs.setdefault("observation_range", (-1., 1.)) kwargs.setdefault("batch_size", 256) kwargs.setdefault("actor_lr", 1e-4) kwargs.setdefault("critic_lr", 1e-4) kwargs.setdefault("buffer_size", 1000000) kwargs.setdefault("action_noise", OrnsteinUhlenbeckActionNoise( mean=np.array([0.3]), sigma=0.3, theta=0.15 )) print("Constructing DDPG agent with settings:") pprint.pprint(kwargs) # Construct the agent if load_policy: print("Load DDPG agent from ", load_policy_dir) agent = DDPG.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), policy=CustomPolicyDDPG, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = DDPG( policy=CustomPolicyDDPG, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb ) elif alg == 'ppo2': kwargs.setdefault("learning_rate", 1e-4) kwargs.setdefault("n_steps", 256 // num_parallel) kwargs.setdefault("ent_coef", 0.01) kwargs.setdefault("cliprange", 0.1) print("Constructing PPO2 agent with settings:") pprint.pprint(kwargs) if load_policy: print("Load PPO2 agent from ", load_policy_dir) agent = PPO2.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), policy=CustomPolicyGeneral, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = PPO2( policy=CustomPolicyGeneral, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb, log_interval=10 ) elif alg == 'sac': # Default parameters for SAC kwargs.setdefault("learning_rate", 1e-4) kwargs.setdefault("buffer_size", 1000000) kwargs.setdefault("batch_size", 256) kwargs.setdefault("ent_coef", 'auto') # kwargs.setdefault("ent_coef", 'auto_0.1') kwargs.setdefault("action_noise", NormalActionNoise( mean=0, sigma=0.2, )) print("Constructing SAC agent with settings:") pprint.pprint(kwargs) # Construct the agent # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy # classes so we just use MlpPolicy and pass policy_kwargs if load_policy: print("Load SAC agent from ", load_policy_dir) kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu)) agent = SAC.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = SAC( policy='MlpPolicy', env=env_vec, verbose=1, tensorboard_log=logdir, policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu), **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb ) elif alg == 'td3': # Default parameters for SAC kwargs.setdefault("learning_rate", 1e-4) kwargs.setdefault("buffer_size", 1000000) kwargs.setdefault("batch_size", 256) kwargs.setdefault("gradient_steps", 1000) kwargs.setdefault("learning_starts", 10000) kwargs.setdefault("train_freq", 1000) # kwargs.setdefault("ent_coef", 'auto_0.1') kwargs.setdefault("action_noise", NormalActionNoise( mean=0, sigma=0.2, )) print("Constructing TD3 agent with settings:") pprint.pprint(kwargs) # Construct the agent # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy # classes so we just use MlpPolicy and pass policy_kwargs if load_policy: print("Load TD3 agent from ", load_policy_dir) kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu)) agent = TD3.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = TD3( policy='MlpPolicy', env=env_vec, verbose=1, tensorboard_log=logdir, policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu), **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb ) else: raise ValueError("Invalid alg: {}".format(alg)) # Save final model agent.save(os.path.join(logdir, 'model.final.pkl')) print("Done")
from stable_baselines.td3 import TD3, LnCnnPolicy, LnMlpPolicy #from stable_baselines.ddpg import DDPG, LnMlpPolicy from env import * import tensorflow as tf from config import config from stable_baselines.ddpg.noise import NormalActionNoise from stable_baselines.common.vec_env import DummyVecEnv policy = LnMlpPolicy action_noise = NormalActionNoise(mean=np.zeros(config['ACTION_DIM']), sigma=0.1 * np.ones(config['ACTION_DIM'])) #env = SketchDesigner(SketchDiscriminator(config['SAVED_GAN'])) env = SketchDesigner(SketchClassifier(config['SAVED_CNN'])) #env = DummyVecEnv([lambda: env]) agent = TD3( policy, env, random_exploration=0.2, #action_noise=action_noise, #tensorboard_log='./log/', verbose=1) #agent.get_env().env_method('get_policy', agent.policy_tf) agent.get_env().get_policy(agent.policy_tf) for _ in range(400): agent.learn(1000, reset_num_timesteps=False) agent.save('./save/4/model')
def train_SAC(env, eval_env, out_dir, seed=None, **kwargs): # Delete keys so the dict can be pass to the model constructor policy = kwargs['policy'] n_timesteps = kwargs['n_timesteps'] noise_type = None if 'noise_type' in kwargs: noise_type = kwargs['noise_type'] del kwargs['noise_type'] del kwargs['policy'] del kwargs['n_timesteps'] save_frequency = 10000 eval_frequency = 50000 eval_episodes = 1000 if 'save_freq' in kwargs: save_frequency = kwargs['save_freq'] del kwargs['save_freq'] if 'eval_freq' in kwargs: eval_frequency = kwargs['eval_freq'] del kwargs['eval_freq'] if 'eval_episides' in kwargs: eval_episodes = kwargs['eval_episides'] del kwargs['eval_episides'] # the noise objects - usually not necessary for SAC but can help for hard exploration tasks nb_actions = env.action_space.shape[-1] action_noise = None if noise_type: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Create learning rate schedule for key in ['learning_rate', 'learning_rate_pi', 'cliprange']: if key in kwargs: if isinstance(kwargs[key], str): schedule, initial_value = kwargs[key].split('_') initial_value = float(initial_value) kwargs[key] = linear_schedule(initial_value) elif isinstance(kwargs[key], float): kwargs[key] = constfn(kwargs[key]) else: raise ValueError('Invalid valid for {}: {}'.format( key, kwargs[key])) if 'continue' in kwargs and kwargs['continue'] is True: print("Loading pretrained agent") list_of_models = glob.glob(os.path.join(out_dir, '*.zip')) last_saved_model = max(list_of_models, key=os.path.getctime) model = SAC_residual.load(last_saved_model, env=env, tensorboard_log=os.path.join(out_dir, 'tb'), verbose=1, **kwargs) reset_num_timesteps = False if 'num_timesteps' in kwargs: model.num_timesteps = kwargs['num_timesteps'] del kwargs['num_timesteps'] else: if 'continue' in kwargs: del kwargs['continue'] # create model model = SAC(policy, env, action_noise=action_noise, seed=seed, verbose=1, tensorboard_log=os.path.join(out_dir, 'tb'), full_tensorboard_log=False, **kwargs) reset_num_timesteps = True # start training train_callback = get_train_callback(eval_env, seed, out_dir, save_f=save_frequency, eval_f=eval_frequency, eval_ep=eval_episodes) model.learn(total_timesteps=n_timesteps, callback=train_callback, log_interval=10, reset_num_timesteps=reset_num_timesteps) return model
def main(args): log_dir = args.log_path if ( args.log_path is not None ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(log_dir) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(log_dir, format_strs=[]) set_global_seeds(args.seed) model_class = SAC_parallel n_workers = args.num_workers if not args.play else 1 env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential, reward_type=args.reward_type, n_object=args.n_object) def make_thunk(rank): return lambda: make_env( env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs) env = ParallelSubprocVecEnv([make_thunk(i) for i in range(n_workers)], reset_when_done=True) if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): os.remove(os.path.join(logger.get_dir(), 'eval.csv')) print('Remove existing eval.csv') eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs) eval_env = FlattenDictWrapper( eval_env, ['observation', 'achieved_goal', 'desired_goal']) if not args.play: os.makedirs(log_dir, exist_ok=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE if not args.play: from stable_baselines.ddpg.noise import NormalActionNoise noise_type = args.action_noise.split('_')[0] if noise_type == 'none': parsed_action_noise = None elif noise_type == 'normal': sigma = float(args.action_noise.split('_')[1]) parsed_action_noise = NormalActionNoise( mean=np.zeros(env.action_space.shape), sigma=sigma * np.ones(env.action_space.shape)) else: raise NotImplementedError train_kwargs = get_train_kwargs("sac", args, parsed_action_noise, eval_env) def callback(_locals, _globals): if _locals['step'] % int(1e3) == 0: if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model( eval_env, _locals["self"], init_on_table=(args.env == 'FetchStack-v2')) elif 'MasspointPushDoubleObstacle-v2' in args.env: mean_eval_reward = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.])) mean_eval_reward2 = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.])) log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv") else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(_locals['self'].num_timesteps, mean_eval_reward) if _locals['step'] % int(2e4) == 0: model_path = os.path.join( log_dir, 'model_' + str(_locals['step'] // int(2e4))) model.save(model_path) print('model saved to', model_path) return True class CustomSACPolicy(SACPolicy): def __init__(self, *model_args, **model_kwargs): super(CustomSACPolicy, self).__init__( *model_args, **model_kwargs, layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) from utils.sac_attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("sac", args) if rank == 0: print('train_kwargs', train_kwargs) print('policy_kwargs', policy_kwargs) # Wrap the model model = HER2(args.policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, num_workers=args.num_workers, policy_kwargs=policy_kwargs, verbose=1, **train_kwargs) print(model.get_parameter_list()) # Train the model model.learn( int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10) if rank == 0: model.save(os.path.join(log_dir, 'final')) # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method if args.play and rank == 0: assert args.load_path is not None model = HER2.load(args.load_path, env=env) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) obs = env.reset() if 'FetchStack' in args.env: env.env_method('set_task_array', [[(env.get_attr('n_object')[0], 0)]]) obs = env.reset() while env.get_attr('current_nobject')[0] != env.get_attr( 'n_object')[0] or env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'FetchPushWallObstacle' in args.env: while not (obs['observation'][0][4] > 0.7 and obs['observation'][0][4] < 0.8): obs = env.reset() env.env_method('set_goal', [np.array([1.18, 0.8, 0.425, 1, 0])]) obs = env.env_method('get_obs') obs = { 'observation': obs[0]['observation'][None], 'achieved_goal': obs[0]['achieved_goal'][None], 'desired_goal': obs[0]['desired_goal'][None] } # obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']]) elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env: while np.argmax(obs['desired_goal'][0][3:]) != 0: obs = env.reset() elif 'MasspointMaze-v2' in args.env: while obs['observation'][0][0] < 3 or obs['observation'][0][1] < 3: obs = env.reset() env.env_method('set_goal', [np.array([1., 1., 0.15])]) obs = env.env_method('get_obs') obs = { 'observation': obs[0]['observation'][None], 'achieved_goal': obs[0]['achieved_goal'][None], 'desired_goal': obs[0]['desired_goal'][None] } print('goal', obs['desired_goal'][0], 'obs', obs['observation'][0]) episode_reward = 0.0 images = [] frame_idx = 0 num_episode = 0 for i in range(env_kwargs['max_episode_steps'] * 10): img = env.render(mode='rgb_array') ax.cla() ax.imshow(img) tasks = ['pick and place', 'stack'] ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', task: ' + tasks[np.argmax(obs['observation'][0][-2:])]) images.append(img) action, _ = model.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action) episode_reward += reward frame_idx += 1 if args.export_gif: plt.imsave( os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) else: plt.pause(0.02) if done: print('episode_reward', episode_reward) obs = env.reset() if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env: while np.argmax(obs['desired_goal'][0][3:]) != 0: obs = env.reset() print('goal', obs['desired_goal'][0]) episode_reward = 0.0 frame_idx = 0 num_episode += 1 if num_episode >= 1: break exit() if args.export_gif: os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) for i in range(env_kwargs['max_episode_steps'] * 10): # images.append(plt.imread('tempimg' + str(i) + '.png')) try: os.remove( os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) except: pass
def train_HER(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir, log_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir + '/', allow_early_resets=True) policy = kwargs['policy'] algo_name = kwargs['algo_name'] n_timesteps = kwargs['n_timesteps'] noise_type = None if 'noise_type' in kwargs: noise_type = kwargs['noise_type'] del kwargs['noise_type'] # HER Available strategies (cf paper): future, final, episode, random goal_selection_strategy = kwargs['goal_selection_strategy'] n_sampled_goal = kwargs['n_sampled_goal'] del kwargs['policy'] del kwargs['algo_name'] del kwargs['n_timesteps'] del kwargs['goal_selection_strategy'] del kwargs['n_sampled_goal'] # Set agent algorithm agent = set_agent(algo_name) if not agent: print("invalid algorithm for HER") return # the noise objects nb_actions = env.action_space.shape[-1] param_noise = None action_noise = None if noise_type: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'adaptive-param' in current_noise_type and algo_name is 'ddpg': _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Create learning rate schedule for key in ['learning_rate', 'learning_rate_pi', 'cliprange']: if key in kwargs: if isinstance(kwargs[key], str): schedule, initial_value = kwargs[key].split('_') initial_value = float(initial_value) kwargs[key] = linear_schedule(initial_value) elif isinstance(kwargs[key], float): kwargs[key] = constfn(kwargs[key]) else: raise ValueError('Invalid valid for {}: {}'.format( key, kwargs[key])) kwargs['tensorboard_log'] = os.path.join(log_dir, 'tb') kwargs['full_tensorboard_log'] = False kwargs['seed'] = seed kwargs['action_noise'] = action_noise if algo_name is 'ddpg': kwargs['param_noise'] = param_noise if 'continue' in kwargs and kwargs['continue'] is True: # Continue training print("Loading pretrained agent") # Policy should not be changed for key in ['policy', 'policy_kwargs']: if key in kwargs: del kwargs[key] model = HER.load(os.path.join(out_dir, 'final_model.pkl'), env=env, verbose=1, **kwargs) else: if 'continue' in kwargs: del kwargs['continue'] model = HER(policy, env, agent, goal_selection_strategy=goal_selection_strategy, n_sampled_goal=n_sampled_goal, verbose=1, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
# parameters(for training) tau = 0.1 # update rate for target model gamma = 0.95 # discount rate for q value. # batch_size = NUMCONC*5+3 # size of batch batch_size = 10 alr = 0.003 # actor learning rate clr = 0.003 # critic learning rate # noise(to better exploration) n_actions = env.action_space.shape[-1] param_noise = AdaptiveParamNoiseSpec() # action_noise = None # param_noise = None action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # A gaussian action noise # model(DDPG) # Deep Deterministic Policy Gradient Algorithms. # DDPG is the combination of Nature DQN、Actor-Critic and DPG, it is designed to tackle continuous action space problems. # Policy-learning # The policy function(actor) takes state as input and is updated according to policy gradient. # Q-learning # The value function(critic) take state and action as input and is adjusted to minimize the loss. # Q-learning algorithm for function approximator is largely based on minimizing this MSBE loss function, with two main tricks, viz replay buffer and targrt network. # The replay buffer is used to store experience, because DDPG is an off-policy algorithm. # A target network is designed to minimize MSBE loss. # A target policy network to compute an action which approximately maximizes Q_{\phi_{\text{targ}}}. # Ornstein-Uhlenbeck process is applied to add exploration noise during training to make DDPG policies explore better.
def main(args): log_dir = args.log_path if ( args.log_path is not None ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(log_dir) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(log_dir, format_strs=[]) set_global_seeds(args.seed) model_class = SAC_SIR # works also with SAC, DDPG and TD3 env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential, reward_type=args.reward_type, n_object=args.n_object) def make_thunk(rank): return lambda: make_env( env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs) env = ParallelSubprocVecEnv( [make_thunk(i) for i in range(args.num_workers)], reset_when_done=True) def make_thunk_aug(rank): return lambda: FlattenDictWrapper( make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs), ['observation', 'achieved_goal', 'desired_goal']) aug_env_kwargs = env_kwargs.copy() del aug_env_kwargs['max_episode_steps'] aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1] aug_env = ParallelSubprocVecEnv( [make_thunk_aug(i) for i in range(args.num_workers)], reset_when_done=False) if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): os.remove(os.path.join(logger.get_dir(), 'eval.csv')) print('Remove existing eval.csv') eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs) eval_env = FlattenDictWrapper( eval_env, ['observation', 'achieved_goal', 'desired_goal']) if not args.play: os.makedirs(log_dir, exist_ok=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE if not args.play: from stable_baselines.ddpg.noise import NormalActionNoise noise_type = args.action_noise.split('_')[0] if noise_type == 'none': parsed_action_noise = None elif noise_type == 'normal': sigma = float(args.action_noise.split('_')[1]) parsed_action_noise = NormalActionNoise( mean=np.zeros(env.action_space.shape), sigma=sigma * np.ones(env.action_space.shape)) else: raise NotImplementedError train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise, eval_env, aug_env) def callback(_locals, _globals): if _locals['step'] % int(1e3) == 0: if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model( eval_env, _locals["self"], init_on_table=(args.env == 'FetchStack-v2')) elif 'MasspointPushDoubleObstacle-v2' in args.env: mean_eval_reward = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.])) mean_eval_reward2 = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.])) log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv") else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(_locals['self'].num_timesteps, mean_eval_reward) if _locals['step'] % int(2e4) == 0: model_path = os.path.join( log_dir, 'model_' + str(_locals['step'] // int(2e4))) model.save(model_path) print('model saved to', model_path) return True class CustomSACPolicy(SACPolicy): def __init__(self, *model_args, **model_kwargs): super(CustomSACPolicy, self).__init__( *model_args, **model_kwargs, layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) from utils.sac_attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("sac_sir", args) if rank == 0: print('train_kwargs', train_kwargs) print('policy_kwargs', policy_kwargs) # Wrap the model model = HER2(args.policy, env, model_class, n_sampled_goal=4, start_augment_time=args.start_augment, goal_selection_strategy=goal_selection_strategy, num_workers=args.num_workers, policy_kwargs=policy_kwargs, verbose=1, **train_kwargs) print(model.get_parameter_list()) # Train the model model.learn( int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10) if rank == 0: model.save(os.path.join(log_dir, 'final'))