def train_sac(training_tag): env = gym.make(ENVIRONMENT_NAME) env = DummyVecEnv([lambda: env]) if (isinstance(training_tag, float)): model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=1, policy_kwargs=POLICY_KWARGS) for step in range(TRAINING_STEPS): env.reset() (model, learning_results) = model.learn( total_timesteps=TRAINING_TIMESTEPS, log_interval=100) file_tag = str(training_tag).replace(".", "p") if (SAVE_AGENTS): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if (SAVE_FINAL_AGENT): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model return data
def model_training_learning(env_train, model_name, timesteps=100000): # train model os.chdir("./model_saved/" + model_name) start = time.time() print("Train ", model_name, " Model with MlpPolicy: ") if model_name == "A2C_Model": model = A2C('MlpPolicy', env_train, verbose=0) elif model_name == "PPO_Model": model = PPO2('MlpPolicy', env_train, verbose=0) elif model_name == "TD3_Model": model = TD3('MlpPolicy', env_train, verbose=0) elif model_name == "SAC_Model": model = SAC('MlpPolicy', env_train, verbose=0) print("Learning ", model_name, " time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("- ", model_name, " save finish :") print("Training time ", model_name, " : ", (end - start) / 60, " minutes") os.chdir("./..") os.chdir("./..") return model
def run_experiment(verbose, tensorboard_log, learning_rate): pdb.set_trace() env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) n_actions = env.action_space.shape[-1] stddev = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=learning_rate, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) model.save(expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(learning_rate))) env.close()
def func_run(env, logger, lr, action_noise, file): expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 5e7 save_video_length = 200 save_video_interval = 1000000 env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() return True
def explore(app, emulator, appium, timesteps, timer, save_policy, policy_dir, cycle, train_freq=5, target_update_interval=10): try: env = TimeFeatureWrapper(app) model = SAC(MlpPolicy, env, verbose=1, train_freq=train_freq, target_update_interval=target_update_interval) callback = TimerCallback(timer=timer, app=app) model.learn(total_timesteps=timesteps, callback=callback) if save_policy: model.save(f'{policy_dir}{os.sep}{cycle}') return True except Exception as e: print(e) appium.restart_appium() if emulator is not None: emulator.restart_emulator() return False
def main(argv): fixed = True policy_name = "sac_reaching_policy" obj_pose_rnd_std = 0 if fixed == True else 0.05 pandaenv = pandaReachGymEnv(renders=True, use_IK=0, numControlledJoints=7, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) n_actions = pandaenv.action_space.shape[-1] pandaenv = DummyVecEnv([lambda: pandaenv]) model = SAC(MlpPolicy, pandaenv, gamma=0.9, batch_size=16, verbose=1, tensorboard_log="../pybullet_logs/pandareach_sac/") model.learn(total_timesteps=1000000) model.save("../pybullet_logs/pandareach_sac/" + policy_name) del model # remove to demonstrate saving and loading
def train_SAC(env_train, model_name, timesteps=50000): start = time.time() model = SAC('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (SAC): ', (end - start) / 60, ' minutes') return model
def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) # env.render() env_ = gym.make(env_name) today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_SAC_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) env = Monitor(env, filename=path) ############################ # Logging # ############################ logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) ############################ # callback # ############################ callbacklist = [] ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') eval_callback = EvalCallback_wandb_SAC(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) callbacklist.append(ckpt_callback) callbacklist.append(eval_callback) callback = CallbackList(callbacklist) ############################ # run # ############################ # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=int(num_time_steps), log_interval=20, callback=callback) model.save(path + "SAC_Walker2d")
def train(): machine = StateMachine() machine.initialize(headless=True) camera = Camera(machine) env = CustomEnv(machine, camera, state="vision") model = SAC(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \ target_update_interval=32, tensorboard_log=dir_path+'/Logs/') model.learn(total_timesteps=2000, log_interval=1000000) model.save("Grasp_Model_Full_Pose")
def train(learning_rate, time_steps, env, model_path): tf.reset_default_graph() # to avoid the conflict the existnat parameters, but not suggested for reuse parameters # default policy is MlpPolicy model = SAC(CustomSACPolicy, env, verbose=1,seed=10, n_cpu_tf_sess=16) model.learn(total_timesteps=int(time_steps), log_interval=1000, callback=callback) model.save(model_path)
def train(): set_gpu() expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 1e8 save_video_length = 200 save_video_interval = 1000000 file = open('sac_done.txt', 'w+') env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) n_actions = env.action_space.shape[-1] stddev = 0.2 pool = multiprocessing.Pool(processes=4) for lr in [1e-5]: #, 5e-4, 1e-5 logger = osp.join( expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter), np.format_float_scientific(lr))) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() file.close() pool.close() pool.join()
def train_GAIL(env_train, model_name, timesteps=1000): """GAIL Model""" #from stable_baselines.gail import ExportDataset, generate_expert_traj start = time.time() # generate expert trajectories model = SAC('MLpPolicy', env_train, verbose=1) generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10) # Load dataset dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1) model = GAIL('MLpPolicy', env_train, dataset, verbose=1) model.learn(total_timesteps=1000) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (PPO): ', (end - start) / 60, ' minutes') return model
def train_SAC(env_train, model_name, timesteps=100000): # train SAC model os.chdir("./model_saved/") start = time.time() print("Train SAC Model with MlpPolicy: ") model = SAC('MlpPolicy', env_train, verbose=0) print("SAC Learning time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("SAC Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("SAC Model save finish :") print('Training time SAC: ', (end - start) / 60, ' minutes') os.chdir("./..") return model
def train_SAC(self, model_name, model_params=config.SAC_PARAMS): """TD3 model""" from stable_baselines import SAC env_train = self.env start = time.time() model = SAC( 'MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], learning_rate=model_params['learning_rate'], learning_starts=model_params['learning_starts'], ent_coef=model_params['ent_coef'], verbose=model_params['verbose'], tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{model_name}") model.learn(total_timesteps=model_params['timesteps'], tb_log_name="SAC_run") end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (SAC): ', (end - start) / 60, ' minutes') return model
specific_env_len=70, s_len=150, walls=True, target_vel=params["target_vel"], use_contacts=params["use_contacts"]) model = SAC('MlpPolicy', env, learning_rate=3e-3, verbose=1, batch_size=64, tensorboard_log="/tmp", gamma=0.99) model.learn(total_timesteps=int(params["steps"])) print("Done learning, saving model") model.save("agents/SBL_{}".format(params["ID"])) print("Saved model, closing env") env.close() print("Finished training with ID: {}".format(ID)) else: env = env_id(params["env_list"], max_n_envs=1, specific_env_len=70, s_len=150, walls=True, target_vel=params["target_vel"], use_contacts=params["use_contacts"]) print("Testing") policy_name = "H02" # LX3, 63W (tiles): joints + contacts + yaw policy_path = 'agents/SBL_{}'.format(policy_name)
import gym from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() from env.cache_env import cache_env import pandas as pd df = pd.read_csv('./data/requests.csv') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: cache_env(df)]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=40000) # model.save("SAC_new") #%% # # load model #model = SAC.load("SAC") obs = env.reset() for i in range(500): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
def expert(obs): try: state = State(env_depth, env_width).load_obs(obs) return get_behav(state, weights={'fr': 0.3}) except NoPathError: return np.zeros(env_depth * 2) # generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100) # pretrain model dataset = ExpertDataset(expert_path='expert.npz') model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1) model.pretrain(dataset, n_epochs=5000) model.save('pretrained_sac') # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0 i = 0 for j in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward i += 1 if done: print(reward_sum, i, reward_sum / i) reward_sum = 0
print('Environment created...') # Check how many episodes the model has been trained for: if os.path.exists(epFile): currentEp = read_file(epFile) print('Resuming training from episode {}'.format(currentEp)) # Check if training log of previous training exists, load model if does if currentEp <= maxEpisodes: if os.path.exists('logs/' + simName + '.txt'): print('Loading previous model...') model = SAC.load(simLogPath + simName, env, tensorboard_log=tensorboardPath) else: print('Creating new model...') model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=tensorboardPath) model.learn(total_timesteps=timeStepsPerLoad, log_interval=1) print('Training finished...') model.save(simLogPath + 'models/' + simName) print('Model saved...') env.SaveAndQuit() model = SAC.load(simName, env, tensorboard_log=tensorboardPath) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() if dones: env.reset()
def test_agent(agent_step): now = time.time() for coef_index in range(len(CLAC_COEFS)): mut_coef = CLAC_COEFS[coef_index] ent_coef = SAC_COEFS[coef_index] training_timestep = 0 clac_env = gym.make(ENVIRONMENT_NAME) clac_env = DummyVecEnv([lambda: clac_env]) clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1) sac_env = gym.make(ENVIRONMENT_NAME) sac_env = DummyVecEnv([lambda: sac_env]) sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1) mirl_env = gym.make(ENVIRONMENT_NAME) mirl_env = DummyVecEnv([lambda: mirl_env]) mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1) for resample_step in range(0, NUM_RESAMPLES): features = pd.DataFrame() if(agent_step == 1): print(mut_coef, " ", ent_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER, " ", resample_step) (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) # Save models clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) sac_model.save(FOLDER + "/Training/models/CLAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) mirl_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) training_timestep += NUM_TRAINING_STEPS # Test Normal eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Training/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") # Test generalization eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") # Test generalization Extreme eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") clac_env.env_method("reset_features") sac_env.env_method("reset_features") mirl_env.env_method("reset_features") del sac_model del sac_env del clac_model del clac_env del mirl_model del mirl_env later = time.time() difference = int(later - now) print("Tested Agent Time: ", difference)
from stable_baselines import SAC import os #GPU isolation os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "3" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' env = gym.make('BipedalWalker-v2') #env = DummyVecEnv([lambda: env]) model = SAC(MlpPolicy, env, verbose=1, learning_starts=1000) #model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=1000, log_interval=10) model.save("sac_bipedalwalker") print("Model saved to sac_bipedalwalkwer") del model # remove to demonstrate saving and loading for i in range(100): print("experiment id: ", i) model = SAC.load("sac_bipedalwalker", env=env, tensorboard_log="./sac_bipedalwalker_tensorboard/") print("loaded") model.learn(total_timesteps=500000, log_interval=50) print("learned again") obs = env.reset() #while True:
import gym import gym_turtlebot3 import rospy from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC env_name = 'TurtleBot3_Circuit_Simple_Continuous-v0' rospy.init_node(env_name.replace('-', '_')) env = gym.make(env_name) env = DummyVecEnv([lambda: env]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=int(1e4), log_interval=10) model.save(env_name)
log.logger.warning( f"No model was found for version {model_name}. Training a new model with name {model_name}." ) mode = 'train' if mode == 'train': env.reset() env.agg.case = 'rl_agg' model = SAC(LnMlpPolicy, env, learning_rate=0.03, verbose=1, tensorboard_log="tensorboard_logs") # note that the env won't record MPCCalc output for the training period model.learn(total_timesteps=5000, tb_log_name=model_name) model.save(model_name) obs = env.reset() env.agg.case = 'rl_agg' for t in range(1, num_steps + 1): action, _state = model.predict(obs) obs, reward, done, info = env.step(action) if (t % checkpoint_interval == 0) or (t == num_steps): env.agg.write_outputs() if 'dn' in run: env.agg.config['agg']['tou_enabled'] = False env.agg.config['agg']['base_price'] = 0.1 env.agg._build_tou_price() env.agg.redis_add_all_data() for h in env.agg.all_homes_obj:
def objective(trial): kwargs = hyperparams.copy() trial.model_class = None kwargs.update(sample_sac_params(trial)) def callback(_locals, _globals): """ Callback for monitoring learning progress. :param _locals: (dict) :param _globals: (dict) :return: (bool) If False: stop training """ self_ = _locals['self'] trial = self_.trial # Initialize variables if not hasattr(self_, 'is_pruned'): self_.is_pruned = False self_.last_mean_test_reward = -np.inf self_.last_time_evaluated = 0 self_.eval_idx = 0 if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval: return True self_.last_time_evaluated = self_.num_timesteps # Evaluate the trained agent on the test env rewards = [] n_steps_done, reward_sum = 0, 0.0 # Sync the obs rms if using vecnormalize # NOTE: this does not cover all the possible cases if isinstance(self_.test_env, VecNormalize): self_.test_env.obs_rms = deepcopy(self_.env.obs_rms) self_.test_env.ret_rms = deepcopy(self_.env.ret_rms) # Do not normalize reward self_.test_env.norm_reward = False obs = self_.test_env.reset() while n_steps_done < n_test_steps: # Use default value for deterministic action, _ = self_.predict(obs, ) obs, reward, done, _ = self_.test_env.step(action) reward_sum += reward n_steps_done += 1 if done: rewards.append(reward_sum) reward_sum = 0.0 obs = self_.test_env.reset() rewards.append(reward_sum) mean_reward = np.mean(rewards) summary = tf.Summary(value=[ tf.Summary.Value(tag='evaluation', simple_value=mean_reward) ]) _locals['writer'].add_summary(summary, self_.num_timesteps) self_.last_mean_test_reward = mean_reward self_.eval_idx += 1 # report best or report current ? # report num_timesteps or elasped time ? trial.report(-1 * mean_reward, self_.eval_idx) # Prune trial if need if trial.should_prune(self_.eval_idx): self_.is_pruned = True return False return True commands = [[1, 0], [2, 0], [3, 0]] env = DummyVecEnv([ lambda: e.AidaBulletEnv(commands, render=False, on_rack=False, default_reward=2, height_weight=5, orientation_weight=3, direction_weight=2, speed_weight=4) ]) model = SAC(MlpPolicy, env, gamma=kwargs['gamma'], learning_rate=kwargs['learning_rate'], batch_size=kwargs['batch_size'], buffer_size=kwargs['buffer_size'], learning_starts=kwargs['learning_starts'], train_freq=kwargs['train_freq'], gradient_steps=kwargs['gradient_steps'], ent_coef=kwargs['ent_coef'], target_entropy=kwargs['target_entropy'], policy_kwargs=kwargs['policy_kwargs'], tensorboard_log="./optimisationSAC/logOPTI") model.test_env = DummyVecEnv([ lambda: e.AidaBulletEnv(commands, render=False, on_rack=False, default_reward=2, height_weight=5, orientation_weight=3, direction_weight=2, speed_weight=4) ]) model.trial = trial try: model.learn(n_timesteps, callback=callback, tb_log_name="SAC_" + str(trial.number)) # Free memory model.env.close() model.test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned cost = -1 * model.last_mean_test_reward try: os.mkdir("./optimisationSAC/resultats/" + str(trial.number)) except FileExistsError: print("Directory already exists") model.save("./optimisationSAC/resultats/" + str(trial.number) + "/" + str(trial.number)) del model.env, model.test_env del model if is_pruned: try: # Optuna >= 0.19.0 raise optuna.exceptions.TrialPruned() except AttributeError: raise optuna.structs.TrialPruned() return cost
import gym import rlbench.gym from stable_baselines.sac.policies import MlpPolicy from stable_baselines import SAC import os dir_path = os.path.dirname(os.path.realpath(__file__)) env = gym.make("empty_container-state-v0",render_mode="human",observation_mode='vision') model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=dir_path+'/Logs/') model.learn(total_timesteps=1000) model.save("sac_ec")
import gym from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() from env.cache_env import cache_env import pandas as pd df = pd.read_csv('./data/requests.csv') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: cache_env(df)]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=40000) model.save("SAC") obs = env.reset() for i in range(20000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
import gym import numpy as np import imageio from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC env = gym.make('Pendulum-v0') env = DummyVecEnv([lambda: env]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=1000, log_interval=10) model.save("../models/sac_pendulum") del model # remove to demonstrate saving and loading model = SAC.load("../models/sac_pendulum") #obs = env.reset() #while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render()
def train_initial_policy(model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print( "Model saved as : ", "data/models/" + algo.__name__ + "_initial_policy_" + env_name + "_.pkl") constrained = False # define the environment here env = gym.make(env_name) if NOISE_VALUE > 0: env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if TIMEWRAPPER: # env = TimeFeatureWrapper(env) env = TimeLimit(env, 1000) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda: env]) if NORMALIZE: env = VecNormalize( env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC( CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3( CustomPolicy2, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], gamma=args['gamma'], gradient_steps=args['gradient_steps'], learning_rate=args['learning_rate'], learning_starts=args['learning_starts'], action_noise=action_noise, train_freq=args['train_freq'], ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters']) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2( mlp_standard, env, n_steps=int(args['n_steps'] / env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', ) elif algo.__name__ == "TRPO_lagrangian": print( 'Initializing TRPO-lagrangian with safety-starter-agents hyperparameters .. ' ) model = TRPO_lagrangian( MLPWithSafeValue, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], cost_lim=args['cost_lim'], penalty_init=args['penalty_init'], penalty_lr=args['penalty_lr']) constrained = True else: print('No algorithm matched. Using SAC .. ') model = SAC( CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) else: model.learn( total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, ) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=10, constrained=constrained) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/' + env_name + '.pkl') print('done :: ', model_name) exit()
if train: for i in range(model_num): model.learn(total_timesteps=total_timesteps_, tb_log_name=tensorboard_log_name) model.save(model_save_name) elif algorithm == "SAC": from stable_baselines.sac.policies import MlpPolicy from stable_baselines import SAC env = gym.make(env_name) model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log_dir) if train: for i in range(model_num): model.learn(total_timesteps=total_timesteps_, log_interval=1, tb_log_name=tensorboard_log_name) model.save(model_save_name) elif algorithm == "DDPG": if train: for i in range(model_num): from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec from stable_baselines import DDPG env = gym.make(env_name) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
names, values= val[0],val[1] for i in range(len(values)): sql = ''' INSERT INTO parameters(simu, type, step, value) VALUES(?,?,?,?) ''' val = (model_name, names[i], 0, float(values[i])) cur.execute(sql,val) conn.commit() cur.close() conn.close() for i in range(args.total_steps//args.save_every): model.learn(total_timesteps=args.save_every, tb_log_name=model_name, reset_num_timesteps=False, callback=callback) if normalize: env.save_running_average(workDirectory+"/resultats/"+model_name+"/normalizeData") model.save(workDirectory+"/resultats/"+model_name+"/"+model_name) os.system("python3 makegif.py --algo "+args.algo+" --dir ./server/assets/"+model_name+"_"+str((i+1)*args.save_every)+"_steps.gif --name "+model_name) print("\n saved at "+str((i+1)*args.save_every)) model.save(workDirectory+"/resultats/"+model_name+"/"+model_name) if normalize: env.save_running_average(workDirectory+"/resultats/"+model_name+"/normalizeData") env = DummyVecEnv([lambda: e.AidaBulletEnv(commands, render = False, on_rack = False, default_reward = args.default_reward, height_weight = args.height_weight, orientation_weight = args.orientation_weight, direction_weight = args.direction_weight, speed_weight = args.speed_weight, mimic_weight = args.mimic_weight,