def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, sac_config, total_time_steps, validate_every_timesteps, task_name): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num, max_episode_length=maximum_episode_length) set_global_seeds(seed_num) policy_kwargs = dict(layers=[256, 256]) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = SAC(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **sac_config, seed=seed_num) model.learn(total_timesteps=total_time_steps, tb_log_name="sac", callback=checkpoint_callback) return
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = make_mujoco_env(env, 0) env = Monitor(env, log_dir + "/") continue_train = False if continue_train: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, #action_noise=action_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def test_predict_SAC(): ''' Visualize predictions from a random policy. ''' env = gym.make('KukaMujocoSAC-v0') model = SAC(SAC_MlpPolicy, env) obs = env.reset() while True: action, _ = model.predict(obs) obs, rew, done, info = env.step(action, render=True)
def sac(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): env = gym.make(env_id) model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=timesteps, log_interval=log_interval) save_model_weights(model, "sac", env_id, policy, seed)
def main(logdir): # params SLEEP_RATE = 100 #100Hz N_EPISODE = 1000 EPISODE_TIME = 30 EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE # logdir logdir = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2' checkpoint_path = os.path.join(logdir, 'checkpoint') callback_path = logdir final_model_path = logdir + '/final_model' # env env = BlimpEnv(SLEEP_RATE) env = Monitor(env, logdir) # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir) print("Observation space:", env.observation_space) print("Shape:", env.observation_space.shape) print("Action space:", env.action_space) # # # callback SAVE_FREQ = EPISODE_LENGTH * 20 # every 1 episode checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=checkpoint_path, name_prefix='sac_callback_model') save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback( check_freq=SAVE_FREQ, log_dir=callback_path) callback = CallbackList( [checkpoint_callback, save_on_best_training_reward_callback]) # traing got kill for some reason so continue from the checkpoint model_path = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2/best_model.zip' model = SAC.load(model_path) model.set_env(env) print("---------- Start Learing -----------") model.learn(total_timesteps=TOTAL_TIMESTEPS, log_interval=SAVE_FREQ, callback=callback) print("---------- Finish Learning ----------") model.save(final_model_path) del model # remove to demonstrate saving and loading model = SAC.load(final_model_path) results_plotter.plot_results([logdir], TOTAL_TIMESTEPS, results_plotter.X_TIMESTEPS, "SAC BLIMP") plt.show()
def main(): env = gym.make("teaching-env-v0", teacher_path=os.path.join(os.getcwd(), "../saved_models", sys.argv[1]), validation_path=DATA_PATH, max_queries=config.MAX_QUERIES) agent_model = SAC(MlpPolicy, env, train_freq=1, batch_size=64, learning_rate=3e-4, learning_starts=0, buffer_size=1000, random_exploration=config.EPSILON_EXPLORATION, gamma=config.GAMMA, verbose=1) #agent_model.learn(total_timesteps=config.MAX_QUERIES * config.NUM_TRAIN_EPISODES) #agent_model.save('test_SAC') agent_model.load('test_SAC', env=env) obs = env.reset() total_reward = float('-inf') prog = tqdm(range(config.MAX_QUERIES), postfix={'Reward': total_reward}) actions = [] # For visualization total_reward = 0.0 for i in prog: action = select_action(agent_model, obs, epsilon=config.EPSILON_EXPLORATION) #action, _states = agent_model.predict(obs, deterministic=False) obs, reward, done, info = env.step(action) total_reward += reward prog.set_postfix({'Reward': total_reward}) actions.append(np.asscalar(action)) plt.hist(actions, bins=config.NUM_BINS, range=(-5, 5), density=True) plt.savefig('./visualizations/histograms/SAC') plt.clf() # Plot student's predicted function inputs = np.linspace(-5, 5, num=1000) outputs = env.student_model(inputs.reshape(-1, 1)) plt.scatter(inputs, outputs, s=0.1, label='SAC') plt.title("SAC Student's Approximation") plt.ylim((-60, 100)) plt.savefig('./visualizations/functions/SAC') plt.clf()
def rollout_policy(filename, traj_len, seed, env_name, n_trajs=1): model = SAC.load(filename) env = gym.make(env_name) env.seed(seed) trajs = [] for _ in range(int(n_trajs)): obs_list, acts_list, rews_list = [], [], [] obs = env.reset() obs_list.append(obs) for _ in range(traj_len): act = model.predict(obs, deterministic=True)[0] obs, r, done, _ = env.step(act) # assert not done acts_list.append(act) obs_list.append(obs) rews_list.append(r) infos = [{} for _ in range(traj_len)] traj = types.TrajectoryWithRew( obs=np.array(obs_list), acts=np.array(acts_list), infos=infos, rews=np.array(rews_list), ) trajs.append(traj) return trajs
def get_new_weights(): v_env = PortfolioEnv(settings['data_file'], settings['output_file'], settings['strategy_name'], settings['total_steps'], settings['window_length'], settings['capital_base'], settings['lot_size'], settings['leverage'], settings['commission_percent'], settings['commission_fixed'], settings['max_slippage_percent'], settings['start_idx'], settings['compute_indicators'], settings['compute_reward'], settings['compute_position'], settings['debug']) # Create the vectorized environment # v_env = DummyVecEnv([lambda: v_env]) # Normalize environment # v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS) model = SAC.load(MODELS_DIR + settings['model_name']) # Strategy obs = v_env.reset() dones = False while not dones: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = v_env.step(action) # v_env.render(mode='ansi') weights = v_env.current_weights return weights
def load_model(path: str, env, desc: str): """ Loads a model from a stable baseline checkpoint file into a memory representation Args: path (str) : Path to the Stable Baseline Checkpoint File env (SB Env) : Path to the Stable Baseline Checkpoint File desc (str) : Text Description of what model this is Returns: The loaded model """ if desc == "ddpg": return DDPG.load(path, env) elif desc == "ppo": env = DummyVecEnv([lambda: env]) return PPO2.load(path, env) elif desc == "trpo": env = DummyVecEnv([lambda: env]) return TRPO.load(path, env) elif desc == "td3": return TD3.load(path, env) elif desc == "sac": return SAC.load(path, env) else: raise RuntimeError(f"Model Name {desc} not supported")
def test_models(env): # seeds = [1, 2, 3] seeds = [1] for s in seeds: # Load Models # models = [A2C.load(f'data/models/a2c_{s}'), # ACKTR.load(f'data/models/acktr_{s}'), # DDPG.load(f'data/models/ddpg_{s}'), # PPO2.load(f'data/models/ppo_{s}'), # SAC.load(f'data/models/sac_{s}'), # TD3.load(f'data/models/td3_{s}'), # TRPO.load(f'data/models/trpo_{s}')] models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load( f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')] for m in models: # run_policy(m, env) og_params = m.get_parameters() generalization_test(m, env) for i in range(50): params = prune_policy(m.__class__.__name__, og_params, 0.1) m.load_parameters(params) generalization_test(m, env)
def example(): # This tutorial shows how to view policies of trained actors task = generate_task(task_generator_id='picking') world_params = dict() world_params["skip_frame"] = 3 world_params["seed"] = 0 stable_baselines_policy_path = "./model_2000000_steps.zip" model = SAC.load(stable_baselines_policy_path) # define a method for the policy fn of your trained model def policy_fn(obs): return model.predict(obs, deterministic=True)[0] # # Record a video of the policy is done in one line viewer.record_video_of_policy(task=task, world_params=world_params, policy_fn=policy_fn, file_name="pushing_video", number_of_resets=10, max_time_steps=10 * 100) # Similarly for interactive visualization in the GUI viewer.view_policy(task=task, world_params=world_params, policy_fn=policy_fn, max_time_steps=40 * 600, number_of_resets=40)
def evaluate_policy(policy_file, policy_type, envname, num_rollouts): if policy_type == "ppo": model = PPO2.load(policy_file) def get_action(obs): return model.predict(obs)[0] elif policy_type == "sac": model = SAC.load(policy_file) def get_action(obs): return model.predict(obs, deterministic=True)[0] else: raise NotImplementedError() env = gym.make(envname) returns = [] for i in range(num_rollouts): # print("iter", i, end=" ") obs = env.reset() done = False totalr = 0.0 while not done: action = get_action(obs) obs, r, done, _ = env.step(action) totalr += r returns.append(totalr) return np.mean(returns), np.std(returns)
def test_SAC(env, out_dir, seed=None, **kwargs): model = SAC.load(os.path.join(out_dir, 'final_model'), env=env) env.seed(seed) # Evaluate the trained agent mean_reward = evaluate(env, model, out_dir, num_episodes=20) return
def play(): model = SAC.load(expDir + "/%s/%d" % (name, np.format_float_scientific(nIter))) env = gym.make('PointMassDense-1-v1') while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render(mode='human')
def train_sac(training_tag): env = gym.make(ENVIRONMENT_NAME) env = DummyVecEnv([lambda: env]) if (isinstance(training_tag, float)): model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=1, policy_kwargs=POLICY_KWARGS) for step in range(TRAINING_STEPS): env.reset() (model, learning_results) = model.learn( total_timesteps=TRAINING_TIMESTEPS, log_interval=100) file_tag = str(training_tag).replace(".", "p") if (SAVE_AGENTS): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if (SAVE_FINAL_AGENT): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model return data
def __init__(self, env): self.env = env load_path_rl="/home/icv/Trustworth/TiRL/models/sac-5" log_path_rl="/home/icv/Trustworth/TiRL/data/sac-5" self.model_rl = SAC.load(load_path_rl, env=env, tensorboard_log=log_path_rl) load_path_rule="/home/icv/Trustworth/TiRL/models/sac_rule3" log_path_rule="/home/icv/Trustworth/TiRL/data/sac_rule3" self.model_rule = SAC.load(load_path_rule, env=env, tensorboard_log=log_path_rule) self.agent_rule = IDM(env) print("load model successfully") self.reset()
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = gym.make(env) env = Monitor(env, log_dir + '/', allow_early_resets=True) # Delete keys so the dict can be pass to the model constructor # policy = kwargs['policy'] policy = 'MlpPolicy' # n_timesteps = kwargs['n_timesteps'] n_timesteps = int(1e6) noise_type = None # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) continue_model = False if continue_model is True: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, # action_noise=param_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def sac(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): env = gym.make(env_id) if load_weights is not None: model = SAC.load(load_weights, env, verbose=0) else: model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def play(save_dir, env): model = SAC.load(save_dir + '/model_dir/sac/test_25_25_14_15', env=env, custom_objects=dict(learning_starts=0)) ### ADD NUM for _ in range(2): obs = env.reset() done = False while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action)
def f_checkpoints_range_2_mean_performance( self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]: logging.debug( f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}" ) rewards = np.zeros(len(checkpoints)) s_rates = np.zeros(len(checkpoints)) # Intent # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint # - Pass that model to `mean_eval` evaluation function which will evaluate the model on # - a certain number of episodes # - a certain env # - continuous or not continuous space # - an evaluation returns reward and average success rate # # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates j = 0 """ NOTE: i can range in anyway while j iterates over the numpy array """ for i in checkpoints: path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}" logging.debug(f"Evaluating model at {path}") if self.args.model['name'] == "ddpg": model = DDPG.load(path) elif self.args.model['name'] == "ppo": model = PPO2.load(path) elif self.args.model['name'] == "trpo": model = TRPO.load(path) elif self.args.model['name'] == "td3": model = TD3.load(path) elif self.args.model['name'] == "sac": model = SAC.load(path) logging.debug( f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}" ) rewards_list, success_rates_list = mean_eval( num_episodes=self.args.n_episodes, checkpoint_id=i, model=model, env=self.env, v=True, continuous=self.args.continuous, plots_dir=self.args.plots_dir) rewards_mean = np.mean(rewards_list) success_rates_mean = np.mean(success_rates_list) logging.debug( f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}" ) rewards[j] = rewards_mean s_rates[j] = success_rates_mean j += 1 return rewards, s_rates
def load_model(config): model = None if config["algo_name"] == "TD3": model = TD3.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "A2C": model = A2C.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "SAC": model = SAC.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "PPO2": model = PPO2.load("agents/{}".format(args["test_agent_path"])) assert model is not None, "Alg name not found, cannot load model, exiting. " return model
def func_run(env, logger, lr, action_noise, file): expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 5e7 save_video_length = 200 save_video_interval = 1000000 env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() return True
def model_training_learning(env_train, model_name, timesteps=100000): # train model os.chdir("./model_saved/" + model_name) start = time.time() print("Train ", model_name, " Model with MlpPolicy: ") if model_name == "A2C_Model": model = A2C('MlpPolicy', env_train, verbose=0) elif model_name == "PPO_Model": model = PPO2('MlpPolicy', env_train, verbose=0) elif model_name == "TD3_Model": model = TD3('MlpPolicy', env_train, verbose=0) elif model_name == "SAC_Model": model = SAC('MlpPolicy', env_train, verbose=0) print("Learning ", model_name, " time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("- ", model_name, " save finish :") print("Training time ", model_name, " : ", (end - start) / 60, " minutes") os.chdir("./..") os.chdir("./..") return model
def main(argv): fixed = True policy_name = "sac_reaching_policy" obj_pose_rnd_std = 0 if fixed == True else 0.05 pandaenv = pandaReachGymEnv(renders=True, use_IK=0, numControlledJoints=7, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) n_actions = pandaenv.action_space.shape[-1] pandaenv = DummyVecEnv([lambda: pandaenv]) model = SAC(MlpPolicy, pandaenv, gamma=0.9, batch_size=16, verbose=1, tensorboard_log="../pybullet_logs/pandareach_sac/") model.learn(total_timesteps=1000000) model.save("../pybullet_logs/pandareach_sac/" + policy_name) del model # remove to demonstrate saving and loading
def run_experiment(verbose, tensorboard_log, learning_rate): pdb.set_trace() env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) n_actions = env.action_space.shape[-1] stddev = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=learning_rate, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) model.save(expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(learning_rate))) env.close()
def explore(app, emulator, appium, timesteps, timer, save_policy, policy_dir, cycle, train_freq=5, target_update_interval=10): try: env = TimeFeatureWrapper(app) model = SAC(MlpPolicy, env, verbose=1, train_freq=train_freq, target_update_interval=target_update_interval) callback = TimerCallback(timer=timer, app=app) model.learn(total_timesteps=timesteps, callback=callback) if save_policy: model.save(f'{policy_dir}{os.sep}{cycle}') return True except Exception as e: print(e) appium.restart_appium() if emulator is not None: emulator.restart_emulator() return False
def load_model(model_path, params): env_cls = globals()[params['env']] orig_env = env_cls(**params['env_options']) env = DummyVecEnv([lambda: orig_env]) if params['alg'] == 'PPO2': model = PPO2.load(model_path, env=env) elif params['alg'] == 'SAC': model = SAC.load(model_path, env=env) else: raise NotImplementedError return orig_env, model
def sac(env, seed): n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions)) return SAC('MlpPolicy', env, learning_rate=0.001, action_noise=action_noise, verbose=1, tensorboard_log="./data/runs", seed=seed)
def get_SAC_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path): policy_kwargs = dict(layers=model_settings['NET_LAYERS']) env = get_single_process_env(model_settings, model_path, ckpt_step) if ckpt_path is not None: print("Loading model from checkpoint '{}'".format(ckpt_path)) model = SAC.load(ckpt_path, env=env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) model.num_timesteps = ckpt_step else: model = SAC(SACMlpPolicy, env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) return model, env
def load_model(self): model_path = "data/saved_models/" if folder: model_path = model_path + self.folder + "/" else: model_path = model_path + self.model_name + "/" model_path = model_path + self.model_name if self.episode: model_path = model_path + "_" + self.episode + ".pkl" self.model = SAC.load(model_path)