def test_predict_SAC(): ''' Visualize predictions from a random policy. ''' env = gym.make('KukaMujocoSAC-v0') model = SAC(SAC_MlpPolicy, env) obs = env.reset() while True: action, _ = model.predict(obs) obs, rew, done, info = env.step(action, render=True)
def main(): parser = argparse.ArgumentParser("Insertion, Manual mode") parser.add_argument('checkpoint_path', type=str, help='Path to checkpoint') parser.add_argument('--host', default="192.168.2.121", type=str, help='IP of the server (default is a Windows#2)') parser.add_argument( '--port', default=9090, type=int, help='Port that should be used to connect to the server') parser.add_argument( '--use_coord', action="store_true", help=('If set, the environment\'s observation space will be' 'coordinates instead of images')) args = parser.parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' env = gym.make('insertion-v0', kwargs={ 'host': args.host, "port": args.port, "use_coord": args.use_coord }) print(f"Observation space: {env.observation_space}") print(f"Action space: {env.action_space}") if args.use_coord: model = SAC('MlpPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") else: model = SAC('CnnPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") model.load(args.checkpoint_path, env=env) obs = env.reset() for i in range(10000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
"death_rate": 0.0, } for i in range(num_of_paths): # Path storage buckets episode_path = { "s": [], "r": [], "s_": [], "state_of_interest": [], "reference": [], } # while not dones[0]: s = env.reset() for j in range(max_ep_steps): action, _states = model.predict(s) s_, rewards, dones, infos = env.step(action) # Store observations episode_path["s"].append(s) episode_path["r"].append(rewards) episode_path["s_"].append(s_) info = infos[0] if "state_of_interest" in info.keys(): episode_path["state_of_interest"].append( np.array([info["state_of_interest"]])) if "reference" in info.keys(): episode_path["reference"].append(np.array(info["reference"])) # Terminate if max step has been reached if j == (max_ep_steps - 1): dones[0] = True
return get_behav(state, weights={'fr': 0.3}) except NoPathError: return np.zeros(env_depth * 2) # generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100) # pretrain model dataset = ExpertDataset(expert_path='expert.npz') model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1) model.pretrain(dataset, n_epochs=5000) model.save('pretrained_sac') # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0 i = 0 for j in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward i += 1 if done: print(reward_sum, i, reward_sum / i) reward_sum = 0 i = 0 obs = env.reset() env.close()
def main(): global model, best_model_path, last_model_path, sim_joy mission = 'PushStonesEnv' # Change according to algorithm env = gym.make(mission + '-v0').unwrapped # Create log and model dir # dir = 'stable_bl/' + mission dir = 'stable_bl/PushMultipleStones' os.makedirs(dir + '/model_dir/sac', exist_ok=True) jobs = ['train', 'record', 'record-w/hm', 'BC_agent', 'play'] job = jobs[1] pretrain = True if job == 'train': # create new folder try: tests = os.listdir(dir + '/model_dir/sac') indexes = [] for item in tests: indexes.append(int(item.split('_')[1])) if not bool(indexes): k = 0 else: k = max(indexes) + 1 except FileNotFoundError: os.makedirs(dir + '/log_dir/sac') k = 0 model_dir = os.getcwd() + '/' + dir + '/model_dir/sac/test_{}'.format( str(k)) best_model_path = model_dir last_model_path = model_dir log_dir = dir + '/log_dir/sac/test_{}'.format(str(k)) logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) num_timesteps = int(1e6) policy_kwargs = dict(layers=[64, 64, 64]) # SAC - start learning from scratch model = SAC(sac_MlpPolicy, env, gamma=0.99, learning_rate=1e-4, buffer_size=500000, learning_starts=0, train_freq=1, batch_size=64, tau=0.01, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=2, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True, seed=None, n_cpu_tf_sess=None) # Load best model and continue learning # models = os.listdir(dir + '/model_dir/sac') # models_rew = (model for model in models if 'rew' in model) # ind, reward = [], [] # for model in models_rew: # ind.append(model.split('_')[1]) # reward.append(model.split('_')[3]) # best_reward = max(reward) # best_model_ind = reward.index(best_reward) # k = ind[best_model_ind] # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_rew_' + best_reward, env=env, # custom_objects=dict(learning_starts=0)) # Load last saved model and continue learning # models = os.listdir(dir + '/model_dir/sac') # models_time = (model for model in models if 'rew' not in model) # ind, hour, min = [], [], [] # for model in models_time: # ind.append(model.split('_')[1]) # hour.append(model.split('_')[3]) # min.append(model.split('_')[4]) # date = models_time[0].split('_')[2] # latest_hour = max(hour) # latest_hour_ind = [i for i, n in enumerate(hour) if n == latest_hour] # latest_min = max(min[latest_hour_ind]) # latest_min_ind = min(latest_min) # k = ind[latest_min_ind] # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_' + date + '_' + latest_hour[0] + '_' + latest_min + 'zip', # env=env, custom_objects=dict(learning_starts=0)) # model = SAC.load(dir + '/model_dir/sac/test_53_rew_24383.0', # env=env, tensorboard_log=log_dir, # custom_objects=dict(learning_starts=0, learning_rate=2e-4, # train_freq=8, gradient_steps=4, target_update_interval=4)) # # # batch_size=32)) # pretrain if pretrain: # load dataset only once # expert_dataset('3_rocks_40_episodes') dataset = ExpertDataset(expert_path=(os.getcwd() + '/dataset.npz'), traj_limitation=-1) model.pretrain(dataset, n_epochs=2000) # Test the pre-trained model # env = model.get_env() # obs = env.reset() # # reward_sum = 0.0 # for _ in range(1000): # action, _ = model.predict(obs) # obs, reward, done, _ = env.step(action) # reward_sum += reward # if done: # print(reward_sum) # reward_sum = 0.0 # obs = env.reset() # # env.close() # learn model.learn(total_timesteps=num_timesteps, callback=save_fn) # PPO1 # model = PPO1(Common_MlpPolicy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, # optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, # schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, # policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) # TRPO # model = TRPO(MlpPolicy, env, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1) # model.learn(total_timesteps=500000) # model.save(log_dir) elif job == 'record': mission = 'PushStonesHeatMapEnv' env = gym.make(mission + '-v0').unwrapped obs = [] actions = [] rewards = [] dones = [] episode_rewards = [] num_episodes = 30 listener = keyboard.Listener(on_press=on_press) listener.start() for episode in range(num_episodes): ob = env.reset() done = False print('Episode number ', episode + 1) episode_reward = 0 while not done: act = "recording" # act = sim_joy # act = [0,1,0.5] new_ob, reward, done, info = env.step(act) # print(info['action']) # print(ob) if recorder_on: obs.append(ob) actions.append(info['action']) rewards.append(reward) dones.append(done) episode_reward = episode_reward + reward ob = new_ob episode_rewards.append(episode_reward) if info['reset reason'] == 'out of boarders' or info[ 'reset reason'] == 'limit time steps': episode -= 1 else: print('saving data') data_saver(obs, actions, rewards, dones, episode_rewards) elif job == 'play': # env = gym.make('PickUpEnv-v0') model = SAC.load(dir + '/model_dir/sac/test_25_25_14_15', env=env, custom_objects=dict(learning_starts=0)) ### ADD NUM for _ in range(2): obs = env.reset() done = False while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action)
if mode == 'train': env.reset() env.agg.case = 'rl_agg' model = SAC(LnMlpPolicy, env, learning_rate=0.03, verbose=1, tensorboard_log="tensorboard_logs") # note that the env won't record MPCCalc output for the training period model.learn(total_timesteps=5000, tb_log_name=model_name) model.save(model_name) obs = env.reset() env.agg.case = 'rl_agg' for t in range(1, num_steps + 1): action, _state = model.predict(obs) obs, reward, done, info = env.step(action) if (t % checkpoint_interval == 0) or (t == num_steps): env.agg.write_outputs() if 'dn' in run: env.agg.config['agg']['tou_enabled'] = False env.agg.config['agg']['base_price'] = 0.1 env.agg._build_tou_price() env.agg.redis_add_all_data() for h in env.agg.all_homes_obj: h.initialize_environmental_variables() obs = env.reset() env.agg.case = 'baseline'
max_n_envs=1, specific_env_len=70, s_len=150, walls=True, target_vel=params["target_vel"], use_contacts=params["use_contacts"]) print("Testing") policy_name = "H02" # LX3, 63W (tiles): joints + contacts + yaw policy_path = 'agents/SBL_{}'.format(policy_name) model = SAC.load(policy_path) print("Loading policy from: {}".format(policy_path)) obs = env.reset() for _ in range(100): cum_rew = 0 t1 = time.time() for i in range(800): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action, render=True) cum_rew += reward #env.render() if done: t2 = time.time() print("Time taken for episode: {}".format(t2 - t1)) obs = env.reset() print(cum_rew) break env.close()
import gym import numpy as np from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC from env import OsmoEnv if __name__ == "__main__": env = DummyVecEnv([lambda: OsmoEnv()]) model = SAC(MlpPolicy, env, verbose=1, learning_rate=1e-4) model.learn(total_timesteps=30000) model.save("SAC_baselines") env = OsmoEnv() for i in range(10): observation = env.reset() done = False while not done: action, _ = model.predict(observation) observation, _, done, info = env.step(action) else: print(info)
class SAC_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] def make_env(self, env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = Template_Gym() env.seed(seed + rank) return env set_global_seeds(seed) return _init def train(self, num_e=1, n_timesteps=10000000, save_fraction=0.1, save='saves/m1'): env_id = "default" num_e = 32 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona #self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) env = Template_Gym() self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./test6" ) self.model = SAC(CustomPolicy_sac, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./m1lstm1") #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(save+str(i)) def evaluate(self, num_env=32, num_steps=50, load="saves/defaultlstmday", runs=10): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=False, norm_reward=True) for i in range(runs): self.model = PPO2.load(load+str(i), self.env, policy=CustomPolicy_2, tensorboard_log="./default/" ) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() for i in range(num_steps): # _states are only useful when using LSTM policies actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward
policy_kwargs={ 'layers': [64, 64], 'reg_weight': 1e-32 }) model.learn(total_timesteps=100000, log_interval=10) obs, act = [], [] nb_rollouts, nb_steps = 25, 200 for n in range(nb_rollouts): _obs = np.empty((nb_steps, dm_obs)) _act = np.empty((nb_steps, dm_act)) x = env.reset() for t in range(nb_steps): u, _ = model.predict(x) _obs[t, :], _act[t, :] = x, u u = np.clip(u, -ulim, ulim) x, r, _, _ = env.step(u) obs.append(_obs) act.append(_act) import matplotlib.pyplot as plt fig, ax = plt.subplots(nrows=1, ncols=dm_obs + dm_act, figsize=(12, 4)) for _obs, _act in zip(obs, act): for k, col in enumerate(ax[:-1]): col.plot(_obs[:, k]) ax[-1].plot(_act) plt.show()