def main(): global model, best_model_path, last_model_path, sim_joy mission = 'PushStonesEnv' # Change according to algorithm env = gym.make(mission + '-v0').unwrapped # Create log and model dir # dir = 'stable_bl/' + mission dir = 'stable_bl/PushMultipleStones' os.makedirs(dir + '/model_dir/sac', exist_ok=True) jobs = ['train', 'record', 'record-w/hm', 'BC_agent', 'play'] job = jobs[1] pretrain = True if job == 'train': # create new folder try: tests = os.listdir(dir + '/model_dir/sac') indexes = [] for item in tests: indexes.append(int(item.split('_')[1])) if not bool(indexes): k = 0 else: k = max(indexes) + 1 except FileNotFoundError: os.makedirs(dir + '/log_dir/sac') k = 0 model_dir = os.getcwd() + '/' + dir + '/model_dir/sac/test_{}'.format( str(k)) best_model_path = model_dir last_model_path = model_dir log_dir = dir + '/log_dir/sac/test_{}'.format(str(k)) logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) num_timesteps = int(1e6) policy_kwargs = dict(layers=[64, 64, 64]) # SAC - start learning from scratch model = SAC(sac_MlpPolicy, env, gamma=0.99, learning_rate=1e-4, buffer_size=500000, learning_starts=0, train_freq=1, batch_size=64, tau=0.01, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=2, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True, seed=None, n_cpu_tf_sess=None) # Load best model and continue learning # models = os.listdir(dir + '/model_dir/sac') # models_rew = (model for model in models if 'rew' in model) # ind, reward = [], [] # for model in models_rew: # ind.append(model.split('_')[1]) # reward.append(model.split('_')[3]) # best_reward = max(reward) # best_model_ind = reward.index(best_reward) # k = ind[best_model_ind] # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_rew_' + best_reward, env=env, # custom_objects=dict(learning_starts=0)) # Load last saved model and continue learning # models = os.listdir(dir + '/model_dir/sac') # models_time = (model for model in models if 'rew' not in model) # ind, hour, min = [], [], [] # for model in models_time: # ind.append(model.split('_')[1]) # hour.append(model.split('_')[3]) # min.append(model.split('_')[4]) # date = models_time[0].split('_')[2] # latest_hour = max(hour) # latest_hour_ind = [i for i, n in enumerate(hour) if n == latest_hour] # latest_min = max(min[latest_hour_ind]) # latest_min_ind = min(latest_min) # k = ind[latest_min_ind] # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_' + date + '_' + latest_hour[0] + '_' + latest_min + 'zip', # env=env, custom_objects=dict(learning_starts=0)) # model = SAC.load(dir + '/model_dir/sac/test_53_rew_24383.0', # env=env, tensorboard_log=log_dir, # custom_objects=dict(learning_starts=0, learning_rate=2e-4, # train_freq=8, gradient_steps=4, target_update_interval=4)) # # # batch_size=32)) # pretrain if pretrain: # load dataset only once # expert_dataset('3_rocks_40_episodes') dataset = ExpertDataset(expert_path=(os.getcwd() + '/dataset.npz'), traj_limitation=-1) model.pretrain(dataset, n_epochs=2000) # Test the pre-trained model # env = model.get_env() # obs = env.reset() # # reward_sum = 0.0 # for _ in range(1000): # action, _ = model.predict(obs) # obs, reward, done, _ = env.step(action) # reward_sum += reward # if done: # print(reward_sum) # reward_sum = 0.0 # obs = env.reset() # # env.close() # learn model.learn(total_timesteps=num_timesteps, callback=save_fn) # PPO1 # model = PPO1(Common_MlpPolicy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, # optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, # schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, # policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) # TRPO # model = TRPO(MlpPolicy, env, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1) # model.learn(total_timesteps=500000) # model.save(log_dir) elif job == 'record': mission = 'PushStonesHeatMapEnv' env = gym.make(mission + '-v0').unwrapped obs = [] actions = [] rewards = [] dones = [] episode_rewards = [] num_episodes = 30 listener = keyboard.Listener(on_press=on_press) listener.start() for episode in range(num_episodes): ob = env.reset() done = False print('Episode number ', episode + 1) episode_reward = 0 while not done: act = "recording" # act = sim_joy # act = [0,1,0.5] new_ob, reward, done, info = env.step(act) # print(info['action']) # print(ob) if recorder_on: obs.append(ob) actions.append(info['action']) rewards.append(reward) dones.append(done) episode_reward = episode_reward + reward ob = new_ob episode_rewards.append(episode_reward) if info['reset reason'] == 'out of boarders' or info[ 'reset reason'] == 'limit time steps': episode -= 1 else: print('saving data') data_saver(obs, actions, rewards, dones, episode_rewards) elif job == 'play': # env = gym.make('PickUpEnv-v0') model = SAC.load(dir + '/model_dir/sac/test_25_25_14_15', env=env, custom_objects=dict(learning_starts=0)) ### ADD NUM for _ in range(2): obs = env.reset() done = False while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action)
def expert(obs): try: state = State(env_depth, env_width).load_obs(obs) return get_behav(state, weights={'fr': 0.3}) except NoPathError: return np.zeros(env_depth * 2) # generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100) # pretrain model dataset = ExpertDataset(expert_path='expert.npz') model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1) model.pretrain(dataset, n_epochs=5000) model.save('pretrained_sac') # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0 i = 0 for j in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward i += 1 if done: print(reward_sum, i, reward_sum / i)
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log, expert_path, pretrain, pretrain_epochs, mdpo_update_steps, num_trajectories, expert_model, exploration_bonus, bonus_coef, random_action_len, is_action_features, dir_name, neural, lipschitz, args): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): # from mpi4py import MPI # rank = MPI.COMM_WORLD.Get_rank() rank = 0 env_name = env_id[:-3].lower() log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\ + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam) log_dir += '_' + dir_name + '/' log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps) # log_name += '_randLen' + str(random_action_len) if exploration_bonus: log_name += '_exploration' + str(bonus_coef) if pretrain: log_name += '_pretrain' + str(pretrain_epochs) if not is_action_features: log_name += "_states_only" log_name += '_s' + str(seed) log_path = log_dir + log_name expert_path = './experts/' + expert_path num_timesteps = int(num_timesteps) args = args.__dict__ dir_path = os.getcwd() + log_dir[1:] if not os.path.exists(dir_path): os.makedirs(dir_path) with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file: file.write("Experiment Arguments:") for key, val in args.items(): print(key, ": ", val, file=file) if log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() # env = make_mujoco_env(env_id, workerseed) def make_env(): # env_out = gym.make(env_id, reset_noise_scale=1.0) env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) env_out = wrap_mujoco(env_out, random_action_len=random_action_len) return env_out # env = DummyVecEnv([make_env]) # env = VecNormalize(env) if algo == 'Train': train = True else: train = False if algo == 'Evaluate': eval = True else: eval = False if train: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) if num_timesteps > 0: model = SAC('MlpPolicy', env_id, verbose=1, buffer_size=1000000, batch_size=256, ent_coef='auto', train_freq=1, tau=0.01, gradient_steps=1, learning_starts=10000) else: model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=num_trajectories) if num_timesteps > 0: model.save('sac_' + env_name + '_' + str(num_timesteps)) elif eval: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=10, evaluate=True) else: expert_path = expert_path + '.npz' dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, verbose=1) if algo == 'MDAL': model = MDAL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal/", seed=seed, buffer_size=1000000, ent_coef=0.0, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, d_step=10, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lipschitz=lipschitz) elif algo == 'MDAL_ON_POLICY': model = MDAL_MDPO_ON('MlpPolicy', env, dataset, verbose=1, timesteps_per_batch=2048, tensorboard_log="./experiments/" + env_name + "/mdal_mdpo_on/", seed=seed, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, adversary_entcoeff=0.001, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, sgd_steps=sgd_steps, klcoeff=1.0, method="multistep-SGD", tsallis_q=1.0, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural) elif algo == 'MDAL_TRPO': model = MDAL_TRPO('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal_trpo/", seed=seed, gamma=0.99, g_step=3, d_step=5, sgd_steps=1, d_stepsize=9e-5, entcoeff=0.0, adversary_entcoeff=0.001, max_kl=t_pi, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lam=0.98, timesteps_per_batch=2000, lipschitz=lipschitz) elif algo == 'GAIL': from mpi4py import MPI from stable_baselines import GAIL model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail/", seed=seed, entcoeff=0.0, adversary_entcoeff=0.001, lipschitz=lipschitz) elif algo == 'GAIL_MDPO_OFF': # from mpi4py import MPI from stable_baselines import GAIL_MDPO_OFF model = GAIL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail_mdpo_off/", seed=seed, ent_coef=0.0, adversary_entcoeff=0.001, buffer_size=1000000, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, lipschitz=lipschitz) else: raise ValueError("Not a valid algorithm.") if pretrain: model.pretrain(dataset, n_epochs=pretrain_epochs) model.learn(total_timesteps=num_timesteps, tb_log_name=log_name) env.close()