def train(config, args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_model and not os.path.exists("./models"): os.makedirs("./models") import pybulletgym warnings.filterwarnings("ignore") eps_bounds = args.reacher_epsilon_bounds # just aliasing with shorter variable name utils_object = utils.GeneralUtils(args) if args.tune_run: if args.prioritized_replay: args.alpha = float(config["alpha"]) args.beta = float(config["beta"]) args.discount = float(config.get("discount", args.discount)) args.tau = float(config.get("tau", args.tau)) elif args.custom_env and args.use_hindsight: eps_bounds = [float(config["epsilons"][0]), float(config["epsilons"][1])] args.seed = int(config["seed"]) else: args.discount = float(config.get("discount", args.discount)) args.tau = float(config.get("tau", args.tau)) if args.custom_env: gym.envs.register( id='OurReacher-v0', entry_point='our_reacher_env:OurReacherEnv', max_episode_steps=50, reward_threshold=100.0, ) # this is assuming we only use epsilon for custom env or fetch reach, where episode tsteps is 50 !!!! max_episode_steps = 50 # retrieve epsilon range [a, b] = eps_bounds epsilons = utils_object.epsilon_calc(a, b, max_episode_steps) env = gym.make('OurReacher-v0', epsilon=epsilons[0], render=False) else: env = gym.make(args.env) if utils_object.fetch_reach and utils_object.args.fetch_reach_dense: env.env.reward_type = "dense" # Set seeds env.seed(int(args.seed)) torch.manual_seed(args.seed) np.random.seed(args.seed) if utils_object.fetch_reach: state_dim = env.reset()["observation"].shape[0] else: state_dim = env.observation_space.shape[0] if args.use_hindsight: # include both current state and goal state if args.custom_env: state_dim += 2 # reacher nonsense; goal = (x, y) elif utils_object.fetch_reach: state_dim += 3 # include fetchreach goal state (x,y,z position) else: state_dim *= 2 action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq kwargs["prioritized_replay"] = args.prioritized_replay kwargs["use_rank"] = args.use_rank kwargs["use_hindsight"] = args.use_hindsight policy = TD3.TD3(**kwargs) elif args.policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) exp_descriptors = [ args.policy, 'CustomReacher' if args.custom_env else args.env, f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 'HER' if args.use_hindsight else '', f"{args.decay_type}decay-eps{f'{eps_bounds[0]}-{eps_bounds[1]}' if eps_bounds[0] != eps_bounds[1] else f'{eps_bounds[0]}'}" if args.custom_env else "", f"k{args.k}", datetime.now().strftime('%Y%m%d%H%M') ] if args.tune_run: # fudgy: assumes tune_run for non-HER experiments exp_descriptors = [ args.policy, 'CustomReacher' if args.custom_env else args.env, f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', f"tau{args.tau}", f"discount{args.discount}", f"alpha{args.alpha}" if args.prioritized_replay else '', f"beta{args.beta}" if args.prioritized_replay else '', f"k{args.k}", datetime.now().strftime('%Y%m%d%H%M') ] exp_descriptors = [x for x in exp_descriptors if len(x) > 0] file_name = "_".join(exp_descriptors) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") if args.prioritized_replay: replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim, args.max_timesteps, args.start_timesteps, alpha=args.alpha, beta=args.beta) else: replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env, args.seed, utils_object=utils_object)] state, done = env.reset(), False original_episode_reward = 0 episode_reward = 0 episode_timesteps = 0 episode_num = 0 trajectory = [] for t in range(int(args.max_timesteps)): episode_timesteps += 1 x, goal = utils_object.compute_x_goal(state, env) # Select action randomly or according to policy if t < args.start_timesteps: action = env.action_space.sample() else: action = ( policy.select_action(np.array(x)) + np.random.normal(0, max_action * args.expl_noise, size=action_dim) ).clip(-max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0 if args.use_hindsight: if utils_object.fetch_reach: goal = state["desired_goal"] next_x = np.concatenate([np.array(next_state["observation"]), goal]) else: # env.set_goal(goal) next_x = np.concatenate([np.array(next_state), goal]) elif utils_object.fetch_reach: next_x = np.array(next_state["observation"]) else: next_x = next_state # Store data in replay buffer if not args.use_hindsight: replay_buffer.add(x, action, next_x, reward, done_bool) trajectory.append((state, action, next_state, reward, done_bool)) state = next_state episode_reward += reward if args.custom_env: original_episode_reward += env.original_rewards # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: if args.use_hindsight: replay_buffer.add_hindsight(trajectory, goal, env, k=args.k, fetch_reach=utils_object.fetch_reach) # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Original Reward: {original_episode_reward:.3f}") # Reset environment state, done = env.reset(), False episode_reward = 0 original_episode_reward = 0 episode_timesteps = 0 episode_num += 1 if args.custom_env: epsilon = epsilons[episode_num] env.set_epsilon(epsilon) trajectory = [] # Evaluate episode if (t + 1) % args.eval_freq == 0: evaled_policy = eval_policy(policy, args.env, args.seed, utils_object=utils_object) evaluations.append(evaled_policy) np.save(f"./results/{file_name}", evaluations) if args.save_model: policy.save(f"./models/{file_name}") if args.plot: plotter.plot(file_name, args.custom_env) if args.tune_run: tune.report(episode_reward_mean=evaled_policy[0])
filename = args.filename print "---------------------------------------" print "Loading model from: %s" % (filename) print "---------------------------------------" env = gym.make(args.env_name) if args.visualize: env.render(mode="human") # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) # Load model policy.load(filename, './pytorch_models/') # Start evaluation _ = evaluate_policy(policy, eval_episodes=args.eval_episodes, visualize=args.visualize)
env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) # Setup wandb wandb.init(project="TD3", config=args) wandb.watch((policy.actor, policy.critic)) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0
def load_policy(load_from): # Initialize policy start_step = 0 if args.policy == "TD3": import TD3 # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * kwargs['max_action'] kwargs["noise_clip"] = args.noise_clip * kwargs['max_action'] kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) elif args.policy == "OurDDPG": import OurDDPG policy = OurDDPG.DDPG(**kwargs) elif args.policy == "DDPG": import DDPG policy = DDPG.DDPG(**kwargs) # create experiment directory (may not be used) exp_cnt = 0 load_model_path = '' results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt) while os.path.exists(results_dir): exp_cnt+=1 results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt) # load model if necessary if load_from != "": if os.path.isdir(load_from): print("loading latest model from dir: {}".format(load_from)) # find last file search_path = os.path.join(load_from, '*.pt') model_files = glob(search_path) if not len(model_files): print('could not find model exp files at {}'.format(search_path)) raise else: load_model_path = sorted(model_files)[-1] else: load_model_path = load_from print("loading model from file: {}".format(load_model_path)) policy.load(load_model_path) # TODO # utils.load_info_dict(load_model_base) try: start_step = int(load_model_path[-13:-3]) except: try: start_step = policy.step except: print('unable to get start step from name - set it manually') # store in old dir if not args.continue_in_new_dir: results_dir = os.path.split(load_model_path)[0] print("continuing in loaded directory") print(results_dir) else: print("resuming in new directory") print(results_dir) else: if not os.path.exists(results_dir): os.makedirs(results_dir) print('storing results in: {}'.format(results_dir)) return policy, start_step, results_dir, load_model_path
"action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, # "trained_model": "data_cube_5_trained_model_10_07_19_1749.pt" } # Initialize policy if args.policy_name == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif args.policy_name == "DDPG": policy = DDPG.DDPG(**kwargs) replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Add expert data into replay buffer from expert_data import generate_Data replay_buffer = generate_Data(env, 300, "random", replay_buffer) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env_name, args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0
eval_envs = 100 + args.eval_episodes eval_env = make_vec_envs(args.env_name, eval_envs) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) if args.policy_name == 'TD3': args.swap_criterion = None # Initialize policy if args.policy_name == "TD3" or args.policy_name == 'TD3-swap': policy = TD3.TD3(state_dim, action_dim, 1, args.target_q, args.target_distance_weight) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, 1) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, 1) else: raise NotImplementedError replay_buffer = utils.ReplayBuffer() total_timesteps = 0 total_timesteps_with_eval = 0 timesteps_since_eval = 0 timesteps_since_swapped = 0 episode_num = 0 done = True episode_reward = 0 episode_timesteps = 0
transfer_state_dim = 0 transfer_action_dim = 0 if args.transfer_env is not None: transfer_model = args.policy_name + "_" + args.transfer_env + "_" + str( args.seed) env_t = gym.make(args.transfer_env) transfer_state_dim = env_t.observation_space.shape[0] transfer_action_dim = env_t.action_space.shape[0] if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, transfer_model, transfer_state_dim, transfer_action_dim) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action, transfer_model, transfer_state_dim, transfer_action_dim) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() while total_timesteps < args.max_timesteps:
def experiment(variant): print('CUDA status:', torch.cuda.is_available()) env = make_env(variant['env']) # Set seeds variant['seed'] = int(variant['seed']) env.seed(int(variant['seed'])) torch.manual_seed(int(variant['seed'])) np.random.seed(int(variant['seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": variant['discount'], "tau": variant['tau'], 'network_class': NETWORK_CLASSES[variant['network_class']] } # custom network kwargs mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim']) dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], dropout_p=variant['dropout_p']) variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], sigma=variant['sigma']) fourier_network_kwargs = dict( n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], fourier_dim=variant['fourier_dim'], sigma=variant['sigma'], concatenate_fourier=variant['concatenate_fourier'], train_B=variant['train_B']) siren_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_omega_0=variant['omega'], hidden_omega_0=variant['omega']) if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}: kwargs['network_kwargs'] = mlp_network_kwargs elif variant['network_class'] == 'DropoutMLP': kwargs['network_kwargs'] = dropout_mlp_network_kwargs elif variant['network_class'] == 'VariableInitMLP': kwargs['network_kwargs'] = variable_init_mlp_network_kwargs elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}: kwargs['network_kwargs'] = fourier_network_kwargs elif variant['network_class'] == 'Siren': kwargs['network_kwargs'] = siren_network_kwargs else: raise NotImplementedError # Initialize policy if variant['policy'] == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = variant['policy_noise * max_action'] kwargs["noise_clip"] = variant['noise_clip * max_action'] kwargs["policy_freq"] = variant['policy_freq'] policy = TD3.TD3(**kwargs) elif variant['policy'] == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif variant['policy'] == "DDPG": policy = DDPG.DDPG(**kwargs) elif variant['policy'] == "SAC": kwargs['lr'] = variant['lr'] kwargs['alpha'] = variant['alpha'] kwargs['automatic_entropy_tuning'] = variant[ 'automatic_entropy_tuning'] kwargs['weight_decay'] = variant['weight_decay'] # left out dmc policy = SAC(**kwargs) elif 'PytorchSAC' in variant['policy']: kwargs['action_range'] = [ float(env.action_space.low.min()), float(env.action_space.high.max()) ] kwargs['actor_lr'] = variant['lr'] kwargs['critic_lr'] = variant['lr'] kwargs['alpha_lr'] = variant['alpha_lr'] kwargs['weight_decay'] = variant['weight_decay'] kwargs['no_target'] = variant['no_target'] kwargs['mlp_policy'] = variant['mlp_policy'] kwargs['mlp_qf'] = variant['mlp_qf'] del kwargs['max_action'] if variant['policy'] == 'PytorchSAC': policy = PytorchSAC(**kwargs) elif variant['policy'] == 'RandomNoisePytorchSAC': kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = RandomNoiseSACAgent(**kwargs) elif variant['policy'] == 'SmoothedPytorchSAC': kwargs['n_critic_samples'] = variant['n_critic_samples'] kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = SmoothedSACAgent(**kwargs) elif variant['policy'] == 'FuncRegPytorchSAC': kwargs['critic_target_update_frequency'] = args.critic_freq kwargs['fr_weight'] = args.fr_weight policy = FuncRegSACAgent(**kwargs) else: raise NotImplementedError if variant['load_model'] != "": policy_file = variant['load_model'] # policy_file = file_name if variant['load_model'] == "default" else variant['load_model'] policy.load(policy_file) replay_buffer = CustomReplayBuffer(state_dim, action_dim, max_size=int(variant['max_timesteps'])) # fill replay buffer, save immediately state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 curr_time = datetime.now() for t in trange(int(variant['max_timesteps'])): episode_timesteps += 1 action = policy.select_action(np.array(state), evaluate=False) # Perform action next_state, reward, done, _ = env.step(action) replay_buffer.add(state, action) state = next_state episode_reward += reward if done or episode_timesteps > env._max_episode_steps: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}" ) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # save the replay buffer folder = os.path.dirname(policy_file) torch.save(replay_buffer, os.path.join(folder, 'generated_replay_buffer.pt')) assert replay_buffer.max_size == replay_buffer.size # label the items in the replay buffer with my q-networks and policy with torch.no_grad(): for start_idx in trange(0, replay_buffer.max_size, variant['batch_size']): end_idx = start_idx + variant['batch_size'] obs = torch.tensor(replay_buffer.state[start_idx:end_idx], device=DEVICE, dtype=torch.float32) action = torch.tensor(replay_buffer.action[start_idx:end_idx], device=DEVICE, dtype=torch.float32) actor_Q1, actor_Q2 = policy.critic(obs, action) actor_Q = torch.min(actor_Q1, actor_Q2) action = policy.actor(obs).mean.clamp(*policy.action_range) replay_buffer.set_values(start_idx, end_idx, to_np(actor_Q), to_np(action)) # overwrite the bad replay buffer torch.save(replay_buffer, os.path.join(folder, 'generated_replay_buffer.pt'))
env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = \ TD3.TD3(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro) elif args.policy_name == "OurDDPG": policy = \ OurDDPG.DDPG(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) # policy.load("good_start_from_ddpg", "models") replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True collected_datas = []
def main(env_name, seed, algo, idx): # algo: TD3, DDPG, OurDDPG # seed: int # env_name: str class args: policy_name = "algo" env_name = "env_name" seed = 0 start_timesteps = int(1e4) eval_freq = int(5e3) max_timesteps = int(1e6) save_models = True expl_noise = 0.1 batch_size = 100 discount = 0.99 tau = 0.005 policy_noise = 0.2 noise_clip = 0.5 policy_freq = 2 args.policy_name = algo args.env_name = env_name args.seed = seed file_name = "%s-%s-seed-%s--reward.csv" % (args.policy_name, args.env_name, str(args.seed)) print("---------------------------------------") print("Settings: %s" % (file_name)) print("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print("Total T: %d Episode Num: %d Episode T: %d Reward: %f" % (total_timesteps, episode_num, episode_timesteps, episode_reward)) if args.policy_name == "TD3": policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq) else: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy)) if args.save_models: policy.save(file_name, directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) # Reset environment obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.select_action(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) done_bool = 0 if episode_timesteps + \ 1 == env._max_episode_steps else float(done) episode_reward += reward # Store data in replay buffer replay_buffer.add((obs, new_obs, action, reward, done_bool)) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # Final evaluation evaluations.append(evaluate_policy(env, policy)) if args.save_models: policy.save("%s" % (file_name), directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) return True
def experiment(variant): from rlkit_logging import logger print('CUDA status:', torch.cuda.is_available()) env = make_env(variant['env']) # Set seeds variant['seed'] = int(variant['seed']) env.seed(int(variant['seed'])) torch.manual_seed(int(variant['seed'])) np.random.seed(int(variant['seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": variant['discount'], "tau": variant['tau'], 'network_class': NETWORK_CLASSES[variant['network_class']] } # custom network kwargs mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim']) dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], dropout_p=variant['dropout_p']) variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], sigma=variant['sigma']) fourier_network_kwargs = dict( n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], fourier_dim=variant['fourier_dim'], sigma=variant['sigma'], concatenate_fourier=variant['concatenate_fourier'], train_B=variant['train_B']) siren_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_omega_0=variant['omega'], hidden_omega_0=variant['omega']) if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}: kwargs['network_kwargs'] = mlp_network_kwargs elif variant['network_class'] == 'DropoutMLP': kwargs['network_kwargs'] = dropout_mlp_network_kwargs elif variant['network_class'] == 'VariableInitMLP': kwargs['network_kwargs'] = variable_init_mlp_network_kwargs elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}: kwargs['network_kwargs'] = fourier_network_kwargs elif variant['network_class'] == 'Siren': kwargs['network_kwargs'] = siren_network_kwargs else: raise NotImplementedError # Initialize policy if variant['policy'] == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = variant['policy_noise * max_action'] kwargs["noise_clip"] = variant['noise_clip * max_action'] kwargs["policy_freq"] = variant['policy_freq'] policy = TD3.TD3(**kwargs) elif variant['policy'] == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif variant['policy'] == "DDPG": policy = DDPG.DDPG(**kwargs) elif variant['policy'] == "SAC": kwargs['lr'] = variant['lr'] kwargs['alpha'] = variant['alpha'] kwargs['automatic_entropy_tuning'] = variant[ 'automatic_entropy_tuning'] kwargs['weight_decay'] = variant['weight_decay'] # left out dmc policy = SAC(**kwargs) elif 'PytorchSAC' in variant['policy']: kwargs['action_range'] = [ float(env.action_space.low.min()), float(env.action_space.high.max()) ] kwargs['actor_lr'] = variant['lr'] kwargs['critic_lr'] = variant['lr'] kwargs['alpha_lr'] = variant['alpha_lr'] kwargs['weight_decay'] = variant['weight_decay'] kwargs['no_target'] = variant['no_target'] kwargs['mlp_policy'] = variant['mlp_policy'] kwargs['mlp_qf'] = variant['mlp_qf'] del kwargs['max_action'] if variant['policy'] == 'PytorchSAC': policy = PytorchSAC(**kwargs) elif variant['policy'] == 'RandomNoisePytorchSAC': kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = RandomNoiseSACAgent(**kwargs) elif variant['policy'] == 'SmoothedPytorchSAC': kwargs['n_critic_samples'] = variant['n_critic_samples'] kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = SmoothedSACAgent(**kwargs) elif variant['policy'] == 'FuncRegPytorchSAC': kwargs['critic_target_update_frequency'] = args.critic_freq kwargs['fr_weight'] = args.fr_weight policy = FuncRegSACAgent(**kwargs) else: raise NotImplementedError if variant['load_model'] != "": policy_file = file_name if variant[ 'load_model'] == "default" else variant['load_model'] policy.load(f"./models/{policy_file}") # change the kwargs for logging and plotting purposes kwargs['network_kwargs'] = { **mlp_network_kwargs, **dropout_mlp_network_kwargs, **fourier_network_kwargs, **siren_network_kwargs } kwargs['expID'] = variant['expID'] kwargs['seed'] = variant['seed'] kwargs['first_dim'] = max(variant['hidden_dim'], variant['first_dim']) kwargs['env'] = variant['env'] # set up logging # log_dir = create_env_folder(args.env, args.expID, args.policy, args.network_class, test=args.test) # save_kwargs(kwargs, log_dir) # tabular_log_path = osp.join(log_dir, 'progress.csv') # text_log_path = osp.join(log_dir, 'debug.log') # logger.add_text_output(text_log_path) # logger.add_tabular_output(tabular_log_path) # exp_name = f'{args.env}-td3-exp{args.expID}' # logger.push_prefix("[%s] " % exp_name) policy.save(osp.join(logger.get_snapshot_dir(), f'itr0')) replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, variant['env'], variant['seed'])] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 curr_time = datetime.now() for t in range(int(variant['max_timesteps'])): episode_timesteps += 1 # Select action randomly or according to policy if t < variant['start_timesteps']: action = env.action_space.sample() elif variant['policy'] in {'TD3', 'DDPG', 'OurDDPG'}: action = (policy.select_action(np.array(state), evaluate=False) + np.random.normal(0, max_action * variant['expl_noise'], size=action_dim)).clip( -max_action, max_action) elif variant['policy'] in { 'SAC', 'PytorchSAC', 'RandomNoisePytorchSAC', 'SmoothedPytorchSAC', 'FuncRegPytorchSAC' }: action = policy.select_action(np.array(state), evaluate=False) # Perform action next_state, reward, done, _ = env.step(action) done_bool = float( done) if episode_timesteps < env._max_episode_steps else 0 # Store data in replay buffer replay_buffer.add(state, action, next_state, reward, done_bool) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= variant['start_timesteps']: policy.train_mode(training=True) policy.train(replay_buffer, variant['batch_size']) policy.train_mode(training=False) if done or episode_timesteps > env._max_episode_steps: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}" ) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % variant['eval_freq'] == 0: evaluations.append( eval_policy(policy, variant['env'], variant['seed'])) new_time = datetime.now() time_elapsed = (new_time - curr_time).total_seconds() curr_time = new_time logger.record_tabular('Timestep', t) logger.record_tabular('Eval returns', evaluations[-1]) logger.record_tabular('Time since last eval (s)', time_elapsed) logger.dump_tabular(with_prefix=False, with_timestamp=False) if (t + 1) % 250000 == 0: policy.save(osp.join(logger.get_snapshot_dir(), f'itr{t + 1}')) policy.save(osp.join( logger.get_snapshot_dir(), f'final')) # might be unnecessary if everything divides properly
def main(): parser = argparse.ArgumentParser() parser.add_argument("--policy_name", default="TD3") # Policy name parser.add_argument("--env_name", default="Reacher-v2") parser.add_argument("--seed", default=0, type=int) # Sets Gym, PyTorch and Numpy seeds parser.add_argument( "--start_timesteps", default=1e3, type=int) # How many time steps purely random policy is run for parser.add_argument("--eval_freq", default=5e3, type=float) # How often (time steps) we evaluate parser.add_argument("--max_timesteps", default=1e6, type=float) # Max time steps to run environment for parser.add_argument("--save_models", action="store_true") # Whether or not models are saved parser.add_argument("--expl_noise", default=0.1, type=float) # Std of Gaussian exploration noise parser.add_argument("--batch_size", default=100, type=int) # Batch size for both actor and critic parser.add_argument("--discount", default=0.99, type=float) # Discount factor parser.add_argument("--tau", default=0.005, type=float) # Target network update rate parser.add_argument( "--policy_noise", default=0.2, type=float) # Noise added to target policy during critic update parser.add_argument("--noise_clip", default=0.5, type=float) # Range to clip target policy noise parser.add_argument("--policy_freq", default=2, type=int) # Frequency of delayed policy updates args = parser.parse_args() file_name = "%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed)) print("---------------------------------------") print("Settings: %s" % (file_name)) print("---------------------------------------") for dirname in ("./results", "./rewards"): os.makedirs(dirname, exist_ok=True) if args.save_models: os.makedirs("./pytorch_models", exist_ok=True) unity = UnityEnvironment(file_name=executable(), no_graphics=True) env = UnityWrapper(unity, train_mode=True) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space action_dim = env.action_space max_action = 1 # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "mDDPG": policy = mDDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(env, policy).mean()] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_reward = 0 episode_timesteps = 0 done = True rewards = [] while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print( ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (total_timesteps, episode_num, episode_timesteps, episode_reward)) rewards.append(episode_reward) if args.policy_name == "TD3": policy.train( replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, ) else: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy).mean()) if args.save_models: policy.save(file_name, directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) # Reset environment obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.sample() else: action = policy.select_action(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space)).clip( env.action_space_low, env.action_space_high) # Perform action new_obs, reward, done, _ = env.step(action) done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float( done) episode_reward += reward # Store data in replay buffer replay_buffer.add((obs, new_obs, action, reward, done_bool)) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # Final evaluation evaluations.append(evaluate_policy(env, policy, 100).mean()) if args.save_models: policy.save("%s" % (file_name), directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) np.save("./rewards/%s" % (file_name), rewards)