def __init__(self, agent_dict={}, model_dict={}): """ Initialize Agent object Params ====== agent_dict(dict): dictionary containing parameters for agent model_dict(dict): dictionary containing parameters for agents model """ if agent_dict.get("enable_gpu", False): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") else: self.device = torch.device("cpu") self.num_episodes = agent_dict.get("num_episodes", 10000) self.save_after = agent_dict.get("save_after", -1) self.name = agent_dict.get("name", "banana_collector") self.gamma = agent_dict.get("gamma", 0.9) self.epsilon = agent_dict.get("epsilon_start", 1.0) self.epsilon_decay = agent_dict.get("epsilon_decay", 0.9) self.epsilon_min = agent_dict.get("epsilon_min", 0.1) self.tau = agent_dict.get("tau", 0.1) self.num_replays = agent_dict.get("num_replays", 1) self.criterion = nn.MSELoss() memory_size = agent_dict.get("memory_size", 2**14) batchsize = agent_dict.get("batchsize", 2**10) replay_reg = agent_dict.get("replay_reg", 0.0) self.replay_buffer = utils.PrioritizedReplayBuffer(memory_size, batchsize, epsilon=replay_reg) self.decision_model = model.Model(model_dict).to(self.device) self.policy_model = model.Model(model_dict).to(self.device) self.optimizer = optim.Adam(self.decision_model.parameters(), lr=1E-3) utils.copy_model(self.decision_model, self.policy_model, tau=1.0) seed = agent_dict.get("seed", 0) torch.manual_seed(seed) np.random.seed(seed)
def train(config, start_timesteps, max_timesteps, policy_noise, expl_noise, noise_clip, policy_freq, batch_size, seed, policy, prioritized_replay, env_name, eval_freq, discount, tau, use_rank): if prioritized_replay: alpha = float(config["alpha"]) beta = float(config["beta"]) else: discount = float(config["discount"]) tau = float(config["tau"]) import pybulletgym warnings.filterwarnings("ignore") env = gym.make(env_name) # Set seeds env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": discount, "tau": tau, } # Initialize policy if policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = policy_noise * max_action kwargs["noise_clip"] = noise_clip * max_action kwargs["policy_freq"] = policy_freq kwargs["prioritized_replay"] = prioritized_replay kwargs["use_rank"] = use_rank policy = TD3.TD3(**kwargs) elif policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif policy == "DDPG": policy = DDPG.DDPG(**kwargs) if prioritized_replay: replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim, max_timesteps, start_timesteps, alpha=alpha, beta=beta) else: replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, env_name, seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 for t in range(int(max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < start_timesteps: action = env.action_space.sample() else: action = (policy.select_action(np.array(state)) + np.random.normal( 0, max_action * expl_noise, size=action_dim)).clip( -max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) done_bool = float( done) if episode_timesteps < env._max_episode_steps else 0 # Store data in replay buffer replay_buffer.add(state, action, next_state, reward, done_bool) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= start_timesteps: policy.train(replay_buffer, batch_size) if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}" ) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % eval_freq == 0: avg_reward = eval_policy(policy, env_name, seed) tune.report(episode_reward_mean=avg_reward) evaluations.append(avg_reward)
# state_dim = env.observation_space.shape[0] state_dim = env.observation_space.shape action_dim = env.action_space.n # Initialize policy if args.policy == "SAC": policy = SAC.SAC(state_dim, action_dim, max_action) elif args.policy == "DDPG": policy = DDPG(state_dim, action_dim, max_action) elif args.policy == "DQN": policy = DQN(state_dim, action_dim) else: raise ValueError("invalid policy {}".format(args.policy)) replay_buffer = utils.PrioritizedReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] timesteps_since_eval = 0 episode_num = 0 done = False episode_reward = 0 episode_timesteps = 0 checkpoint = tf.train.Checkpoint(policy=policy) checkpoint_manager = tf.train.CheckpointManager(checkpoint, directory=args.parameter, max_to_keep=5)
def train(config, args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_model and not os.path.exists("./models"): os.makedirs("./models") import pybulletgym warnings.filterwarnings("ignore") eps_bounds = args.reacher_epsilon_bounds # just aliasing with shorter variable name utils_object = utils.GeneralUtils(args) if args.tune_run: if args.prioritized_replay: args.alpha = float(config["alpha"]) args.beta = float(config["beta"]) args.discount = float(config.get("discount", args.discount)) args.tau = float(config.get("tau", args.tau)) elif args.custom_env and args.use_hindsight: eps_bounds = [float(config["epsilons"][0]), float(config["epsilons"][1])] args.seed = int(config["seed"]) else: args.discount = float(config.get("discount", args.discount)) args.tau = float(config.get("tau", args.tau)) if args.custom_env: gym.envs.register( id='OurReacher-v0', entry_point='our_reacher_env:OurReacherEnv', max_episode_steps=50, reward_threshold=100.0, ) # this is assuming we only use epsilon for custom env or fetch reach, where episode tsteps is 50 !!!! max_episode_steps = 50 # retrieve epsilon range [a, b] = eps_bounds epsilons = utils_object.epsilon_calc(a, b, max_episode_steps) env = gym.make('OurReacher-v0', epsilon=epsilons[0], render=False) else: env = gym.make(args.env) if utils_object.fetch_reach and utils_object.args.fetch_reach_dense: env.env.reward_type = "dense" # Set seeds env.seed(int(args.seed)) torch.manual_seed(args.seed) np.random.seed(args.seed) if utils_object.fetch_reach: state_dim = env.reset()["observation"].shape[0] else: state_dim = env.observation_space.shape[0] if args.use_hindsight: # include both current state and goal state if args.custom_env: state_dim += 2 # reacher nonsense; goal = (x, y) elif utils_object.fetch_reach: state_dim += 3 # include fetchreach goal state (x,y,z position) else: state_dim *= 2 action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq kwargs["prioritized_replay"] = args.prioritized_replay kwargs["use_rank"] = args.use_rank kwargs["use_hindsight"] = args.use_hindsight policy = TD3.TD3(**kwargs) elif args.policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) exp_descriptors = [ args.policy, 'CustomReacher' if args.custom_env else args.env, f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 'HER' if args.use_hindsight else '', f"{args.decay_type}decay-eps{f'{eps_bounds[0]}-{eps_bounds[1]}' if eps_bounds[0] != eps_bounds[1] else f'{eps_bounds[0]}'}" if args.custom_env else "", f"k{args.k}", datetime.now().strftime('%Y%m%d%H%M') ] if args.tune_run: # fudgy: assumes tune_run for non-HER experiments exp_descriptors = [ args.policy, 'CustomReacher' if args.custom_env else args.env, f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', f"tau{args.tau}", f"discount{args.discount}", f"alpha{args.alpha}" if args.prioritized_replay else '', f"beta{args.beta}" if args.prioritized_replay else '', f"k{args.k}", datetime.now().strftime('%Y%m%d%H%M') ] exp_descriptors = [x for x in exp_descriptors if len(x) > 0] file_name = "_".join(exp_descriptors) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") if args.prioritized_replay: replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim, args.max_timesteps, args.start_timesteps, alpha=args.alpha, beta=args.beta) else: replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env, args.seed, utils_object=utils_object)] state, done = env.reset(), False original_episode_reward = 0 episode_reward = 0 episode_timesteps = 0 episode_num = 0 trajectory = [] for t in range(int(args.max_timesteps)): episode_timesteps += 1 x, goal = utils_object.compute_x_goal(state, env) # Select action randomly or according to policy if t < args.start_timesteps: action = env.action_space.sample() else: action = ( policy.select_action(np.array(x)) + np.random.normal(0, max_action * args.expl_noise, size=action_dim) ).clip(-max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0 if args.use_hindsight: if utils_object.fetch_reach: goal = state["desired_goal"] next_x = np.concatenate([np.array(next_state["observation"]), goal]) else: # env.set_goal(goal) next_x = np.concatenate([np.array(next_state), goal]) elif utils_object.fetch_reach: next_x = np.array(next_state["observation"]) else: next_x = next_state # Store data in replay buffer if not args.use_hindsight: replay_buffer.add(x, action, next_x, reward, done_bool) trajectory.append((state, action, next_state, reward, done_bool)) state = next_state episode_reward += reward if args.custom_env: original_episode_reward += env.original_rewards # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: if args.use_hindsight: replay_buffer.add_hindsight(trajectory, goal, env, k=args.k, fetch_reach=utils_object.fetch_reach) # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Original Reward: {original_episode_reward:.3f}") # Reset environment state, done = env.reset(), False episode_reward = 0 original_episode_reward = 0 episode_timesteps = 0 episode_num += 1 if args.custom_env: epsilon = epsilons[episode_num] env.set_epsilon(epsilon) trajectory = [] # Evaluate episode if (t + 1) % args.eval_freq == 0: evaled_policy = eval_policy(policy, args.env, args.seed, utils_object=utils_object) evaluations.append(evaled_policy) np.save(f"./results/{file_name}", evaluations) if args.save_model: policy.save(f"./models/{file_name}") if args.plot: plotter.plot(file_name, args.custom_env) if args.tune_run: tune.report(episode_reward_mean=evaled_policy[0])
"max_action": max_action, "discount": args.discount, "tau": args.tau, "policy_noise": args.policy_noise * max_action, "noise_clip": args.noise_clip * max_action, "policy_freq": args.policy_freq } # Initialize policy and replay buffer if args.algorithm == "TD3": policy = TD3.TD3(**kwargs) replay_buffer = utils.ReplayBuffer(state_dim, action_dim) elif args.algorithm == "PER_TD3": policy = PER_TD3.PER_TD3(**kwargs) replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim) kwargs["alpha"] = args.alpha kwargs["min_priority"] = args.min_priority if args.algorithm == "LAP_TD3": policy = LAP_TD3.LAP_TD3(**kwargs) replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim) elif args.algorithm == "PAL_TD3": policy = PAL_TD3.PAL_TD3(**kwargs) replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env, args.seed)]