def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = Buffer(buffer_maxlen)
class DecoupledA3CAgent: def __init__(self, env, gamma, lr, global_max_episode): self.env = env self.gamma = gamma self.lr = lr self.global_episode = mp.Value('i', 0) self.GLOBAL_MAX_EPISODE = global_max_episode self.global_value_network = ValueNetwork( self.env.observation_space.shape[0], 1) self.global_value_network.share_memory() self.global_policy_network = PolicyNetwork( self.env.observation_space.shape[0], self.env.action_space.n) self.global_policy_network.share_memory() self.global_value_optimizer = optim.Adam( self.global_value_network.parameters(), lr=lr) self.global_policy_optimizer = optim.Adam( self.global_policy_network.parameters(), lr=lr) self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\ self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())] def train(self): print("Training on {} cores".format(mp.cpu_count())) input("Enter to start") [worker.start() for worker in self.workers] [worker.join() for worker in self.workers] def save_model(self): torch.save(self.global_value_network.state_dict(), "a3c_value_model.pth") torch.save(self.global_policy_network.state_dict(), "a3c_policy_model.pth")
def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): super(DecoupledWorker, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = "w%i" % id self.env = env self.env.seed(id) self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.local_value_network = ValueNetwork(self.obs_dim, 1) self.local_policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.global_value_network = global_value_network self.global_policy_network = global_policy_network self.global_episode = global_episode self.global_value_optimizer = global_value_optimizer self.global_policy_optimizer = global_policy_optimizer self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE # sync local networks with global networks self.sync_with_global()
def __init__(self): self.policy = PolicyNetwork(action_space=self.ACTION_SPACE) self.value_network = ValueNetwork() self.env = gym.make(self.ENV_ID) self.global_steps = 0 self.history = [] self.hiscore = None
def main(args): env = gym.make(args.env_name) device = torch.device(args.device) # 1.Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # 2.Create actor, critic, EnvSampler() and TRPO. state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] actor = PolicyNetwork(state_size, action_size, hidden_sizes=args.hidden_sizes, init_std=args.init_std) critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes) env_sampler = EnvSampler(env, args.max_episode_step) trpo = TRPO(actor, critic, args.value_lr, args.value_steps_per_update, args.cg_steps, args.linesearch_steps, args.gamma, args.tau, args.damping, args.max_kl, device) def get_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) action = actor.select_action(state) return action.detach().cpu().numpy()[0] total_step = 0 for episode in range(1, args.episodes + 1): episode_reward, samples = env_sampler(get_action, args.batch_size) actor_loss, value_loss = trpo.update(*samples) yield episode * args.batch_size, episode_reward, actor_loss, value_loss
def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr)
def __init__(self, env, gamma, lr, global_max_episode): self.env = env self.gamma = gamma self.lr = lr self.global_episode = mp.Value('i', 0) self.GLOBAL_MAX_EPISODE = global_max_episode self.global_value_network = ValueNetwork( self.env.observation_space.shape[0], 1) self.global_policy_network = PolicyNetwork( self.env.observation_space.shape[0], self.env.action_space.n) self.global_value_optimizer = optim.Adam( self.global_value_network.parameters(), lr=lr) self.global_policy_optimizer = optim.Adam( self.global_policy_network.parameters(), lr=lr) self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\ self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())]
def train_reinforce(args): ''' Parse arguments and construct objects for training reinforce model, with no baseine ''' device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') token_tables = op.build_token_tables() # initialize tensorboard for logging output from os import path train_logger = None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) # Load Models policy = RobustFill(string_size=len(op.CHARACTER), string_embedding_size=args.embedding_size, decoder_inp_size=args.embedding_size, hidden_size=args.hidden_size, program_size=len(token_tables.op_token_table), device=device) value = ValueNetwork(args.embedding_size, args.hidden_size).to(device) if args.continue_training_policy: policy.load_state_dict( torch.load(path.join(path.dirname(path.abspath(__file__)), args.checkpoint_filename), map_location=device)) elif args.continue_training: policy.load_state_dict( torch.load(path.join(path.dirname(path.abspath(__file__)), args.checkpoint_filename), map_location=device)) value.load_state_dict( torch.load(path.join(path.dirname(path.abspath(__file__)), args.val_checkpoint_filename), map_location=device)) policy = policy.to(device) value = value.to(device) # Initialize Optimizer if (args.optimizer == 'sgd'): pol_opt = optim.SGD(policy.parameters(), lr=args.lr) val_opt = optim.SGD(value.parameters(), lr=args.lr) else: pol_opt = optim.Adam(policy.parameters(), lr=args.lr) val_opt = optim.Adam(value.parameters(), lr=args.lr) # Load Environment env = RobustFillEnv() train_reinforce_( args, policy=policy, value=value, pol_opt=pol_opt, value_opt=val_opt, env=env, train_logger=train_logger, checkpoint_filename=args.checkpoint_filename, checkpoint_step_size=args.checkpoint_step_size, checkpoint_print_tensors=args.print_tensors, )
def run_learned_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes): # learned baseline best_result = 0 best_settings = dict() results_file = f'results/s{stochasticity}_learned_baseline.csv' best_settings_file = f'results/s{stochasticity}_learned_baseline_best_settings.pkl' with open(results_file, 'w') as f: f.write( 'discount_factor,learn_rate_policy,learn_rate_value,hidden_dim_policy,hidden_dim_value,init_temp,result' + '\n') for discount_factor in discount_factors: for learn_rate_policy in learn_rates: for learn_rate_value in learn_rates: for hidden_dim_policy in hidden_dims: for hidden_dim_value in hidden_dims: for init_temp in init_temps: print('#' * 30) print('#' * 9 + ' NEW SEARCH ' + '#' * 9) print('#' * 30) print() st = time() # change this for learned baseline print( f'Search settings: baseline=run_episodes_with_learned_baseline, discount_factor={discount_factor}, learn_rate_policy={learn_rate_policy}, learn_rate_value={learn_rate_value}, hidden_dim_policy={hidden_dim_policy}, hidden_dim_value={hidden_dim_value}, init_temp={init_temp}' ) # initialize the environment env = gym.make('CartPole-v1') result = 0 for i in range(n_runs): start_time = time() policy_model = PolicyNetwork( input_dim=4, hidden_dim=hidden_dim_policy, output_dim=2 ) # change input_ and output_dim for gridworld env value_model = ValueNetwork( input_dim=4, hidden_dim=hidden_dim_value ) # change input_dim for gridworld env seed = 40 + i set_seeds(env, seed) episode_durations, _, _ = run_episodes_with_learned_baseline( policy_model, value_model, env, n_episodes, discount_factor, learn_rate_policy, learn_rate_value, init_temp, stochasticity) result += np.mean(episode_durations) del policy_model del value_model end_time = time() h, m, s = get_running_time(end_time - start_time) print( f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' ) env.close() result /= n_runs with open(results_file, 'a') as f: f.write( f'{discount_factor},{learn_rate_policy},{learn_rate_value},{hidden_dim_policy},{hidden_dim_value},{init_temp},{result}' + '\n') et = time() h, m, s = get_running_time(et - st) print( f'Done with search in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' ) print( f'Average number of steps per episode: {result}' ) if result > best_result: best_result = result best_settings[ 'discount_factor'] = discount_factor best_settings[ 'learn_rate_policy'] = learn_rate_policy best_settings[ 'learn_rate_value'] = learn_rate_value best_settings[ 'hidden_dim_policy'] = hidden_dim_policy best_settings[ 'hidden_dim_value'] = hidden_dim_value best_settings['init_temp'] = init_temp best_settings['result'] = best_result pkl.dump(best_settings, open(best_settings_file, 'wb')) print(f'New best result!: {result}') print(f'New best settings!: {best_settings}') print() print() print() print(f'Best settings after completing grid search: {best_settings}')
def run_learned_baseline(stochasticity, n_runs, n_episodes): # learned baseline dir_path = os.path.dirname(os.path.realpath(__file__)) best_settings_file = dir_path + f'/cart_pole_parameter_search/s{stochasticity}_learned_baseline_best_settings.pkl' eval_file = f'cart_evals/s{stochasticity}_learned_baseline.pkl' with open(best_settings_file, 'rb') as pickle_file: best_settings = pkl.load(pickle_file) discount_factor = best_settings['discount_factor'] learn_rate_policy = best_settings['learn_rate_policy'] learn_rate_value = best_settings['learn_rate_value'] hidden_dim_policy = best_settings['hidden_dim_policy'] hidden_dim_value = best_settings['hidden_dim_value'] init_temp = best_settings['init_temp'] st = time() # change this for learned baseline print( f'Run settings: baseline=run_episodes_with_learned_baseline, discount_factor={discount_factor}, learn_rate_policy={learn_rate_policy}, learn_rate_value={learn_rate_value}, hidden_dim_policy={hidden_dim_policy}, hidden_dim_value={hidden_dim_value}, init_temp={init_temp}' ) # initialize the environment env = gym.make('CartPole-v1') episode_durations_list = [] reinforce_loss_list = [] value_loss_list = [] for i in range(n_runs): start_time = time() policy_model = PolicyNetwork( input_dim=4, hidden_dim=hidden_dim_policy, output_dim=2) # change input_ and output_dim for gridworld env value_model = ValueNetwork( input_dim=4, hidden_dim=hidden_dim_value) # change input_dim for gridworld env seed = 40 + i set_seeds(env, seed) episode_durations, reinforce_loss, value_loss = run_episodes_with_learned_baseline( policy_model, value_model, env, n_episodes, discount_factor, learn_rate_policy, learn_rate_value, init_temp, stochasticity) episode_durations_list.append(episode_durations) reinforce_loss_list.append(reinforce_loss) value_loss_list.append(value_loss) del policy_model del value_model end_time = time() h, m, s = get_running_time(end_time - start_time) print( f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' ) env.close() et = time() h, m, s = get_running_time(et - st) evals = {} evals['episode_durations'] = episode_durations_list evals['reinforce_loss'] = reinforce_loss_list evals['value_loss'] = value_loss_list pkl.dump(evals, open(eval_file, 'wb')) print( f'Done with run in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' )
def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.firsttime = 0 self.env = env self.action_range = [env.action_space.low, env.action_space.high] #self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] #1 self.conv_channels = 4 self.kernel_size = (3, 3) self.img_size = (500, 500, 3) print("Diagnostics:") print(f"action_range: {self.action_range}") #print(f"obs_dim: {self.obs_dim}") print(f"action_dim: {self.action_dim}") # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.feature_net = FeatureExtractor(self.img_size[2], self.conv_channels, self.kernel_size).to(self.device) print("Feature net init'd successfully") input_dim = self.feature_net.get_output_size(self.img_size) self.input_size = input_dim[0] * input_dim[1] * input_dim[2] print(f"input_size: {self.input_size}") self.value_net = ValueNetwork(self.input_size, 1).to(self.device) self.target_value_net = ValueNetwork(self.input_size, 1).to(self.device) self.q_net1 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.input_size, self.action_dim).to(self.device) print("Finished initing all nets") # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) print("Finished copying targets") # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) print("Finished initing optimizers") self.replay_buffer = BasicBuffer(buffer_maxlen) print("End of init")
class TRPOAgent: TRAJECTORY_SIZE = 1024 VF_BATCHSIZE = 64 MAX_KL = 0.01 GAMMA = 0.99 GAE_LAMBDA = 0.98 ENV_ID = "Pendulum-v0" OBS_SPACE = 3 ACTION_SPACE = 1 def __init__(self): self.policy = PolicyNetwork(action_space=self.ACTION_SPACE) self.value_network = ValueNetwork() self.env = gym.make(self.ENV_ID) self.global_steps = 0 self.history = [] self.hiscore = None def play(self, n_iters): self.epi_reward = 0 self.epi_steps = 0 self.state = self.env.reset() for _ in range(n_iters): trajectory = self.generate_trajectory() trajectory = self.compute_advantage(trajectory) self.update_policy(trajectory) self.update_vf(trajectory) return self.history def generate_trajectory(self): """generate trajectory on current policy """ trajectory = { "s": np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32), "a": np.zeros((self.TRAJECTORY_SIZE, self.ACTION_SPACE), dtype=np.float32), "r": np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32), "s2": np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32), "done": np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32) } state = self.state for i in range(self.TRAJECTORY_SIZE): action = self.policy.sample_action(state) next_state, reward, done, _ = self.env.step(action) trajectory["s"][i] = state trajectory["a"][i] = action trajectory["r"][i] = reward trajectory["s2"][i] = next_state trajectory["done"][i] = done self.epi_reward += reward self.epi_steps += 1 self.global_steps += 1 if done: state = self.env.reset() self.history.append(self.epi_reward) recent_score = sum(self.history[-10:]) / 10 print("====" * 5) print("Episode:", len(self.history)) print("Episode reward:", self.epi_reward) print("Global steps:", self.global_steps) if len(self.history) > 100 and (self.hiscore is None or recent_score > self.hiscore): print("*HISCORE UPDATED:", recent_score) self.save_model() self.hiscore = recent_score self.epi_reward = 0 self.epi_steps = 0 else: state = next_state self.state = state return trajectory def compute_advantage(self, trajectory): """Compute Args: trajectory ([type]): [description] """ trajectory["vpred"] = self.value_network(trajectory["s"]).numpy() trajectory["vpred_next"] = self.value_network(trajectory["s2"]).numpy() is_nonterminals = 1 - trajectory["done"] deltas = trajectory["r"] + self.GAMMA * is_nonterminals * trajectory[ "vpred_next"] - trajectory["vpred"] advantages = np.zeros_like(deltas, dtype=np.float32) lastgae = 0 for i in reversed(range(len(deltas))): lastgae = deltas[ i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[i] * lastgae advantages[i] = lastgae trajectory["adv"] = (advantages - advantages.mean()) / (advantages.std() + 1e-8) #trajectory["adv"] = advantages trajectory["vftarget"] = trajectory["adv"] + trajectory["vpred"] return trajectory def update_policy(self, trajectory): def flattengrads(grads): flatgrads_list = [ tf.reshape(grad, shape=[1, -1]) for grad in grads ] flatgrads = tf.concat(flatgrads_list, axis=1) return flatgrads actions = tf.convert_to_tensor(trajectory["a"], dtype=tf.float32) states = tf.convert_to_tensor(trajectory["s"], dtype=tf.float32) advantages = tf.convert_to_tensor(trajectory["adv"], dtype=tf.float32) old_means, old_stdevs = self.policy(states) old_logp = compute_logprob(old_means, old_stdevs, actions) with tf.GradientTape() as tape: new_means, new_stdevs = self.policy(states) new_logp = compute_logprob(new_means, new_stdevs, actions) loss = tf.exp(new_logp - old_logp) * advantages loss = tf.reduce_mean(loss) g = tape.gradient(loss, self.policy.trainable_variables) g = tf.transpose(flattengrads(g)) @tf.function def hvp_func(vector): """Compute hessian-vector product """ with tf.GradientTape() as t2: with tf.GradientTape() as t1: new_means, new_stdevs = self.policy(states) kl = compute_kl(old_means, old_stdevs, new_means, new_stdevs) meankl = tf.reduce_mean(kl) kl_grads = t1.gradient(meankl, self.policy.trainable_variables) kl_grads = flattengrads(kl_grads) grads_vector_product = tf.matmul(kl_grads, vector) hvp = t2.gradient(grads_vector_product, self.policy.trainable_variables) hvp = tf.transpose(flattengrads(hvp)) return hvp + vector * 1e-2 #: 共役勾配法の安定化のために微小量を加える step_direction = cg(hvp_func, g) shs = tf.matmul(tf.transpose(step_direction), hvp_func(step_direction)) lm = tf.sqrt(2 * self.MAX_KL / shs) fullstep = lm * step_direction expected_improve = tf.matmul(tf.transpose(g), fullstep) fullstep = restore_shape(fullstep, self.policy.trainable_variables) params_old = [var.numpy() for var in self.policy.trainable_variables] old_loss = loss for stepsize in [0.5**i for i in range(10)]: params_new = [ p + step * stepsize for p, step in zip(params_old, fullstep) ] self.policy.set_weights(params_new) new_means, new_stdevs = self.policy(states) new_logp = compute_logprob(new_means, new_stdevs, actions) new_loss = tf.reduce_mean(tf.exp(new_logp - old_logp) * advantages) improve = new_loss - old_loss kl = compute_kl(old_means, old_stdevs, new_means, new_stdevs) mean_kl = tf.reduce_mean(kl) print(f"Expected: {expected_improve} Actual: {improve}") print(f"KL {mean_kl}") if mean_kl > self.MAX_KL * 1.5: print("violated KL constraint. shrinking step.") elif improve < 0: print("surrogate didn't improve. shrinking step.") else: print("Stepsize OK!") break else: print("更新に失敗") self.policy.set_weights(params_old) def update_vf(self, trajectory): for _ in range(self.TRAJECTORY_SIZE // self.VF_BATCHSIZE): indx = np.random.choice(self.TRAJECTORY_SIZE, self.VF_BATCHSIZE, replace=True) with tf.GradientTape() as tape: vpred = self.value_network(trajectory["s"][indx]) vtarget = trajectory["vftarget"][indx] loss = tf.reduce_mean(tf.square(vtarget - vpred)) variables = self.value_network.trainable_variables grads = tape.gradient(loss, variables) self.value_network.optimizer.apply_gradients(zip(grads, variables)) def save_model(self): self.policy.save_weights("checkpoints/actor") self.value_network.save_weights("checkpoints/critic") print() print("Model Saved") print() def load_model(self): self.policy.load_weights("checkpoints/actor") self.value_network.load_weights("checkpoints/critic") def test_play(self, n, monitordir, load_model=False): if load_model: self.load_model() if monitordir: env = wrappers.Monitor(gym.make(self.ENV_ID), monitordir, force=True, video_callable=(lambda ep: ep % 1 == 0)) else: env = gym.make(self.ENV_ID) for i in range(n): total_reward = 0 steps = 0 done = False state = env.reset() while not done: action = self.policy.sample_action(state) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward steps += 1 print() print(f"Test Play {i}: {total_reward}") print(f"Steps:", steps) print()
class DRTRPOAgent(): """ DR TRPO """ def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_adv_mc(self, trajectory): """ Compute the advantage of all (st,at) in trajectory. The advantage is estimated using MC: i.e. discounted reward sum (from trajectory) - value (from NN) """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) advantages = value_targets - values return advantages, value_loss def compute_adv_td(self, state, next_state, reward): """ Compute the advantage of a single (s,a) using TD: i.e. r + v(s') - v(s) - depends highly on the accuracy of NN """ state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) reward = torch.as_tensor(reward) state_value = self.value_network.forward(state) next_state_value = self.value_network.forward(next_state) value_target = reward + next_state_value advantage = value_target - state_value value_loss = F.mse_loss(state_value, value_target) return advantage, value_loss def compute_policy_loss_kl(self, state, state_adv, beta): """ Policy loss of DR TRPO (KL Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) denom = torch.sum(torch.exp(state_adv / beta) * pi_dist) new_pi_dist = torch.exp(state_adv / beta) * pi_dist / denom return F.mse_loss(pi_dist, new_pi_dist) def compute_policy_loss_wass(self, state, state_adv, beta): """ Policy loss of DR TRPO (Wasserstein Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) """Find argmax_j {A(s,aj) - β*d(aj,ai)}.""" best_j = [] for i in range(self.action_dim): opt_j = 0 opt_val = state_adv[opt_j] - beta * self.compute_distance(opt_j, i) for j in range(self.action_dim): cur_val = state_adv[j] - beta * self.compute_distance(j, i) if cur_val > opt_val: opt_j = j opt_val = cur_val best_j.append(opt_j) new_pi_dist = torch.zeros(self.action_dim) for j in range(self.action_dim): for i in range(self.action_dim): if j == best_j[i]: new_pi_dist[j] += pi_dist[i] return F.mse_loss(pi_dist, new_pi_dist) def compute_distance(self, a1, a2): if a1 == a2: return 0 else: return 1 def update(self, value_loss, policy_loss): self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory, adv_method): """ When gamma is large, the NN loss does not converge, we should use MC to estimate advantage. When gamma is small (i.e. 0.9), the NN loss decreases after training, we can use TD to estimate advantage. """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = logits probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() # 0 for MC, 1 for TD if adv_method == 0: advantages = value_targets - values if adv_method == 1: advantages = rewards - values + self.gamma * torch.cat( (values[1:], torch.FloatTensor([[0]])), dim=0) policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantages.detach() policy_loss = policy_loss.sum() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory, adv_method): value_loss, policy_loss = self.compute_loss(trajectory, adv_method) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() #delayed update for policy net and target value nets if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
class DecoupledWorker(mp.Process): def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): super(DecoupledWorker, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = "w%i" % id self.env = env self.env.seed(id) self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.local_value_network = ValueNetwork(self.obs_dim, 1) self.local_policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.global_value_network = global_value_network self.global_policy_network = global_policy_network self.global_episode = global_episode self.global_value_optimizer = global_value_optimizer self.global_policy_optimizer = global_policy_optimizer self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE # sync local networks with global networks self.sync_with_global() def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.local_policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.local_value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.local_policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update_global(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.global_value_optimizer.zero_grad() value_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_value_network.parameters(), self.global_value_network.parameters()): global_params._grad = local_params._grad self.global_value_optimizer.step() self.global_policy_optimizer.zero_grad() policy_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_policy_network.parameters(), self.global_policy_network.parameters()): global_params._grad = local_params._grad #print(global_params._grad) self.global_policy_optimizer.step() def sync_with_global(self): self.local_value_network.load_state_dict( self.global_value_network.state_dict()) self.local_policy_network.load_state_dict( self.global_policy_network.state_dict()) def run(self): state = self.env.reset() trajectory = [] # [[s, a, r, s', done], [], ...] episode_reward = 0 while self.global_episode.value < self.GLOBAL_MAX_EPISODE: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) trajectory.append([state, action, reward, next_state, done]) episode_reward += reward if done: with self.global_episode.get_lock(): self.global_episode.value += 1 print(self.name + " | episode: " + str(self.global_episode.value) + " " + str(episode_reward)) self.update_global(trajectory) self.sync_with_global() trajectory = [] episode_reward = 0 state = self.env.reset() else: state = next_state
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.firsttime = 0 self.env = env self.action_range = [env.action_space.low, env.action_space.high] #self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] #1 self.conv_channels = 4 self.kernel_size = (3, 3) self.img_size = (500, 500, 3) print("Diagnostics:") print(f"action_range: {self.action_range}") #print(f"obs_dim: {self.obs_dim}") print(f"action_dim: {self.action_dim}") # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.feature_net = FeatureExtractor(self.img_size[2], self.conv_channels, self.kernel_size).to(self.device) print("Feature net init'd successfully") input_dim = self.feature_net.get_output_size(self.img_size) self.input_size = input_dim[0] * input_dim[1] * input_dim[2] print(f"input_size: {self.input_size}") self.value_net = ValueNetwork(self.input_size, 1).to(self.device) self.target_value_net = ValueNetwork(self.input_size, 1).to(self.device) self.q_net1 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.input_size, self.action_dim).to(self.device) print("Finished initing all nets") # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) print("Finished copying targets") # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) print("Finished initing optimizers") self.replay_buffer = BasicBuffer(buffer_maxlen) print("End of init") def get_action(self, state): if state.shape != self.img_size: print( f"Invalid size, expected shape {self.img_size}, got {state.shape}" ) return None inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to( self.device) features = self.feature_net(inp) features = features.view(-1, self.input_size) mean, log_std = self.policy_net.forward(features) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # states and next states are lists of ndarrays, np.stack converts them to # ndarrays of shape (batch_size, height, width, num_channels) states = np.stack(states) next_states = np.stack(next_states) states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).permute(0, 3, 1, 2).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Process images features = self.feature_net( states) #.contiguous() # Properly shaped due to batching next_features = self.feature_net(next_states) #.contiguous() features = torch.reshape(features, (64, self.input_size)) next_features = torch.reshape(next_features, (64, self.input_size)) next_actions, next_log_pi = self.policy_net.sample(next_features) next_q1 = self.q_net1(next_features, next_actions) next_q2 = self.q_net2(next_features, next_actions) next_v = self.target_value_net(next_features) next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(features) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss expected_q = rewards + (1 - dones) * self.gamma * next_v curr_q1 = self.q_net1.forward(features, actions) curr_q2 = self.q_net2.forward(features, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value and q networks self.value_optimizer.zero_grad() v_loss.backward(retain_graph=True) self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward(retain_graph=True) self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward(retain_graph=True) self.q2_optimizer.step() # delayed update for policy network and target q networks if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(features) min_q = torch.min(self.q_net1.forward(features, new_actions), self.q_net2.forward(features, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward(retain_graph=True) self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1