class SAC(algorithms): def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = GaussianPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(), self.args.lr) self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_1.load_state_dict(self.critic_1.state_dict()) self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(), self.args.lr) self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_2.load_state_dict(self.critic_2.state_dict()) self.replay_buffer = ReplayBuffer(self.args.capacity) self.global_steps = 0 def update(self): for it in range(self.args.update_iteration): # sample from replay buffer x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) state = torch.FloatTensor(x).to(device) action = torch.FloatTensor(u).to(device) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # get the next action and compute target Q with torch.no_grad(): next_action, log_prob, _ = self.actor.sample(next_state) target_Q1 = self.critic_target_1(next_state, next_action) target_Q2 = self.critic_target_2(next_state, next_action) target_Q = torch.min(target_Q1, target_Q2) - self.args.alpha * log_prob y_Q = reward + self.args.gamma * (1 - done) * target_Q # update critic current_Q1 = self.critic_1(state, action) critic_loss1 = F.mse_loss(current_Q1, y_Q) self.critic_optimizer_1.zero_grad() critic_loss1.backward() self.critic_optimizer_1.step() current_Q2 = self.critic_2(state, action) critic_loss2 = F.mse_loss(current_Q2, y_Q) self.critic_optimizer_2.zero_grad() critic_loss2.backward() self.critic_optimizer_2.step() # update actor actor_action, actor_log_prob, _ = self.actor.sample(state) Q1 = self.critic_1(state, actor_action) Q2 = self.critic_2(state, actor_action) actor_loss = -(torch.min(Q1, Q2) - self.args.alpha * actor_log_prob).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target network for param, target_param in zip(self.critic_1.parameters(), self.critic_target_1.parameters()): target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) for param, target_param in zip(self.critic_2.parameters(), self.critic_target_2.parameters()): target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) def train(self): for i in range(self.args.max_episode): state = self.env.reset() ep_r = 0 for t in count(): action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] next_state, reward, done, info = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push( (state, next_state, action, reward, np.float(done))) state = next_state if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print( "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}" .format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) ep_r = 0 break if len(self.replay_buffer.storage) >= self.args.capacity - 1: self.update() self.save(i + 1) def evaluate(self, number=1, render=True): rewards = [] for _ in range(number): state = self.env.reset() done = False total_rews = 0 time_step = 0 while not done: with torch.no_grad(): # use the mean action action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] if render: self.env.render() state, reward, done, _ = self.env.step(action) total_rews += reward time_step += 1 if render: print("total reward of this episode is " + str(total_rews)) rewards.append(total_rews) rewards = np.array(rewards) if not render: pickle.dump((self.global_steps, rewards), self.log_file) return rewards.max(), rewards.min(), rewards.mean() def save(self, episode): file_name = self.weights_file(episode) torch.save( { 'actor': self.actor.state_dict(), 'critic_1': self.critic_1.state_dict(), 'critic_2': self.critic_2.state_dict(), 'critic_target_1': self.critic_target_1.state_dict(), 'critic_target_2': self.critic_target_2.state_dict() }, file_name) print("save model to " + file_name) def load(self, episode): file_name = self.weights_file(episode) checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.critic_1.load_state_dict(checkpoint['critic_1']) self.critic_2.load_state_dict(checkpoint['critic_2']) self.critic_target_1.load_state_dict(checkpoint['critic_target_1']) self.critic_target_2.load_state_dict(checkpoint['critic_target_2']) print("successfully load model from " + file_name)
class BaseAgent: def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) self.det_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net.load_state_dict( torch.load( "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt" )) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net) def selectAction(self, x): # take a random action about epsilon percent of the time q_s, _ = self.bpolicy_net(x) if q_s.shape[0] == 3: q_s = q_s.unsqueeze(0) #act = q_s.argmax().detach() # else: act = torch.max(q_s, 1).indices.detach().numpy() for i in range(act.shape[0]): action = act[i] if action == 1: if np.random.rand() < self.epsilon: act[i] = np.random.choice([0, 2]) # if act.cpu().numpy() == 1: # if np.random.rand() < self.epsilon: # a = np.random.randint(self.actions-1) # if np.random.rand() < self.epsilon: # a = np.random.randint(self.actions) # return torch.tensor(a, device=device) # # otherwise take a greedy action # q_s, _ = self.bpolicy_net(x) # # print(q_s) # return q_s.argmax().detach() act_tensor = torch.from_numpy(act).detach().to(device) return act_tensor def updateNetwork(self, samples): pass def update(self, s, a, sp, r, gamma): # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer) > 200: samples, idcs = self.buffer.sample(200) self.updateNetwork(samples)
class NAF: MODEL_NAME = "NAF" TARGET_MODEL_NAME = "target-NAF" class Build(Enum): SINGLE = 1 MULTIPLE = 2 HYDRA = 3 def __init__(self, prep, build, policy, state_dim, action_dim, monitor_directory, buffer_size=10000, batch_size=32, steps_before_train=100, train_freq=1, num_steps=1000000, learning_rate=1e-3, update_rate=1e-3, max_reward=None, detailed_summary=False): self.prep = prep self.build_mode = build self.policy = policy self.state_dim = state_dim self.action_dim = action_dim self.summary_dir = os.path.join(monitor_directory, "summary") self.detailed_summary = detailed_summary self.discount = 0.99 self.learning_rate = learning_rate self.target_update_rate = update_rate self.buffer_size = buffer_size self.batch_size = batch_size self.steps_before_train = steps_before_train self.train_freq = train_freq self.max_reward = max_reward self.max_iters = num_steps self.step = 0 self.solved = False self.state_layers = [64, 32] self.mu_layers = [16, 8, self.action_dim] self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2] self.v_layers = [16, 8, 1] self.action_inputs = None self.reward_inputs = None self.done = None self.state_inputs = None self.state_outputs = None self.mu_outputs = None self.l_outputs = None self.value_outputs = None self.next_state_inputs = None self.next_state_outputs = None self.target_value_outputs = None self.target = None self.advantages = None self.q_values = None self.loss = None self.global_step = None self.inc_global_step = None self.train_op = None self.target_update = None self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.build() self.merged = tf.summary.merge_all() self.session = tf.Session() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "learning rate": self.learning_rate, "batch size": self.batch_size, "update rate": self.target_update_rate, "buffer size": self.buffer_size, "build": self.build_mode.name, "train frequency": self.train_freq }) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session.run(init_op) def build(self): self.action_inputs = tf.placeholder(tf.float32, (None, self.action_dim)) self.reward_inputs = tf.placeholder(tf.float32, (None, )) self.done = tf.placeholder(tf.float32, (None, )) self.state_inputs, self.state_outputs, self.mu_outputs, self.l_outputs, self.value_outputs = \ self.build_network(self.MODEL_NAME) self.next_state_inputs, self.next_state_outputs, _, _, self.target_value_outputs = \ self.build_network(self.TARGET_MODEL_NAME) self.target = tf.expand_dims(self.reward_inputs, 1) + self.discount * ( 1 - tf.expand_dims(self.done, 1)) * self.target_value_outputs # taken from https://github.com/carpedm20/NAF-tensorflow/blob/master/src/network.py pivot = 0 rows = [] for idx in range(self.action_dim): count = self.action_dim - idx diag_elem = tf.exp(tf.slice(self.l_outputs, (0, pivot), (-1, 1))) non_diag_elems = tf.slice(self.l_outputs, (0, pivot + 1), (-1, count - 1)) row = tf.pad(tf.concat((diag_elem, non_diag_elems), 1), ((0, 0), (idx, 0))) rows.append(row) pivot += count L = tf.transpose(tf.stack(rows, axis=1), (0, 2, 1)) P = tf.matmul(L, tf.transpose(L, (0, 2, 1))) adv_term = tf.expand_dims(self.action_inputs - self.mu_outputs, -1) self.advantages = -tf.matmul(tf.transpose(adv_term, [0, 2, 1]), tf.matmul(P, adv_term)) / 2 self.advantages = tf.reshape(self.advantages, [-1, 1]) self.q_values = self.advantages + self.value_outputs self.loss = tf.reduce_mean( architect.huber_loss(self.q_values - tf.stop_gradient(self.target))) tf.summary.scalar("training_loss", self.loss) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.train_op = optimizer.minimize(self.loss) self.create_target_update_op() def build_network(self, name): detailed_summary = self.detailed_summary if name == self.TARGET_MODEL_NAME: detailed_summary = False with tf.variable_scope(name): state_inputs = tf.placeholder(tf.float32, shape=(None, self.state_dim)) if self.build_mode == self.Build.SINGLE: state_outputs = architect.dense_block( state_inputs, self.state_layers, name="state_branch", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( state_outputs, [self.mu_layers[-1]], "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( state_outputs, [self.l_layers[-1]], "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( state_outputs, [self.v_layers[-1]], "value_branch", detailed_summary=detailed_summary) elif self.build_mode == self.Build.MULTIPLE: state_outputs = None mu_state = architect.dense_block( state_inputs, self.state_layers, name="mu_state", detailed_summary=detailed_summary) l_state = architect.dense_block( state_inputs, self.state_layers, name="l_state", detailed_summary=detailed_summary) value_state = architect.dense_block( state_inputs, self.state_layers, name="value_state", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( mu_state, [self.mu_layers[-1]], "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( l_state, [self.l_layers[-1]], "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( value_state, [self.v_layers[-1]], "value_branch", detailed_summary=detailed_summary) elif self.build_mode == self.Build.HYDRA: state_outputs = architect.dense_block( state_inputs, self.state_layers, name="state_branch", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( state_outputs, self.mu_layers, "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( state_outputs, self.l_layers, "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( state_outputs, self.v_layers, "value_branch", detailed_summary=detailed_summary) else: raise ValueError("Wrong build type.") return state_inputs, state_outputs, mu_outputs, l_outputs, value_outputs def create_target_update_op(self): # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.MODEL_NAME) target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_MODEL_NAME) self.target_update = [] for v_source, v_target in zip(net_vars, target_net_vars): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_update.append(update_op) self.target_update = tf.group(*self.target_update) def learn(self): # learn batch = self.buffer.sample(self.batch_size) merged, targets, _ = self.session.run( [self.merged, self.target, self.train_op], feed_dict={ self.state_inputs: batch["states"], self.action_inputs: batch["actions"], self.reward_inputs: batch["rewards"], self.next_state_inputs: batch["next_states"], self.done: batch["done"] }) self.summary_writer.add_summary(merged, global_step=self.step) # target update self.session.run(self.target_update) def run_episode(self, env): self.policy.reset() state = env.reset() state, skip = self.prep.process(state) total_reward = 0 while True: # play if skip: action = env.action_space.sample() else: action = self.session.run(self.mu_outputs, feed_dict={self.state_inputs: state})[0] action = self.policy.add_noise(action) tmp_state = state tmp_skip = skip state, reward, done, _ = env.step(action) state, skip = self.prep.process(state) total_reward += reward if not tmp_skip and not tmp_skip: self.buffer.add({ "state": tmp_state[0], "action": action, "reward": reward, "next_state": state[0], "done": int(done) }) if self.step >= self.steps_before_train and not self.solved: # learn for _ in range(self.train_freq): self.learn() _, self.step = self.session.run( [self.inc_global_step, self.global_step]) else: _, self.step = self.session.run( [self.inc_global_step, self.global_step]) if done: break summary_value = summary_pb2.Summary.Value(tag="episode_reward", simple_value=total_reward) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=self.step) if self.max_reward is not None: if total_reward >= self.max_reward: self.solved = True else: self.solved = False if self.step == self.max_iters: self.saver.save(self.session, self.summary_dir, global_step=self.step) return total_reward, self.step def close(self): self.session.close()
class DQN(BaseAgent): def __init__(self, features, actions, state_array, params): super(DQN, self).__init__(features, actions, params) self.buffer_BACK = ReplayBuffer(1000) self.buffer_STAY = ReplayBuffer(1000) self.buffer_FORWARD = ReplayBuffer(1000) self.back_q_net = Network(features, self.h1, self.h2, 1).to(device) self.back_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.back_q_net.cloneWeightsTo(self.back_target_q_net) self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device) self.stay_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.stay_q_net.cloneWeightsTo(self.stay_target_q_net) self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device) self.forward_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.forward_q_net.cloneWeightsTo(self.forward_target_q_net) self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.back_values = [] self.stay_values = [] self.forward_values = [] self.back_values_baseline = [] self.stay_values_baseline = [] self.forward_values_baseline = [] self.td_loss = [] self.state_array = state_array self.penultimate_features = [] self.ratioMap = params['ratioMap'] self.sampleSize = params['sampleSize'] def updateNetwork(self, samples): # organize the mini-batch so that we can request "columns" from the data # e.g. we can get all of the actions, or all of the states with a single call batch = getBatchColumns(samples) # compute Q(s, a) for each sample in mini-batch Qs, x = self.policy_net(batch.states) Qsa = Qs.gather(1, batch.actions).squeeze() self.penultimate_features.append(x) # by default Q(s', a') = 0 unless the next states are non-terminal Qspap = torch.zeros(batch.size, device=device) # for i in range(len(batch.actions.numpy())): # if batch.actions.numpy()[i][0] == 0: # self.back_values.append(Qsa.detach().numpy()[i]) # elif batch.actions.numpy()[i][0] == 1: # self.stay_values.append(Qsa.detach().numpy()[i]) # elif batch.actions.numpy()[i][0] == 2: # self.forward_values.append(Qsa.detach().numpy()[i]) # if we don't have any non-terminal next states, then no need to bootstrap if batch.nterm_sp.shape[0] > 0: Qsp, _ = self.target_net(batch.nterm_sp) # bootstrapping term is the max Q value for the next-state # only assign to indices where the next state is non-terminal Qspap[batch.nterm] = Qsp.max(1).values # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize # don't worry about detaching the bootstrapping term for semi-gradient Q-learning # the target network handles that target = batch.rewards + batch.gamma * Qspap.detach() td_loss = 0.5 * f.mse_loss(target, Qsa) # make sure we have no gradients left over from previous update self.optimizer.zero_grad() self.target_net.zero_grad() # compute the entire gradient of the network using only the td error td_loss.backward() self.td_loss.append(td_loss.detach().numpy()) # self.td_loss = self.td_loss + list(td_loss.detach().numpy()) Qs_state_array, _ = self.policy_net(self.state_array) Qsa_mean_states = torch.mean(Qs_state_array, 0) self.back_values.append(Qsa_mean_states[0].detach().numpy()) self.stay_values.append(Qsa_mean_states[1].detach().numpy()) self.forward_values.append(Qsa_mean_states[2].detach().numpy()) # update the *policy network* using the combined gradients self.optimizer.step() def updateActionNet(self, samples, q_net, target_q_net, optimizer, storeList): batch = getBatchColumns(samples) Qs, x = q_net(batch.states) # Qsa = Qs.squeeze() # for i in range(len(batch.actions)): # storeList.append(Qsa.detach().numpy()[i]) Qspap = torch.zeros(batch.size, device=device) ############ ============ CHECK ================= ############################### if batch.nterm_sp.shape[0] > 0: ## Qsp, _ = target_q_net(batch.nterm_sp) #### Is this correct ???? Qsp_back, _ = self.back_target_q_net(batch.nterm_sp) Qsp_stay, _ = self.stay_target_q_net(batch.nterm_sp) Qsp_forward, _ = self.forward_target_q_net(batch.nterm_sp) Qsp = torch.hstack([Qsp_back, Qsp_stay, Qsp_forward]) # bootstrapping term is the max Q value for the next-state # only assign to indices where the next state is non-terminal Qspap[batch.nterm] = Qsp.max(1).values ############ ============ CHECK ================= ############################### # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize # don't worry about detaching the bootstrapping term for semi-gradient Q-learning # the target network handles that target = batch.rewards + batch.gamma * Qspap.detach() td_loss = 0.5 * f.mse_loss(target, Qsa) # make sure we have no gradients left over from previous update optimizer.zero_grad() target_q_net.zero_grad() self.back_target_q_net.zero_grad() self.stay_target_q_net.zero_grad() self.forward_target_q_net.zero_grad() # compute the entire gradient of the network using only the td error td_loss.backward() Qs_state_array, _ = q_net(self.state_array) Qsa_mean_states = torch.mean(Qs_state_array, 0) storeList.append(Qsa_mean_states[0].detach().numpy()) # update the *policy network* using the combined gradients optimizer.step() def update(self, s, a, sp, r, gamma): if a.cpu().numpy() == 0: self.buffer_BACK.add((s, a, sp, r, gamma)) elif a.cpu().numpy() == 1: self.buffer_STAY.add((s, a, sp, r, gamma)) elif a.cpu().numpy() == 2: self.buffer_FORWARD.add((s, a, sp, r, gamma)) # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) self.back_q_net.cloneWeightsTo(self.back_target_q_net) self.stay_q_net.cloneWeightsTo(self.stay_target_q_net) self.forward_q_net.cloneWeightsTo(self.forward_target_q_net) back_sample_count = math.floor( self.ratioMap.backward_ratio * self.sampleSize) stay_sample_count = math.floor( self.ratioMap.stay_ratio * self.sampleSize) forward_sample_count = math.floor( self.ratioMap.forward_ratio * self.sampleSize) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer_BACK) > back_sample_count \ and len(self.buffer_STAY) > stay_sample_count \ and len(self.buffer_FORWARD) > forward_sample_count: samplesBack, idcs = self.buffer_BACK.sample(back_sample_count) samplesStay, idcs = self.buffer_STAY.sample(stay_sample_count) samplesForward, idcs = self.buffer_FORWARD.sample(forward_sample_count) self.updateActionNet(samplesBack, self.back_q_net, self.back_target_q_net, self.optimizerBack, self.back_values_baseline) self.updateActionNet(samplesStay, self.stay_q_net, self.stay_target_q_net, self.optimizerStay, self.stay_values_baseline) self.updateActionNet(samplesForward, self.forward_q_net, self.forward_target_q_net, self.optimizerForward, self.forward_values_baseline) samples = samplesBack + samplesStay + samplesForward self.updateNetwork(samples)
class DDPG(): def __init__(self, args, env = None): self.args = args # actor self.actor = DeterministicPolicy(128).to(device) self.actor_target = DeterministicPolicy(128).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) # critics self.critic = QNetwork(128).to(device) self.critic_target = QNetwork(128).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 self.action_scale = torch.FloatTensor([[20, 1]]).to(device) self.env = env #self.load() def update(self): for it in range(self.args.update_iteration): # sample from replay buffer obs, local_goal, next_obs, next_goal, action, reward, done = self.replay_buffer.sample(self.args.batch_size) obs = torch.FloatTensor(obs).to(device) local_goal = torch.FloatTensor(local_goal).to(device) next_obs = torch.FloatTensor(next_obs).to(device) next_goal = torch.FloatTensor(next_goal).to(device) action = torch.FloatTensor(action).to(device) reward = torch.FloatTensor(reward).to(device) done = torch.FloatTensor(done).to(device) # computer the target Q value next_action, _ = self.actor_target.sample(next_obs, next_goal) target_Q = self.critic_target(next_obs, next_goal, next_action / self.action_scale) target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach() # get current Q estimate current_Q = self.critic(obs, local_goal, action) # compute cirtic loss and update critic_loss = F.mse_loss(current_Q, target_Q) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # computer actor loss actor_action, _ = self.actor.sample(obs, local_goal) actor_loss = -self.critic(obs, local_goal, actor_action / self.action_scale).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target model for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 def train(self): for i in range(self.args.max_episode): obs, local_goal = self.env.reset() ep_r = 0 for t in count(): action, _ = self.actor.sample(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device)) action = action.cpu().detach().numpy()[0] next_obs, next_goal, done, reward = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push((obs / 4.0, local_goal / 20., next_obs / 4.0, next_goal / 20., action / np.array([20, 1]), reward, np.float(done))) obs = next_obs local_goal = next_goal if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) break if len(self.replay_buffer.storage) >= self.args.capacity * 0.2: self.update() self.save() def evaluate(self, number = 1, render = True): rewards = [] for _ in range(number): total_rews = 0 time_step = 0 done = False obs, local_goal = self.env.reset() while not done: action = self.predict(obs / 4., local_goal / 20.) # with torch.no_grad(): # # use the mean action # _, action = self.actor.sample(torch.FloatTensor(obs).to(device) / 4., torch.FloatTensor(local_goal).to(device) / 20) # action = action.cpu().detach().numpy()[0] obs, local_goal, done, reward = self.env.step(action) if render: self.env.render() total_rews += reward time_step += 1 if time_step > self.args.max_length_trajectory: break #print(str(action) + " " + str(local_goal)) if done: break rewards.append(total_rews) rewards = np.array(rewards) print("mean reward {}, max reward {}, min reward {}".format(rewards.mean(), rewards.max(), rewards.min())) def predict(self, obs, local_goal): with torch.no_grad(): action = self.actor.forward(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device)) action = action.cpu().detach().numpy()[0] return action def load(self, episode = None): file_name = "weights/DDPG.pt" checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic.load_state_dict(checkpoint['critic']) self.critic.load_state_dict(checkpoint['critic_target']) print("successfully load model from " + file_name) def save(self, episode = None): file_name = "weights/DDPG.pt" torch.save({'actor' : self.actor.state_dict(), 'critic' : self.critic.state_dict(), 'actor_target' : self.actor_target.state_dict(), 'critic_target' : self.critic_target.state_dict()}, file_name) print("save model to " + file_name)
class DDPG_Agent(Agent): """Interacts with and learns from the environment.""" policy_type = "DDPG" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ super().__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_target = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_target = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #Statistics self.stats = { "actor_loss": [], "critic_loss": [], "reward_sum": [], } def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() action = self.actor_local.select_action(state) self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() #tmp = np.array((critic_loss.item(), actor_loss.item())) #print(tmp) # --------------------------- for the plot ----------------------------- # # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) with torch.no_grad(): actions_pred_target = self.actor_target(states) actor_loss_target = -self.critic_target( states, actions_pred_target).mean() Q_expected_target = self.critic_target(states, actions) critic_loss_target = F.mse_loss(Q_expected_target, Q_targets) with open("saveDDPG_critic-actor_loss.csv", "a") as f: tmp = str(critic_loss_target.item()) + "," + str( actor_loss_target.item()) + "\n" f.write(tmp) self.save_stats(actor_loss=actor_loss.item(), critic_loss=critic_loss.item(), reward_sum=rewards.sum().item()) def store_policy(self, env_name, score): traced = torch.jit.script(self.actor_target) torch.jit.save( traced, "data/policies/" + "DDPGAgent" + str(env_name) + "#" + str(score) + ".zip") def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class BaseAgent: def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector): self.features = features self.actions = actions self.params = params self.collector = collector self.seed = seed # define parameter contract self.gamma = params['gamma'] self.epsilon = params.get('epsilon', 0) # the mellowmax parameter self.omega = params.get('omega', 1.0) # set up network for estimating Q(s, a) self.value_net = Network(features, actions, params, seed).to(device) # build the optimizer self.optimizer_params = params['optimizer'] self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params) self.steps = 0 # set up the replay buffer self.buffer_size = params['buffer_size'] self.batch_size = params['batch'] self.buffer_type = params.get('buffer', 'standard') if self.buffer_type == 'per': prioritization = params['prioritization'] self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization) else: self.buffer = ReplayBuffer(self.buffer_size) # build a target network self.target_refresh = params.get('target_refresh', 1) self.target_net = copy.deepcopy(self.value_net) self.initializeTargetNet() def getValues(x: torch.Tensor): qs = self.values(x).detach().cpu().squeeze(0).numpy() return qs self.policy = createEpsilonGreedy(seed, self.epsilon, getValues) # return the Q(s, a) values from the value network def values(self, x): return self.value_net(x)[0] # sample an action according to our policy def selectAction(self, x): return self.policy.selectAction(x) def initializeTargetNet(self): # if we aren't using target nets, then save some compute if self.target_refresh > 1: self.target_net = copy.deepcopy(self.value_net) cloneNetworkWeights(self.value_net, self.target_net) else: self.target_net = self.value_net @abstractmethod def updateNetwork(self, batch: Batch, predictions: Dict): pass @abstractmethod def forward(self, batch: Batch) -> Dict[str, torch.Tensor]: pass @abstractmethod def bootstrap(self, batch: Batch, next_values: torch.Tensor) -> Dict[str, torch.Tensor]: pass # a helper method that lets us bypass combining gradients whenever # target networks are disabled def combineTargetGrads(self): if self.target_net == self.value_net: return addGradients_(self.value_net, self.target_net) def update(self, s, a, sp, r, gamma): self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 if self.steps % self.target_refresh == 0 and self.target_refresh > 1: cloneNetworkWeights(self.value_net, self.target_net) if len(self.buffer) > self.batch_size + 1: samples, idcs = self.buffer.sample(self.batch_size) batch = getBatchColumns(samples) predictions = self.forward(batch) tde = self.updateNetwork(batch, predictions) self.buffer.update_priorities(idcs, tde)
class BaseAgent: def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 self.actionCounter = np.zeros((env.width, env.height, env.num_actions)) # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net) def selectAction(self, x): # take a random action about epsilon percent of the time if np.random.rand() < self.epsilon: a = np.random.randint(self.actions) return torch.tensor(a, device=device) # otherwise take a greedy action q_s, _ = self.policy_net(x) # print(q_s.detach().numpy()[0][3]) print(q_s.argmax().detach()) return q_s.argmax().detach() def updateNetwork(self, samples): pass def update(self, s, a, r, sp, gamma): # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, r, sp, gamma)) self.steps += 1 a = a.numpy() s = s.numpy() self.actionCounter[s[0][0]][s[0][1]][a] += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer) > 32: samples, idcs = self.buffer.sample(32) self.updateNetwork(samples)
class DDPG(algorithms): def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = DeterministicPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target = DeterministicPolicy( state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 if self.args.last_episode > 0: self.load(self.args.last_episode) def update(self): for it in range(self.args.update_iteration): # sample from replay buffer x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) state = torch.FloatTensor(x).to(device) action = torch.FloatTensor(u).to(device) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # computer the target Q value next_action, _, _ = self.actor_target.sample(next_state) target_Q = self.critic_target(next_state, next_action) target_Q = reward + ( (1 - done) * self.args.gamma * target_Q).detach() # get current Q estimate current_Q = self.critic(state, action) # compute cirtic loss and update critic_loss = F.mse_loss(current_Q, target_Q) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # computer actor loss actor_action, _, _ = self.actor.sample(state) actor_loss = -self.critic(state, actor_action).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target model for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 def train(self): for i in range(self.args.max_episode): state = self.env.reset() ep_r = 0 for t in count(): action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] next_state, reward, done, info = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push( (state, next_state, action, reward, np.float(done))) state = next_state if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print( "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}" .format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) break if len(self.replay_buffer.storage) >= self.args.capacity - 1: self.update() self.save(i + 1) def evaluate(self, number=1, render=True): rewards = [] for _ in range(number): total_rews = 0 time_step = 0 done = False state = self.env.reset() while not done: with torch.no_grad(): # use the mean action _, _, action = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] if render: self.env.render() state, reward, done, _ = self.env.step(action) total_rews += reward time_step += 1 if render: print("total reward of this episode is " + str(total_rews)) rewards.append(total_rews) rewards = np.array(rewards) if not render: pickle.dump((self.global_steps, rewards), self.log_file) print("mean reward {}, max reward {}".format(rewards.mean(), rewards.max())) def load(self, episode=None): file_name = self.weights_file(episode) checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic.load_state_dict(checkpoint['critic']) self.critic.load_state_dict(checkpoint['critic_target']) print("successfully load model from " + file_name) def save(self, episode=None): file_name = self.weights_file(episode) torch.save( { 'actor': self.actor.state_dict(), 'critic': self.critic.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_target': self.critic_target.state_dict() }, file_name) print("save model to " + file_name)
class DDPG: CRITIC_NAME = "critic" TARGET_CRITIC_NAME = "target_critic" ACTOR_NAME = "actor" TARGET_ACTOR_NAME = "target_actor" def __init__(self, state_dim, action_dim, monitor_directory, actor_learning_rate=1e-5, critic_learning_rate=1e-3, critic_target_update_rate=1e-3, actor_target_update_rate=1e-3, discount=0.99, l2_decay=1e-2, buffer_size=1000000, batch_size=64, detail_summary=False, tanh_action=True, input_batch_norm=True, all_batch_norm=True, log_frequency=10): self.state_dim = state_dim self.action_dim = action_dim self.critic_learning_rate = critic_learning_rate self.actor_learning_rate = actor_learning_rate self.critic_target_update_rate = critic_target_update_rate self.actor_target_update_rate = actor_target_update_rate self.discount = discount self.batch_size = batch_size self.l2_decay = l2_decay self.buffer_size = buffer_size self.summary_dir = os.path.join(monitor_directory, "summary") self.detail_summary = detail_summary self.tanh_action = tanh_action self.input_batch_norm = input_batch_norm self.all_batch_norm = all_batch_norm self.log_frequency = log_frequency self.step = 0 self.solved = False self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.__build() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "actor learning rate": self.actor_learning_rate, "critic learning rate": self.critic_learning_rate, "batch size": self.batch_size, "actor update rate": self.actor_target_update_rate, "critic update rate": self.critic_target_update_rate, "buffer size": self.buffer_size, }) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session = tf.Session() self.merged = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.session.run(init_op) """ PUBLIC """ def learn(self): batch = self.buffer.sample(self.batch_size) self.__train_critic(batch["states"], batch["actions"], batch["rewards"], batch["next_states"], batch["done"]) self.__train_actor(batch["states"]) self.session.run([ self.target_critic_update, self.target_actor_update, self.inc_global_step ]) def act(self, state): a = self.session.run(self.action, feed_dict={ self.state_input: state, self.is_training: False })[0] return a def perceive(self, transition): self.buffer.add(transition) def log_scalar(self, name, value, index): summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=index) def save(self): self.saver.save(self.session, self.summary_dir, global_step=self.session.run(self.global_step)) def close(self): self.session.close() """ PRIVATE """ def __build_critic(self, name, state_input, action_input): bn_training = self.is_training if name == self.TARGET_CRITIC_NAME: bn_training = False with tf.variable_scope(name): # weights and biases W1 = self.__get_weights((self.state_dim, 400), self.state_dim, name="W1") b1 = self.__get_weights((400, ), self.state_dim, name="b1") W2 = self.__get_weights((400, 300), 400 + self.action_dim, name="W2") b2 = self.__get_weights((300, ), 400 + self.action_dim, name="b2") W2_action = self.__get_weights((self.action_dim, 300), 400 + self.action_dim, name="W2_action") W3 = tf.Variable(tf.random_uniform((300, 1), -3e-3, 3e-3), name="W3") b3 = tf.Variable(tf.random_uniform((1, ), -3e-3, 3e-3), name="b3") # layers if self.input_batch_norm: state_input = tf.layers.batch_normalization( state_input, training=bn_training) layer_1 = tf.matmul(state_input, W1) + b1 if self.all_batch_norm: layer_1 = tf.layers.batch_normalization(layer_1, training=bn_training) layer_1 = tf.nn.relu(layer_1) layer_2 = tf.nn.relu( tf.matmul(layer_1, W2) + tf.matmul(action_input, W2_action) + b2) output_layer = tf.matmul(layer_2, W3) + b3 # summary if name == self.CRITIC_NAME: self.critic_summaries = [ tf.summary.histogram("W1", W1), tf.summary.histogram("b1", b1), tf.summary.histogram("W2", W2), tf.summary.histogram("b2", b2), tf.summary.histogram("W2_action", W2_action), tf.summary.histogram("W3", W3), tf.summary.histogram("b3", b3), tf.summary.histogram("layer_1", layer_1), tf.summary.histogram("layer_2", layer_2), tf.summary.histogram("output_layer", output_layer) ] # weight decay weights = [W1, b1, W2, b2, W2_action, W3, b3] weight_decay = tf.add_n( [self.l2_decay * tf.nn.l2_loss(var) for var in weights]) return output_layer, weight_decay def __build_actor(self, name, state_input): bn_training = self.is_training if name == self.TARGET_ACTOR_NAME: bn_training = False with tf.variable_scope(name): # weights and biases W1 = self.__get_weights((self.state_dim, 400), self.state_dim, name="W1") b1 = self.__get_weights((400, ), self.state_dim, name="b1") W2 = self.__get_weights((400, 300), 400, name="W2") b2 = self.__get_weights((300, ), 400, name="b2") W3 = tf.Variable(tf.random_uniform((300, self.action_dim), minval=-3e-3, maxval=3e-3), name="W3") b3 = tf.Variable(tf.random_uniform((self.action_dim, ), -3e-3, 3e-3), name="b3") # layers if self.input_batch_norm: state_input = tf.layers.batch_normalization( state_input, training=bn_training) layer_1 = tf.matmul(state_input, W1) + b1 if self.all_batch_norm: layer_1 = tf.layers.batch_normalization(layer_1, training=bn_training) layer_1 = tf.nn.relu(layer_1) layer_2 = tf.matmul(layer_1, W2) + b2 if self.all_batch_norm: layer_2 = tf.layers.batch_normalization(layer_2, training=bn_training) layer_2 = tf.nn.relu(layer_2) output_layer = tf.matmul(layer_2, W3) + b3 # summary if name == self.ACTOR_NAME: self.actor_summaries = [ tf.summary.histogram("W1", W1), tf.summary.histogram("b1", b1), tf.summary.histogram("W2", W2), tf.summary.histogram("b2", b2), tf.summary.histogram("W3", W3), tf.summary.histogram("b3", b3), tf.summary.histogram("layer_1", layer_1), tf.summary.histogram("layer_2", layer_2), tf.summary.histogram("output_layer", output_layer) ] if self.tanh_action: return tf.nn.tanh(output_layer) else: return output_layer def __build(self): self.state_input = tf.placeholder(tf.float32, shape=(None, self.state_dim), name="state_input") self.next_state_input = tf.placeholder(tf.float32, shape=(None, self.state_dim), name="next_state_input") self.action_input = tf.placeholder(tf.float32, shape=(None, self.action_dim), name="action_input") self.reward_input = tf.placeholder(tf.float32, shape=(None, ), name="reward_input") self.done_input = tf.placeholder(tf.float32, shape=(None, ), name="done_input") self.is_training = tf.placeholder(tf.bool, name="is_training") # inputs summary if self.detail_summary: self.input_summaries = [ tf.summary.histogram("state", self.state_input), tf.summary.histogram("next_state", self.next_state_input), tf.summary.histogram("action", self.action_input), tf.summary.histogram("reward", self.reward_input), tf.summary.histogram("done", self.done_input) ] self.target_action = self.__build_actor(self.TARGET_ACTOR_NAME, self.next_state_input) self.q_value, weight_decay = self.__build_critic( self.CRITIC_NAME, self.state_input, self.action_input) self.target_q_value, _ = self.__build_critic(self.TARGET_CRITIC_NAME, self.next_state_input, self.target_action) self.tmp = tf.expand_dims(self.reward_input, 1) self.targets = tf.expand_dims(self.reward_input, 1) + self.discount * ( 1 - tf.expand_dims(self.done_input, 1)) * self.target_q_value self.diff = self.targets - self.q_value self.loss = tf.reduce_mean( tf.square(tf.stop_gradient(self.targets) - self.q_value)) + weight_decay self.loss_summary = tf.summary.scalar("critic_loss", self.loss) self.critic_train_op = tf.train.AdamOptimizer( self.critic_learning_rate).minimize(self.loss) # add critic batch norm. update if self.input_batch_norm or self.all_batch_norm: self.critic_bn_update_op = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=self.CRITIC_NAME) self.critic_bn_update_op = tf.group(*self.critic_bn_update_op) self.critic_train_op = tf.group(self.critic_train_op, self.critic_bn_update_op) self.action = self.__build_actor(self.ACTOR_NAME, self.state_input) self.actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTOR_NAME) self.action_gradients = tf.gradients(self.q_value, self.action_input)[0] self.actor_params_gradient = tf.gradients(self.action, self.actor_params, -self.action_gradients) # actor gradients summary if self.detail_summary: self.actor_summaries.append( tf.summary.histogram("action_gradient", self.action_gradients)) for grad in self.actor_params_gradient: self.actor_summaries.append( tf.summary.histogram("actor_parameter_gradients", grad)) self.actor_train_op = tf.train.AdamOptimizer( self.actor_learning_rate).apply_gradients( zip(self.actor_params_gradient, self.actor_params)) # add actor batch norm. update if self.input_batch_norm or self.all_batch_norm: self.actor_bn_update_op = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=self.ACTOR_NAME) self.actor_bn_update_op = tf.group(*self.actor_bn_update_op) self.actor_train_op = tf.group(self.actor_train_op, self.actor_bn_update_op) self.target_critic_update = architect.create_target_update_ops( self.CRITIC_NAME, self.TARGET_CRITIC_NAME, self.critic_target_update_rate) self.target_actor_update = architect.create_target_update_ops( self.ACTOR_NAME, self.TARGET_ACTOR_NAME, self.actor_target_update_rate) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) # group summaries self.critic_summaries = tf.summary.merge(self.critic_summaries) if self.detail_summary: self.actor_summaries = tf.summary.merge(self.actor_summaries) self.input_summaries = tf.summary.merge(self.input_summaries) @staticmethod def __get_weights(shape, input_shape, name="var"): return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(input_shape), 1 / math.sqrt(input_shape)), name=name) def __train_actor(self, states): actions = self.session.run(self.action, feed_dict={ self.state_input: states, self.is_training: True }) self.session.run(self.actor_train_op, feed_dict={ self.state_input: states, self.action_input: actions, self.is_training: True }) def __train_critic(self, states, actions, rewards, next_states, done): feed_dict = { self.state_input: states, self.action_input: actions, self.reward_input: rewards, self.next_state_input: next_states, self.done_input: done, self.is_training: True } step = self.session.run(self.global_step) if step % self.log_frequency == 0: ops = [self.critic_train_op, self.loss_summary] if self.detail_summary: ops.append(self.actor_summaries) ops.append(self.input_summaries) res = self.session.run(ops, feed_dict=feed_dict) self.summary_writer.add_summary(res[1], global_step=step) if self.detail_summary: self.summary_writer.add_summary(res[2], global_step=step) self.summary_writer.add_summary(res[3], global_step=step) else: self.session.run(self.critic_train_op, feed_dict=feed_dict)
class Agents(): def __init__(self, args): self.args = args self.policy = [Q_net(args) for _ in range(args.n_agents)] self.hyperNet = HyperNet(args) self.policy_target = [copy.deepcopy(p) for p in self.policy] self.hyperNet_target = copy.deepcopy(self.hyperNet) self.replayBuffer = ReplayBuffer(args) self.preference_pool = Preference(args) policy_param = [policy.parameters() for policy in self.policy] self.optim = torch.optim.Adam(itertools.chain( *policy_param, self.hyperNet.parameters()), lr=self.args.learning_rate) self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size=100, gamma=0.9, last_epoch=-1) self.step = 0 def choose_action(self, obs, preference, epsilon): obs = np.array(obs).transpose((1, 0, 2)) preference = np.array(preference).transpose((1, 0, 2)) act = np.array([ self.policy[i].choose_action(obs[i], preference[i], epsilon) for i in range(self.args.n_agents) ]) return act.transpose((2, 0, 1)) def learn(self): def combine(obs, pref): ow = [] n_pref = len(pref) for w in range(n_pref): ow.append( torch.cat([obs, pref[w]]).unsqueeze(0).to(self.args.device)) ow = torch.cat(ow, dim=0) return ow.unsqueeze(0) sample = self.replayBuffer.sample(self.args.batch_size) batch_w = self.preference_pool.sample(self.args.batch_size_p, train=True) obs = sample["obs"] obs_ = sample["next_obs"] act = sample["act"] rew = sample["rew"] state = sample["state"] state_ = sample["next_state"] Q_ = [] #################################################################### for i in range(self.args.batch_size): Q_.append([]) for j in range(self.args.batch_size_p): Q_[i].append( torch.cat([ combine(obs_[a][i], batch_w[a]) for a in range(self.args.n_agents) ], dim=0).unsqueeze(0)) Q_[i] = torch.cat(Q_[i], dim=0) Q_ = torch.cat(Q_, dim=0).permute(1, 0, 2, 3) #################################################################### Q_ = torch.cat([ self.policy[a].get_target_q(Q_[a], batch_w[a][0]).unsqueeze(0) for a in range(self.args.n_agents) ], dim=0) Q_ = Q_.squeeze(-1).permute(2, 0, 1).view(-1, self.args.n_agents * 3) obs = [ torch.cat([obs[i] for _ in range(self.args.batch_size_p)]) for i in range(self.args.n_agents) ] w = copy.deepcopy(batch_w[0]) batch_w = [ batch_w[i].data.cpu().numpy().repeat(self.args.batch_size, axis=0) for i in range(self.args.n_agents) ] Q = torch.cat([ self.policy[i].get_q(obs[i], batch_w[i], act[i]) for i in range(self.args.n_agents) ], dim=-1) Q_tot = self.hyperNet.get_Q_tot(state, w, Q) Q_tot_target = self.hyperNet_target.get_Q_tot(state_, w, Q_).detach() rew = rew.unsqueeze(0).repeat([self.args.batch_size_p, 1, 1]).view(-1, self.args.n_obj) loss = self.loss_func(Q_tot, Q_tot_target, rew, w) self.optim.zero_grad() loss.backward() self.optim.step() self.lr_scheduler.step() # print("learning rate:", self.optim) def loss_func(self, Q, Q_target, R, w): R = self.convert_type(R) w = self.convert_type(w) y = R + Q_target w = w.repeat([self.args.batch_size, 1]).view(-1, self.args.n_obj) La = torch.norm(y - Q, p=2, dim=-1).mean() wy = torch.bmm(w.unsqueeze(1), y.unsqueeze(-1)) wq = torch.bmm(w.unsqueeze(1), Q.unsqueeze(-1)) Lb = torch.abs(wy - wq).mean() # loss = La + Lb loss = La return loss def push(self, traj): self.replayBuffer.push(traj["obs"], traj["acts"], traj["rew"], traj["next_obs"], traj["done"], traj["state"], traj["next_state"], traj["pref"]) def update_target(self): self.step += 1 if self.step % 1000 == 0: print("updating target nets") self.hyperNet_target.load_state_dict(self.hyperNet.state_dict()) for i in range(len(self.policy)): self.policy_target[i].load_state_dict( self.policy[i].state_dict()) def convert_type(self, input): if not isinstance(input, torch.Tensor): input = torch.Tensor(input) if input.device != torch.device(self.args.device): input = input.to(self.args.device) return input def save_model(self, ep, path='./model/MOQMIX/'): print("saving model") state = {} for i in range(len(self.policy)): state['policy{0}'.format(i)] = self.policy[i].state_dict() state['target_policy{0}'.format( i)] = self.policy_target[i].state_dict() state['hyperNet'] = self.hyperNet.state_dict() state['target_hyperNet'] = self.hyperNet_target.state_dict() state['optim'] = self.optim.state_dict() state['lr_scheduler'] = self.lr_scheduler.state_dict() state['epoch'] = ep torch.save(state, path + "model.pth") def load_model(self, path='./model/MOQMIX', device='cpu'): state = torch.load(path + "model.pth", map_location=device) for i in range(len(self.policy)): self.policy[i].load_state_dict(state['policy{0}'.format(i)]) self.policy_target[i].load_state_dict( state['target_policy{0}'.format(i)]) self.hyperNet = state['hyperNet'] self.hyperNet_target = state['target_hyperNet'] self.optim = state['optim'] self.lr_scheduler = state['lr_scheduler'] return state['epoch']
class DeepQNetwork: ACTION_VALUE_NET_NAME = "q-network" TARGET_ACTION_VALUE_NET_NAME = "target-q-network" def __init__(self, network, prep, exp_policy, state_dim, action_dim, name, learning_rate=1e-3, hard_update_frequency=500, soft_update_rate=None, buffer_size=50000, batch_size=32, num_steps=200000, discount=0.99, use_huber_loss=True, detailed_summary=False, max_reward=200, steps_before_learn=1000, train_freq=1, save_end=True): self.network = network self.prep = prep self.exp_policy = exp_policy self.greedy_policy = policy.Greedy() self.state_dim = state_dim self.action_dim = action_dim self.discount = discount self.summary_dir = os.path.join(name, "summary") self.use_huber_loss = use_huber_loss self.detailed_summary = detailed_summary self.learning_rate = learning_rate self.batch_size = batch_size self.hard_update_frequency = hard_update_frequency self.soft_update_rate = soft_update_rate self.num_steps = num_steps self.step = 0 self.steps_before_learn = steps_before_learn self.train_freq = train_freq self.solved = False self.max_reward = max_reward self.save_end = save_end self.actions = None self.rewards = None self.done = None self.action_q_values = None self.max_target_q_values = None self.targets = None self.global_step = None self.inc_global_step = None self.train_op = None self.states = None self.q_values = None self.next_states = None self.target_q_values = None self.target_update = None self.build_all() self.merged = tf.summary.merge_all() self.session = tf.Session() self.summary_dir = utils.new_summary_dir(self.summary_dir) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session.run(init_op) self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) def build_all(self): self.actions = tf.placeholder(tf.float32, (None, self.action_dim), name="actions") self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") self.done = tf.placeholder(tf.float32, (None,), name="done") self.build_network() self.build_target_network() if self.soft_update_rate is not None: self.create_soft_target_update_op() else: self.create_hard_target_update_op() self.action_q_values = tf.reduce_sum(self.q_values * self.actions, axis=1) self.max_target_q_values = tf.reduce_max(self.target_q_values, axis=1) self.targets = self.rewards + (1 - self.done) * (self.discount * self.max_target_q_values) if self.detailed_summary: architect.variable_summaries(self.targets, name="targets") td_diff = self.action_q_values - tf.stop_gradient(self.targets) if self.use_huber_loss: loss = tf.reduce_mean(architect.huber_loss(td_diff)) else: loss = tf.reduce_mean(tf.pow(td_diff, 2)) tf.summary.scalar("loss", loss) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss) def build_network(self): self.states, self.q_values = self.network.build(self.state_dim, self.action_dim, self.ACTION_VALUE_NET_NAME) def build_target_network(self): self.next_states, self.target_q_values = self.network.build(self.state_dim, self.action_dim, self.TARGET_ACTION_VALUE_NET_NAME) def create_soft_target_update_op(self): # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME) target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME) self.target_update = [] for v_source, v_target in zip(net_vars, target_net_vars): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.soft_update_rate * (v_target - v_source)) self.target_update.append(update_op) self.target_update = tf.group(*self.target_update) def create_hard_target_update_op(self): net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME) target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME) self.target_update = [] for v_source, v_target in zip(net_vars, target_net_vars): update_op = v_target.assign(v_source) self.target_update.append(update_op) self.target_update = tf.group(*self.target_update) def learn(self): # learn batch = self.buffer.sample(self.batch_size) merged, _ = self.session.run([self.merged, self.train_op], feed_dict={ self.states: batch["states"], self.actions: batch["actions"], self.rewards: batch["rewards"], self.next_states: batch["next_states"], self.done: batch["done"] }) self.summary_writer.add_summary(merged, global_step=self.step) # target update if self.soft_update_rate is not None: self.session.run(self.target_update) elif self.step % self.hard_update_frequency == 0: self.session.run(self.target_update) def run_episode(self, env): state = env.reset() state, skip = self.prep.process(state) total_reward = 0 while True: # play if skip: action = env.action_space.sample() else: q_values = self.session.run(self.q_values, feed_dict={self.states: state})[0] if self.solved: action = self.greedy_policy.select_action(q_values) else: action = self.exp_policy.select_action(q_values) action_one_hot = np.zeros(self.action_dim) action_one_hot[action] = 1 tmp_state = state tmp_skip = skip state, reward, done, info = env.step(action) state, skip = self.prep.process(state) total_reward += reward if not tmp_skip and not tmp_skip: self.buffer.add({ "state": tmp_state[0], "action": action_one_hot, "reward": reward, "next_state": state[0], "done": int(done) }) if self.step >= self.steps_before_learn and self.step % self.train_freq == 0 and not self.solved: # learn self.learn() _, self.step = self.session.run([self.inc_global_step, self.global_step]) if done: break summary_value = summary_pb2.Summary.Value(tag="episode_reward", simple_value=total_reward) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=self.step) if total_reward >= self.max_reward: self.solved = True else: self.solved = False if self.step == self.num_steps: self.saver.save(self.session, self.summary_dir, global_step=self.step) return total_reward, self.step def close(self): self.session.close()
class BaseDQN(bp.Policy): """ An abstract base class that implements Deep Q-Learning and allows for customization - to be extended by other policies that we wrote """ def cast_string_args(self, policy_args): policy_args['epsilon'] = float( policy_args['epsilon']) if 'epsilon' in policy_args else EPSILON policy_args['gamma'] = float( policy_args['gamma']) if 'gamma' in policy_args else GAMMA self.huber_loss = False self.use_softmax_sampling = True self.epsilon_decay = 0.90 self.min_epsilon = MIN_EPSILON self.learning_rate = 1e-4 self.batch_size = 96 self.state_radius = 5 self.state_rep = SQUARE self.step_forward = True self.flatten = FULL self.doubleDQN = False self.save_model_round = 250 self.augment_after_normaliztion = False policy_args = self._additional_args(policy_args) return policy_args def _save_model(self): self.old_model.set_weights(self.model.get_weights()) def init_run(self): self.log("Starting init") self.r_sum = 0 if self.state_rep == SQUARE: self.state_proc = SquareAroundHeadState( radius=self.state_radius, step_forward=self.step_forward, flatten=self.flatten) elif self.state_rep == DIAMOND: self.state_proc = DiamondAroundHeadState( radius=self.state_radius, step_forward=self.step_forward, flatten=self.flatten) elif self.state_rep == RADAR: self.state_proc = RadarState(num_per_type=NUM_PER_TYPE) self.input_shape = self.state_proc.get_shape() self.model = self._build_model() self.model.summary() if self.huber_loss: loss = huber_loss else: loss = 'mse' opt = Adam(self.learning_rate) self.model.compile(loss=loss, optimizer=opt) self.old_model = keras.models.clone_model(self.model) self._save_model() self.memory = ReplayBuffer(BUFFER_SIZE) self.log("Init finished!") self.num_of_samples = 0 self.sum_of_loss = 0 def learn(self, round, prev_state, prev_action, reward, new_state, too_slow): try: if round % 100 == 0: if round > self.game_duration - self.score_scope: self.log( "Rewards in last 100 rounds which counts towards the score: {}, eps={:.2f}, " "db_size={}".format(str(self.r_sum), self.epsilon, len(self.memory)), 'VALUE') else: total_loss = self.sum_of_loss / self.num_of_samples self.num_of_samples = self.sum_of_loss = 0 self.log( "Rewards in last 100 rounds: {}, eps={:.2f}, db_size={}, loss={:.3f}" .format(str(self.r_sum), self.epsilon, len(self.memory), total_loss), 'VALUE') self.r_sum = 0 else: self.r_sum += reward except Exception as e: self.log("Something Went Wrong...", 'EXCEPTION') self.log(e, 'EXCEPTION') prev, actions, rewards, new = self.memory.sample(self.batch_size) if self.doubleDQN: target = rewards + self.gamma * self.old_model.predict(new)[ range(len(new)), np.argmax(self.model.predict(new), axis=1)] else: target = rewards + self.gamma * np.amax( self.old_model.predict(new), axis=1) target_f = self.model.predict(prev) try: target_f[range(len(actions)), actions] = target hist = self.model.fit(prev, target_f, epochs=1, verbose=0, batch_size=len(prev), shuffle=True) self.sum_of_loss += np.sum(hist.history['loss']) self.num_of_samples += len(hist.history['loss']) except Exception as e: print(e) if round % self.save_model_round == 0 and round > 0: self._save_model() if round % 200 == 0 and round > 0 and self.epsilon > 0: self.epsilon = max(self.epsilon * self.epsilon_decay, self.min_epsilon) def act(self, round, prev_state, prev_action, reward, new_state, too_slow): if round > self.game_duration - self.score_scope: # cancel exploration during "money-time" self.use_softmax_sampling = False self.epsilon = 0 new_state_repr = self.state_proc.get_state_repr(new_state) if prev_state is not None: prev_state_repr = self.state_proc.get_state_repr(prev_state) self.memory.record(prev_state_repr, bp.Policy.ACTIONS.index(prev_action), reward, new_state_repr) if self.augment_after_normaliztion and prev_state[1][ 1] == new_state[1][1]: self.memory.record(*augment_after_normaliztion( prev_state_repr, prev_state[1][1], bp.Policy.ACTIONS.index(prev_action), reward, new_state_repr, new_state[1][1], self.state_radius)) if self.use_softmax_sampling: return np.random.choice( bp.Policy.ACTIONS, p=softmax( self.model.predict(new_state_repr[np.newaxis]) / self.epsilon).squeeze()) else: # use epsilon-greedy if np.random.rand() < self.epsilon: return np.random.choice(bp.Policy.ACTIONS) else: prediction = self.model.predict(new_state_repr[np.newaxis])[0] action = bp.Policy.ACTIONS[np.argmax(prediction)] return action @abstractmethod def _build_model(self) -> Model: raise NotImplementedError @abstractmethod def _additional_args(self, policy_args): raise NotImplementedError