for i_episode in range(MAX_EPISODE): observation = env.reset() t = 0 track_reward = [] while True: if RENDER: env.render() action = actor.choose_action(observation) observation_, reward, done, info = env.step(action) if done: reward = -20 track_reward.append(reward) td_error = critic.learn(observation, reward, observation_) actor.learn(observation, action, td_error) observation = observation_ t += 1 if done or t > MAX_EP_STEP: ep_rs_sum = sum(track_reward) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 if running_reward > DISPLAY_THRESHOLD: RENDER = True print "Episode: %d | Reward: %d" % (i_episode, running_reward) break
Action_dim = 4 sess = tf.Session() actor = Actor(sess, State_dim=State_dim, Action_dim=Action_dim, lr=Actor_lr) critic = Critic(sess, State_dim=State_dim, lr=Critic_lr) sess.run(tf.global_variables_initializer()) for i_episode in range(MAX_EPISODE): s = env.reset() t = 0 track_r = 0 total_action = [] done = False while (not done and t < 200): a = actor.choose_action(s) s_, r, done = env.step(env.t_action[a]) total_action.append(env.t_action[a]) if done: r = -200 td_error = critic.learn(s, -r, s_) actor.learn(s, a, td_error) s = s_ track_r += r t += 1 print("episode:", i_episode, " tracked actions to attempt goal:", total_action)
class Agent: def __init__(self, input_dim, output_dim, lr, gamma, tau, clipnorm, verbose): self.input_dim = input_dim self.output_dim = output_dim self.actions = range(output_dim) self.lr = lr self.gamma = gamma self.tau = tau #Buffer for experience replay self.S = [] self.A = [] self.R = [] self.S1 = [] self.D = [] self.memory_size = 10**3 #Make actor and critic self.actor = Actor(input_dim, output_dim, lr, gamma, tau, clipnorm, verbose) self.critic = Critic(input_dim, output_dim, lr, gamma, tau, clipnorm, verbose) def learn(self): #get batch S, A, R, S1, D = self.get_batch() #Find advantage #G = self.find_discounted_return(R) #V = self.critic.model.predict(S) #V.resize(len(V)) #adv = G - V #Train critic D0, R0 = np.array([[x] for x in D ]), np.array([[x] for x in R ]) #hack to get working for now V1 = self.critic.model.predict(S1) self.critic.learn(S, R0, D0, V1) self.soft_update_target_network(self.critic) #Find advantage V, V1 = self.critic.model.predict(S), self.critic.model.predict(S1) adv = R0 + self.gamma * V1 - V adv = adv.flatten() #train actor self.actor.learn(S, A, adv) self.soft_update_target_network(self.actor) #Clear memory self.S, self.A, self.R, self.S1, self.D = [], [], [], [], [] def find_discounted_return(self, rewards): R = np.zeros_like(rewards) rolling_sum = 0 for t in reversed(range(len(R))): rolling_sum = rolling_sum * self.gamma + rewards[t] R[t] = rolling_sum #Normalize rewards R -= np.mean(R) R /= np.std(R) return np.array(R) def remember(self, state, action, reward, next_state, done): """ Add experience to buffer """ self.S.append(state) action_onehot = to_categorical( action, self.output_dim) #optimizers use one-hot self.A.append(action_onehot) self.R.append(reward) self.S1.append(next_state) self.D.append(done * 1.0) def get_batch(self): #indicies = np.random.choice(range(len(self.S)),self.batchsize) indicies = range(len(self.S)) S = np.array(self.S)[indicies] A = np.array(self.A)[indicies] R = np.array(self.R)[indicies] S1 = np.array(self.S1)[indicies] D = np.array(self.D)[indicies] return S, A, R, S1, D def act(self, state): """ Choose action according to softmax """ probs = self.actor.model.predict(state)[0] action = np.random.choice(self.actions, p=probs) return action def make_tensor(self, vec): """ Turns a 1D array, x, into a 2d array / tensor = [x] So that keras can read it """ vec = np.reshape(vec, (1, len(vec))) return vec def save_target_weights(self): """ Saves the weights of the target networks (only use the target during testing, so don't need to save the behavior network) """ #Create directory if it doesn't exist dir_name = 'network_weights/' if not os.path.exists(os.path.dirname(dir_name)): os.makedirs(os.path.dirname(dir_name)) #Now save the weights. I'm choosing ID by gamma, lr, tau pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str( self.lr) + '_tau_' + str(self.tau) #save attached the extension #Actor target network filename = 'network_weights/actor_target' actor_pars = self.actor.target_model.get_weights() np.save(filename + pars_tag, actor_pars) #Critic target network filename = 'network_weights/critic_target' critic_pars = self.critic.target_model.get_weights() np.save(filename + pars_tag, critic_pars) def load_target_weights(self, gamma, lr, tau): """ Loads the weights of the target networks, previously created using the save_target_wieghts() function """ #Now save the weights. I'm choosing ID by gamma, lr, tau pars_tag = '_gamma_' + str(gamma) + '_lr_' + str(lr) + '_tau_' + str( tau) + '.npy' #Actor target network filename = 'network_weights/actor_target' actor_pars = np.load(filename + pars_tag) self.actor.target_model.set_weights(actor_pars) #Critic target network filename = 'network_weights/critic_target' critic_pars = np.load(filename + pars_tag) self.critic.target_model.set_weights(critic_pars) def soft_update_target_network(self, net): """ Updates parameters of the target network according to the following where tau is a hyper parameter. theta_target = (1-tau)*theta_target + tau*theta_behavior Input: network = Actor or Critic class """ pars_behavior = net.model.get_weights( ) # these have form [W1, b1, W2, b2, ..], Wi = pars_target = net.target_model.get_weights() # bi = biases in layer i ctr = 0 for par_behavior, par_target in zip(pars_behavior, pars_target): par_target = par_target * (1 - self.tau) + par_behavior * self.tau pars_target[ctr] = par_target ctr += 1 net.target_model.set_weights(pars_target)
class Agent: def __init__(self, input_dim, output_dim, lr, gamma, seed_num=False): self.input_dim = input_dim self.output_dim = output_dim self.actions = range(output_dim) self.lr = lr self.gamma = gamma self.tau = 0.1 self.seed_num = seed_num #For experience replay self.memory = [] self.memory_size = 10000 self.batchsize = 32 #Actor & critic self.actor = Actor(input_dim, output_dim, self.lr) self.critic = Critic(input_dim, output_dim, self.lr, self.gamma) if seed_num != False: set_random_seed(seed_num) #seed tensorflow seed(seed_num) #seed numpy def remember(self, state, action, reward, next_state, done): event = (state, action, reward, next_state, done) if len(self.memory) <= self.memory_size: self.memory.append(event) else: self.memory[0] = event def act(self, state): #The softmax gumbel trick outputs an almost 1-hot vector (i.e elements sum to one, with way way bigger than others) #I need to turn this into a 'hard' onehot vector action_soft_onehot = self.actor.model.predict(state)[0] action_index = np.argmax(action_soft_onehot) action_hard_onehot = np.array([ 1 if i == action_index else 0 for i in range(len(action_soft_onehot)) ]) return action_hard_onehot def extract_from_batch(self, batch): states, actions = [], [] for event in batch: state, action, reward, next_state, done = event states.append(state) actions.append(action) return np.array(states), np.array(actions) def train_models(self): #Do experience replay if len(self.memory) < self.batchsize: minibatch = self.memory else: minibatch = random.sample(self.memory, self.batchsize) #Actor update states, actions = self.extract_from_batch(minibatch) grad_actions = self.critic.find_action_grads([states, actions])[0] self.actor.learn(states, grad_actions) self.soft_update_target_network(self.actor) #Critic update self.critic.learn(minibatch) self.soft_update_target_network(self.critic) def soft_update_target_network(self, net): """ Updates parameters of the target network according to the following where tau is a hyper parameter. theta_target = (1-tau)*theta_target + tau*theta_behavior Input: network = Actor or Critic class """ pars_behavior = net.model.get_weights( ) # these have form [W1, b1, W2, b2, ..], Wi = pars_target = net.target_model.get_weights() # bi = biases in layer i ctr = 0 for par_behavior, par_target in zip(pars_behavior, pars_target): par_target = par_target * (1 - self.tau) + par_behavior * self.tau pars_target[ctr] = par_target ctr += 1 net.target_model.set_weights(pars_target) def save_target_weights(self): """ Saves the weights of the target network (only use the target during testing, so don't need to save tje behavior) """ #Create directory if it doesn't exist dir_name = 'network_weights/' if not os.path.exists(os.path.dirname(dir_name)): os.makedirs(os.path.dirname(dir_name)) #Now save the weights. I'm choosing ID by gamma, lr, tau if self.seed_num == False: pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str( self.lr) + '_tau_' + str(self.tau) else: pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str(self.lr) + '_tau_' + str(self.tau) \ +'_seed_' + str(self.seed_num) #Actor target network filename = 'network_weights/actor_target' actor_pars = self.actor.target_model.get_weights() np.save(filename + pars_tag, actor_pars) #Critic target network filename = 'network_weights/critic_target' critic_pars = self.critic.target_model.get_weights() np.save(filename + pars_tag, critic_pars) def load_target_weights(self, gamma, lr, tau): """ Loads the weights of the target network, previously created using the save_target_wieghts() function """ #Now save the weights. I'm choosing ID by gamma, lr, tau #Now save the weights. I'm choosing ID by gamma, lr, tau if self.seed_num == False: pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str( self.lr) + '_tau_' + str(self.tau) + '.npy' else: pars_tag = '_gamma_' + str(self.gamma)+'_lr_'+str(self.lr)+'_tau_'+str(self.tau)+'_seed_' \ +str(self.seed_num)+ '.npy' #Actor target network filename = 'network_weights/actor_target' actor_pars = np.load(filename + pars_tag) self.actor.target_model.set_weights(actor_pars) #Critic target network filename = 'network_weights/critic_target' critic_pars = np.load(filename + pars_tag) self.critic.target_model.set_weights(critic_pars)