class NeuralQLearner(object): def __init__(self, session, optimizer, q_network, state_dim, num_actions, batch_size=32, init_exp=0.5, # initial exploration prob final_exp=0.1, # final exploration prob anneal_steps=10000, # N steps for annealing exploration replay_buffer_size=10000, store_replay_every=5, # how frequent to store experience discount_factor=0.9, # discount future rewards target_update_rate=0.01, reg_param=0.01, # regularization constants max_gradient=5, # max gradient norms double_q_learning=False, summary_writer=None, summary_every=100): # tensorflow machinery self.session = session self.optimizer = optimizer self.summary_writer = summary_writer # model components self.q_network = q_network self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size) # Q learning parameters self.batch_size = batch_size self.state_dim = state_dim self.num_actions = num_actions self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate self.double_q_learning = double_q_learning # training parameters self.max_gradient = max_gradient self.reg_param = reg_param # counters self.store_replay_every = store_replay_every self.store_experience_cnt = 0 self.train_iteration = 0 # create and initialize variables self.create_variables() var_lists = tf.get_collection(tf.GraphKeys.VARIABLES) self.session.run(tf.initialize_variables(var_lists)) # make sure all variables are initialized self.session.run(tf.assert_variables_initialized()) if self.summary_writer is not None: # graph was not available when journalist was created self.summary_writer.add_graph(self.session.graph) self.summary_every = summary_every def create_variables(self): # compute action from a state: a* = argmax_a Q(s_t,a) with tf.name_scope("predict_actions"): # raw state representation self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # initialize Q network with tf.variable_scope("q_network"): self.q_outputs = self.q_network(self.states) # predict actions from Q network self.action_scores = tf.identity(self.q_outputs, name="action_scores") tf.histogram_summary("action_scores", self.action_scores) self.predicted_actions = tf.argmax(self.action_scores, dimension=1, name="predicted_actions") # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a) with tf.name_scope("estimate_future_rewards"): self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks") if self.double_q_learning: # reuse Q network for action selection with tf.variable_scope("q_network", reuse=True): self.q_next_outputs = self.q_network(self.next_states) self.action_selection = tf.argmax(tf.stop_gradient(self.q_next_outputs), 1, name="action_selection") tf.histogram_summary("action_selection", self.action_selection) self.action_selection_mask = tf.one_hot(self.action_selection, self.num_actions, 1, 0) # use target network for action evaluation with tf.variable_scope("target_network"): self.target_outputs = self.q_network(self.next_states) * tf.cast(self.action_selection_mask, tf.float32) self.action_evaluation = tf.reduce_sum(self.target_outputs, reduction_indices=[1,]) tf.histogram_summary("action_evaluation", self.action_evaluation) self.target_values = self.action_evaluation * self.next_state_mask else: # initialize target network with tf.variable_scope("target_network"): self.target_outputs = self.q_network(self.next_states) # compute future rewards self.next_action_scores = tf.stop_gradient(self.target_outputs) self.target_values = tf.reduce_max(self.next_action_scores, reduction_indices=[1,]) * self.next_state_mask tf.histogram_summary("next_action_scores", self.next_action_scores) self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") self.future_rewards = self.rewards + self.discount_factor * self.target_values # compute loss and gradients with tf.name_scope("compute_temporal_differences"): # compute temporal difference loss self.action_mask = tf.placeholder(tf.float32, (None, self.num_actions), name="action_mask") self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, reduction_indices=[1,]) self.temp_diff = self.masked_action_scores - self.future_rewards self.td_loss = tf.reduce_mean(tf.square(self.temp_diff)) # regularization loss q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") self.reg_loss = self.reg_param * tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in q_network_variables]) # compute total loss and gradients self.loss = self.td_loss + self.reg_loss gradients = self.optimizer.compute_gradients(self.loss) # clip gradients by norm for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var) # add histograms for gradients. for grad, var in gradients: tf.histogram_summary(var.name, var) if grad is not None: tf.histogram_summary(var.name + '/gradients', grad) self.train_op = self.optimizer.apply_gradients(gradients) # update target network with Q network with tf.name_scope("update_target_network"): self.target_network_update = [] # slowly update target network parameters with Q network parameters q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network") for v_source, v_target in zip(q_network_variables, target_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) self.target_network_update = tf.group(*self.target_network_update) # scalar summaries tf.scalar_summary("td_loss", self.td_loss) tf.scalar_summary("reg_loss", self.reg_loss) tf.scalar_summary("total_loss", self.loss) tf.scalar_summary("exploration", self.exploration) self.summarize = tf.merge_all_summaries() self.no_op = tf.no_op() def storeExperience(self, state, action, reward, next_state, done): # always store end states if self.store_experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.store_experience_cnt += 1 def eGreedyAction(self, states, explore=True): if explore and self.exploration > random.random(): return random.randint(0, self.num_actions-1) else: return self.session.run(self.predicted_actions, {self.states: states})[0] def annealExploration(self, stategy='linear'): ratio = max((self.anneal_steps - self.train_iteration)/float(self.anneal_steps), 0) self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp def updateModel(self): # not enough experiences yet if self.replay_buffer.count() < self.batch_size: return batch = self.replay_buffer.getBatch(self.batch_size) states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size,)) action_mask = np.zeros((self.batch_size, self.num_actions)) next_states = np.zeros((self.batch_size, self.state_dim)) next_state_mask = np.zeros((self.batch_size,)) for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r action_mask[k][a] = 1 # check terminal state if not done: next_states[k] = s1 next_state_mask[k] = 1 # whether to calculate summaries calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None # perform one update of training cost, _, summary_str = self.session.run([ self.loss, self.train_op, self.summarize if calculate_summaries else self.no_op ], { self.states: states, self.next_states: next_states, self.next_state_mask: next_state_mask, self.action_mask: action_mask, self.rewards: rewards }) # update target network using Q-network self.session.run(self.target_network_update) # emit summaries if calculate_summaries: self.summary_writer.add_summary(summary_str, self.train_iteration) self.annealExploration() self.train_iteration += 1
class DeepDeterministicPolicyGradient(object): def __init__(self, session, optimizer, actor_network, critic_network, state_dim, action_dim, batch_size=32, replay_buffer_size=1000000, # size of replay buffer store_replay_every=1, # how frequent to store experience discount_factor=0.99, # discount future rewards target_update_rate=0.01, reg_param=0.01, # regularization constants max_gradient=5, # max gradient norms noise_sigma=0.20, noise_theta=0.15, summary_writer=None, summary_every=100): # tensorflow machinery self.session = session self.optimizer = optimizer self.summary_writer = summary_writer # model components self.actor_network = actor_network self.critic_network = critic_network self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size) # training parameters self.batch_size = batch_size self.state_dim = state_dim self.action_dim = action_dim self.discount_factor = discount_factor self.target_update_rate = target_update_rate self.max_gradient = max_gradient self.reg_param = reg_param # Ornstein-Uhlenbeck noise for exploration self.noise_var = tf.Variable(tf.zeros([1, action_dim])) noise_random = tf.random_normal([1, action_dim], stddev=noise_sigma) self.noise = self.noise_var.assign_sub((noise_theta) * self.noise_var - noise_random) # counters self.store_replay_every = store_replay_every self.store_experience_cnt = 0 self.train_iteration = 0 # create and initialize variables self.create_variables() var_lists = tf.get_collection(tf.GraphKeys.VARIABLES) self.session.run(tf.initialize_variables(var_lists)) # make sure all variables are initialized self.session.run(tf.assert_variables_initialized()) if self.summary_writer is not None: # graph was not available when journalist was created self.summary_writer.add_graph(self.session.graph) self.summary_every = summary_every def create_variables(self): with tf.name_scope("model_inputs"): # raw state representation self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # action input used by critic network self.action = tf.placeholder(tf.float32, (None, self.action_dim), name="action") # define outputs from the actor and the critic with tf.name_scope("predict_actions"): # initialize actor-critic network with tf.variable_scope("actor_network"): self.policy_outputs = self.actor_network(self.states) with tf.variable_scope("critic_network"): self.value_outputs = self.critic_network(self.states, self.action) self.action_gradients = tf.gradients(self.value_outputs, self.action)[0] # predict actions from policy network self.predicted_actions = tf.identity(self.policy_outputs, name="predicted_actions") tf.histogram_summary("predicted_actions", self.predicted_actions) tf.histogram_summary("action_scores", self.value_outputs) # get variable list actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network") critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network") # estimate rewards using the next state: r + argmax_a Q'(s_{t+1}, u'(a)) with tf.name_scope("estimate_future_rewards"): self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks") self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") # initialize target network with tf.variable_scope("target_actor_network"): self.target_actor_outputs = self.actor_network(self.next_states) with tf.variable_scope("target_critic_network"): self.target_critic_outputs = self.critic_network(self.next_states, self.target_actor_outputs) # compute future rewards self.next_action_scores = tf.stop_gradient(self.target_critic_outputs)[:,0] * self.next_state_mask tf.histogram_summary("next_action_scores", self.next_action_scores) self.future_rewards = self.rewards + self.discount_factor * self.next_action_scores # compute loss and gradients with tf.name_scope("compute_pg_gradients"): # compute gradients for critic network self.temp_diff = self.value_outputs[:,0] - self.future_rewards self.mean_square_loss = tf.reduce_mean(tf.square(self.temp_diff)) self.critic_reg_loss = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in critic_network_variables]) self.critic_loss = self.mean_square_loss + self.reg_param * self.critic_reg_loss self.critic_gradients = self.optimizer.compute_gradients(self.critic_loss, critic_network_variables) # compute actor gradients (we don't do weight decay for actor network) self.q_action_grad = tf.placeholder(tf.float32, (None, self.action_dim), name="q_action_grad") actor_policy_gradients = tf.gradients(self.policy_outputs, actor_network_variables, -self.q_action_grad) self.actor_gradients = zip(actor_policy_gradients, actor_network_variables) # collect all gradients self.gradients = self.actor_gradients + self.critic_gradients # clip gradients for i, (grad, var) in enumerate(self.gradients): # clip gradients by norm if grad is not None: self.gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var) # summarize gradients for grad, var in self.gradients: tf.histogram_summary(var.name, var) if grad is not None: tf.histogram_summary(var.name + '/gradients', grad) # emit summaries tf.scalar_summary("critic_loss", self.critic_loss) tf.scalar_summary("critic_td_loss", self.mean_square_loss) tf.scalar_summary("critic_reg_loss", self.critic_reg_loss) # apply gradients to update actor network self.train_op = self.optimizer.apply_gradients(self.gradients) # update target network with Q network with tf.name_scope("update_target_network"): self.target_network_update = [] # slowly update target network parameters with the actor network parameters actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network") target_actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_actor_network") for v_source, v_target in zip(actor_network_variables, target_actor_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) # same for the critic network critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network") target_critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_critic_network") for v_source, v_target in zip(critic_network_variables, target_critic_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) # group all assignment operations together self.target_network_update = tf.group(*self.target_network_update) self.summarize = tf.merge_all_summaries() self.no_op = tf.no_op() def sampleAction(self, states, exploration=True): policy_outs, ou_noise = self.session.run([ self.policy_outputs, self.noise ], { self.states: states }) # add OU noise for exploration policy_outs = policy_outs + ou_noise if exploration else policy_outs return policy_outs def updateModel(self): # not enough experiences yet if self.replay_buffer.count() < self.batch_size: return batch = self.replay_buffer.getBatch(self.batch_size) states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size,)) actions = np.zeros((self.batch_size, self.action_dim)) next_states = np.zeros((self.batch_size, self.state_dim)) next_state_mask = np.zeros((self.batch_size,)) for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r actions[k] = a if not done: next_states[k] = s1 next_state_mask[k] = 1 # whether to calculate summaries calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None # compute a = u(s) policy_outs = self.session.run(self.policy_outputs, { self.states: states }) # compute d_a Q(s,a) where s=s_i, a=u(s) action_grads = self.session.run(self.action_gradients, { self.states: states, self.action: policy_outs }) critic_loss, _, summary_str = self.session.run([ self.critic_loss, self.train_op, self.summarize if calculate_summaries else self.no_op ], { self.states: states, self.next_states: next_states, self.next_state_mask: next_state_mask, self.action: actions, self.rewards: rewards, self.q_action_grad: action_grads }) # update target network using Q-network self.session.run(self.target_network_update) # emit summaries if calculate_summaries: self.summary_writer.add_summary(summary_str, self.train_iteration) self.train_iteration += 1 def storeExperience(self, state, action, reward, next_state, done): # always store end states if self.store_experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.store_experience_cnt += 1
def run_ddpg(amodel, cmodel, train_indicator=0, seeded=1337, track_name='practgt2.xml'): OU = FunctionOU() BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic ALPHA = 0.9 action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input np.random.seed(seeded) vision = False EXPLORE = 100000. if train_indicator: episode_count = 600 else: episode_count = 3 max_steps = 20000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=track_name) if not train_indicator: # Now load the weight #logging.info("Now we load the weight") print("Now we load the weight") try: actor.model.load_weights(amodel) critic.model.load_weights(cmodel) actor.target_model.load_weights(amodel) critic.target_model.load_weights(cmodel) #logging.info(" Weight load successfully") print("Weight load successfully") except: #ogging.info("Cannot find the weight") print("Cannot find the weight") exit() #logging.info("TORCS Experiment Start.") print("TORCS Experiment Start.") best_lap = 500 for i_episode in range(episode_count): print("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) if np.mod(i_episode, 3) == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. for j_iter in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i_episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) if np.mod(step, 1000) == 0: logging.info("Episode {}, Distance {}, Last Lap {}".format( i_episode, ob.distRaced, ob.lastLapTime)) if ob.lastLapTime > 0: if best_lap < ob.lastLapTime: best_lap = ob.lastLapTime step += 1 if done: break if train_indicator and i_episode > 20: if np.mod(i_episode, 3) == 0: logging.info("Now we save model") actor.model.save_weights("ddpg_actor_weights_periodic.h5", overwrite=True) critic.model.save_weights("ddpg_critic_weights_periodic.h5", overwrite=True) print("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("Best Lap {}".format(best_lap)) print("") logging.info("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) logging.info("Best Lap {}".format(best_lap)) env.end() # This is for shutting down TORCS logging.info("Finish.")
class DeepQLearner(object): def __init__(self, session, optimizer, q_network, state_dim, num_actions, batch_size=32, init_exp=0.5, # initial exploration prob final_exp=0.1, # final exploration prob anneal_steps=10000, # N steps for annealing exploration replay_buffer_size=10000, store_replay_every=5, # how frequent to store experience discount_factor=0.9, # discount future rewards target_update_rate=0.01, name="DeepQLearner" ): """ Initializes the Deep Q Network. Args: session: A TensorFlow session. optimizer: A TensorFlow optimizer. q_network: A TensorFlow network that takes in a state and output the Q-values over all actions. state_dim: Dimension of states. num_actions: Number of actions. batch_size: Batch size for training with experience replay. init_exp: Initial exploration probability for eps-greedy policy. final_exp: Final exploration probability for eps-greedy policy. anneal_steps: Number of steps to anneal from init_exp to final_exp. replay_buffer_size: Size of replay buffer. store_replay_every: Frequency with which to store replay. discount_factor: For discounting future rewards. target_update_rate: For the slow update of the target network. name: Used to create a variable scope. Useful for creating multiple networks. """ self.session = session self.optimizer = optimizer self.q_network = q_network # tensorflow constructor for Q network self.state_dim = state_dim self.num_actions = num_actions self.batch_size = batch_size # initialize exploration self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate # Initialize the replay buffer. self.replay_buffer_size = replay_buffer_size self.replay_buffer = ReplayBuffer(replay_buffer_size) self.store_replay_every = store_replay_every self.experience_cnt = 0 self.name = name self.train_iteration = 0 self.constructModel() self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def constructModel(self): """ Constructs the model to do Q-learning. """ # ensure that we don't have conflicts when initializing multiple models with tf.variable_scope(self.name): # this part of the model is for predicting actions using the learned Q_network. with tf.name_scope("predict_actions"): # input: vectors of states (in a batch) self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # use new scope to differentiate this q_network from one used for target evaluation # note that this will differentiate the weights, for example "learn_q_network/W1" with tf.variable_scope("learn_q_network"): # the current q_network that we train self.action_scores = self.q_network(self.states, self.state_dim, self.num_actions) self.predicted_actions = tf.argmax(self.action_scores, axis=1, name="predicted_actions") # this part of the model is for estimating future rewards, to be used for the Q-learning # update for estimating the target Q-value. with tf.name_scope("estimate_future_rewards"): # input: vectors of next states (in a batch) self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") # input: binary inputs that indicate whether states are unfinished or terminal # this is important to compute the target and do the Bellman update correctly, since # it tells us whether to include the optimal Q value for the next state or not. self.unfinished_states_flags = tf.placeholder(tf.float32, (None,), name="unfinished_states_flags") # input: rewards from last state and action self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") # use new scope to differentiate this q_network from one we are training # note that this will differentiate the weights, for example "target_q_network/W1" with tf.variable_scope("target_q_network"): # the q_network used for evaluation self.eval_q_vals = self.q_network(self.next_states, self.state_dim, self.num_actions) # note that this term is only non-zero for a state if it is non-terminal # also note the use of stop_gradient to make sure we don't train this q_network self.best_future_q_vals = tf.reduce_max(tf.stop_gradient(self.eval_q_vals), axis=1) * self.unfinished_states_flags # future rewards given by Bellman equation self.future_rewards = self.rewards + self.discount_factor * self.best_future_q_vals # this part of the model is for computing the loss and gradients with tf.name_scope("loss"): # input: one-hot vectors that give the current actions to evaluate the loss for self.action_selects = tf.placeholder(tf.float32, (None, self.num_actions), name="action_select") # get Q-values for the actions that we took self.selected_action_scores = tf.reduce_sum(self.action_scores * self.action_selects, axis=1) # temporal difference loss self.td_loss = tf.reduce_mean(tf.reduce_sum(tf.square(self.future_rewards - self.selected_action_scores))) # cross-entropy loss for adversarial example generation self.cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.action_scores, self.action_selects)) # TODO: regularization loss # TODO: gradient clipping self.train_op = self.optimizer.minimize(self.td_loss) # this part of the model is for updating the target Q network with tf.name_scope("eval_q_network_update"): target_network_update = [] # slowly update target network parameters with Q network parameters # we do this by grabbing all the parameters in both networks and manually defining # update operations self.q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="learn_q_network") self.target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_q_network") for v_source, v_target in zip(self.q_network_variables, self.target_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) target_network_update.append(update_op) # this groups all operations to run together # this operation will update all of the target Q network variables self.target_network_update = tf.group(*target_network_update) def store_experience(self, state, action, reward, next_state, done): """ Adds an experience to the replay buffer. """ if self.experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.experience_cnt += 1 def greedy_policy(self, states): """ Executes the greedy policy. Useful for executing a learned agent. """ return self.session.run(self.predicted_actions, {self.states: states})[0] def e_greedy_policy(self, states): """ Executes the epsilon greedy policy. """ # with probability exploration, choose random action if random.random() < self.exploration: return random.randint(0, self.num_actions-1) # choose greedy action given by current Q network else: return self.greedy_policy(states) def annealExploration(self): """ Anneals the exploration probability linearly with training iteration. """ ratio = max((self.anneal_steps - self.train_iteration) / float(self.anneal_steps), 0) self.exploration = (self.init_exp- self.final_exp) * ratio + self.final_exp def updateModel(self): """ Update the model by sampling a batch from the replay buffer and performing Q-learning updates on the network parameters. """ # not enough experiences yet if self.replay_buffer.count() < self.batch_size: return # sample a random batch from the replay buffer batch = self.replay_buffer.getBatch(self.batch_size) # keep track of these inputs to the Q networks for the batch states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size,)) action_selects = np.zeros((self.batch_size, self.num_actions)) next_states = np.zeros((self.batch_size, self.state_dim)) unfinished_states_flags = np.zeros((self.batch_size,)) # train on the experiences in this batch for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r action_selects[k][a] = 1 # check terminal state if not done: next_states[k] = s1 unfinished_states_flags[k] = 1 # perform one update of training cost, _ = self.session.run([self.td_loss, self.train_op], { self.states : states, self.next_states : next_states, self.unfinished_states_flags : unfinished_states_flags, self.action_selects : action_selects, self.rewards : rewards }) # update target network using learned Q-network self.session.run(self.target_network_update) self.annealExploration() self.train_iteration += 1 # saves the trained model def saveModel(self, name): self.saver.save(self.session, name) def restoreModel(self, name): self.saver.restore(self.session, './' + name) def reset(self): # initialize exploration self.exploration = self.init_exp # Initialize the replay buffer. self.replay_buffer = ReplayBuffer(self.replay_buffer_size) self.experience_cnt = 0 self.train_iteration = 0 self.session.run(tf.global_variables_initializer())
def main(config_dict): train = config_dict['train'] network = config_dict['network'] experiment_name = config_dict['experiment_name'] EXPERIMENTS_PATH = config_dict['EXPERIMENTS_PATH'] actor_weights_file = "%s%s/%s_actor.h5" % (EXPERIMENTS_PATH, network, network) critic_weights_file = "%s%s/%s_critic.h5" % (EXPERIMENTS_PATH, network, network) log_directory = "%s%s/%s/" % (EXPERIMENTS_PATH, network, experiment_name) BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 LRA = 0.0001 LRC = 0.001 action_dim = 3 # Steering / Acceleration / Blake state_dim = 29 # Dimension of sensor inputs #np.random.seed(42) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 done = False step = 0 epsilon = 1 exp_logger = TORCS_ExperimentLogger(log_directory, experiment_name) #directory = "%s%s/" % (EXPERIMENTS_PATH, experiment) #actor_weights_file = "%s%s_%s" % (directory, experiment, "actor.h5") #critic_weights_file = "%s%s_%s" % (directory, experiment, "critic.h5") # TensorFlow GPU config = tf.ConfigProto() # Not sure if this is really necessary, since we only have a single GPU config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Weight loading if not train: try: actor.model.load_weights(actor_weights_file) critic.model.load_weights(critic_weights_file) actor.target_model.load_weights(actor_weights_file) critic.target_model.load_weights(critic_weights_file) print "Weights loaded successfully" time.sleep(2) except: print "Error in loading weights" print '-' * 60 traceback.print_exc(file=sys.stdout) print '-' * 60 assert (False) for i in xrange(episode_count): print "Episode: %i; Replay Buffer: %i" % (i, buff.count()) if np.mod(i, 3) == 0: # Relaunch TORCS every 3 episodes; memory leak error ob = env.reset(relaunch=True) else: ob = env.reset() state_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. # Compute rewards for j in xrange(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE # exploration factor action_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) action_t_raw = actor.model.predict( state_t.reshape( 1, state_t.shape[0])) # this call to reshape seems suboptimal noise_t[0][0] = train * max(epsilon, 0) * OU.run( action_t_raw[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train * max(epsilon, 0) * OU.run( action_t_raw[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train * max(epsilon, 0) * OU.run( action_t_raw[0][2], -0.1, 1.00, 0.05) # stochastic brake #if random.random() <= 0.1: # noise_t[0][2] = train * max(epsilon, 0) * OU.run(action_t_raw[0][2], 0.2, 1.00, 0.10) # May be able to do this a bit more concisely with NumPy vectorization action_t[0][0] = action_t_raw[0][0] + noise_t[0][0] action_t[0][1] = action_t_raw[0][1] + noise_t[0][1] action_t[0][2] = action_t_raw[0][2] + noise_t[0][2] # Raw_reward_t is the raw reward computed by the gym_torcs script. # We will compute our own reward metric from the ob object ob, raw_reward_t, done, info = env.step(action_t[0]) state_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) #reward_t = lng_trans(ob) reward_t = raw_reward_t buff.add(state_t, action_t[0], reward_t, state_t1, done) # Add replay buffer # Batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) done_indicators = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # Can't we just use BATCH_SIZE here for k in xrange(len(batch)): if done_indicators[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.train_target_net() critic.train_target_net() exp_logger.log(ob, action_t[0], reward_t, loss) total_reward += reward_t state_t = state_t1 print("Episode", i, "Step", step, "Action", action_t, "Reward", reward_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train): print("Now we save model") actor.model.save_weights(actor_weights_file, overwrite=True) #with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights(critic_weights_file, overwrite=True) #with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
class NeuralAgent(): def __init__(self, track_name='practgt2.xml'): BUFFER_SIZE = 100000 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic state_dim = 29 # of sensors input self.batch_size = 32 self.lambda_mix = 10.0 self.action_dim = 3 # Steering/Acceleration/Brake # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRC) self.buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer self.track_name = track_name self.save = dict(total_reward=[], total_step=[], ave_reward=[], distRaced=[], distFromStart=[], lastLapTime=[], curLapTime=[], lapTimes=[], avelapTime=[], ave_sp=[], max_sp=[], min_sp=[], test_total_reward=[], test_total_step=[], test_ave_reward=[], test_distRaced=[], test_distFromStart=[], test_lastLapTime=[], test_curLapTime=[], test_lapTimes=[], test_avelapTime=[], test_ave_sp=[], test_max_sp=[], test_min_sp=[]) def rollout(self, env): max_steps = 10000 vision = False # zhichen: it is not stable to have two torcs env and UDP connections # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. sp = [] lastLapTime = [] for j_iter in range(max_steps): a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) a_t = a_t[0] # print('test a_t:', a_t) a_t[0] = clip(a_t[0], -1, 1) a_t[1] = clip(a_t[1], 0, 1) a_t[2] = clip(a_t[2], 0, 1) ob, r_t, done, info = env.step(a_t) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) if np.mod(j_iter + 1, 20) == 0: logging.info('step: ' + str(j_iter + 1)) print('\n ob: ', ob) s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t if done: break logging.info("Test Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + # " Episode Length: " + str(j_iter+1) + " Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime)) #env.end() # This is for shutting down TORCS ave_sp = np.mean(sp) max_sp = np.max(sp) min_sp = np.min(sp) return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime def update_neural(self, controllers, episode_count=200, tree=False, seed=1337): OU = FunctionOU() vision = False GAMMA = 0.99 EXPLORE = 100000. max_steps = 10000 reward = 0 done = False step = 0 epsilon = 1 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Experiment Start with Lambda = " + str(self.lambda_mix)) for i_episode in range(episode_count): logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(self.buff.count())) if np.mod(i_episode, 3) == 0: logging.info('relaunch TORCS') ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: logging.info('reset TORCS') ob = env.reset() #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)] s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] sp = [] lastLapTime = [] for j_iter in range(max_steps): if tree: tree_obs = [ sensor for obs in tempObs[:-1] for sensor in obs ] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, self.action_dim]) noise_t = np.zeros([1, self.action_dim]) a_t_original = self.actor.model.predict( s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = max(epsilon, 0) * OU.function( a_t_original[0][2], 0, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) self.buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = self.buff.getBatch(self.batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.zeros((states.shape[0], 1)) target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] loss += self.critic.model.train_on_batch([states, actions], y_t) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() total_reward += r_t s_t = s_t1 # Control prior mixing term if j_iter > 0 and i_episode > 50: lambda_track = lambda_max * (1 - np.exp(-factor * np.abs( r_t + GAMMA * np.mean(target_q_values[-1] - base_q[-1])))) lambda_track = np.squeeze(lambda_track) else: lambda_track = 10. lambda_store[j_iter] = lambda_track base_q = copy.deepcopy(target_q_values) if np.mod(step, 2000) == 0: logging.info("Episode " + str(i_episode) + " Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break #else: # env.end() self.lambda_mix = np.mean(lambda_store) logging.info('Episode ends! \n' + "Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Lambda Mix: " + str(self.lambda_mix)) self.save['total_reward'].append(total_reward) self.save['total_step'].append(j_iter + 1) self.save['ave_reward'].append(total_reward / (j_iter + 1)) self.save['distRaced'].append(info['distRaced']) self.save['distFromStart'].append(info['distFromStart']) self.save['lastLapTime'].append(info['lastLapTime']) self.save['curLapTime'].append(info['curLapTime']) self.save['lapTimes'].append(lastLapTime) if lastLapTime == []: self.save['avelapTime'].append(0) else: self.save['avelapTime'].append(np.mean(lastLapTime)) self.save['ave_sp'].append(np.mean(sp)) self.save['max_sp'].append(np.max(sp)) self.save['min_sp'].append(np.min(sp)) # test if np.mod(i_episode + 1, 10) == 0: logging.info("Start Testing!") test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout( env) self.save['test_total_reward'].append(test_total_reward) self.save['test_total_step'].append(test_step) self.save['test_ave_reward'].append(test_total_reward / test_step) self.save['test_distRaced'].append(test_info['distRaced']) self.save['test_distFromStart'].append( test_info['distFromStart']) self.save['test_lastLapTime'].append(test_info['lastLapTime']) self.save['test_curLapTime'].append(test_info['curLapTime']) self.save['test_lapTimes'].append(test_lastLapTime) if test_lastLapTime == []: self.save['test_avelapTime'].append(0) else: self.save['test_avelapTime'].append( np.mean(test_lastLapTime)) self.save['test_ave_sp'].append(test_ave_sp) self.save['test_max_sp'].append(test_max_sp) self.save['test_min_sp'].append(test_min_sp) if np.mod(i_episode + 1, 5) == 0: print("Now we save model") #os.remove("actormodel.h5") self.actor.model.save_weights("actormodel_" + str(seed) + ".h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) #os.remove("criticmodel.h5") self.critic.model.save_weights("criticmodel_" + str(seed) + ".h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) filename = "./model/actormodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.actor.model.save_weights(filename, overwrite=True) filename = "./model/criticmodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.critic.model.save_weights(filename, overwrite=True) if np.mod(i_episode + 1, 10) == 0: filename = "./Fig/iprl_save_" + str(seed) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as f: pickle.dump(self.save, f) if i_episode > 1000 and all( np.array(self.save['total_reward'][-20:]) < 20): print('model degenerated. Stop at Epsisode ' + str(i_episode)) break env.end() # This is for shutting down TORCS logging.info("Neural Policy Update Finish.") return None def collect_data(self, controllers, tree=False): vision = False max_steps = 10000 step = 0 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) print("S0=", ob) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Collection started with Lambda = " + str(self.lambda_mix)) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] observation_list = [] actions_list = [] lastLapTime = [] sp = [] for j_iter in range(max_steps): if tree: tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] if tree: newobs = [item for sublist in tempObs[:-1] for item in sublist] observation_list.append(newobs[:]) else: observation_list.append(window_list[:]) actions_list.append(mixed_act[:]) ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t s_t = s_t1 #if np.mod(step, 2000) == 0: # logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break logging.info("Data Collection Finished!") logging.info('Episode ends! \n' + "Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) env.end() return observation_list, actions_list def label_data(self, controllers, observation_list, tree=False): if not tree: steer_prog, accel_prog, brake_prog = controllers actions_list = [] net_obs_list = [] logging.info("Data labelling started with Lambda = " + str(self.lambda_mix)) for window_list in observation_list: if tree: act_tree = controllers.predict([window_list]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) net_obs_list.append(window_list) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) net_obs = [sensor for obs in window_list[-1] for sensor in obs] net_obs_list.append(net_obs[:29]) action_prior = [steer_action, accel_action, brake_action] s_t = np.hstack([[net_obs[:29]]]) a_t = self.actor.model.predict(s_t.reshape(1, 29)) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] actions_list.append(mixed_act[:]) return net_obs_list, observation_list, actions_list
class NeuralQLearner(object): def __init__( self, session, optimizer, q_network, restore_net_path, state_dim, num_actions, batch_size, init_exp, # initial exploration prob final_exp, # final exploration prob anneal_steps, # N steps for annealing exploration replay_buffer_size, store_replay_every, # how frequent to store experience discount_factor, # discount future rewards target_update_rate, reg_param, # regularization constants max_gradient, # max gradient norms double_q_learning, summary_writer, summary_every): # tensorflow machinery self.session = session self.optimizer = optimizer self.summary_writer = summary_writer # model components self.q_network = q_network self.restore_net_path = restore_net_path self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size) # Q learning parameters self.batch_size = batch_size self.state_dim = state_dim self.num_actions = num_actions self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate self.double_q_learning = double_q_learning # training parameters self.max_gradient = max_gradient self.reg_param = reg_param # counters self.store_replay_every = store_replay_every self.store_experience_cnt = 0 self.train_iteration = 0 # create and initialize variables self.create_variables() if self.restore_net_path is not None: saver = tf.train.Saver() saver.restore(self.session, self.restore_net_path) else: var_lists = tf.get_collection(tf.GraphKeys.VARIABLES) self.session.run(tf.initialize_variables(var_lists)) #var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) #self.session.run(tf.variables_initializer(var_lists)) # make sure all variables are initialized self.session.run(tf.assert_variables_initialized()) self.summary_every = summary_every if self.summary_writer is not None: # graph was not available when journalist was created self.summary_writer.add_graph(self.session.graph) self.summary_every = summary_every def create_variables(self): # compute action from a state: a* = argmax_a Q(s_t,a) with tf.name_scope("predict_actions"): # raw state representation self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # initialize Q network with tf.variable_scope("q_network"): self.q_outputs = self.q_network(self.states) # predict actions from Q network self.action_scores = tf.identity(self.q_outputs, name="action_scores") tf.summary.histogram("action_scores", self.action_scores) self.predicted_actions = tf.argmax(self.action_scores, dimension=1, name="predicted_actions") # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a) with tf.name_scope("estimate_future_rewards"): self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") self.next_state_mask = tf.placeholder(tf.float32, (None, ), name="next_state_masks") if self.double_q_learning: # reuse Q network for action selection with tf.variable_scope("q_network", reuse=True): self.q_next_outputs = self.q_network(self.next_states) self.action_selection = tf.argmax(tf.stop_gradient( self.q_next_outputs), 1, name="action_selection") tf.summary.histogram("action_selection", self.action_selection) self.action_selection_mask = tf.one_hot( self.action_selection, self.num_actions, 1, 0) # use target network for action evaluation with tf.variable_scope("target_network"): self.target_outputs = self.q_network( self.next_states) * tf.cast(self.action_selection_mask, tf.float32) self.action_evaluation = tf.reduce_sum(self.target_outputs, axis=[ 1, ]) tf.summary.histogram("action_evaluation", self.action_evaluation) self.target_values = self.action_evaluation * self.next_state_mask else: # initialize target network with tf.variable_scope("target_network"): self.target_outputs = self.q_network(self.next_states) # compute future rewards self.next_action_scores = tf.stop_gradient(self.target_outputs) #self.target_values = tf.reduce_max(self.next_action_scores, axis=[1, ]) * self.next_state_mask self.target_values = tf.reduce_max(self.next_action_scores, reduction_indices=[ 1, ]) * self.next_state_mask tf.summary.histogram("next_action_scores", self.next_action_scores) self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards") self.future_rewards = self.rewards + self.discount_factor * self.target_values # compute loss and gradients with tf.name_scope("compute_temporal_differences"): # compute temporal difference loss self.action_mask = tf.placeholder(tf.float32, (None, self.num_actions), name="action_mask") #self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, axis=[1, ]) self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, reduction_indices=[ 1, ]) self.temp_diff = self.masked_action_scores - self.future_rewards self.norm_diff = tf.square( tf.sigmoid(self.masked_action_scores / 100.0) - tf.sigmoid(self.future_rewards / 100.0)) #self.norm_diff = tf.nn.sigmoid(tf.square(self.temp_diff)/40000.0) self.td_loss = tf.reduce_mean(self.norm_diff) * 20000.0 # regularization loss q_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") self.reg_loss = self.reg_param * tf.reduce_sum( [tf.reduce_sum(tf.square(x)) for x in q_network_variables]) # compute total loss and gradients self.loss = self.td_loss + self.reg_loss gradients = self.optimizer.compute_gradients(self.loss) # clip gradients by norm for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var) # add histograms for gradients. for grad, var in gradients: tf.summary.histogram(var.name, var) if grad is not None: tf.summary.histogram(var.name + '/gradients', grad) self.train_op = self.optimizer.apply_gradients(gradients) # update target network with Q network with tf.name_scope("update_target_network"): self.target_network_update = [] # slowly update target network parameters with Q network parameters q_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") target_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network") for v_source, v_target in zip(q_network_variables, target_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) self.target_network_update = tf.group(*self.target_network_update) # scalar summaries tf.summary.scalar("td_loss", self.td_loss) #tf.summary.scalar("reg_loss", self.reg_loss) tf.summary.scalar("total_loss", self.loss) tf.summary.scalar("exploration", self.exploration) self.summarize = tf.summary.merge_all() self.no_op = tf.no_op() def storeExperience(self, state, action, reward, next_state, done): # always store end states if self.store_experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.store_experience_cnt += 1 def eGreedyAction(self, states, explore=True): if explore and self.exploration > random.random(): return random.randint(0, self.num_actions - 1) else: return self.session.run(self.predicted_actions, {self.states: states})[0] def annealExploration(self, stategy='linear'): ratio = max((self.anneal_steps - self.train_iteration) / float(self.anneal_steps), 0) self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp def updateModel(self, episode=-1): # not enough experiences yet print("compare ", self.replay_buffer.count(), self.batch_size) if self.replay_buffer.count() < self.batch_size: return batch = self.replay_buffer.getBatch(self.batch_size) states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size, )) action_mask = np.zeros((self.batch_size, self.num_actions)) next_states = np.zeros((self.batch_size, self.state_dim)) next_state_mask = np.zeros((self.batch_size, )) for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r action_mask[k][a] = 1 # check terminal state if not done: next_states[k] = s1 next_state_mask[k] = 1 # whether to calculate summaries calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None # perform one update of training #direct_r, nxt_r, label_r, now_net_r, diff, norm_diff, cost, td_cost, reg_cost, _, summary_str = self.session.run([ cost, td_cost, reg_cost, _, summary_str = self.session.run( [ #self.rewards, #self.target_values * self.discount_factor, #self.future_rewards, #self.masked_action_scores, #self.temp_diff, #self.norm_diff, self.loss, self.td_loss, self.reg_loss, self.train_op, self.summarize if calculate_summaries else self.no_op ], { self.states: states, self.next_states: next_states, self.next_state_mask: next_state_mask, self.action_mask: action_mask, self.rewards: rewards }) ''' rewards_out = open(rewards_out_path, 'a+') if self.train_iteration % 100 == 0: for i in range(len(direct_r)): print("episode: ", episode, "iter: ", self.train_iteration, "mini batch --- ", i, "direct_r ", direct_r[i], "nxt_r: ", nxt_r[i], "label_r: ", label_r[i], "now_net_r: ", now_net_r[i], "tmpdiff: ", diff[i], "norm_diff", norm_diff[i], #"loss", cost[i], #"state: ", states[i], file=rewards_out) sys.stdout.flush() rewards_out.close() ''' #if self.train_iteration % 500: # print('0000 : ', diff, file=logf) # print('llll : ', norm_diff, file=logf) loss_out = open(loss_out_path, "a+") print("episode: ", episode, "iter: ", self.train_iteration, "hjk loss is ----- ", cost, "hjk td_loss is ----- ", td_cost, "hjk reg_loss is ----- ", reg_cost, file=loss_out) sys.stdout.flush() loss_out.close() # update target network using Q-network self.session.run(self.target_network_update) ''' # emit summaries if calculate_summaries: self.summary_writer.add_summary(summary_str, self.train_iteration) ''' self.annealExploration() self.train_iteration += 1 del batch, states, rewards, action_mask, next_states, next_state_mask #del direct_r, nxt_r, label_r, now_net_r, diff, norm_diff gc.collect() #objgraph.show_most_common_types(limit=50) def save_net(self, path): saver = tf.train.Saver() save_path = saver.save(self.session, path) print("Save to path: " + save_path)