class NeuralQLearner(object): def __init__( self, session, optimizer, q_network, restore_net_path, state_dim, num_actions, batch_size, init_exp, # initial exploration prob final_exp, # final exploration prob anneal_steps, # N steps for annealing exploration replay_buffer_size, store_replay_every, # how frequent to store experience discount_factor, # discount future rewards target_update_rate, reg_param, # regularization constants max_gradient, # max gradient norms double_q_learning, summary_writer, summary_every): # tensorflow machinery self.session = session self.optimizer = optimizer self.summary_writer = summary_writer # model components self.q_network = q_network self.restore_net_path = restore_net_path self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size) # Q learning parameters self.batch_size = batch_size self.state_dim = state_dim self.num_actions = num_actions self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate self.double_q_learning = double_q_learning # training parameters self.max_gradient = max_gradient self.reg_param = reg_param # counters self.store_replay_every = store_replay_every self.store_experience_cnt = 0 self.train_iteration = 0 # create and initialize variables self.create_variables() if self.restore_net_path is not None: saver = tf.train.Saver() saver.restore(self.session, self.restore_net_path) else: var_lists = tf.get_collection(tf.GraphKeys.VARIABLES) self.session.run(tf.initialize_variables(var_lists)) #var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) #self.session.run(tf.variables_initializer(var_lists)) # make sure all variables are initialized self.session.run(tf.assert_variables_initialized()) self.summary_every = summary_every if self.summary_writer is not None: # graph was not available when journalist was created self.summary_writer.add_graph(self.session.graph) self.summary_every = summary_every def create_variables(self): # compute action from a state: a* = argmax_a Q(s_t,a) with tf.name_scope("predict_actions"): # raw state representation self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # initialize Q network with tf.variable_scope("q_network"): self.q_outputs = self.q_network(self.states) # predict actions from Q network self.action_scores = tf.identity(self.q_outputs, name="action_scores") tf.summary.histogram("action_scores", self.action_scores) self.predicted_actions = tf.argmax(self.action_scores, dimension=1, name="predicted_actions") # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a) with tf.name_scope("estimate_future_rewards"): self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") self.next_state_mask = tf.placeholder(tf.float32, (None, ), name="next_state_masks") if self.double_q_learning: # reuse Q network for action selection with tf.variable_scope("q_network", reuse=True): self.q_next_outputs = self.q_network(self.next_states) self.action_selection = tf.argmax(tf.stop_gradient( self.q_next_outputs), 1, name="action_selection") tf.summary.histogram("action_selection", self.action_selection) self.action_selection_mask = tf.one_hot( self.action_selection, self.num_actions, 1, 0) # use target network for action evaluation with tf.variable_scope("target_network"): self.target_outputs = self.q_network( self.next_states) * tf.cast(self.action_selection_mask, tf.float32) self.action_evaluation = tf.reduce_sum(self.target_outputs, axis=[ 1, ]) tf.summary.histogram("action_evaluation", self.action_evaluation) self.target_values = self.action_evaluation * self.next_state_mask else: # initialize target network with tf.variable_scope("target_network"): self.target_outputs = self.q_network(self.next_states) # compute future rewards self.next_action_scores = tf.stop_gradient(self.target_outputs) #self.target_values = tf.reduce_max(self.next_action_scores, axis=[1, ]) * self.next_state_mask self.target_values = tf.reduce_max(self.next_action_scores, reduction_indices=[ 1, ]) * self.next_state_mask tf.summary.histogram("next_action_scores", self.next_action_scores) self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards") self.future_rewards = self.rewards + self.discount_factor * self.target_values # compute loss and gradients with tf.name_scope("compute_temporal_differences"): # compute temporal difference loss self.action_mask = tf.placeholder(tf.float32, (None, self.num_actions), name="action_mask") #self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, axis=[1, ]) self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, reduction_indices=[ 1, ]) self.temp_diff = self.masked_action_scores - self.future_rewards self.norm_diff = tf.square( tf.sigmoid(self.masked_action_scores / 100.0) - tf.sigmoid(self.future_rewards / 100.0)) #self.norm_diff = tf.nn.sigmoid(tf.square(self.temp_diff)/40000.0) self.td_loss = tf.reduce_mean(self.norm_diff) * 20000.0 # regularization loss q_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") self.reg_loss = self.reg_param * tf.reduce_sum( [tf.reduce_sum(tf.square(x)) for x in q_network_variables]) # compute total loss and gradients self.loss = self.td_loss + self.reg_loss gradients = self.optimizer.compute_gradients(self.loss) # clip gradients by norm for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var) # add histograms for gradients. for grad, var in gradients: tf.summary.histogram(var.name, var) if grad is not None: tf.summary.histogram(var.name + '/gradients', grad) self.train_op = self.optimizer.apply_gradients(gradients) # update target network with Q network with tf.name_scope("update_target_network"): self.target_network_update = [] # slowly update target network parameters with Q network parameters q_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") target_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network") for v_source, v_target in zip(q_network_variables, target_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) self.target_network_update = tf.group(*self.target_network_update) # scalar summaries tf.summary.scalar("td_loss", self.td_loss) #tf.summary.scalar("reg_loss", self.reg_loss) tf.summary.scalar("total_loss", self.loss) tf.summary.scalar("exploration", self.exploration) self.summarize = tf.summary.merge_all() self.no_op = tf.no_op() def storeExperience(self, state, action, reward, next_state, done): # always store end states if self.store_experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.store_experience_cnt += 1 def eGreedyAction(self, states, explore=True): if explore and self.exploration > random.random(): return random.randint(0, self.num_actions - 1) else: return self.session.run(self.predicted_actions, {self.states: states})[0] def annealExploration(self, stategy='linear'): ratio = max((self.anneal_steps - self.train_iteration) / float(self.anneal_steps), 0) self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp def updateModel(self, episode=-1): # not enough experiences yet print("compare ", self.replay_buffer.count(), self.batch_size) if self.replay_buffer.count() < self.batch_size: return batch = self.replay_buffer.getBatch(self.batch_size) states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size, )) action_mask = np.zeros((self.batch_size, self.num_actions)) next_states = np.zeros((self.batch_size, self.state_dim)) next_state_mask = np.zeros((self.batch_size, )) for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r action_mask[k][a] = 1 # check terminal state if not done: next_states[k] = s1 next_state_mask[k] = 1 # whether to calculate summaries calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None # perform one update of training #direct_r, nxt_r, label_r, now_net_r, diff, norm_diff, cost, td_cost, reg_cost, _, summary_str = self.session.run([ cost, td_cost, reg_cost, _, summary_str = self.session.run( [ #self.rewards, #self.target_values * self.discount_factor, #self.future_rewards, #self.masked_action_scores, #self.temp_diff, #self.norm_diff, self.loss, self.td_loss, self.reg_loss, self.train_op, self.summarize if calculate_summaries else self.no_op ], { self.states: states, self.next_states: next_states, self.next_state_mask: next_state_mask, self.action_mask: action_mask, self.rewards: rewards }) ''' rewards_out = open(rewards_out_path, 'a+') if self.train_iteration % 100 == 0: for i in range(len(direct_r)): print("episode: ", episode, "iter: ", self.train_iteration, "mini batch --- ", i, "direct_r ", direct_r[i], "nxt_r: ", nxt_r[i], "label_r: ", label_r[i], "now_net_r: ", now_net_r[i], "tmpdiff: ", diff[i], "norm_diff", norm_diff[i], #"loss", cost[i], #"state: ", states[i], file=rewards_out) sys.stdout.flush() rewards_out.close() ''' #if self.train_iteration % 500: # print('0000 : ', diff, file=logf) # print('llll : ', norm_diff, file=logf) loss_out = open(loss_out_path, "a+") print("episode: ", episode, "iter: ", self.train_iteration, "hjk loss is ----- ", cost, "hjk td_loss is ----- ", td_cost, "hjk reg_loss is ----- ", reg_cost, file=loss_out) sys.stdout.flush() loss_out.close() # update target network using Q-network self.session.run(self.target_network_update) ''' # emit summaries if calculate_summaries: self.summary_writer.add_summary(summary_str, self.train_iteration) ''' self.annealExploration() self.train_iteration += 1 del batch, states, rewards, action_mask, next_states, next_state_mask #del direct_r, nxt_r, label_r, now_net_r, diff, norm_diff gc.collect() #objgraph.show_most_common_types(limit=50) def save_net(self, path): saver = tf.train.Saver() save_path = saver.save(self.session, path) print("Save to path: " + save_path)
class DDPG(nn.Module): def __init__( self, state_dim, action_dim, learning_rate_a=1e-3, learning_rate_c=1e-3, gamma=0.99, update_tau=1e-3, batch_size=100, buffer_size=10000, training_start=1000, ): super(DDPG, self).__init__() self.s_dim = state_dim self.a_dim = action_dim self.lr_a = learning_rate_a self.lr_c = learning_rate_c self.gamma = gamma self.update_tau = update_tau self.batch_size = batch_size self.buffer_size = buffer_size self.training_start = training_start self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.actor = Actor(input_dim=self.s_dim, output_dim=self.a_dim, update_tau=self.update_tau).to(self.device) self.critic = Critic(state_dim=self.s_dim, action_dim=self.a_dim, update_tau=self.update_tau).to(self.device) self.buffer = ReplayBuffer(buffer_size=self.buffer_size) self.loss_actor = 0 self.loss_critic = 0 self.optimizer_a = optim.Adam(self.actor.eval_net.parameters(), lr=self.lr_a) self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c) def choose_action(self, s): s = torch.Tensor(s).to(self.device) return self.actor.get_eval(s).to( torch.device('cpu')).detach().numpy().tolist() def percive(self, state, action, reward, state_, done): self.buffer.add(state, action, reward, state_, done) if self.training_start < self.buffer.count(): self.Train() def get_critic_loss(self, reward, state_next, state, action, done): action_next = self.actor.get_target(state_next) q_next_tar = self.critic.get_target(s=state_next, a=action_next) Q_target = reward + self.gamma * q_next_tar * (1 - done) Q_eval = self.critic.get_eval(s=state, a=action) return F.mse_loss(Q_target, Q_eval) def Train(self): minibatch = self.buffer.get_batch(batch_size=self.batch_size) state_batch = torch.Tensor([data[0] for data in minibatch]).to(self.device) action_batch = torch.Tensor([data[1] for data in minibatch]).to(self.device) reward_batch = torch.Tensor([data[2] for data in minibatch]).to(self.device) state_next_batch = torch.Tensor([data[3] for data in minibatch ]).to(self.device) done_batch = torch.Tensor([data[4] for data in minibatch]).to(self.device) #train critic self.loss_critic = self.get_critic_loss(reward_batch, state_next_batch, state_batch, action_batch, done_batch) self.optimizer_c.zero_grad() self.loss_critic.backward() self.optimizer_c.step() #train actor self.loss_actor = -self.critic.get_eval(state_batch, action_batch).mean() self.optimizer_a.zero_grad() self.loss_actor.backward() self.optimizer_a.step() #update the target net self.actor.soft_update() self.critic.soft_update()
class DDPG: """docstring for DDPG""" def __init__(self, state_space, action_dim): self.name = 'DDPG' # name for uploading results self.sess = tf.Session() # Randomly initialize actor network and critic network # with both their target networks self.state_space = state_space self.action_dim = action_dim # 1 self.ac_network = ActorCriticNetwork(self.sess, self.state_space, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Get Q target label # maxQ(s',a') q_value_batch = self.ac_network.target_q(next_state_batch) # Calculate target maxQ(s,a): y = reward + GAMMA * maxQ(s',a') y_batch = [] batch_size = len(minibatch) for i in range(batch_size): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [batch_size, 1]) # Update eval critic network by minimizing the loss L cost = self.ac_network.train_critic(y_batch, state_batch, action_batch) print('step_%d critic cost:' % self.ac_network.time_step, cost) # Update eval actor policy using the sampled gradient: self.ac_network.train_actor(state_batch) # Update the target networks self.ac_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.ac_network.actions(state) return action[0] + self.exploration_noise.noise() def action(self, state): action = self.ac_network.actions([state]) return action[0] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def sparse_tensor(self, state_batch, state_space): row = len(state_batch) indices = [] for r in range(row): indices += [(r, c) for c in state_batch[r]] values = [1.0 for i in range(len(indices))] return tf.SparseTensorValue(indices=indices, values=values, dense_shape=[row, state_space])
class RDPG: """docstring for RDPG""" def __init__(self, env): self.name = 'RDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.saver = tf.train.Saver() def train(self): # Sample a random minibatch of N sequences from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # Construct histories observations = [] next_observations = [] actions = [] rewards = [] dones = [] for each in minibatch: for i in range(1, len(each.observations)): observations.append(self.pad(each.observations[0:i])) next_observations.append(self.pad(each.observations[1, i + 1])) actions.append(each.actions[0:i - 1]) rewards.append(each.rewards[0:i]) if i == len(each.observations) - 1: dones.append(True) else: dones.append(False) # Calculate y_batch next_action_batch = self.actor_network.target_action(observations) q_value_batch = self.critic_network.target_q( next_observations, [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)]) y_batch = [] for i in range(len(observations)): if dones[i]: y_batch.append(rewards[i][-1]) else: y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [len(observations), 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, observations, [self.pad(i) for i in actions]) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(observations) q_gradient_batch = self.critic_network.gradients( observations, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, observations) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def save_model(self, path, episode): self.saver.save(self.sess, path + "modle.ckpt", episode) def noise_action(self, history): # Select action a_t according to a sequence of observation and action action = self.actor_network.action(history) return action + self.exploration_noise.noise() def action(self, history): action = self.actor_network.action(history) return action def perceive(self, history): # Store the history sequence in the replay buffer self.replay_buffer.add(history) # Store history to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def pad(self, input): dim = len(input[0]) return input + [[0] * dim] * (1000 - len(input))
def main(args): if VERBOSE: print '***The Replay Buffer currently always returns the most recent experiences (instead of random), so the batches are constant between the tf and torch nets.' state_dim = 3 action_dim = 1 net = ActorCriticNet(state_dim, action_dim) target_net = copy.deepcopy(net) memory = ReplayBuffer(REPLAY_BUFFER_SIZE) noise = OUNoise(action_dim) criterion = nn.MSELoss() optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, weight_decay=L2) target_optim = optim.Optimizer(target_net.parameters(), {}) # to iterate over target params if VERBOSE: print '***Making gym env (only used to setup TF net).' # load tf net (restoring saved parameters) dtf = ddpg_tf.DDPG_TF(filter_env.makeFilteredEnv(gym.make('Pendulum-v0')), loadfilename='tf_params-0', printVars=False) if VERBOSE: print '***TF net restore complete.' # load control data (only using a every fourth data), and tf net results control_states = np.load('control_states.npy')[::4] control_rewards = np.load('control_rewards.npy')[::4] tf_record = np.load('tf_control_record.npy') # replace torch params with tf params, and run control data, collecting torch net results # first optimization step will occur at i == 50, upon which extra data is recorded to compare tf and torch # using: no bn, REPLAY_BUFFER_SIZE=200, REPLAY_START_SIZE=50, BATCH_SIZE=50, constant replay_buffer_batches (always the most recent experiences) replaceNetParams(dtf, net, target_net) if VERBOSE: print '***Torch net params initialized to TF net params.' original_net = copy.deepcopy(net) # save original net original_target_net = copy.deepcopy(target_net) torch_record = [] loss = -1 first_step = True for i in xrange(len(control_rewards) - 1): state = torch.from_numpy(control_states[i].reshape(1, state_dim)).float() action = net.getAction(Variable(state)).data target_action = target_net.getAction(Variable(state)).data reward = torch.FloatTensor([[control_rewards[i]]]).float() new_state = torch.from_numpy(control_states[i + 1].reshape( 1, state_dim)).float() memory.add(state, action, reward, new_state, True) if memory.count() > REPLAY_START_SIZE: minibatch = memory.get_batch(BATCH_SIZE) state_batch = torch.cat([data[0] for data in minibatch], dim=0) action_batch = torch.cat([data[1] for data in minibatch], dim=0) reward_batch = torch.cat([data[2] for data in minibatch]) next_state_batch = torch.cat([data[3] for data in minibatch], dim=0) done_batch = Tensor([data[4] for data in minibatch]) # calculate y_batch from targets #next_action_batch = target_net.getAction(Variable(next_state_batch)) value_batch = target_net.getValue(Variable(next_state_batch)).data y_batch = reward_batch + GAMMA * value_batch * done_batch if first_step: if VERBOSE: print '***First Optimization Step complete.' torch_ys = y_batch torch_batch = minibatch torch_outs = net.getValue(Variable(state_batch)).data # optimize net 1 step loss = criterion(net.getValue(Variable(state_batch)), Variable(y_batch)) optimizer.zero_grad() loss.backward() optimizer.step() loss = loss.data[0] # update targets - using exponential moving averages for group, target_group in zip(optimizer.param_groups, target_optim.param_groups): for param, target_param in zip(group['params'], target_group['params']): target_param.data.mul_(1 - TAU) target_param.data.add_(TAU, param.data) if first_step: first_step_net = copy.deepcopy(net) first_step_target_net = copy.deepcopy(target_net) first_step = False torch_record.append( [action.numpy()[0][0], target_action.numpy()[0][0], loss]) loss = -1 torch_record = np.array(torch_record) torch_outs = torch_outs.numpy().T[0] torch_ys = torch_ys.numpy().T[0] if VERBOSE: print '***Control Data run complete.' # compare torch and tf results # results for each net have 3 columns: [net action prediction, target net action prediction, loss (-1 if there was no training)] sel = np.arange(45, 55) #print calc_error(tf_record[sel,:], torch_record[sel,:]) print 'Result comparison:' print 'control_data_index | tf_net_action | tf_target_net_action | tf_loss | torch_net_action | torch_target_net_action | torch_loss' print np.hstack( [sel[:, np.newaxis], tf_record[sel, :], torch_record[sel, :]]) print '\t(a loss of -1 means no training occured in that step)' # load all tf results from before taking first optimization step tf_ys = np.load('tf_first_step_y_batch.npy') tf_rs = np.load('tf_first_step_reward_batch.npy') tf_ds = np.load('tf_first_step_done_batch.npy') tf_vs = np.load('tf_first_step_value_batch.npy') tf_outs = np.load('tf_first_step_output_values.npy') torch_wd = 1.36607 # weight decay loss of tf net at first optimization step - recorded directly from terminal output of tf net if VERBOSE: print '***Comparing first step stats' # compare tf and torch data from before taking first optimization step # including calculation of manual loss print '\terror in ys (between tf and torch)', calc_error( torch_ys, tf_ys) print '\terror in predictions (between tf and torch)', calc_error( torch_outs, tf_outs) print '\ttorch loss (manually calculated)', np.mean( (torch_ys - torch_outs)**2) print '\ttf loss (manually calculated)', np.mean((tf_ys - tf_outs)**2) print '\ttorch loss', torch_record[50, 2], '(not including weight decay)' print '\ttf loss', tf_record[ 50, 2] - torch_wd, '(not including weight decay)' return 0
class NeuralQLearner(object): def __init__(self, session, optimizer, q_network, state_dim, num_actions, batch_size=32, init_exp=0.5, # initial exploration prob final_exp=0.1, # final exploration prob anneal_steps=10000, # N steps for annealing exploration replay_buffer_size=10000, store_replay_every=5, # how frequent to store experience discount_factor=0.9, # discount future rewards target_update_rate=0.01, reg_param=0.01, # regularization constants max_gradient=5, # max gradient norms double_q_learning=False, summary_writer=None, summary_every=100): # tensorflow machinery self.session = session self.optimizer = optimizer self.summary_writer = summary_writer # model components self.q_network = q_network self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size) # Q learning parameters self.batch_size = batch_size self.state_dim = state_dim self.num_actions = num_actions self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate self.double_q_learning = double_q_learning # training parameters self.max_gradient = max_gradient self.reg_param = reg_param # counters self.store_replay_every = store_replay_every self.store_experience_cnt = 0 self.train_iteration = 0 # create and initialize variables self.create_variables() var_lists = tf.get_collection(tf.GraphKeys.VARIABLES) self.session.run(tf.initialize_variables(var_lists)) # make sure all variables are initialized self.session.run(tf.assert_variables_initialized()) if self.summary_writer is not None: # graph was not available when journalist was created self.summary_writer.add_graph(self.session.graph) self.summary_every = summary_every def create_variables(self): # compute action from a state: a* = argmax_a Q(s_t,a) with tf.name_scope("predict_actions"): # raw state representation self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # initialize Q network with tf.variable_scope("q_network"): self.q_outputs = self.q_network(self.states) # predict actions from Q network self.action_scores = tf.identity(self.q_outputs, name="action_scores") tf.histogram_summary("action_scores", self.action_scores) self.predicted_actions = tf.argmax(self.action_scores, dimension=1, name="predicted_actions") # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a) with tf.name_scope("estimate_future_rewards"): self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks") if self.double_q_learning: # reuse Q network for action selection with tf.variable_scope("q_network", reuse=True): self.q_next_outputs = self.q_network(self.next_states) self.action_selection = tf.argmax(tf.stop_gradient(self.q_next_outputs), 1, name="action_selection") tf.histogram_summary("action_selection", self.action_selection) self.action_selection_mask = tf.one_hot(self.action_selection, self.num_actions, 1, 0) # use target network for action evaluation with tf.variable_scope("target_network"): self.target_outputs = self.q_network(self.next_states) * tf.cast(self.action_selection_mask, tf.float32) self.action_evaluation = tf.reduce_sum(self.target_outputs, reduction_indices=[1,]) tf.histogram_summary("action_evaluation", self.action_evaluation) self.target_values = self.action_evaluation * self.next_state_mask else: # initialize target network with tf.variable_scope("target_network"): self.target_outputs = self.q_network(self.next_states) # compute future rewards self.next_action_scores = tf.stop_gradient(self.target_outputs) self.target_values = tf.reduce_max(self.next_action_scores, reduction_indices=[1,]) * self.next_state_mask tf.histogram_summary("next_action_scores", self.next_action_scores) self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") self.future_rewards = self.rewards + self.discount_factor * self.target_values # compute loss and gradients with tf.name_scope("compute_temporal_differences"): # compute temporal difference loss self.action_mask = tf.placeholder(tf.float32, (None, self.num_actions), name="action_mask") self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, reduction_indices=[1,]) self.temp_diff = self.masked_action_scores - self.future_rewards self.td_loss = tf.reduce_mean(tf.square(self.temp_diff)) # regularization loss q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") self.reg_loss = self.reg_param * tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in q_network_variables]) # compute total loss and gradients self.loss = self.td_loss + self.reg_loss gradients = self.optimizer.compute_gradients(self.loss) # clip gradients by norm for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var) # add histograms for gradients. for grad, var in gradients: tf.histogram_summary(var.name, var) if grad is not None: tf.histogram_summary(var.name + '/gradients', grad) self.train_op = self.optimizer.apply_gradients(gradients) # update target network with Q network with tf.name_scope("update_target_network"): self.target_network_update = [] # slowly update target network parameters with Q network parameters q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network") target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network") for v_source, v_target in zip(q_network_variables, target_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) self.target_network_update = tf.group(*self.target_network_update) # scalar summaries tf.scalar_summary("td_loss", self.td_loss) tf.scalar_summary("reg_loss", self.reg_loss) tf.scalar_summary("total_loss", self.loss) tf.scalar_summary("exploration", self.exploration) self.summarize = tf.merge_all_summaries() self.no_op = tf.no_op() def storeExperience(self, state, action, reward, next_state, done): # always store end states if self.store_experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.store_experience_cnt += 1 def eGreedyAction(self, states, explore=True): if explore and self.exploration > random.random(): return random.randint(0, self.num_actions-1) else: return self.session.run(self.predicted_actions, {self.states: states})[0] def annealExploration(self, stategy='linear'): ratio = max((self.anneal_steps - self.train_iteration)/float(self.anneal_steps), 0) self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp def updateModel(self): # not enough experiences yet if self.replay_buffer.count() < self.batch_size: return batch = self.replay_buffer.getBatch(self.batch_size) states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size,)) action_mask = np.zeros((self.batch_size, self.num_actions)) next_states = np.zeros((self.batch_size, self.state_dim)) next_state_mask = np.zeros((self.batch_size,)) for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r action_mask[k][a] = 1 # check terminal state if not done: next_states[k] = s1 next_state_mask[k] = 1 # whether to calculate summaries calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None # perform one update of training cost, _, summary_str = self.session.run([ self.loss, self.train_op, self.summarize if calculate_summaries else self.no_op ], { self.states: states, self.next_states: next_states, self.next_state_mask: next_state_mask, self.action_mask: action_mask, self.rewards: rewards }) # update target network using Q-network self.session.run(self.target_network_update) # emit summaries if calculate_summaries: self.summary_writer.add_summary(summary_str, self.train_iteration) self.annealExploration() self.train_iteration += 1
class DDPG: """docstring for DDPG""" def __init__(self, state_dim, action_dim): """name for uploading resuults""" self.name = 'DDPG' self.time_step = 0 # self.atten_rate = 1 """Randomly initialize actor network and critic network""" """and both their target networks""" self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) """initialize replay buffer""" self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) """Initialize a random process the Ornstein-Uhlenbeck process for action exploration""" self.exploration_noise = OUNoise(self.action_dim) """Initialize a Treading""" self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') def train(self): # if self.time_step ==0: # print("Begins Training!!!") #print("Training Begins") self.time_step += 1 """Sample a random minibatch of N transitions from replay buffer""" """take out BATCH_SIZE sets of data""" minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) """resize the action_batch shape to [BATCH_SIZE, self.action_dim]""" action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) """Calculate y_batch(reward)""" next_action_batch = self.actor_network.target_action(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) """Update critic by minimizing the loss L (training)""" self.critic_network.train(y_batch, state_batch, action_batch) """Update the actor policy using the sampled gradient:""" action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) """Update the target networks""" self.actor_network.update_target() self.critic_network.update_target() #print("Training Finished") def noise_action(self, state): """Select action a_t according to the current policy and exploration noise""" action = self.actor_network.action(state) exp_noise = self.exploration_noise.noise() action += exp_noise # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def action(self, state): action = self.actor_network.action(state) # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def perceive(self, state, action, reward, next_state, done): """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer""" self.replay_buffer.add(state, action, reward, next_state, done) """Store transitions to replay start size then start training""" # if self.replay_buffer.count() % 1000 == 0: # print("The buffer count is ", self.replay_buffer.count()) if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # self.atten_rate *= 0.99995 if not self.threading.is_alive(): self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') self.threading.start() """SAVE NETWORK""" if self.time_step % 100 == 0: print("Training_time_step:", self.time_step) if self.time_step % 1000 == 0: print("!!!!!!!save model success!!!!!!!!") self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) """Re-iniitialize the random process when an episode ends""" if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, sess, data_fname): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN) print("######### TRAINING #############") for k in range(Hp.N_TRAIN): minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size] state_batch_r = np.asarray([data[0] for data in minibatch]) state_batch = [] for j in range(Hp.categories): new_cat = np.stack(state_batch_r[:, j], axis=0) state_batch.append(new_cat) #state_batch = [np.expand_dims(state_batch, axis=1)] action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch_r = np.asarray([data[3] for data in minibatch]) next_state_batch = [] for j in range(Hp.categories): new_cat = np.stack(next_state_batch_r[:, j], axis=0) next_state_batch.append(new_cat) #next_state_batch = [np.expand_dims(next_state_batch, axis=1)] done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [Hp.batch_size, self.action_dim]) next_action_batch = self.actor_network.target_actions( self.target_state_input, next_state_batch) q_value_batch = self.critic_network.target_q( self.target_state_input, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + Hp.GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [Hp.batch_size, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, self.state_input, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.state_input, state_batch) q_gradient_batch = self.critic_network.gradients( self.state_input, state_batch, action_batch_for_gradients) self.summary_str2 = self.actor_network.train( q_gradient_batch, self.state_input, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() self.state_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) print("no noise ", action) return np.clip( action + self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]), [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0]) def action(self, state): state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > Hp.REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def main(config_dict): train = config_dict['train'] network = config_dict['network'] experiment_name = config_dict['experiment_name'] EXPERIMENTS_PATH = config_dict['EXPERIMENTS_PATH'] actor_weights_file = "%s%s/%s_actor.h5" % (EXPERIMENTS_PATH, network, network) critic_weights_file = "%s%s/%s_critic.h5" % (EXPERIMENTS_PATH, network, network) log_directory = "%s%s/%s/" % (EXPERIMENTS_PATH, network, experiment_name) BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 LRA = 0.0001 LRC = 0.001 action_dim = 3 # Steering / Acceleration / Blake state_dim = 29 # Dimension of sensor inputs #np.random.seed(42) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 done = False step = 0 epsilon = 1 exp_logger = TORCS_ExperimentLogger(log_directory, experiment_name) #directory = "%s%s/" % (EXPERIMENTS_PATH, experiment) #actor_weights_file = "%s%s_%s" % (directory, experiment, "actor.h5") #critic_weights_file = "%s%s_%s" % (directory, experiment, "critic.h5") # TensorFlow GPU config = tf.ConfigProto() # Not sure if this is really necessary, since we only have a single GPU config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Weight loading if not train: try: actor.model.load_weights(actor_weights_file) critic.model.load_weights(critic_weights_file) actor.target_model.load_weights(actor_weights_file) critic.target_model.load_weights(critic_weights_file) print "Weights loaded successfully" time.sleep(2) except: print "Error in loading weights" print '-' * 60 traceback.print_exc(file=sys.stdout) print '-' * 60 assert (False) for i in xrange(episode_count): print "Episode: %i; Replay Buffer: %i" % (i, buff.count()) if np.mod(i, 3) == 0: # Relaunch TORCS every 3 episodes; memory leak error ob = env.reset(relaunch=True) else: ob = env.reset() state_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. # Compute rewards for j in xrange(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE # exploration factor action_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) action_t_raw = actor.model.predict( state_t.reshape( 1, state_t.shape[0])) # this call to reshape seems suboptimal noise_t[0][0] = train * max(epsilon, 0) * OU.run( action_t_raw[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train * max(epsilon, 0) * OU.run( action_t_raw[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train * max(epsilon, 0) * OU.run( action_t_raw[0][2], -0.1, 1.00, 0.05) # stochastic brake #if random.random() <= 0.1: # noise_t[0][2] = train * max(epsilon, 0) * OU.run(action_t_raw[0][2], 0.2, 1.00, 0.10) # May be able to do this a bit more concisely with NumPy vectorization action_t[0][0] = action_t_raw[0][0] + noise_t[0][0] action_t[0][1] = action_t_raw[0][1] + noise_t[0][1] action_t[0][2] = action_t_raw[0][2] + noise_t[0][2] # Raw_reward_t is the raw reward computed by the gym_torcs script. # We will compute our own reward metric from the ob object ob, raw_reward_t, done, info = env.step(action_t[0]) state_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) #reward_t = lng_trans(ob) reward_t = raw_reward_t buff.add(state_t, action_t[0], reward_t, state_t1, done) # Add replay buffer # Batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) done_indicators = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # Can't we just use BATCH_SIZE here for k in xrange(len(batch)): if done_indicators[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.train_target_net() critic.train_target_net() exp_logger.log(ob, action_t[0], reward_t, loss) total_reward += reward_t state_t = state_t1 print("Episode", i, "Step", step, "Action", action_t, "Reward", reward_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train): print("Now we save model") actor.model.save_weights(actor_weights_file, overwrite=True) #with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights(critic_weights_file, overwrite=True) #with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state, epsilon): action = self.actor_network.action(state) noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.linear_noise.noise() noise_t[1] = epsilon * self.angular_noise.noise() action = action + noise_t a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) #print(a_linear, a_angular) return [a_linear, a_angular] def action(self, state): action = self.actor_network.action(state) a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) return [a_linear, a_angular] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) if done: self.linear_noise.reset() self.angular_noise.reset() return self.time_step
class DDQN: def __init__(self, model_name, action_dim): self.device = configure.DEVICE self.model_name = model_name self.action_dim = action_dim self.episode = 0 # self.timeStep = 0 self.STARTtrain = False self.epsilon = INITIAL_EPSILON self.img_width = configure.IMAGE_WIDTH self.img_height = configure.IMAGE_HEIGHT self.img_channels = configure.STACKED_FRAMES * 4 self.learning_rate = configure.LEARNING_RATE_START self.tau = configure.TargetNet_Tau self.replaybuffer = ReplayBuffer(REPLAY_MEMORY) self.graph = tf.Graph() with self.graph.as_default() as g: with tf.device(self.device): with tf.variable_scope('Main_net'): self.imageIn, self.conv1, self.conv2, self.conv3, self.pool1, self.conv4, \ self.Advantage, self.Value, self.Qout, self.predict \ = self.__create_graph() with tf.variable_scope('Target_net'): self.imageInT, _, _, _, _, _, _, _, self.QoutT, _ = self.__create_graph( ) self.MainNet_vars = get_variables('Main_net') self.TargetNet_vars = get_variables('Target_net') self.createTrainingMethod() self.createupdateTargetNetOp() self.sess = tf.Session( graph=self.graph, config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(allow_growth=True))) self.sess.run(tf.global_variables_initializer()) if configure.TENSORBOARD: self._create_tensor_board() # if configure.LOAD_CHECKPOINT or configure.SAVE_MODELS: # vars = tf.global_variables() # self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0) self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(self.model_name) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path mypath = str(checkpoint.model_checkpoint_path) stepmatch = re.split('-', mypath)[2] self.episode = int(stepmatch) # pass else: print "Could not find old network weights" # def __create_main_graph(self): # self.imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='imgIn') # # self.conv1 = self.conv2d_layer(self.imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1]) # self.conv2 = self.conv2d_layer(self.conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1]) # self.conv3 = self.conv2d_layer(self.conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1]) # self.conv4 = self.conv2d_layer(self.conv3, self.conv3.get_shape()[1].value, 512, 'conv4', strides=[1,1,1,1]) # with tf.variable_scope('A_V'): # self.streamAC, self.streamVC = tf.split(self.conv4, 2, 3) # self.streamA = tf.contrib.layers.flatten(self.streamAC) # self.streamV = tf.contrib.layers.flatten(self.streamVC) # # self.AW = tf.Variable(tf.random_normal([self.streamA, self.action_dim]), name='AW') # self.VW = tf.Variable(tf.random_normal([self.streamV, 1]), name='VW') # self.Advantage = tf.matmul(self.streamA, self.AW, name='Advantage') # self.Value = tf.matmul(self.streamV, self.VW, name='Value') # # with tf.variable_scope('Qout'): # self.Qout = self.Value + tf.subtract( # self.Advantage, tf.reduce_mean(self.Advantage, reduction_indices=1, keep_dims=True)) # # with tf.variable_scope('Predict'): # self.predict = tf.argmax(self.Qout, 1) def __create_graph(self): imageIn = tf.placeholder( tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='imgIn') conv1 = self.conv2d_layer(imageIn, 8, 128, 'conv1', strides=[1, 4, 4, 1]) conv2 = self.conv2d_layer(conv1, 4, 128, 'conv2', strides=[1, 2, 2, 1]) conv3 = self.conv2d_layer(conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1]) pool1 = self.mpool_layer(conv3, 2, [1, 2, 2, 1], name='pool1') conv4 = self.conv2d_layer(pool1, pool1.get_shape()[1].value, 1024, 'conv4', strides=[1, 1, 1, 1], padding='VALID') streamAC, streamVC = tf.split(conv4, 2, 3) streamA = tf.contrib.layers.flatten(streamAC) streamV = tf.contrib.layers.flatten(streamVC) Advantage = self.fc_layer(streamA, self.action_dim, 'Advantage', func=None) Value = self.fc_layer(streamV, 1, 'Value', func=None) # AW = tf.Variable(tf.random_normal([streamA.get_shape()[1].value, self.action_dim]), name='AW') # VW = tf.Variable(tf.random_normal([streamV.get_shape()[1].value, 1]), name='VW') # Advantage = tf.matmul(streamA, AW, name='Advantage') # Value = tf.matmul(streamV, VW, name='Value') with tf.variable_scope('Qout'): Qout = Value + tf.subtract( Advantage, tf.reduce_mean(Advantage, reduction_indices=1, keep_dims=True)) with tf.variable_scope('Predict'): predict = tf.argmax(Qout, 1) return imageIn, conv1, conv2, conv3, pool1, conv4, Advantage, Value, Qout, predict # def __create_target_graph(self): # self.target_imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels], # name='imgIn') # self.target_conv1 = self.conv2d_layer(self.target_imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1]) # self.target_conv2 = self.conv2d_layer(self.target_conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1]) # self.target_conv3 = self.conv2d_layer(self.target_conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1]) # self.target_conv4 = self.conv2d_layer(self.target_conv3, self.target_conv3.get_shape()[1].value, 512, 'conv4', strides=[1, 1, 1, 1]) # with tf.variable_scope('A_V'): # self.target_streamAC, self.target_streamVC = tf.split(self.target_conv4, 2, 3) # self.target_streamA = tf.contrib.layers.flatten(self.target_streamAC) # self.target_streamV = tf.contrib.layers.flatten(self.target_streamVC) # # self.target_AW = tf.Variable(tf.random_normal([self.target_streamA, self.action_dim]), name='AW') # self.target_VW = tf.Variable(tf.random_normal([self.target_streamV, 1]), name='VW') # self.target_Advantage = tf.matmul(self.target_streamA, self.target_AW, name='Advantage') # self.target_Value = tf.matmul(self.target_streamV, self.target_VW, name='Value') # # with tf.variable_scope('Qout'): # self.Qout = self.target_Value + tf.subtract( # self.target_Advantage, tf.reduce_mean(self.target_Advantage, reduction_indices=1, keep_dims=True)) def createTrainingMethod(self): self.global_step = tf.Variable(0, trainable=False, name='step') self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32, name='targetQ') self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name='actions') self.actions_onehot = tf.one_hot(self.actions, self.action_dim, dtype=tf.float32, name='act_onehot') self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), reduction_indices=1, name='Q') self.td_error = tf.square(self.targetQ - self.Q, name='td_error') self.loss = tf.reduce_mean(self.td_error, name='loss') self.trainer = tf.train.AdamOptimizer( learning_rate=self.var_learning_rate) self.train_op = self.trainer.minimize(self.loss, global_step=self.global_step, name='train_update') def createupdateTargetNetOp(self): self.assign_op = {} for from_, to_ in zip(self.MainNet_vars, self.TargetNet_vars): self.assign_op[to_.name] = to_.assign(self.tau * from_ + (1 - self.tau) * to_) def updateTargetNet(self): for var in self.TargetNet_vars: self.sess.run(self.assign_op[var.name]) def conv2d_layer(self, input, filter_size, out_dim, name, strides, func=tf.nn.relu, padding='SAME'): in_dim = input.get_shape()[-1].value # in_dim = input.get_shape()[-1].value d = 1.0 / np.sqrt(filter_size * filter_size * in_dim) with tf.variable_scope(name): w_init = tf.random_uniform_initializer(-d, d) b_init = tf.random_uniform_initializer(-d, d) w = tf.get_variable( 'w', shape=[filter_size, filter_size, in_dim, out_dim], dtype=tf.float32, initializer=w_init) b = tf.get_variable('b', shape=[out_dim], initializer=b_init) output = tf.nn.conv2d(input, w, strides=strides, padding=padding) + b if func is not None: output = func(output) return output def mpool_layer(self, input_op, mpool_size, strides, name): with tf.variable_scope(name): output = tf.nn.max_pool(input_op, ksize=[1, mpool_size, mpool_size, 1], strides=strides, padding="SAME") return output def fc_layer(self, input, out_dim, name, func=tf.nn.relu): in_dim = input.get_shape()[-1].value d = 1.0 / np.sqrt(in_dim) with tf.variable_scope(name): w_init = tf.random_uniform_initializer(-d, d) b_init = tf.random_uniform_initializer(-d, d) w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init) b = tf.get_variable('b', dtype=tf.float32, shape=[out_dim], initializer=b_init) output = tf.matmul(input, w) + b if func is not None: output = func(output) return output def _create_tensor_board(self): summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summaries.append(tf.summary.scalar("Loss", self.loss)) for var in tf.trainable_variables(): summaries.append(tf.summary.histogram("W_%s" % var.name, var)) summaries.append(tf.summary.histogram("conv1", self.conv1)) summaries.append(tf.summary.histogram("conv2", self.conv2)) summaries.append(tf.summary.histogram("conv3", self.conv3)) summaries.append(tf.summary.histogram("pool1", self.pool1)) summaries.append(tf.summary.histogram("conv4", self.conv4)) summaries.append(tf.summary.histogram("Advantage", self.Advantage)) summaries.append(tf.summary.histogram("Value", self.Value)) summaries.append(tf.summary.histogram("Qout", self.Qout)) summaries.append(tf.summary.histogram("Q", self.Q)) self.summary_op = tf.summary.merge(summaries) self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name, self.sess.graph) def log(self, y_batch, action_batch, state_batch): feed_dict = { self.targetQ: y_batch, self.actions: action_batch, self.imageIn: state_batch, self.var_learning_rate: self.learning_rate } step, summary = self.sess.run([self.global_step, self.summary_op], feed_dict=feed_dict) self.log_writer.add_summary(summary, step) def trainQNetwork(self): minibatch = self.replaybuffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) action_batch = np.resize(action_batch, [BATCH_SIZE]) A = self.sess.run(self.predict, feed_dict={self.imageIn: next_state_batch}) Q = self.sess.run(self.QoutT, feed_dict={self.imageInT: next_state_batch}) doubleQ = Q[range(BATCH_SIZE), A] targetQ = [] for i in range(len(minibatch)): if done_batch[i]: targetQ.append(reward_batch[i]) else: targetQ.append(reward_batch[i] + GAMMA * doubleQ[i]) # targetQ = np.resize(targetQ, [BATCH_SIZE, 1]) self.sess.run(self.train_op, feed_dict={ self.imageIn: state_batch, self.targetQ: targetQ, self.actions: action_batch, self.var_learning_rate: self.learning_rate }) self.updateTargetNet() if self.episode % configure.SAVE_NET == 0 and self.episode != 0: self.saver.save(self.sess, self.model_name + '/network' + '-dqn', global_step=self.episode) if configure.TENSORBOARD and self.episode % configure.TENSORBOARD_UPDATE_FREQUENCY == 0 and self.episode != 0: self.log(targetQ, action_batch, state_batch) self.episode += 1 self.STARTtrain = True def setPerception(self, nextObservation, action, reward, terminal): newState = np.concatenate( (self.currentState[:, :, 4:], nextObservation), axis=2) self.replaybuffer.add(self.currentState, action, reward, newState, terminal) # self.replayMemory.append((self.currentState, action, reward, newState, terminal)) if self.episode <= OBSERVE: state = "observe" elif self.episode > OBSERVE and self.episode <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if self.episode % 100 == 0 and self.STARTtrain: print "episode", self.episode , "/ STATE", state, \ "/ EPSILON", self.epsilon self.currentState = newState def Perce_Train(self): if self.replaybuffer.count() > configure.REPLAY_START_SIZE: self.trainQNetwork() def getAction(self): if np.random.rand(1) < self.epsilon: action_get = np.random.randint(0, self.action_dim) else: action_get = self.sess.run( self.predict, feed_dict={self.imageIn: [self.currentState]}) if self.epsilon > FINAL_EPSILON and self.episode > OBSERVE: self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE return action_get def setInitState_rgb(self, observation): self.currentState = observation for i in xrange(configure.STACKED_FRAMES - 1): self.currentState = np.concatenate( (self.currentState, observation), axis=2)
class DDPG(object): def __init__(self, a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim): self.time_step = 1 self.memory = ReplayBuffer(MEMORY_CAPACITY) self.exploration_noise = OUNoise(a_dim) self.pointer = 0 self.sess = tf.Session() writer = tf.summary.FileWriter("logs/", self.sess.graph) self.a_dim, self.s_dim, self.a_bound, self.m_dim, self.pixel_meter, self.att_dim = \ a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim self.S = tf.placeholder(tf.float32, [None, s_dim], 's') self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.GM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'gm') self.LM = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm') self.LM_ = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm_') self.a = self._build_a(self.S, self.GM, self.LM, ) q = self._build_c(self.S, self.GM, self.LM, self.a, ) a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor') c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic') ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement def ema_getter(getter, name, *args, **kwargs): return ema.average(getter(name, *args, **kwargs)) target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation a_ = self._build_a(self.S_, self.GM, self.LM_, reuse=True, custom_getter=ema_getter) # replaced target parameters q_ = self._build_c(self.S_, self.GM, self.LM_, a_, reuse=True, custom_getter=ema_getter) a_loss = - tf.reduce_mean(q) # maximize the q self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params) with tf.control_dependencies(target_update): # soft replacement happened at here q_target = self.R + GAMMA * q_ td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params) self.sess.run(tf.global_variables_initializer()) def noise_action(self, s1, gm1, loc1): locm = np.zeros([1, self.att_dim*2+1, self.att_dim*2+1, 4]) for j in range(self.att_dim * 2 + 1): for k in range(self.att_dim * 2 + 1): locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0]) return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis], self.LM: locm})[0] + self.exploration_noise.noise() def action(self, s1, gm1, loc1): locm = np.zeros([1, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4]) for j in range(self.att_dim * 2 + 1): for k in range(self.att_dim * 2 + 1): locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0]) return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis], self.LM: locm})[0] def perceive(self, sd, p, loc, s, a_store, r, s_, loc_, done): self.memory.add(sd, p, loc, s, a_store, r, s_, loc_, done) if self.memory.count() > REPLAY_START: self.learn() if self.time_step % 500000 == 0: self.save_network() def learn(self): self.time_step += 1 replay = self.memory.get_batch(BATCH_SIZE) bm_sd = np.asarray([data[0] for data in replay]) bp = np.asarray([data[1] for data in replay]) bloc = np.asarray([data[2] for data in replay]) bs = np.asarray([data[3] for data in replay]) ba = np.asarray([data[4] for data in replay]) br = np.reshape(np.asarray([data[5] for data in replay]), [-1, 1]) bs_ = np.asarray([data[6] for data in replay]) bloc_ = np.asarray([data[7] for data in replay]) bgm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1]) for batch in range(BATCH_SIZE): sd1 = bm_sd[batch] terrian_map = grid_map(sd1, self.m_dim, self.pixel_meter, bp[batch]) bgm[batch, :, :, 0] = terrian_map.map_matrix blocm = np.zeros([BATCH_SIZE, self.att_dim*2+1, self.att_dim*2+1, 4]) blocm_ = np.zeros([BATCH_SIZE, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4]) for i in range(BATCH_SIZE): for j in range(self.att_dim*2+1): for k in range(self.att_dim*2+1): blocm[i, j, k, :] = np.array([i, bloc[i, 0]-self.att_dim+j, bloc[i, 1]-self.att_dim+k, 0]) blocm_[i, j, k, :] = np.array([i, bloc_[i, 0] - self.att_dim + j, bloc_[i, 1] - self.att_dim + k, 0]) self.sess.run(self.atrain, {self.S: bs, self.GM: bgm, self.LM: blocm}) self.sess.run(self.ctrain, {self.GM: bgm, self.S: bs, self.LM: blocm, self.a: ba, self.R: br, self.S_: bs_, self.LM_: blocm_}) def _build_a(self, s, gm, locm, reuse=None, custom_getter=None): def _conv2d_keep_size(x, y, kernel_size, name, use_bias=False, reuse_conv=None, trainable_conv=True): return tf.layers.conv2d(inputs=x, filters=y, kernel_size=kernel_size, padding="same", use_bias=use_bias, kernel_initializer=tf.truncated_normal_initializer(stddev=0.01), bias_initializer=tf.truncated_normal_initializer(stddev=0.01), reuse=reuse_conv, name=name, trainable=trainable_conv) def _build_vin(mat, name, reuse, trainable_vin): h1 = _conv2d_keep_size(mat, 150, 3, name+"_h1", use_bias=True, reuse_conv=reuse, trainable_conv=trainable_vin) r = _conv2d_keep_size(h1, 1, 1, name+"_r", reuse_conv=reuse, trainable_conv=trainable_vin) q0 = _conv2d_keep_size(r, 10, 9, name+"_q0", reuse_conv=reuse, trainable_conv=trainable_vin) v = tf.reduce_max(q0, axis=3, keep_dims=True, name=name+"_v") rv = tf.concat([r, v], axis=3) q = _conv2d_keep_size(rv, 10, 9, name + "_q", reuse_conv=False, trainable_conv=trainable_vin) v = tf.reduce_max(q, axis=3, keep_dims=True, name=name + "_v") for k in range(30): rv = tf.concat([r, v], axis=3) q = _conv2d_keep_size(rv, 10, 9, name+"_q", reuse_conv=True, trainable_conv=trainable_vin) v = tf.reduce_max(q, axis=3, keep_dims=True, name=name+"_v") return v trainable = True if reuse is None else False with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter): gv = _build_vin(gm, name="global_map_vin", reuse=reuse, trainable_vin=trainable) att = tf.reshape(tf.gather_nd(gv, locm), [-1, (self.att_dim*2+1)**2]) layer_1 = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable) layer_2a = tf.layers.dense(layer_1, 600, name='l2a', trainable=trainable) layer_2att = tf.layers.dense(att, 600, name='l2att', trainable=trainable) layer_2 = tf.add(layer_2a, layer_2att, name="l2") layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable) a = tf.layers.dense(layer_3, 7, activation=tf.nn.tanh, name='a1', trainable=trainable) return a def _build_c(self, s, gm, loc, a, reuse=None, custom_getter=None): trainable = True if reuse is None else False with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter): gm_flat = tf.reshape(gm, [-1, self.m_dim**2]) layer_gm = tf.layers.dense(gm_flat, self.s_dim, activation=tf.nn.relu, name='lgm', trainable=trainable) s_all = tf.concat([layer_gm, s], axis=1) layer_1 = tf.layers.dense(s_all, 300, activation=tf.nn.relu, name='l1', trainable=trainable) layer_2s = tf.layers.dense(layer_1, 600, activation=None, name='l2s', trainable=trainable) layer_2a = tf.layers.dense(a, 600, activation=None, name='l2a', trainable=trainable) layer_2 = tf.add(layer_2s, layer_2a, name="l2") layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable) return tf.layers.dense(layer_3, 1, trainable=trainable) # Q(s,a) def save_network(self): self.saver = tf.train.Saver() print("save ddpg-network...", self.time_step) self.saver.save(self.sess, 'saved_ddpg_networks/' + "ddpg-network", global_step=self.time_step) def load_network(self): self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_ddpg_networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights")
class NeuralAgent(): def __init__(self, track_name='practgt2.xml'): BUFFER_SIZE = 100000 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic state_dim = 29 # of sensors input self.batch_size = 32 self.lambda_mix = 10.0 self.action_dim = 3 # Steering/Acceleration/Brake # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRC) self.buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer self.track_name = track_name self.save = dict(total_reward=[], total_step=[], ave_reward=[], distRaced=[], distFromStart=[], lastLapTime=[], curLapTime=[], lapTimes=[], avelapTime=[], ave_sp=[], max_sp=[], min_sp=[], test_total_reward=[], test_total_step=[], test_ave_reward=[], test_distRaced=[], test_distFromStart=[], test_lastLapTime=[], test_curLapTime=[], test_lapTimes=[], test_avelapTime=[], test_ave_sp=[], test_max_sp=[], test_min_sp=[]) def rollout(self, env): max_steps = 10000 vision = False # zhichen: it is not stable to have two torcs env and UDP connections # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. sp = [] lastLapTime = [] for j_iter in range(max_steps): a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) a_t = a_t[0] # print('test a_t:', a_t) a_t[0] = clip(a_t[0], -1, 1) a_t[1] = clip(a_t[1], 0, 1) a_t[2] = clip(a_t[2], 0, 1) ob, r_t, done, info = env.step(a_t) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) if np.mod(j_iter + 1, 20) == 0: logging.info('step: ' + str(j_iter + 1)) print('\n ob: ', ob) s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t if done: break logging.info("Test Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + # " Episode Length: " + str(j_iter+1) + " Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime)) #env.end() # This is for shutting down TORCS ave_sp = np.mean(sp) max_sp = np.max(sp) min_sp = np.min(sp) return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime def update_neural(self, controllers, episode_count=200, tree=False, seed=1337): OU = FunctionOU() vision = False GAMMA = 0.99 EXPLORE = 100000. max_steps = 10000 reward = 0 done = False step = 0 epsilon = 1 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Experiment Start with Lambda = " + str(self.lambda_mix)) for i_episode in range(episode_count): logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(self.buff.count())) if np.mod(i_episode, 3) == 0: logging.info('relaunch TORCS') ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: logging.info('reset TORCS') ob = env.reset() #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)] s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] sp = [] lastLapTime = [] for j_iter in range(max_steps): if tree: tree_obs = [ sensor for obs in tempObs[:-1] for sensor in obs ] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, self.action_dim]) noise_t = np.zeros([1, self.action_dim]) a_t_original = self.actor.model.predict( s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = max(epsilon, 0) * OU.function( a_t_original[0][2], 0, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) self.buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = self.buff.getBatch(self.batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.zeros((states.shape[0], 1)) target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] loss += self.critic.model.train_on_batch([states, actions], y_t) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() total_reward += r_t s_t = s_t1 # Control prior mixing term if j_iter > 0 and i_episode > 50: lambda_track = lambda_max * (1 - np.exp(-factor * np.abs( r_t + GAMMA * np.mean(target_q_values[-1] - base_q[-1])))) lambda_track = np.squeeze(lambda_track) else: lambda_track = 10. lambda_store[j_iter] = lambda_track base_q = copy.deepcopy(target_q_values) if np.mod(step, 2000) == 0: logging.info("Episode " + str(i_episode) + " Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break #else: # env.end() self.lambda_mix = np.mean(lambda_store) logging.info('Episode ends! \n' + "Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Lambda Mix: " + str(self.lambda_mix)) self.save['total_reward'].append(total_reward) self.save['total_step'].append(j_iter + 1) self.save['ave_reward'].append(total_reward / (j_iter + 1)) self.save['distRaced'].append(info['distRaced']) self.save['distFromStart'].append(info['distFromStart']) self.save['lastLapTime'].append(info['lastLapTime']) self.save['curLapTime'].append(info['curLapTime']) self.save['lapTimes'].append(lastLapTime) if lastLapTime == []: self.save['avelapTime'].append(0) else: self.save['avelapTime'].append(np.mean(lastLapTime)) self.save['ave_sp'].append(np.mean(sp)) self.save['max_sp'].append(np.max(sp)) self.save['min_sp'].append(np.min(sp)) # test if np.mod(i_episode + 1, 10) == 0: logging.info("Start Testing!") test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout( env) self.save['test_total_reward'].append(test_total_reward) self.save['test_total_step'].append(test_step) self.save['test_ave_reward'].append(test_total_reward / test_step) self.save['test_distRaced'].append(test_info['distRaced']) self.save['test_distFromStart'].append( test_info['distFromStart']) self.save['test_lastLapTime'].append(test_info['lastLapTime']) self.save['test_curLapTime'].append(test_info['curLapTime']) self.save['test_lapTimes'].append(test_lastLapTime) if test_lastLapTime == []: self.save['test_avelapTime'].append(0) else: self.save['test_avelapTime'].append( np.mean(test_lastLapTime)) self.save['test_ave_sp'].append(test_ave_sp) self.save['test_max_sp'].append(test_max_sp) self.save['test_min_sp'].append(test_min_sp) if np.mod(i_episode + 1, 5) == 0: print("Now we save model") #os.remove("actormodel.h5") self.actor.model.save_weights("actormodel_" + str(seed) + ".h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) #os.remove("criticmodel.h5") self.critic.model.save_weights("criticmodel_" + str(seed) + ".h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) filename = "./model/actormodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.actor.model.save_weights(filename, overwrite=True) filename = "./model/criticmodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.critic.model.save_weights(filename, overwrite=True) if np.mod(i_episode + 1, 10) == 0: filename = "./Fig/iprl_save_" + str(seed) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as f: pickle.dump(self.save, f) if i_episode > 1000 and all( np.array(self.save['total_reward'][-20:]) < 20): print('model degenerated. Stop at Epsisode ' + str(i_episode)) break env.end() # This is for shutting down TORCS logging.info("Neural Policy Update Finish.") return None def collect_data(self, controllers, tree=False): vision = False max_steps = 10000 step = 0 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) print("S0=", ob) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Collection started with Lambda = " + str(self.lambda_mix)) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] observation_list = [] actions_list = [] lastLapTime = [] sp = [] for j_iter in range(max_steps): if tree: tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] if tree: newobs = [item for sublist in tempObs[:-1] for item in sublist] observation_list.append(newobs[:]) else: observation_list.append(window_list[:]) actions_list.append(mixed_act[:]) ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t s_t = s_t1 #if np.mod(step, 2000) == 0: # logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break logging.info("Data Collection Finished!") logging.info('Episode ends! \n' + "Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) env.end() return observation_list, actions_list def label_data(self, controllers, observation_list, tree=False): if not tree: steer_prog, accel_prog, brake_prog = controllers actions_list = [] net_obs_list = [] logging.info("Data labelling started with Lambda = " + str(self.lambda_mix)) for window_list in observation_list: if tree: act_tree = controllers.predict([window_list]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) net_obs_list.append(window_list) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) net_obs = [sensor for obs in window_list[-1] for sensor in obs] net_obs_list.append(net_obs[:29]) action_prior = [steer_action, accel_action, brake_action] s_t = np.hstack([[net_obs[:29]]]) a_t = self.actor.model.predict(s_t.reshape(1, 29)) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] actions_list.append(mixed_act[:]) return net_obs_list, observation_list, actions_list
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) return self.time_step
class DDPG: def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights") def train(self): # my_config.logger.debug("......enter tain......") # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise(action) # if random.random() <= 0.5: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5]) # else: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75]) noise_action = action + noise clipped_noise_action = np.clip(noise_action, 0, 1) # if (self.time_step < 5): # my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action)) return clipped_noise_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): # my_config.logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'ltr') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DeepQLearner(object): def __init__(self, session, optimizer, q_network, state_dim, num_actions, batch_size=32, init_exp=0.5, # initial exploration prob final_exp=0.1, # final exploration prob anneal_steps=10000, # N steps for annealing exploration replay_buffer_size=10000, store_replay_every=5, # how frequent to store experience discount_factor=0.9, # discount future rewards target_update_rate=0.01, name="DeepQLearner" ): """ Initializes the Deep Q Network. Args: session: A TensorFlow session. optimizer: A TensorFlow optimizer. q_network: A TensorFlow network that takes in a state and output the Q-values over all actions. state_dim: Dimension of states. num_actions: Number of actions. batch_size: Batch size for training with experience replay. init_exp: Initial exploration probability for eps-greedy policy. final_exp: Final exploration probability for eps-greedy policy. anneal_steps: Number of steps to anneal from init_exp to final_exp. replay_buffer_size: Size of replay buffer. store_replay_every: Frequency with which to store replay. discount_factor: For discounting future rewards. target_update_rate: For the slow update of the target network. name: Used to create a variable scope. Useful for creating multiple networks. """ self.session = session self.optimizer = optimizer self.q_network = q_network # tensorflow constructor for Q network self.state_dim = state_dim self.num_actions = num_actions self.batch_size = batch_size # initialize exploration self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate # Initialize the replay buffer. self.replay_buffer_size = replay_buffer_size self.replay_buffer = ReplayBuffer(replay_buffer_size) self.store_replay_every = store_replay_every self.experience_cnt = 0 self.name = name self.train_iteration = 0 self.constructModel() self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def constructModel(self): """ Constructs the model to do Q-learning. """ # ensure that we don't have conflicts when initializing multiple models with tf.variable_scope(self.name): # this part of the model is for predicting actions using the learned Q_network. with tf.name_scope("predict_actions"): # input: vectors of states (in a batch) self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # use new scope to differentiate this q_network from one used for target evaluation # note that this will differentiate the weights, for example "learn_q_network/W1" with tf.variable_scope("learn_q_network"): # the current q_network that we train self.action_scores = self.q_network(self.states, self.state_dim, self.num_actions) self.predicted_actions = tf.argmax(self.action_scores, axis=1, name="predicted_actions") # this part of the model is for estimating future rewards, to be used for the Q-learning # update for estimating the target Q-value. with tf.name_scope("estimate_future_rewards"): # input: vectors of next states (in a batch) self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") # input: binary inputs that indicate whether states are unfinished or terminal # this is important to compute the target and do the Bellman update correctly, since # it tells us whether to include the optimal Q value for the next state or not. self.unfinished_states_flags = tf.placeholder(tf.float32, (None,), name="unfinished_states_flags") # input: rewards from last state and action self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") # use new scope to differentiate this q_network from one we are training # note that this will differentiate the weights, for example "target_q_network/W1" with tf.variable_scope("target_q_network"): # the q_network used for evaluation self.eval_q_vals = self.q_network(self.next_states, self.state_dim, self.num_actions) # note that this term is only non-zero for a state if it is non-terminal # also note the use of stop_gradient to make sure we don't train this q_network self.best_future_q_vals = tf.reduce_max(tf.stop_gradient(self.eval_q_vals), axis=1) * self.unfinished_states_flags # future rewards given by Bellman equation self.future_rewards = self.rewards + self.discount_factor * self.best_future_q_vals # this part of the model is for computing the loss and gradients with tf.name_scope("loss"): # input: one-hot vectors that give the current actions to evaluate the loss for self.action_selects = tf.placeholder(tf.float32, (None, self.num_actions), name="action_select") # get Q-values for the actions that we took self.selected_action_scores = tf.reduce_sum(self.action_scores * self.action_selects, axis=1) # temporal difference loss self.td_loss = tf.reduce_mean(tf.reduce_sum(tf.square(self.future_rewards - self.selected_action_scores))) # cross-entropy loss for adversarial example generation self.cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.action_scores, self.action_selects)) # TODO: regularization loss # TODO: gradient clipping self.train_op = self.optimizer.minimize(self.td_loss) # this part of the model is for updating the target Q network with tf.name_scope("eval_q_network_update"): target_network_update = [] # slowly update target network parameters with Q network parameters # we do this by grabbing all the parameters in both networks and manually defining # update operations self.q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="learn_q_network") self.target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_q_network") for v_source, v_target in zip(self.q_network_variables, self.target_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) target_network_update.append(update_op) # this groups all operations to run together # this operation will update all of the target Q network variables self.target_network_update = tf.group(*target_network_update) def store_experience(self, state, action, reward, next_state, done): """ Adds an experience to the replay buffer. """ if self.experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.experience_cnt += 1 def greedy_policy(self, states): """ Executes the greedy policy. Useful for executing a learned agent. """ return self.session.run(self.predicted_actions, {self.states: states})[0] def e_greedy_policy(self, states): """ Executes the epsilon greedy policy. """ # with probability exploration, choose random action if random.random() < self.exploration: return random.randint(0, self.num_actions-1) # choose greedy action given by current Q network else: return self.greedy_policy(states) def annealExploration(self): """ Anneals the exploration probability linearly with training iteration. """ ratio = max((self.anneal_steps - self.train_iteration) / float(self.anneal_steps), 0) self.exploration = (self.init_exp- self.final_exp) * ratio + self.final_exp def updateModel(self): """ Update the model by sampling a batch from the replay buffer and performing Q-learning updates on the network parameters. """ # not enough experiences yet if self.replay_buffer.count() < self.batch_size: return # sample a random batch from the replay buffer batch = self.replay_buffer.getBatch(self.batch_size) # keep track of these inputs to the Q networks for the batch states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size,)) action_selects = np.zeros((self.batch_size, self.num_actions)) next_states = np.zeros((self.batch_size, self.state_dim)) unfinished_states_flags = np.zeros((self.batch_size,)) # train on the experiences in this batch for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r action_selects[k][a] = 1 # check terminal state if not done: next_states[k] = s1 unfinished_states_flags[k] = 1 # perform one update of training cost, _ = self.session.run([self.td_loss, self.train_op], { self.states : states, self.next_states : next_states, self.unfinished_states_flags : unfinished_states_flags, self.action_selects : action_selects, self.rewards : rewards }) # update target network using learned Q-network self.session.run(self.target_network_update) self.annealExploration() self.train_iteration += 1 # saves the trained model def saveModel(self, name): self.saver.save(self.session, name) def restoreModel(self, name): self.saver.restore(self.session, './' + name) def reset(self): # initialize exploration self.exploration = self.init_exp # Initialize the replay buffer. self.replay_buffer = ReplayBuffer(self.replay_buffer_size) self.experience_cnt = 0 self.train_iteration = 0 self.session.run(tf.global_variables_initializer())
class DDPG: """docstring for DDPG""" def __init__(self): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 12 self.action_dim = 10 self.has_kicked = False self.laststep_haskicked = False self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.saver = tf.train.Saver(max_to_keep=1) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # print(minibatch) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) # print(q_value_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("action_batch[0]", file=f) print(action_batch[0], file=f) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("q_gradient_batch[0]", file=f) print(q_gradient_batch[0], file=f) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action2(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def noise_action(self, state): action = self.actor_network.action(state) random_action = np.zeros(10, float) random_action[random.randint(0, 3)] = 1 random_action[4] = random.uniform(-100, 100) #DASH POWER random_action[5] = random.uniform(-180, 180) #DASH DEGREES random_action[6] = random.uniform(-180, 180) #TURN DEGREES random_action[7] = random.uniform(-180, 180) #TACKLE DEGREES random_action[8] = random.uniform(0, 100) #KICK POWER random_action[9] = random.uniform(-180, 180) #KICK DEGREES if np.random.uniform() < EPSILON: return action else: return random_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class MaDDPG: def __init__(self, num_agents, state_dim, action_dim): # track training times self.time_step = 0 # use set session use GPU #self.sess = tf.InteractiveSession() self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=True)) self.num_agents = num_agents self.state_dim = state_dim self.action_dim = action_dim self.agents = self.create_multi_agents(self.sess, num_agents, self.state_dim, self.action_dim) # make sure create Criticnetwork later, summarise mean Q value inside self.critic = CriticNetwork(self.sess, state_dim, action_dim) self.exploration_noise = OUNoise((self.num_agents, action_dim)) self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # for store checkpoint self.saver = tf.train.Saver() def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.zeros((BATCH_SIZE, self.num_agents, self.state_dim)) action_batch = np.zeros((BATCH_SIZE, self.num_agents, self.action_dim)) reward_batch = np.zeros((BATCH_SIZE, self.num_agents)) next_state_batch = np.zeros( (BATCH_SIZE, self.num_agents, self.state_dim)) done_batch = np.zeros((BATCH_SIZE)) for ii in range(BATCH_SIZE): state_batch[ii, :, :] = minibatch[ii][0] action_batch[ii, :, :] = minibatch[ii][1] reward_batch[ii, :] = minibatch[ii][2] next_state_batch[ii, :, :] = minibatch[ii][3] done_batch[ii] = minibatch[ii][4] # calculate Gt batch next_action_batch = self.target_actions(next_state_batch) q_value_batch = self.critic.target_q(next_state_batch, next_action_batch) gt = np.zeros((BATCH_SIZE, self.num_agents)) for ii in range(BATCH_SIZE): if done_batch[ii]: gt[ii, :] = reward_batch[ii, :] else: gt[ii, :] = reward_batch[ii, :] + GAMMA * q_value_batch[ii, :] #update critic by minimizing the loss self.critic.train(gt, state_batch, action_batch) # update policy using the sampling gradients actions_for_grad = self.actions(state_batch) q_gradients_batch = self.critic.gradients(state_batch, actions_for_grad) self.train_agents(q_gradients_batch, state_batch) # update critic target network self.critic.update_target() # update actor target self.update_agents_target() def summary(self, record_num): if self.replay_buffer.count() > SUMMARY_BATCH_SIZE: mini_batch = self.replay_buffer.popn(SUMMARY_BATCH_SIZE) state_batch = np.zeros( (SUMMARY_BATCH_SIZE, self.num_agents, self.state_dim)) for ii in range(SUMMARY_BATCH_SIZE): state_batch[ii, :, :] = mini_batch[ii][0] actions_for_summary = self.actions(state_batch) self.critic.write_summaries(state_batch, actions_for_summary, record_num) def update_agents_target(self): for agent in self.agents: agent.update_target() def train_agents(self, gradients_batch, state_batch): # gradients_batch = [batchsize* agents* action_dim] # state_batch = [batchsize* agents * state_dim ] for ii in range(self.num_agents): grad = gradients_batch[:, ii, :] state = state_batch[:, ii, :] self.agents[ii].train(grad, state) def create_multi_agents(self, sess, num_agents, state_dim, action_dim): agents = [] nets = None for ii in range(num_agents): agent_name = 'agent' + str(ii) agents.append( ActorNetwork(sess, state_dim, action_dim, agent_name, nets)) nets = agents[-1].nets return agents def add_agents(self, add_num): for ii in range(add_num): #self.num_agents+=1 agent_name = 'agent' + str(self.num_agents) self.agents.append( ActorNetwork(self.sess, self.state_dim, self.action_dim, agent_name, self.agents[-1].nets)) # the agents' name is from 0-num_agents-1 self.num_agents += 1 # if add a new agent then reset the noise and replay buffer self.exploration_noise = OUNoise((self.num_agents, self.action_dim)) #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.replay_buffer.erase() # re-create a saver # the new saver will contains all the savable variables. # otherwise only contains the initially created agents self.saver = tf.train.Saver() # reset the time step # self.time_step = 0 def action( self, state ): # here is action, for one state on agent, not batch_sized actions # state = [num_agents * state_dim] # actions = [num_agents * action_dim] action = np.zeros((self.num_agents, self.action_dim)) for ii in range(self.num_agents): action[ii, :] = self.agents[ii].action(state[ii, :]) return action def actions(self, state_batch): #state = batch_size*numOfagents*state_dim #actions = batch_size*numOfagents*action_dim batch_size = state_batch.shape[0] actions = np.zeros((batch_size, self.num_agents, self.action_dim)) for ii in range(self.num_agents): actions[:, ii, :] = self.agents[ii].actions(state_batch[:, ii, :]) return actions def target_actions(self, state_batch): # the state size is batch_size* num_agents * state_dimension actions = np.zeros( (state_batch.shape[0], self.num_agents, self.action_dim)) for ii in range(self.num_agents): actions[:, ii, :] = self.agents[ii].target_actions(state_batch[:, ii, :]) return actions def noise_action(self, state): action = self.action(state) # clip the action, action \in [-1,+1] return np.clip(action + self.exploration_noise.noise(), -1, 1) def close_session(self): self.sess.close() def perceive(self, state, action, reward, next_state, done): # store {st,at,Rt+1,st+1} self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % SAVE_STEPS == 0: self.save_network() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-initialize the random process when an episode ends if done: self.exploration_noise.reset() def load_network(self): checkpoint = tf.train.get_checkpoint_state("saved_network") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print('Could not find old network weights') def save_network(self): # do not processing under Dropbox # exit drop box then run print('save network...', self.time_step) self.saver.save(self.sess, 'saved_network/' + 'network', global_step=self.time_step)
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def run_ddpg(amodel, cmodel, train_indicator=0, seeded=1337, track_name='practgt2.xml'): OU = FunctionOU() BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic ALPHA = 0.9 action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input np.random.seed(seeded) vision = False EXPLORE = 100000. if train_indicator: episode_count = 600 else: episode_count = 3 max_steps = 20000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=track_name) if not train_indicator: # Now load the weight #logging.info("Now we load the weight") print("Now we load the weight") try: actor.model.load_weights(amodel) critic.model.load_weights(cmodel) actor.target_model.load_weights(amodel) critic.target_model.load_weights(cmodel) #logging.info(" Weight load successfully") print("Weight load successfully") except: #ogging.info("Cannot find the weight") print("Cannot find the weight") exit() #logging.info("TORCS Experiment Start.") print("TORCS Experiment Start.") best_lap = 500 for i_episode in range(episode_count): print("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) if np.mod(i_episode, 3) == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. for j_iter in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i_episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) if np.mod(step, 1000) == 0: logging.info("Episode {}, Distance {}, Last Lap {}".format( i_episode, ob.distRaced, ob.lastLapTime)) if ob.lastLapTime > 0: if best_lap < ob.lastLapTime: best_lap = ob.lastLapTime step += 1 if done: break if train_indicator and i_episode > 20: if np.mod(i_episode, 3) == 0: logging.info("Now we save model") actor.model.save_weights("ddpg_actor_weights_periodic.h5", overwrite=True) critic.model.save_weights("ddpg_critic_weights_periodic.h5", overwrite=True) print("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("Best Lap {}".format(best_lap)) print("") logging.info("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) logging.info("Best Lap {}".format(best_lap)) env.end() # This is for shutting down TORCS logging.info("Finish.")
class DDPG: def __init__(self, env): self.name = 'DDPG' self.environment = env self.episode = 0 self.epsilon = 0.98 self.one_number = 1 self.mean = [] self.state_dim = len(obs2state(env.reset().observation)) self.action_dim = env.action_spec().shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) self.critic_network.train(y_batch, state_batch, action_batch) action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): action = self.actor_network.action(state) exp = self.exploration_noise.noise() t = action * exp return exp def action(self, state): if np.random.rand() <= self.epsilon: act = self.noise_action(state) z = array(act) else: action = self.actor_network.action(state) z = array(action) self.mean.append(z[0]) g = np.tanh(z) return g def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() if self.epsilon > 0.1: self.epsilon *= 0.99999 if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, env): mx.random.seed(seed) np.random.seed(seed) self.env = env if flg_gpu: self.ctx = mx.gpu(0) else: self.ctx = mx.cpu() self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.ddpgnet = DDPGNet(self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim) self.replay_buffer = ReplayBuffer(memory_size) self.batch_size = batch_size self.ddpgnet.init() self.train_step = 0 def train(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(self.batch_size) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [self.batch_size, self.action_dim]) # Calculate y_batch next_qvals = self.ddpgnet.get_target_q(next_state_batch).asnumpy() y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * next_qvals[i][0]) y_batch = np.resize(y_batch, [self.batch_size, 1]) # Update critic by minimizing the loss L self.ddpgnet.update_critic(state_batch, action_batch, y_batch) # Update actor by maxmizing Q self.ddpgnet.update_actor(state_batch) self.train_step += 1 # update target networks self.ddpgnet.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise state = np.reshape(state, (1, self.state_dim)) action = self.ddpgnet.get_step_action(state) return action + self.exploration_noise.noise() def action(self, state): state = np.reshape(state, (1, self.state_dim)) action = self.ddpgnet.get_step_action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > memory_start_size: self.train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG(object): def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0 def train(self): self.time_step = self.time_step + 1 self.epsilon_expert -= (self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= (self.epsilon_random_range[0] - self.epsilon_random_range[1]) / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.debug( "step: %d, epsilon_expert: %s, epsilon_random: %s" % (self.time_step, self.epsilon_expert, self.epsilon_random)) # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # if done_batch[i]: # y_batch.append(reward_batch[i]) # else : # y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_cost = self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = self.exploration_noise.noise(action) # noise_action = action + noise # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = np.zeros(self.action_dim) # noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10) # noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) # noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10) # noise_action = action + noise # logger.debug("action: %s, noise: %s" % (action, noise)) # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action def action(self, state): action = self.actor_network.action(state) logger.debug("action: %s" % (action)) return action def opposite_action(self, state): logger.debug("state: %s" % (state)) action = self.actor_network.action(state) logger.debug("action: %s" % (action)) action[0] = 1 - action[0] logger.debug("opposite action: %s" % (action)) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() >= REPLAY_START_SIZE: # logger.debug("train...") self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'DDPG') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DeepDeterministicPolicyGradient(object): def __init__(self, session, optimizer, actor_network, critic_network, state_dim, action_dim, batch_size=32, replay_buffer_size=1000000, # size of replay buffer store_replay_every=1, # how frequent to store experience discount_factor=0.99, # discount future rewards target_update_rate=0.01, reg_param=0.01, # regularization constants max_gradient=5, # max gradient norms noise_sigma=0.20, noise_theta=0.15, summary_writer=None, summary_every=100): # tensorflow machinery self.session = session self.optimizer = optimizer self.summary_writer = summary_writer # model components self.actor_network = actor_network self.critic_network = critic_network self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size) # training parameters self.batch_size = batch_size self.state_dim = state_dim self.action_dim = action_dim self.discount_factor = discount_factor self.target_update_rate = target_update_rate self.max_gradient = max_gradient self.reg_param = reg_param # Ornstein-Uhlenbeck noise for exploration self.noise_var = tf.Variable(tf.zeros([1, action_dim])) noise_random = tf.random_normal([1, action_dim], stddev=noise_sigma) self.noise = self.noise_var.assign_sub((noise_theta) * self.noise_var - noise_random) # counters self.store_replay_every = store_replay_every self.store_experience_cnt = 0 self.train_iteration = 0 # create and initialize variables self.create_variables() var_lists = tf.get_collection(tf.GraphKeys.VARIABLES) self.session.run(tf.initialize_variables(var_lists)) # make sure all variables are initialized self.session.run(tf.assert_variables_initialized()) if self.summary_writer is not None: # graph was not available when journalist was created self.summary_writer.add_graph(self.session.graph) self.summary_every = summary_every def create_variables(self): with tf.name_scope("model_inputs"): # raw state representation self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # action input used by critic network self.action = tf.placeholder(tf.float32, (None, self.action_dim), name="action") # define outputs from the actor and the critic with tf.name_scope("predict_actions"): # initialize actor-critic network with tf.variable_scope("actor_network"): self.policy_outputs = self.actor_network(self.states) with tf.variable_scope("critic_network"): self.value_outputs = self.critic_network(self.states, self.action) self.action_gradients = tf.gradients(self.value_outputs, self.action)[0] # predict actions from policy network self.predicted_actions = tf.identity(self.policy_outputs, name="predicted_actions") tf.histogram_summary("predicted_actions", self.predicted_actions) tf.histogram_summary("action_scores", self.value_outputs) # get variable list actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network") critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network") # estimate rewards using the next state: r + argmax_a Q'(s_{t+1}, u'(a)) with tf.name_scope("estimate_future_rewards"): self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks") self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") # initialize target network with tf.variable_scope("target_actor_network"): self.target_actor_outputs = self.actor_network(self.next_states) with tf.variable_scope("target_critic_network"): self.target_critic_outputs = self.critic_network(self.next_states, self.target_actor_outputs) # compute future rewards self.next_action_scores = tf.stop_gradient(self.target_critic_outputs)[:,0] * self.next_state_mask tf.histogram_summary("next_action_scores", self.next_action_scores) self.future_rewards = self.rewards + self.discount_factor * self.next_action_scores # compute loss and gradients with tf.name_scope("compute_pg_gradients"): # compute gradients for critic network self.temp_diff = self.value_outputs[:,0] - self.future_rewards self.mean_square_loss = tf.reduce_mean(tf.square(self.temp_diff)) self.critic_reg_loss = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in critic_network_variables]) self.critic_loss = self.mean_square_loss + self.reg_param * self.critic_reg_loss self.critic_gradients = self.optimizer.compute_gradients(self.critic_loss, critic_network_variables) # compute actor gradients (we don't do weight decay for actor network) self.q_action_grad = tf.placeholder(tf.float32, (None, self.action_dim), name="q_action_grad") actor_policy_gradients = tf.gradients(self.policy_outputs, actor_network_variables, -self.q_action_grad) self.actor_gradients = zip(actor_policy_gradients, actor_network_variables) # collect all gradients self.gradients = self.actor_gradients + self.critic_gradients # clip gradients for i, (grad, var) in enumerate(self.gradients): # clip gradients by norm if grad is not None: self.gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var) # summarize gradients for grad, var in self.gradients: tf.histogram_summary(var.name, var) if grad is not None: tf.histogram_summary(var.name + '/gradients', grad) # emit summaries tf.scalar_summary("critic_loss", self.critic_loss) tf.scalar_summary("critic_td_loss", self.mean_square_loss) tf.scalar_summary("critic_reg_loss", self.critic_reg_loss) # apply gradients to update actor network self.train_op = self.optimizer.apply_gradients(self.gradients) # update target network with Q network with tf.name_scope("update_target_network"): self.target_network_update = [] # slowly update target network parameters with the actor network parameters actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network") target_actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_actor_network") for v_source, v_target in zip(actor_network_variables, target_actor_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) # same for the critic network critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network") target_critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_critic_network") for v_source, v_target in zip(critic_network_variables, target_critic_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) # group all assignment operations together self.target_network_update = tf.group(*self.target_network_update) self.summarize = tf.merge_all_summaries() self.no_op = tf.no_op() def sampleAction(self, states, exploration=True): policy_outs, ou_noise = self.session.run([ self.policy_outputs, self.noise ], { self.states: states }) # add OU noise for exploration policy_outs = policy_outs + ou_noise if exploration else policy_outs return policy_outs def updateModel(self): # not enough experiences yet if self.replay_buffer.count() < self.batch_size: return batch = self.replay_buffer.getBatch(self.batch_size) states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size,)) actions = np.zeros((self.batch_size, self.action_dim)) next_states = np.zeros((self.batch_size, self.state_dim)) next_state_mask = np.zeros((self.batch_size,)) for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r actions[k] = a if not done: next_states[k] = s1 next_state_mask[k] = 1 # whether to calculate summaries calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None # compute a = u(s) policy_outs = self.session.run(self.policy_outputs, { self.states: states }) # compute d_a Q(s,a) where s=s_i, a=u(s) action_grads = self.session.run(self.action_gradients, { self.states: states, self.action: policy_outs }) critic_loss, _, summary_str = self.session.run([ self.critic_loss, self.train_op, self.summarize if calculate_summaries else self.no_op ], { self.states: states, self.next_states: next_states, self.next_state_mask: next_state_mask, self.action: actions, self.rewards: rewards, self.q_action_grad: action_grads }) # update target network using Q-network self.session.run(self.target_network_update) # emit summaries if calculate_summaries: self.summary_writer.add_summary(summary_str, self.train_iteration) self.train_iteration += 1 def storeExperience(self, state, action, reward, next_state, done): # always store end states if self.store_experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.store_experience_cnt += 1
class Worker: """docstring for DDPG""" def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic') def start(self, setting=0): self.env = RunEnv(visualize=True) self.setting = setting def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions( self.sess, next_state_batch) q_value_batch = self.critic_network.target_q(self.sess, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.sess, selfstate_batch) q_gradient_batch = self.critic_network.gradients( self.sess, state_batch, action_batch_for_gradients) self.actor_network.train(self.sess, q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target(self.sess) self.critic_network.update_target(self.sess) def save_model(self, saver, episode): #if self.episode % 10 == 1: if self.name == 'worker_0': saver.save(self.sess, self.model_path + "/model-" + str(episode) + ".ckpt") def noise_action(self, state, decay): # Select action a_t according to the current policy and exploration noise which gradually vanishes action = self.actor_network.action(self.sess, state) return action + self.exploration_noise.noise() * decay def action(self, state): action = self.actor_network.action(self.sess, state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE and self.training: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def work(self, coord, saver): if self.training: episode_count = self.sess.run(self.global_episodes) else: episode_count = 0 wining_episode_count = 0 total_steps = 0 print("Starting worker_" + str(self.number)) with self.sess.as_default(), self.sess.graph.as_default(): #not_start_training_yet = True while not coord.should_stop(): returns = [] rewards = [] episode_reward = 0 if np.random.rand( ) < 0.9: # change Aug20 stochastic apply noise noisy = True self.decay -= 1. / self.explore else: noisy = False self.sess.run(self.update_local_ops_actor) self.sess.run(self.update_local_ops_critic) state = self.env.reset(difficulty=self.setting) #print(observation) s = process_frame(state) print "episode:", episode_count # Train for step in xrange(self.env.spec.timestep_limit): state = process_frame(state) if noisy: action = np.clip( self.noise_action(state, np.maximum(self.decay, 0)), 0.0, 1.0 ) # change Aug20, decay noise (no noise after ep>=self.explore) else: action = self.action(state) next_state, reward, done, _ = self.env.step(action) #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done)) next_state = process_frame(next_state) self.perceive(state, action, reward * 100, next_state, done) state = next_state episode_reward += reward if done: break if episode % 5 == 0: print "episode reward:", reward_episode # Testing: #if episode % 1 == 0: if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1: # change Aug19 self.save_model(saver, episode_count) total_return = 0 ave_reward = 0 for i in xrange(TEST): state = self.env.reset() reward_per_step = 0 for j in xrange(self.env.spec.timestep_limit): action = self.action( process_frame(state)) # direct action for test state, reward, done, _ = self.env.step(action) total_return += reward if done: break reward_per_step += (reward - reward_per_step) / (j + 1) ave_reward += reward_per_step ave_return = total_return / TEST ave_reward = ave_reward / TEST returns.append(ave_return) rewards.append(ave_reward) print 'episode: ', episode, 'Evaluation Average Return:', ave_return, ' Evaluation Average Reward: ', ave_reward if self.name == 'worker_0' and self.training: sess.run(self.increment) episode_count += 1 # All done Stop trail # Confirm exit print('Done ' + self.name)
class DDPG: """docstring for DDPG""" def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings()) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, a_dim, s_dim): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = s_dim self.action_dim = a_dim self.time_step=0 self.max_bw = 0.0 self.max_cwnd = 0.0 self.min_rtt = 9999999.0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def learn(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): self.time_step += 1 # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise() # print("noise:" + str(noise)) return action + noise def choose_action(self, state): self.time_step += 1 # print("_______________________choose_action_____________________") action = self.actor_network.action(state) return action def store_transition(self, s, a, r, s_,done,episode_count): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer # print("*********************************ADD****************************") self.replay_buffer.add(s, a, r, s_, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: if((episode_count+1)%100!= 0): self.learn() # print("learn!") else: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def extract_observation(self,dataRecorder,subflow_index,state_before): # print("extracting...") value_dic = dataRecorder.get_latest_data() state_after=state_before.reshape(10,5) # observation = np.zeros((4)) observation = np.zeros((5)) t_cWnd=[0,0] t_thr=[0,0] t_rtt=[0,0] t_loss_rate=[0,0] t_unAck=[0,0] s0=[0,0,0,0,0] state=np.zeros(1) for i in range(value_dic["nbOfSubflows"]): name = "cWnd" + str(i) t_cWnd[i] = value_dic[name] name = "rtt"+str(i) t_rtt[i] = value_dic[name] name = "unAck" + str(i) t_unAck[i]=value_dic[name] name = "loss_rate" + str(i) t_loss_rate[i]=value_dic[name] name = "throughput" + str(i) t_thr[i]=value_dic[name] thr=t_thr[subflow_index] s0[0]=t_thr[subflow_index] rtt=t_rtt[subflow_index] s0[1]=t_rtt[subflow_index] cwnd=t_cWnd[subflow_index] s0[2]=t_cWnd[subflow_index] loss_rate=t_loss_rate[subflow_index] s0[3]=t_loss_rate[subflow_index] unAck=t_unAck[subflow_index] s0[4]=t_unAck[subflow_index] s0=np.array(s0) min_=s0-s0 thr_n=s0[0] thr_n_min=s0[0]-min_[0] rtt_min=s0[1]-min_[1] cwnd_n_min=s0[2]-min_[2] loss_rate_n_min=s0[3]-min_[3] unAck_n_min=s0[4]-min_[4] # loss_rate_n_min=s0[7]-min_[7] if self.max_bw<thr_n_min: self.max_bw=thr_n_min if self.max_cwnd<cwnd_n_min: self.max_cwnd=cwnd_n_min if self.max_cwnd<cwnd_n_min: self.max_cwnd=cwnd_n_min if self.min_rtt>rtt_min: self.min_rtt=rtt_min reward = thr_n_min-5*(rtt_min-self.min_rtt)-10*loss_rate_n_min print("reward:"+str(reward)+" thr_n_min:"+str(thr_n_min)+ " rtt_min:"+str(rtt_min)+" self.min_rtt :"+str(self.min_rtt)+" delta_rtt"+str(rtt_min-self.min_rtt)) # print("unAck:"+str(unAck_n_min)) if self.max_bw!=0: state[0]=thr_n_min/self.max_bw # tmp=pacing_rate_n_min/self.max_bw state=np.append(state,[5*loss_rate_n_min]) state=np.append(state,[unAck_n_min]) else: state[0]=0 state=np.append(state,[0]) state=np.append(state,[0]) state=np.append(state,[1400/cwnd]) state=np.append(state,[self.min_rtt/rtt_min]) state_after=np.delete(state_after,[0],axis = 0) state_after=np.append(state_after,state) return state_after,reward,thr_n_min,rtt_min