def _set_local_network(self, device, network_scope, scene_scope, task_scope): self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)
def __init__(self): if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) # self.evluation_gap = 10**6 print(MAX_TIME_STEP) self.device = "/gpu:0" if USE_GPU else "/cpu:0" self.network_scope = TASK_TYPE self.list_of_tasks = TASK_LIST self.scene_scopes = self.list_of_tasks.keys() self.global_t = 0 self.stop_requested = False self.initial_learning_rate = self.log_uniform(LR_ALPHA_LOW, LR_ALPHA_HIGH, LR_ALPHA_LOG_RATE) self.global_network = DRLNetwork(action_size=ACTION_SIZE, device=self.device, network_scope=self.network_scope, scene_scopes=self.scene_scopes) self.branches = [] for scene in self.scene_scopes: for task in self.list_of_tasks[scene]: self.branches.append((scene, task)) self.NUM_TASKS = len(self.branches) assert NUM_THREADS >= self.NUM_TASKS, \ "Not enough threads for multitasking: at least {} threads needed.".format(self.NUM_TASKS) self.learning_rate_input = tf.placeholder("float") self.grad_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=self.device) # instantiate each training thread # each thread is training for one target in one scene self.training_threads = [] for i in range(NUM_THREADS): scene, task = self.branches[i % self.NUM_TASKS] training_thread = ADQN_Thread(i, self.global_network, self.initial_learning_rate, self.learning_rate_input, self.grad_applier, MAX_TIME_STEP, device=self.device, network_scope="thread-%d" % (i + 1), scene_scope=scene, task_scope=task) self.training_threads.append(training_thread)
def __init__(self, global_step): device = "/cpu:0" # use CPU for display tool network_scope = TASK_TYPE list_of_tasks = TASK_LIST scene_scopes = list_of_tasks.keys() global_network = DRLNetwork(action_size=4, device=device, network_scope=network_scope, scene_scopes=scene_scopes) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() # Read network from checkpoint file checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded: {}".format( checkpoint.model_checkpoint_path)) else: print("Could not find old checkpoint") # Read checkpoint directly, 'meta' file # saver.restore(sess, checkpoint.model_checkpoint_path) scene_stats = dict() self.results = dict() for scene_scope in scene_scopes: scene_stats[scene_scope] = [] for task_scope in list_of_tasks[scene_scope]: env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) ep_rewards = [] ep_lengths = [] ep_collisions = [] scopes = [network_scope, scene_scope, task_scope] for i_episode in range(NUM_EVAL_EPISODES): env.reset() terminal = False ep_reward = 0 ep_collision = 0 ep_t = 0 while not terminal: pi_values = global_network.run_policy( sess, env.s_t, env.target, scopes) pi_values = np.array(pi_values) / np.sum(pi_values) action = np.random.choice(np.arange(len(pi_values)), p=pi_values) env.step(action) terminal = env.terminal if ep_t == 10000: break if env.collided: ep_collision += 1 ep_reward += env.reward ep_t += 1 ep_lengths.append(ep_t) ep_rewards.append(ep_reward) ep_collisions.append(ep_collision) if VERBOSE: print("episode #{} ends after {} steps".format( i_episode, ep_t)) print('evaluation: %s %s' % (scene_scope, task_scope)) print('mean episode reward: %.2f' % np.mean(ep_rewards)) print('mean episode length: %.2f' % np.mean(ep_lengths)) print('mean episode collision: %.2f' % np.mean(ep_collisions)) scene_stats[scene_scope].extend(ep_lengths) print('\nResults (average trajectory length):') for scene_scope in scene_stats: self.results[scene_scope] = np.mean(scene_stats[scene_scope]) print('%s: %.2f steps' % (scene_scope, self.results[scene_scope])) with open("./Evaluation/result_%d.txt" % global_step, 'wb') as fp: pickle.dump(self.results, fp)
class A3C_Thread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index # Number the thread self._set_local_network(device, network_scope, scene_scope, task_scope) # Set local network self.sync = self.local_network.sync_from( global_network) # Synthesize from the global network self.learning_rate_input = learning_rate_input # Set learning rate self.max_global_time_step = max_global_time_step # Set maximum of global time step self._set_trainer_optimizer(device, global_network, grad_applier) # Set trainer self._set_environment(initial_learning_rate) # Set environment # Create local network def _set_local_network(self, device, network_scope, scene_scope, task_scope): self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) # Set trainer and optimizer # Set Actor-Critic gradient and optimizer # Use the accumulated trainer from Zhu def _set_trainer_optimizer(self, device, global_network, grad_applier): self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' # Set environments def _set_environment(self, initial_learning_rate): self.episode_max_q = -np.inf self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 # Choose one action according to the pi values def choose_action(self, pi_values): action = np.random.choice(np.arange(len(pi_values)), p=pi_values) return action # Take LOCAL_T_MAX step in one process # And update the accumulated gradients def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) start_local_t = self.local_t # Initialization states = [] actions = [] rewards = [] values = [] targets = [] terminal_end = False # Reset accmulated gradient variables sess.run(self.reset_gradients) # Obtain shared parameters from global sess.run(self.sync) # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.env.s_t, self.env.target, self.scopes) pi_ = np.array(pi_) / np.sum(pi_) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("%s:" % self.scene_scope) sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward terminal = self.env.terminal # ad-hoc reward for navigation # reward = 10.0 if terminal else -0.01 if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 self.episode_max_q = max(self.episode_max_q, np.max(value_)) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 if terminal: terminal_end = True sys.stdout.write( "#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (self.thread_index, global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.env.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.env.s_t, self.env.target, self.scopes) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] batch_t = [] # compute and accmulate gradients for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values, targets): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_t.append(ti) sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write( "#Thread-%d-%s-Local timestep-%d\n" % (self.thread_index, self.scene_scope, self.local_t)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate
class Train(object): def __init__(self): if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) # self.evluation_gap = 10**6 print(MAX_TIME_STEP) self.device = "/gpu:0" if USE_GPU else "/cpu:0" self.network_scope = TASK_TYPE self.list_of_tasks = TASK_LIST self.scene_scopes = self.list_of_tasks.keys() self.global_t = 0 self.stop_requested = False self.initial_learning_rate = self.log_uniform(LR_ALPHA_LOW, LR_ALPHA_HIGH, LR_ALPHA_LOG_RATE) self.global_network = DRLNetwork(action_size=ACTION_SIZE, device=self.device, network_scope=self.network_scope, scene_scopes=self.scene_scopes) self.branches = [] for scene in self.scene_scopes: for task in self.list_of_tasks[scene]: self.branches.append((scene, task)) self.NUM_TASKS = len(self.branches) assert NUM_THREADS >= self.NUM_TASKS, \ "Not enough threads for multitasking: at least {} threads needed.".format(self.NUM_TASKS) self.learning_rate_input = tf.placeholder("float") self.grad_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=self.device) # instantiate each training thread # each thread is training for one target in one scene self.training_threads = [] for i in range(NUM_THREADS): scene, task = self.branches[i % self.NUM_TASKS] training_thread = ADQN_Thread(i, self.global_network, self.initial_learning_rate, self.learning_rate_input, self.grad_applier, MAX_TIME_STEP, device=self.device, network_scope="thread-%d" % (i + 1), scene_scope=scene, task_scope=task) self.training_threads.append(training_thread) def log_uniform(self, lo, hi, rate): log_lo = np.log(lo) log_hi = np.log(hi) v = log_lo * (1 - rate) + log_hi * rate return np.exp(v) def train(self): # prepare session self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() self.sess.run(init) # create tensorboard summaries self.create_summary() self.summary_writer = tf.summary.FileWriter(LOG_FILE, self.sess.graph) # init or load checkpoint with saver # if you don't need to be able to resume training, use the next line instead. # it will result in a much smaller checkpoint file. self.saver = tf.train.Saver(max_to_keep=10, var_list=self.global_network.get_vars()) # saver = tf.train.Saver(max_to_keep=10) self.checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if self.checkpoint and self.checkpoint.model_checkpoint_path: self.saver.restore(self.sess, self.checkpoint.model_checkpoint_path) print("checkpoint loaded: {}".format( self.checkpoint.model_checkpoint_path)) tokens = self.checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: {}".format(self.global_t)) else: print("Could not find old checkpoint") train_threads = [] for i in range(NUM_THREADS): train_threads.append( threading.Thread(target=self.train_function, args=(i, ))) signal.signal(signal.SIGINT, self.signal_handler) # start each training thread for t in train_threads: t.start() print('Press Ctrl+C to stop.') signal.pause() # wait for all threads to finish for t in train_threads: t.join() print('Now saving data. Please wait.') self.saver.save(self.sess, CHECKPOINT_DIR + '/' + 'checkpoint', global_step=self.global_t) self.summary_writer.close() def create_summary(self): self.summary_op = dict() self.summary_placeholders = dict() for i in range(NUM_THREADS): scene, task = self.branches[i % self.NUM_TASKS] key = scene + "-" + task # summary for tensorboard episode_reward_input = tf.placeholder("float") episode_length_input = tf.placeholder("float") #episode_max_q_input = tf.placeholder("float") scalar_summaries = [ tf.summary.scalar(key + "/Episode Reward", episode_reward_input), tf.summary.scalar(key + "/Episode Length", episode_length_input) #tf.summary.scalar(key+"/Episode Max Q", episode_max_q_input) ] self.summary_op[key] = tf.summary.merge(scalar_summaries) self.summary_placeholders[key] = { "episode_reward_input": episode_reward_input, "episode_length_input": episode_length_input, #"episode_max_q_input": episode_max_q_input, "learning_rate_input": self.learning_rate_input } def train_function(self, parallel_index): training_thread = self.training_threads[parallel_index] last_global_t = 0 scene, task = self.branches[parallel_index % self.NUM_TASKS] key = scene + "-" + task while self.global_t < MAX_TIME_STEP and not self.stop_requested: diff_global_t = training_thread.process( self.sess, self.global_t, self.summary_writer, self.summary_op[key], self.summary_placeholders[key]) self.global_t += diff_global_t # periodically save checkpoints to disk if parallel_index == 0 and self.global_t - last_global_t > 1000000: print('Save checkpoint at timestamp %d' % self.global_t) self.saver.save(self.sess, CHECKPOINT_DIR + '/' + 'checkpoint', global_step=self.global_t) last_global_t = self.global_t def signal_handler(self, signal, frame): print('You pressed Ctrl+C!') self.stop_requested = True
scene_scopes = list_of_tasks.keys() global_t = 0 stop_requested = False if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) # Initialize learning rate log_lo = np.log(LR_ALPHA_LOW) log_hi = np.log(LR_ALPHA_HIGH) v = log_lo * (1 - LR_ALPHA_LOG_RATE) + log_hi * LR_ALPHA_LOG_RATE initial_learning_rate = np.exp(v) # Create global network global_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=scene_scopes) # Initialize scene-task specific branch branches = [] for scene in scene_scopes: for task in list_of_tasks[scene]: branches.append((scene, task)) NUM_TASKS = len(branches) assert NUM_THREADS >= NUM_TASKS, \ "Not enough threads for multitasking: at least {} threads needed.".format(NUM_TASKS) # Create gradient applier learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
class ADQN_Thread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index # Number the thread self._set_local_network(device, network_scope, scene_scope, task_scope) # Set local network self.sync = self.local_network.sync_from(global_network) # Synthesize from the global network self.learning_rate_input = learning_rate_input # Set learning rate self.max_global_time_step = max_global_time_step # Set maximum of global time step self._set_trainer_optimizer(device, global_network, grad_applier) # Set trainer self._set_environment(initial_learning_rate) # Set environment self.memory_size = MEMORY_SIZE # memory size for replay buffer self.memory = np.zeros((self.memory_size, 2048 * 4 * 2 + 2)) # initialize zero memory [s, a, r, s_] self.replace_target_iter = DQN_REPLACE_TARGET_ITER self.batch_size = DQN_BATCH_SIZE self.gamma = REWARD_DECAY # Create local network def _set_local_network(self, device, network_scope, scene_scope, task_scope): self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) # Set trainer and optimizer # Set Actor-Critic gradient and optimizer # Use the accumulated trainer from Zhu def _set_trainer_optimizer(self, device, global_network, grad_applier): self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()] global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names] self.apply_gradients = grad_applier.apply_gradients(global_net_vars, self.trainer.get_accum_grad_list() ) def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':','_') + '_accum_grad:0' # Set environments def _set_environment(self, initial_learning_rate): self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 def choose_action(self, actions_value): # epsilon-greedy if np.random.uniform() < EPSILON: action = np.argmax(actions_value) else: action = np.random.randint(0, ACTION_SIZE) return action # Take LOCAL_T_MAX step in one process def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): #print("start process") if self.env is None: # lazy evaluation time.sleep(self.thread_index*1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) start_local_t = self.local_t # Reset accmulated gradient variables sess.run(self.reset_gradients) # Obtain shared parameters from global sess.run( self.sync ) # t_max times loop for i in range(LOCAL_T_MAX): old_s_t = self.env.s_t actions_value = self.local_network.run_DQN(sess, self.env.s_t, self.env.target, self.scopes) action = self.choose_action(actions_value) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("%s:" % self.scene_scope) sys.stdout.write("Pi = {0} V = {1}\n".format(actions_value, action)) # process game self.env.step(action) # receive game result reward = self.env.reward terminal = self.env.terminal # ad-hoc reward for navigation # reward = 10.0 if terminal else -0.01 if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 """ print("Local t: {0:d}".format(self.local_t)) print("Reward: {0:f}".format(reward)) print("Episode reward: {0:f}".format(self.episode_reward)) print("Episode length: {0:d}".format(self.episode_length)) """ self.local_t += 1 # store transition to replay buffer self.store_transition(old_s_t, action, reward, self.env.s_t) if terminal: sys.stdout.write("#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s \n" % (self.thread_index, global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.env.reset() break # update target network if self.local_t % self.replace_target_iter == 0: sess.run(self.local_network.replace_target_op) # print('\ntarget_params_replaced\n') # sample batch memory from all memory if self.memory_counter > self.memory_size: sample_index = np.random.choice(self.memory_size, size=self.batch_size) else: sample_index = np.random.choice(self.memory_counter, size=self.batch_size) batch_memory = self.memory[sample_index, :] batch_memory_s_ = np.reshape(batch_memory[:, -2048*4:], (-1, 2048, 4)) batch_memory_s = np.reshape(batch_memory[:, :2048*4], (-1, 2048, 4)) batch_memory_t = np.reshape(np.tile(self.env.target, [self.batch_size, 1]), (-1, 2048, 4)) q_next, q_eval = sess.run( [self.local_network.q_next, self.local_network.q_eval], feed_dict={ self.local_network.s_: batch_memory_s_, # fixed params self.local_network.s: batch_memory_s, # newest params self.local_network.t: batch_memory_t }) # change q_target w.r.t q_eval's action q_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) eval_act_index = batch_memory[:, 2048*4].astype(int) reward = batch_memory[:, 2048*4 + 1] key_eval = self.network_scope + '/' + self.scene_scope + '/eval' if terminal: q_target[key_eval][batch_index, eval_act_index] = reward else: key_target = self.network_scope + '/'+ self.scene_scope + '/target' q_target[key_eval][batch_index, eval_act_index] = reward + self.gamma * np.max(q_next[key_target], axis=1) for idx in batch_index: # train eval network sess.run(self.accum_gradients, feed_dict={ self.local_network.s: [batch_memory_s[idx]], self.local_network.t: [batch_memory_t[idx]], self.local_network.q_target: [q_target[key_eval][idx]]}) cur_learning_rate = self._anneal_learning_rate(global_t) # update global network sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("#Thread-%d-%s-Local timestep-%d\n" % (self.thread_index, self.scene_scope, self.local_t)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def store_transition(self, s, a, r, s_): if not hasattr(self, 'memory_counter'): self.memory_counter = 0 transition = np.hstack((np.reshape(s, -1), [a, r], np.reshape(s_,-1))) # replace the old memory with new memory index = self.memory_counter % self.memory_size self.memory[index, :] = transition self.memory_counter += 1