class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop self.episode_reward = 0 for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # print(self.episode_reward) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() # self._record_score(sess, summary_writer, summary_op, score_input, # self.episode_reward, global_t) if terminal: terminal_end = True print("score={}".format(self.episode_reward)) # self._record_score(sess, summary_writer, summary_op, score_input, # self.episode_reward, global_t) # self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print("pi=", pi_) print(" V=", value_) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score=", self.episode_reward) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)] }) else: sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print("TIMESTEP", self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, options): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.options = options if options.use_lstm: self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(options.action_size, device) self.local_network.prepare_loss(options.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.indent = " |" * self.thread_index self.steps = 0 self.no_reward_steps = 0 self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0) if self.options.train_episode_steps > 0: self.max_reward = 0.0 self.max_episode_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores = Episode_scores(options) self.tes = self.options.train_episode_steps if self.options.tes_list is not None: self.tes = self.options.tes_list[thread_index] print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes)) self.initial_lives = self.game_state.initial_lives self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1) if self.options.record_new_record_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_record_dir): os.makedirs(self.options.record_new_record_dir) self.episode_screens = [] if self.options.record_new_room_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_room_dir): os.makedirs(self.options.record_new_room_dir) self.episode_screens = [] self.greediness = options.greediness self.repeat_action_ratio = options.repeat_action_ratio self.prev_action = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values, global_t): # Add greediness for broader exploration r = random.random() if r < self.greediness: action = int(r * len(pi_values)) elif r < self.repeat_action_ratio: action = self.prev_action else: # Increase randomness of choice if no reward term is too long if self.no_reward_steps > self.options.no_reward_steps: randomness = (self.no_reward_steps - self.options.no_reward_steps) * self.options.randomness pi_values += randomness pi_values /= sum(pi_values) if self.local_t % self.options.randomness_log_interval == 0: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}randomness={:.8f}".format( elapsed_time, global_t, self.thread_index, self.indent, randomness)) pi_values -= np.finfo(np.float32).epsneg action_samples = np.random.multinomial(self.options.num_experiments, pi_values) action = action_samples.argmax(0) self.prev_action = action return action def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time #@profile def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] liveses = [self.game_state.lives] if self.tes > 0: if self.episode_liveses == []: self.episode_liveses.append(self.game_state.lives) terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if self.options.use_lstm: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.options.local_t_max): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_, global_t) states.append(self.game_state.s_t) actions.append(action) values.append(value_) liveses.append(self.game_state.lives) if (self.thread_index == 0) and (self.local_t % self.options.log_interval == 0): print("pi={} (thread{})".format(pi_, self.thread_index)) print(" V={} (thread{})".format(value_, self.thread_index)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward if reward > 0 and \ (self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0"): elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:4.0f},th={}:{}r={:3.0f}RM{:02d}| NEW-SCORE".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, self.game_state.room_no)) # pseudo-count reward if self.options.psc_use: reward += self.game_state.psc_reward # add basic income after some no reward steps if self.no_reward_steps > self.options.no_reward_steps: reward += self.options.basic_income # clip reward if self.options.reward_clip > 0.0: reward = np.clip(reward, -self.options.reward_clip, self.options.reward_clip) rewards.append( reward ) # collect episode log if self.tes > 0: self.episode_states.append(self.game_state.s_t) self.episode_actions.append(action) self.episode_rewards.append(reward) self.episode_values.append(value_) self.episode_liveses.append(self.game_state.lives) if len(self.episode_states) > self.max_history * 2: self.episode_states = self.episode_states[-self.max_history:] self.episode_actions = self.episode_actions[-self.max_history:] self.episode_rewards = self.episode_rewards[-self.max_history:] self.episode_values = self.episode_values[-self.max_history:] self.episode_liveses = self.episode_liveses[-self.max_history-1:] # requirement for OpenAI Gym: --clear-history-on-death=False if self.options.clear_history_on_death and (liveses[-2] > liveses[-1]): self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = self.episode_liveses[-2:] self.local_t += 1 if self.options.record_new_record_dir is not None \ or self.options.record_new_room_dir is not None: screen = self.game_state.uncropped_screen if self.options.compress_frame: screen = lzma.compress(screen.tobytes(), preset=0) self.episode_screens.append(screen) # terminate if the play time is too long self.steps += 1 if self.steps > self.options.max_play_steps: terminal = True # requirement for OpenAI Gym: --terminate-on-lives-lost=False # terminate if lives lost if self.terminate_on_lives_lost and (liveses[-2] > liveses[-1]): terminal = True # count no reward steps if self.game_state.reward == 0.0: self.no_reward_steps += 1 else: self.no_reward_steps = 0 # s_t1 -> s_t self.game_state.update() if self.local_t % self.options.score_log_interval == 0: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, self.game_state.room_no, self.game_state.lives, value_, self.game_state.psc_reward)) # if self.game_state.room_no != self.game_state.prev_room_no: # elapsed_time = time.time() - self.start_time # print("t={:6.0f},s={:9d},th={}:{}RM{:02d}>RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format( # elapsed_time, global_t, self.thread_index, self.indent, # self.game_state.prev_room_no, self.game_state.room_no, # self.game_state.lives, value_, self.game_state.psc_reward)) if self.tes > 0: if self.game_state.lives < self.episode_liveses[-2]: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}l={:.0f}>{:.0f}RM{:02d}|".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_liveses[-2], self.game_state.lives, self.game_state.room_no)) # seperate steps after getting reward if self.game_state.reward > 0: if not terminal: break if terminal: terminal_end = True elapsed_time = time.time() - self.start_time end_mark = "end" if self.terminate_on_lives_lost else "END" print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}@{}|".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, end_mark)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) if self.tes > 0: if self.options.record_new_room_dir is not None \ and self.game_state.new_room >= 0: dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t, self.thread_index,\ self.episode_reward, self.game_state.new_room) dirname = os.path.join(self.options.record_new_room_dir, dirname) os.makedirs(dirname) for index, screen in enumerate(self.episode_screens): filename = "{:06d}.png".format(index) filename = os.path.join(dirname, filename) screen_image = screen if self.options.compress_frame: screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160)) cv2.imwrite(filename, screen_image) print("@@@ New Room record screens saved to {}".format(dirname)) if self.episode_reward > self.max_episode_reward: if self.options.record_new_record_dir is not None: dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t, self.thread_index,\ self.episode_reward, self.game_state.room_no) dirname = os.path.join(self.options.record_new_record_dir, dirname) os.makedirs(dirname) for index, screen in enumerate(self.episode_screens): filename = "{:06d}.png".format(index) filename = os.path.join(dirname, filename) screen_image = screen if self.options.compress_frame: screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160)) cv2.imwrite(filename, screen_image) print("@@@ New Record screens saved to {}".format(dirname)) self.max_episode_reward = self.episode_reward if self.options.record_all_non0_record: self.max_episode_reward = 0 self.max_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores.add(self.episode_reward, global_t, self.thread_index) if self.options.record_new_record_dir is not None \ or self.options.record_new_room_dir is not None: self.episode_screens= [] self.episode_reward = 0 self.steps = 0 self.no_reward_steps = 0 self.game_state.reset() if self.options.use_lstm: self.local_network.reset_state() break if self.thread_index == 0 and self.local_t % self.options.performance_log_interval < self.options.local_t_max: elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) if self.options.gym_eval: diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end # don't train if following condition # requirement for OpenAI Gym: --terminate-on-lives-lost=False if self.options.terminate_on_lives_lost and (self.thread_index == 0) and (not self.options.train_in_eval): return 0, terminal_end else: if self.tes > 0: _ = self.episode_scores.is_highscore(self.episode_reward) if self.episode_reward > self.max_reward: self.max_reward = self.episode_reward if True: tes = self.tes # requirement for OpenAI Gym: --test-extend=False if self.options.tes_extend and self.initial_lives != 0: tes *= self.options.tes_extend_ratio * (self.game_state.lives / self.initial_lives) if self.game_state.lives == self.initial_lives: tes *= 2 tes = int(tes) tes = min(tes, len(self.episode_states)) print("[OHL]SCORE={:3.0f},s={:9d},th={},lives={},steps={},tes={},RM{:02d}".format(self.episode_reward, global_t, self.thread_index, self.game_state.lives, self.steps, tes, self.game_state.room_no)) if tes == 0: states = [] actions = [] rewards = [] values = [] liveses = self.episode_liveses[-1:] else: states = self.episode_states[-tes:] actions = self.episode_actions[-tes:] rewards = self.episode_rewards[-tes:] values = self.episode_values[-tes:] liveses = self.episode_liveses[-tes-1:] if self.options.clear_history_after_ohl: self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = self.episode_liveses[-2:] if len(states) > 0: R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] lives = liveses.pop() # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): # Consider the number of lives if (not self.options.use_gym) and self.initial_lives != 0.0 and not self.terminate_on_lives_lost: prev_lives = liveses.pop() if prev_lives > lives: weight = self.options.lives_lost_weight rratio = self.options.lives_lost_rratio R *= rratio * ( (1.0 - weight) + weight * (lives / prev_lives) ) ri = self.options.lives_lost_reward lives = prev_lives R = ri + self.options.gamma * R td = R - Vi a = np.zeros([self.options.action_size]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if self.options.use_lstm: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * \ (self.max_global_time_step - global_time_step) / \ self.max_global_time_step assert learning_rate > 0, 'Learning rate {} is not >0'.format( learning_rate) return learning_rate def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop start_local_t = self.local_t terminal_end = False for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) # Debug output for progress if (self.thread_index == 0) and (self.local_t % 100) == 0: print(('local_t = {:10} pi = ' + '{:7.5f} ' * len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t, *pi_, value_, self.thread_index)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward # TODO: Does this make sense? rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print ("score=", self.episode_reward) self._record_score( sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break # Compute and accmulate gradients R = 0.0 if terminal_end else self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # What is the meaning of these values? batch_si = [] batch_a = [] batch_td = [] batch_R = [] for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)]}) else: sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R}) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print ("TIMESTEP", self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, max_global_time_step, device): self.learn_rate = 0 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) # # self.apply_gradients = tf.train.RMSPropOptimizer( self.learning_rate_input).apply_gradients( zip(self.gradients, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.game_state = Game() self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def process(self, sess, global_t): states = [] actions = [] rewards = [] values = [] temp_reward = 0 terminal_end = False sess.run(self.sync) start_local_t = self.local_t for i in range(0, LOCAL_T_MAX): # while True: # sleep(100) pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) # print(pi_) action = self.choose_action(pi_) # print(action) states.append(self.game_state.s_t) actions.append(action) values.append(value_) temp_action = [0, 0, 0] temp_action[action] = 1 self.game_state.process(temp_action) # receive game result reward = self.game_state.reward # print(self.game_state.terminal) terminal = self.game_state.terminal self.episode_reward += reward temp_reward = self.episode_reward # clip reward rewards.append(reward) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score={}".format(self.episode_reward)) print("process:", self.thread_index, " learn_rate:", self.learn_rate) self.episode_reward = 0 self.game_state.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) self.learn_rate = cur_learning_rate sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) diff_local_t = self.local_t - start_local_t return diff_local_t, temp_reward
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = action_size self.gamma = gamma self.local_t_max = local_t_max self.agent_type = agent_type self.performance_log_interval = performance_log_interval self.log_level = log_level if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) self.local_network.prepare_loss(entropy_beta) with tf.device(device): var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0 def reset_counters(self): self.total_q_max = 0 self.episode_reward = 0 self.episode_actions = [] self.passed_obst = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 self.learn_rate = learning_rate return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, statistics): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if self.agent_type == 'LSTM': start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.local_t_max): pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) # process game try: # Bitblt may raise error, but we can safely ignore it, otherwise thread will die self.game_state.process(action) except Exception as e: print e.message # receive game result reward = self.game_state.reward terminal = self.game_state.terminal steps = self.game_state.steps passed = self.game_state.passed_obst self.episode_reward += reward # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() self.total_q_max += np.max(pi_) self.episode_actions.append(action) self.passed_obst = self.game_state.passed_obst if terminal: terminal_end = True self.episode += 1 if self.log_level == 'FULL': reward_steps = format( float(self.episode_reward) / float(steps), '.4f') print "THREAD: {} / EPISODE: {} / TOTAL STEPS: {} / STEPS: {} / PASSED OBST: {} / REWARD: {} / REWARD/STEP: {}".format( self.thread_index, self.episode, global_t, steps, self.passed_obst, self.episode_reward, reward_steps) statistics.update(global_t, self.episode_reward, self.total_q_max, steps, self.episode_actions, self.learn_rate, self.passed_obst) self.reset_counters() self.game_state.reset() if self.agent_type == 'LSTM': self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R td = R - Vi a = np.zeros([self.action_size]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if self.agent_type == 'LSTM': batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= self.performance_log_interval) and (self.log_level == 'FULL'): self.prev_local_t += self.performance_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.apply_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)], self.learning_rate_input: cur_learning_rate } ) else: sess.run( self.apply_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate} ) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_episode, device, arrived_jobs, condition): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_episode = max_global_time_episode # 通过thread_index 即机器编号来获取在该机器上加工的所有工序 self.operations = get_data_by_machine(thread_index) self.condition = condition self.is_terminal_counted = False self.last_episode_reward = 0 if USE_LSTM: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients) # self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) # 创建该工序的环境 self.env = JspEnv(self.operations, thread_index, arrived_jobs) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): # return self.initial_learning_rate learning_rate = self.initial_learning_rate * ( self.max_global_time_episode - global_time_step) / self.max_global_time_episode if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values, use_max_choice): # if len(self.env.action_space) != 1: # print('\n------------------------------------------------' # 'machine = {}'.format(self.thread_index)) # print('action space = {}'.format(self.env.action_space)) # print('pi = {}'.format(pi_values)) # # for i in range(len(pi_values)): # if i not in self.env.action_space: # pi_values[i] = 0 # sum = np.sum(pi_values) # if sum == 0: # return np.random.choice(self.env.action_space) # else: # for i in range(len(pi_values)): # pi_values[i] = pi_values[i] / sum # if use_max_choice: # if len(self.env.action_space) != 1: # pi_values[self.env.machine_size] = 0 # return np.argmax(pi_values) # else: # return np.random.choice(range(len(pi_values)), p=pi_values) return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input, use_max_choice): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local # sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop # for i in range(LOCAL_T_MAX): while True: # pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) pi_, value_ = self.local_network.run_policy_and_value( sess, self.env.local_state) action = self.choose_action(pi_, use_max_choice) # states.append(self.game_state.s_t) states.append(self.env.local_state) actions.append(action) values.append(value_) # if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): # if (self.thread_index == 0): # print('machine index: ' + str(self.thread_index)) # print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index])) # print('actions:{}'.format(action)) # print('clock:{}'.format(self.env.clock)) # print("action space = {}".format(self.env.action_space)) # # print("pi={}".format(pi_)) # print(" V={}".format(value_)) ''' # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal ''' new_state, reward, terminal, info = self.env.step(action) self.episode_reward += reward # clip reward # rewards.append( np.clip(reward, -1, 1) ) rewards.append(reward) self.local_t += 1 # s_t1 -> s_t # self.game_state.update() if terminal: terminal_end = True # print("score={}".format(self.episode_reward)) # print("complete time={}".format(self.env.clock)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) # print('\n----------------------------------------------------') # print('machine index: ' + str(self.thread_index)) # print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index])) # print('actions:{}'.format(action)) # print('clock:{}'.format(self.env.clock)) # print("jobs size = {}".format(len(self.env.init_operations))) # print("action space = {}".format(self.env.action_space)) # print("pi={}".format(pi_)) # print(" V={}".format(value_)) # print('----------------------------------------------------\n') self.complete_time = self.env.clock self.last_episode_reward = self.episode_reward self.episode_reward = 0 # self.game_state.reset() self.env.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: # R = self.local_network.run_value(sess, self.game_state.s_t) R = self.local_network.run_value(sess, self.env.local_state) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi # a = np.zeros([ACTION_SIZE]) a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) # if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): # self.prev_local_t += PERFORMANCE_LOG_INTERVAL # elapsed_time = time.time() - self.start_time # steps_per_sec = global_t / elapsed_time # print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( # global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, self.complete_time, self.last_episode_reward
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # STATE_SIZE = 6 - 3 Landmarks + 5 (comm-size) self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.epSteps = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * \ (self.max_global_time_step - global_time_step) / \ self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] states2 = [] actions = [] actions2 = [] rewards = [] values = [] values2 = [] # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, np.concatenate([self.game_state.s1_t, [self.epSteps]])) pi2_, value2_ = self.local_network.run_policy_and_value( sess, np.concatenate([self.game_state.s2_t, [self.epSteps]])) action = self.choose_action(pi_) action2 = self.choose_action(pi2_) states.append( np.concatenate([self.game_state.s1_t, [self.epSteps]])) states2.append( np.concatenate([self.game_state.s2_t, [self.epSteps]])) actions.append(action) actions2.append(action2) values.append(value_) values2.append(value2_) # process game self.game_state.process([action, action2]) # receive game result reward = self.game_state.reward self.episode_reward += reward if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) print(" R={}".format(reward)) # clip reward # rewards.append(np.clip(reward, -1, 1)) rewards.append(reward) self.local_t += 1 self.epSteps += 1 # s_t1 -> s_t self.game_state.update() if self.epSteps >= 100: self.epSteps = 0 if (self.thread_index == 0 and self.local_t % LOG_INTERVAL == 0): print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() self.local_network.reset_state() break R = 0.0 R = self.local_network.run_value( sess, np.concatenate([self.game_state.s1_t, [self.epSteps]])) R2 = self.local_network.run_value( sess, np.concatenate([self.game_state.s2_t, [self.epSteps]])) actions.reverse() actions2.reverse() states.reverse() states2.reverse() rewards.reverse() values.reverse() values2.reverse() batch_si = [] batch_s2i = [] batch_a = [] batch_a2 = [] batch_td = [] batch_td2 = [] batch_R = [] batch_R2 = [] # compute and accmulate gradients for (ai, a2i, ri, si, s2i, Vi, V2i) in zip(actions, actions2, rewards, states, states2, values, values2): R = ri + GAMMA * R R2 = ri + GAMMA * R2 td = R - Vi td2 = R2 - V2i a = np.zeros([5]) a[ai] = 1 a2 = np.zeros([5]) a2[a2i] = 1 batch_si.append(si) batch_s2i.append(s2i) batch_a.append(a) batch_a2.append(a2) batch_td.append(td) batch_td2.append(td2) batch_R.append(R) batch_R2.append(R2) cur_learning_rate = self._anneal_learning_rate(global_t) batch_si.reverse() batch_s2i.reverse() batch_a.reverse() batch_a2.reverse() batch_td.reverse() batch_td2.reverse() batch_R.reverse() batch_R2.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_s2i, self.local_network.a: batch_a2, self.local_network.td: batch_td2, self.local_network.r: batch_R2, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and \ (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. \ {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t