def main(args): action_size = Environment.get_action_size(flags.env_type, flags.env_name) objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) global_network = UnrealModel(action_size, objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0") # use CPU for weight visualize tool sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") vars = {} var_list = global_network.get_vars() for v in var_list: vars[v.name] = v W_conv1 = sess.run(vars['net_-1/base_conv/W_base_conv1:0']) # show graph of W_conv1 fig, axes = plt.subplots(3, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(3 * 16)): inch = i // 16 outch = i % 16 img = W_conv1[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show()
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0 def prepare(self): self.environment = Environment.create_environment( self.env_type, self.env_name) def stop(self): self.environment.stop() def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) #Modify Last State - with attention pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) #Modify New State - with attention frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R def _process_rp(self): # [Reward prediction] rp_experience_frames = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(3): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = rp_experience_frames[3].reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c def process(self, sess, global_t, summary_writer, summary_op, score_input): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) # Copy weights from shared to local sess.run(self.sync) # [Base] batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \ self._process_base(sess, global_t, summary_writer, summary_op, score_input) feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, self.local_network.base_initial_lstm_state: start_lstm_state, # [common] self.learning_rate_input: cur_learning_rate } # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] if self.use_reward_prediction: batch_rp_si, batch_rp_c = self._process_rp() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) # Calculate gradients and copy them to global network. sess.run(self.apply_gradients, feed_dict=feed_dict) self._print_log(global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, n_step_TD, gamma, gamma_pc, experience_history_size, max_global_time_step, device, segnet_param_dict, image_shape, is_training, n_classes, random_state, termination_time, segnet_lambda, dropout): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.n_step_TD = n_step_TD self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size( env_type, env_name) self.segnet_param_dict = segnet_param_dict self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None) self.is_training = is_training self.n_classes = n_classes self.segnet_lambda = segnet_lambda self.run_metadata = tf.RunMetadata() self.many_runs_timeline = TimeLiner() self.random_state = random_state self.termination_time = termination_time self.dropout = dropout try: self.local_network = UnrealModel( self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device, segnet_param_dict=self.segnet_param_dict, image_shape=image_shape, is_training=is_training, n_classes=n_classes, segnet_lambda=self.segnet_lambda, dropout=dropout) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars(), self.thread_index) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, random_state=self.random_state) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = -1 self.prev_local_t_loss = 0 self.sr_size = 50 self.success_rates = deque(maxlen=self.sr_size) except Exception as e: print(str(e)) #, flush=True) raise Exception( "Problem in Trainer {} initialization".format(thread_index)) def prepare(self, termination_time=50.0, termination_dist_value=-10.0): self.environment = Environment.create_environment( self.env_type, self.env_name, self.termination_time, thread_index=self.thread_index) def stop(self): self.environment.stop() def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return self.random_state.choice(len(pi_values), p=pi_values) def _record_one(self, sess, summary_writer, summary_op, score_input, score, global_t): if self.thread_index >= 0: summary_str = sess.run(summary_op, feed_dict={score_input: score}) for sum_wr in summary_writer: sum_wr.add_summary(summary_str, global_t) def _record_all(self, sess, summary_writer, summary_op, dict_input, dict_eval, global_t): if self.thread_index >= 0: assert set(dict_input.keys()) == set(dict_eval.keys()), print( dict_input.keys(), dict_eval.keys()) feed_dict = {} for key in dict_input.keys(): feed_dict.update({dict_input[key]: dict_eval[key]}) summary_str = sess.run(summary_op, feed_dict=feed_dict) for sum_wr in summary_writer: sum_wr.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ #print("Start experience filling", flush=True) prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, prev_state) #print("Local network run base policy, value!", flush=True) pi_, _, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action, flag=0) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) #, flush=True) # print("### Experience : {}".format(self.experience.get_debug_string())) def _process_base(self, sess, global_t, summary_writer, summary_op_dict, summary_dict): #, losses_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = None if self.use_lstm: start_lstm_state = self.local_network.base_lstm_state_out mode = "segnet" if self.segnet_mode >= 2 else "" # t_max times loop flag = 0 for _ in range(self.n_step_TD): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward, self.environment.last_state) pi_, value_, losses = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward, mode) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("Trainer {}>>> Local step {}:".format( self.thread_index, self.local_t)) print("Trainer {}>>> pi={}".format(self.thread_index, pi_)) print("Trainer {}>>> V={}".format(self.thread_index, value_)) flag = 1 prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action, flag=flag) frame = ExperienceFrame( { key: val for key, val in prev_state.items() if 'objectType' not in key }, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) # Use to know about Experience collection #print(self.experience.get_debug_string()) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("Trainer {}>>> score={}".format( self.thread_index, self.episode_reward)) #, flush=True) summary_dict['values'].update( {'score_input': self.episode_reward}) success = 1 if self.environment._last_full_state[ "success"] else 0 #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success) self.success_rates.append(success) summary_dict['values'].update({ 'sr_input': np.mean(self.success_rates) if len(self.success_rates) == self.sr_size else 0 }) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() if flag: flag = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] batch_sobjT = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si['image']) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) if self.segnet_param_dict["segnet_mode"] >= 2: batch_sobjT.append(si['objectType']) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() batch_sobjT.reverse() #print(np.unique(batch_sobjT)) ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) #print(">>> Process run!", flush=True) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last # pc_experience_frames.reverse() pc_experience_frames = pc_experience_frames[::-1] #print(">>> Process ran!", flush=True) batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) #print(">>> Process run!", flush=True) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state['image']) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() #print(">>> Process ended!", flush=True) return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state['image']) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R def _process_rp(self): # [Reward prediction] rp_experience_frames = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(3): batch_rp_si.append(rp_experience_frames[i].state['image']) # one hot vector for target reward r = rp_experience_frames[3].reward rp_c = [0.0, 0.0, 0.0] if -1e-10 < r < 1e-10: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c def process(self, sess, global_t, summary_writer, summary_op_dict, score_input, sr_input, eval_input, entropy_input, term_global_t, losses_input): if self.prev_local_t == -1 and self.segnet_mode >= 2: self.prev_local_t = 0 sess.run(self.local_network.reset_evaluation_vars) # Fill experience replay buffer #print("Inside train process of thread!", flush=True) if not self.experience.is_full(): self._fill_experience(sess) return 0, None start_local_t = self.local_t episode_score = None cur_learning_rate = self._anneal_learning_rate(global_t) #print("Weights copying!", flush=True) # Copy weights from shared to local sess.run(self.sync) #print("Weights copied successfully!", flush=True) summary_dict = {'placeholders': {}, 'values': {}} summary_dict['placeholders'].update(losses_input) # [Base] #print("[Base]", flush=True) batch_si, batch_sobjT, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state, = \ self._process_base(sess, global_t, summary_writer, summary_op_dict, summary_dict) if summary_dict['values'].get('score_input', None) is not None: self._record_one(sess, summary_writer, summary_op_dict['score_input'], score_input, summary_dict['values']['score_input'], global_t) self._record_one(sess, summary_writer, summary_op_dict['sr_input'], sr_input, summary_dict['values']['sr_input'], global_t) #self._record_one(sess, summary_writer, summary_op_dict['term_global_t'], term_global_t, # global_t, global_t) #summary_writer[0].flush() # summary_writer[1].flush() # Return advanced local step size episode_score = summary_dict['values'].get('score_input', None) summary_dict['values'] = {} feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, # [common] self.learning_rate_input: cur_learning_rate, self.is_training: True } if self.use_lstm: feed_dict[ self.local_network.base_initial_lstm_state] = start_lstm_state if self.segnet_param_dict["segnet_mode"] >= 2: feed_dict[self.local_network.base_segm_mask] = batch_sobjT #print("[Pixel change]", flush=True) # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) #print("[Value replay]", flush=True) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] #print("[Reward prediction]", flush=True) if self.use_reward_prediction: batch_rp_si, batch_rp_c = self._process_rp() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) #print(len(batch_rp_c), batch_rp_c) grad_check = None #if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: # grad_check = [tf.add_check_numerics_ops()] #print("Applying gradients in train!", flush=True) # Calculate gradients and copy them to global network. out_list = [self.apply_gradients] out_list += [ self.local_network.total_loss, self.local_network.base_loss, self.local_network.policy_loss, self.local_network.value_loss, self.local_network.entropy ] if self.segnet_mode >= 2: out_list += [self.local_network.decoder_loss] out_list += [self.local_network.regul_loss] if self.use_pixel_change: out_list += [self.local_network.pc_loss] if self.use_value_replay: out_list += [self.local_network.vr_loss] if self.use_reward_prediction: out_list += [self.local_network.rp_loss] if self.segnet_mode >= 2: out_list += [self.local_network.update_evaluation_vars] if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: out_list += [self.local_network.evaluation] import time now = time.time() with tf.control_dependencies(grad_check): if GPU_LOG: return_list = sess.run(out_list, feed_dict=feed_dict, options=run_options, run_metadata=self.run_metadata) else: return_list = sess.run(out_list, feed_dict=feed_dict, options=run_options) if time.time() - now > 30.0: print( "Too much time on sess.run: check tensorflow") #, flush=True) sys.exit(0) raise ValueError("More than 100 seconds update in tensorflow!") # gradients_tuple, total_loss, base_loss, policy_loss, value_loss, entropy = return_list[: 6] grad_norm = gradients_tuple[1] return_list = return_list[6:] return_string = "Trainer {}>>> Total loss: {}, Base loss: {}\n".format( self.thread_index, total_loss, base_loss) return_string += "\t\tPolicy loss: {}, Value loss: {}, Grad norm: {}\nEntropy: {}\n".format( policy_loss, value_loss, grad_norm, entropy) losses_eval = { 'all/total_loss': total_loss, 'all/base_loss': base_loss, 'all/policy_loss': policy_loss, 'all/value_loss': value_loss, 'all/loss/grad_norm': grad_norm } if self.segnet_mode >= 2: decoder_loss, l2_loss = return_list[:2] return_list = return_list[2:] return_string += "\t\tDecoder loss: {}, L2 weights loss: {}\n".format( decoder_loss, l2_loss) losses_eval.update({ 'all/decoder_loss': decoder_loss, 'all/l2_weights_loss': l2_loss }) if self.use_pixel_change: pc_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tPC loss: {}\n".format(pc_loss) losses_eval.update({'all/pc_loss': pc_loss}) if self.use_value_replay: vr_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tVR loss: {}\n".format(vr_loss) losses_eval.update({'all/vr_loss': vr_loss}) if self.use_reward_prediction: rp_loss = return_list[0] return_list = return_list[1:] return_string += "\t\tRP loss: {}\n".format(rp_loss) losses_eval.update({'all/rp_loss': rp_loss}) if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: if self.segnet_mode >= 2: return_string += "\t\tmIoU: {}\n".format(return_list[-1]) summary_dict['values'].update(losses_eval) # Printing losses if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL: if self.segnet_mode >= 2: self._record_one(sess, summary_writer, summary_op_dict['eval_input'], eval_input, return_list[-1], global_t) self._record_one(sess, summary_writer, summary_op_dict['entropy'], entropy_input, entropy, global_t) # summary_writer[0].flush() # summary_writer[1].flush() print(return_string) self.prev_local_t_loss += LOSS_AND_EVAL_LOG_INTERVAL if GPU_LOG: fetched_timeline = timeline.Timeline(self.run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() self.many_runs_timeline.update_timeline(chrome_trace) self._print_log(global_t) #Recording score and losses self._record_all(sess, summary_writer, summary_op_dict['losses_input'], summary_dict['placeholders'], summary_dict['values'], global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, episode_score
sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # summary for tensorboard score_input = tf.placeholder(tf.int32) average_entropy = tf.placeholder(tf.float32) tf.summary.scalar("score", score_input) tf.summary.scalar("entropy", average_entropy) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(LOG_FILE, sess.graph) # init or load checkpoint with saver saver = tf.train.Saver(global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[1]) print(">>> global step set: ", global_t) # set wall time wall_t_fname = CHECKPOINT_DIR + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'r') as f: wall_t = float(f.read()) next_save_steps = (global_t + SAVE_INTERVAL_STEP ) // SAVE_INTERVAL_STEP * SAVE_INTERVAL_STEP
class Application(object): def __init__(self): pass def train_function(self, parallel_index, preparing): """ Train each environment. """ trainer = self.trainers[parallel_index] if preparing: trainer.prepare() # set start_time trainer.set_start_time(self.start_time) while True: if self.stop_requested: break if self.terminate_reqested: trainer.stop() break if self.global_t > flags.max_time_step: trainer.stop() break if parallel_index == 0 and self.global_t > self.next_save_steps: # Save checkpoint self.save() #Each env calls its own process #Process has sub tasks called within diff_global_t = trainer.process(self.sess, self.global_t, self.summary_writer, self.summary_op, self.score_input) self.global_t += diff_global_t def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(action_size, -1, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=flags.rmsp_alpha, momentum=0.0, epsilon=flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) for i in range( flags.parallel_size): #Trainer creates a UnrealModel in init trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device) self.trainers.append(trainer) # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", self.score_input) self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(flags.log_file, self.sess.graph) # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step # run training threads ## Each Env is Running Here Parallel self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append( threading.Thread(target=self.train_function, args=(i, True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t for t in self.train_threads: t.start() print('Press Ctrl+C to stop') signal.pause() def save(self): """ Save checkpoint. Called from therad-0. """ self.stop_requested = True # Wait for all other threads to stop for (i, t) in enumerate(self.train_threads): if i != 0: t.join() # Save if not os.path.exists(flags.checkpoint_dir): os.mkdir(flags.checkpoint_dir) # Write wall time wall_t = time.time() - self.start_time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) print('Start saving.') self.saver.save(self.sess, flags.checkpoint_dir + '/' + 'checkpoint', global_step=self.global_t) print('End saving.') self.stop_requested = False self.next_save_steps += flags.save_interval_step # Restart other threads for i in range(flags.parallel_size): if i != 0: thread = threading.Thread(target=self.train_function, args=(i, False)) self.train_threads[i] = thread thread.start() def signal_handler(self, signal, frame): print('You pressed Ctrl+C!') self.terminate_reqested = True
class Application(object): def __init__(self): pass def base_train_function(self): """ Train routine for base_trainer. """ trainer = self.base_trainer # set start_time trainer.set_start_time(self.start_time, self.global_t) while True: if self.stop_requested: break if self.terminate_requested: break if self.global_t > flags.max_time_step: break if self.global_t > self.next_save_steps: # Save checkpoint logger.debug("Steps:{}".format(self.global_t)) logger.debug(self.next_save_steps) self.save() diff_global_t = trainer.process(self.sess, self.global_t, self.summary_writer, self.summary_op, self.summary_values, flags.base_lambda) self.global_t += diff_global_t logger.warn("exiting training!") self.environment.stop() #sys.exit(0) time.sleep(1) os._exit(0) def aux_train_function(self, aux_index): """ Train routine for aux_trainer. """ trainer = self.aux_trainers[aux_index] while True: if self.global_t < 500: continue if self.stop_requested: continue if self.terminate_requested: break if self.global_t > flags.max_time_step: break diff_aux_t = trainer.process(self.sess, self.global_t, self.aux_t, self.summary_writer, self.summary_op_aux, self.summary_aux) self.aux_t += diff_aux_t #logger.debug("aux_t:{}".format(self.aux_t)) def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" logger.debug("start App") initial_learning_rate = flags.initial_learning_rate self.global_t = 0 self.aux_t = 0 self.stop_requested = False self.terminate_requested = False logger.debug("getting action size...") visinput = [flags.vision, flags.vis_h, flags.vis_w] action_size = Environment.get_action_size(flags.env_type, flags.env_name) # Setup Global Network logger.debug("loading global model...") self.global_network = UnrealModel( action_size, visinput, -1, flags.entropy_beta, device, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.pixel_change_lambda, flags.temporal_coherence_lambda) logger.debug("done loading global model") learning_rate_input = tf.placeholder("float") # Setup gradient calculator #""" grad_applier = RMSPropApplier( learning_rate=learning_rate_input, #decay = flags.rmsp_alpha, momentum=0.0, #epsilon = flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) """ grad_applier = AdamApplier(learning_rate = learning_rate_input, clip_norm=flags.grad_norm_clip, device=device) """ # Start environment self.environment = Environment.create_environment( flags.env_type, flags.env_name, visinput) logger.debug("done loading environment") # Setup runner self.runner = RunnerThread(flags, self.environment, self.global_network, action_size, visinput, device, visualise) logger.debug("done setting up RunnerTread") # Setup experience self.experience = Experience(flags.experience_history_size) #@TODO check device usage: should we build a cluster? # Setup Base Network self.base_trainer = BaseTrainer( self.runner, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, visinput, flags.env_type, flags.env_name, flags.entropy_beta, flags.gamma, self.experience, flags.max_time_step, device) # Setup Aux Networks self.aux_trainers = [] for k in range(flags.parallel_size): self.aux_trainers.append( AuxTrainer( self.global_network, k + 2, #-1 is global, 0 is runnerthread, 1 is base flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.pixel_change_lambda, flags.temporal_coherence_lambda, flags.aux_initial_learning_rate, learning_rate_input, grad_applier, visinput, self.aux_t, flags.env_type, flags.env_name, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.aux_lambda, flags.gamma_pc, self.experience, flags.max_time_step, device)) # Start tensorflow session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.init_tensorboard() # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) checkpointpath = checkpoint.model_checkpoint_path.replace( "/", "\\") logger.info("checkpoint loaded: {}".format(checkpointpath)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) logger.info(">>> global step set: {}".format(self.global_t)) logger.info(">>> aux step: {}".format(self.aux_t)) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step logger.debug("next save steps:{}".format(self.next_save_steps)) else: logger.info("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t # Start runner self.runner.start_runner(self.sess) # Start base_network thread self.base_train_thread = threading.Thread( target=self.base_train_function, args=()) self.base_train_thread.start() # Start aux_network threads self.aux_train_threads = [] for k in range(flags.parallel_size): self.aux_train_threads.append( threading.Thread(target=self.aux_train_function, args=(k, ))) self.aux_train_threads[k].start() logger.debug(threading.enumerate()) logger.info('Press Ctrl+C to stop') #signal.pause() def init_tensorboard(self): # tensorboard summary for base self.score_input = tf.placeholder(tf.int32) self.epl_input = tf.placeholder(tf.int32) self.policy_loss = tf.placeholder(tf.float32) self.value_loss = tf.placeholder(tf.float32) self.base_entropy = tf.placeholder(tf.float32) self.base_gradient = tf.placeholder(tf.float32) self.base_lr = tf.placeholder(tf.float32) self.laststate = tf.placeholder( tf.float32, [1, flags.vis_w, flags.vis_h, len(flags.vision)], name="laststate") score = tf.summary.scalar("env/score", self.score_input) epl = tf.summary.scalar("env/ep_length", self.epl_input) policy_loss = tf.summary.scalar("base/policy_loss", self.policy_loss) value_loss = tf.summary.scalar("base/value_loss", self.value_loss) entropy = tf.summary.scalar("base/entropy", self.base_entropy) gradient = tf.summary.scalar("base/gradient", self.base_gradient) lr = tf.summary.scalar("base/learning_rate", self.base_lr) laststate = tf.summary.image("base/laststate", self.laststate) self.summary_values = [ self.score_input, self.epl_input, self.policy_loss, self.value_loss, self.base_entropy, self.base_gradient, self.base_lr, self.laststate ] self.summary_op = tf.summary.merge_all( ) # we want to merge model histograms as well here # tensorboard summary for aux self.summary_aux = [] aux_losses = [] self.aux_basep_loss = tf.placeholder(tf.float32) self.aux_basev_loss = tf.placeholder(tf.float32) self.aux_entropy = tf.placeholder(tf.float32) self.aux_gradient = tf.placeholder(tf.float32) self.summary_aux.append(self.aux_basep_loss) self.summary_aux.append(self.aux_basev_loss) aux_losses.append( tf.summary.scalar("aux/basep_loss", self.aux_basep_loss)) aux_losses.append( tf.summary.scalar("aux/basev_loss", self.aux_basev_loss)) if flags.use_pixel_change: self.pc_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.pc_loss) aux_losses.append(tf.summary.scalar("aux/pc_loss", self.pc_loss)) if flags.use_value_replay: self.vr_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.vr_loss) aux_losses.append(tf.summary.scalar("aux/vr_loss", self.vr_loss)) if flags.use_reward_prediction: self.rp_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.rp_loss) aux_losses.append(tf.summary.scalar("aux/rp_loss", self.rp_loss)) if flags.use_temporal_coherence: self.tc_loss = tf.placeholder(tf.float32) self.summary_aux.append(self.tc_loss) aux_losses.append(tf.summary.scalar("aux/tc_loss", self.tc_loss)) # append entropy and gradient last self.summary_aux.append(self.aux_entropy) self.summary_aux.append(self.aux_gradient) aux_losses.append(tf.summary.scalar("aux/entropy", self.aux_entropy)) aux_losses.append(tf.summary.scalar("aux/gradient", self.aux_gradient)) self.summary_op_aux = tf.summary.merge(aux_losses) #self.summary_op = tf.summary.merge_all() tensorboard_path = flags.temp_dir + TRAINING_NAME + "/" logger.info("tensorboard path:" + tensorboard_path) if not os.path.exists(tensorboard_path): os.makedirs(tensorboard_path) self.summary_writer = tf.summary.FileWriter(tensorboard_path) self.summary_writer.add_graph(self.sess.graph) def save(self): """ Save checkpoint. Called from base_trainer. """ self.stop_requested = True # Save if not os.path.exists(flags.checkpoint_dir): os.mkdir(flags.checkpoint_dir) # Write wall time wall_t = time.time() - self.start_time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) logger.info('Start saving.') self.saver.save(self.sess, flags.checkpoint_dir + '/' + 'checkpoint', global_step=self.global_t) logger.info('End saving.') self.stop_requested = False self.next_save_steps += flags.save_interval_step def signal_handler(self, signal, frame): logger.warn('Ctrl+C detected, shutting down...') logger.info('run name: {} -- terminated'.format(TRAINING_NAME)) self.terminate_requested = True
sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") vars = {} var_list = global_network.get_vars() for v in var_list: vars[v.name] = v W_conv1 = sess.run(vars['net_-1/base_conv/W_base_conv1:0']) # show graph of W_conv1 fig, axes = plt.subplots(3, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1)
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, use_future_reward_prediction, use_autoencoder, reward_length, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device, log_file, skip_step): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.use_future_reward_prediction = use_future_reward_prediction self.use_autoencoder = use_autoencoder self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.skip_step = skip_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, use_future_reward_prediction, use_autoencoder, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, reward_length) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0 self.log_file = log_file self.prediction_res_file = log_file + '/' + 'res.pkl' def prepare(self): self.environment = Environment.create_environment( self.env_type, self.env_name, self.skip_step) def stop(self): self.environment.stop() def add_summary(self, step, name, value, writer): summary = tf.Summary() summary_value = summary.value.add() summary_value.simple_value = float(value) summary_value.tag = name writer.add_summary(summary, step) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R ''' def _process_rp(self): # [Reward prediction] rp_experience_frames, total_raw_reward, _ = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(4): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = total_raw_reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c ''' def _process_replay(self, action=False): # [Reward prediction] rp_experience_frames, total_raw_reward, next_frame = self.experience.sample_rp_sequence( flag=True) # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(4): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = total_raw_reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) result = [batch_rp_si, batch_rp_c, next_frame] if action: batch_rp_action = [] action_index = rp_experience_frames[3].action action_one_hot = np.zeros([self.action_size]) action_one_hot[action_index] = 1.0 batch_rp_action.append(action_one_hot) result.append(batch_rp_action) return result def process(self, sess, global_t, summary_writer, summary_op, score_input): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) # Copy weights from shared to local sess.run(self.sync) # [Base] batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \ self._process_base(sess, global_t, summary_writer, summary_op, score_input) feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, self.local_network.base_initial_lstm_state: start_lstm_state, # [common] self.learning_rate_input: cur_learning_rate } # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] next_frame = None if self.use_reward_prediction: batch_rp_si, batch_rp_c, next_frame = self._process_replay() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) # [Future reward prediction] if self.use_future_reward_prediction: batch_frp_si, batch_frp_c, next_frame, batch_frp_action = self._process_replay( action=True) frp_feed_dict = { self.local_network.frp_input: batch_frp_si, self.local_network.frp_c_target: batch_frp_c, self.local_network.frp_action_input: batch_frp_action } feed_dict.update(frp_feed_dict) if next_frame and self.use_autoencoder: ae_feed_dict = { self.local_network.ground_truth: np.expand_dims(next_frame.state, axis=0) } feed_dict.update(ae_feed_dict) # Calculate gradients and copy them to global network. #sess.run( self.apply_gradients, feed_dict=feed_dict) ln = self.local_network if self.use_future_reward_prediction: if self.use_autoencoder: frp_c, decoder_loss, frp_loss, value_loss, policy_loss, _ = sess.run( [ ln.frp_c, ln.decoder_loss, ln.frp_loss, ln.value_loss, ln.policy_loss, self.apply_gradients ], feed_dict=feed_dict) self.add_summary(global_t, 'decoder_loss', decoder_loss, summary_writer) self.add_summary(global_t, 'frp_loss', frp_loss, summary_writer) else: frp_c, value_loss, policy_loss, _ = sess.run( [ ln.frp_c, ln.value_loss, ln.policy_loss, self.apply_gradients ], feed_dict=feed_dict) acc = ((frp_c == frp_c.max()) * batch_frp_c).sum() self.add_summary(global_t, 'reward prediction accuracy', acc, summary_writer) else: value_loss, policy_loss, _ = sess.run( [ln.value_loss, ln.policy_loss, self.apply_gradients], feed_dict=feed_dict) self.add_summary(global_t, 'value_loss', value_loss, summary_writer) self.add_summary(global_t, 'policy_loss', policy_loss, summary_writer) self.add_summary(global_t, 'base_loss', policy_loss + value_loss, summary_writer) if self.use_autoencoder and global_t % 25000 == 0: current_res = { 'next_frame_ground_truth': next_frame, 'step': global_t } if self.use_reward_prediction: predicted_frame, predicted_reward = sess.run( [ self.local_network.encoder_output, self.local_network.rp_c ], feed_dict=feed_dict) current_res['states'] = batch_rp_si current_res['target_reward'] = batch_rp_c elif self.use_future_reward_prediction: predicted_frame, predicted_reward = sess.run( [ self.local_network.encoder_output, self.local_network.frp_c ], feed_dict=feed_dict) current_res['states'] = batch_frp_si current_res['target_reward'] = batch_frp_c current_res['action'] = batch_frp_action current_res['next_frame_prediction'] = predicted_frame current_res['next_reward_prediction'] = predicted_reward if os.path.exists(self.prediction_res_file) and os.path.getsize( self.prediction_res_file) > 0: with open(self.prediction_res_file, 'rb') as f: res = pickle.load(f) else: res = [] res.append(current_res) with open(self.prediction_res_file, 'wb') as f: pickle.dump(res, f) self._print_log(global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t