def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, training, cooperative, delay_delta): self.delay_delta = delay_delta logging.info(" ".join( map(str, ("delay_delta", delay_delta, "cooperative", cooperative)))) self.training = training self.cooperative = cooperative self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACLSTMNetwork(thread_index, device) self.local_network.prepare_loss() with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( zip(self.gradients, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.episode_count = 0 self.backup_vars = self.local_network.backup_vars() self.restore_backup = self.local_network.restore_backup() self.initial_learning_rate = initial_learning_rate
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): """ A3C 算法的 local AC 网络的训练 :param thread_index: 线程编号,-1 是 全局的 AC 网络 :param global_network: :param initial_learning_rate: :param learning_rate_input: :param grad_applier: 梯度更新器对象,论文中使用了 RMSProp :param max_global_time_step: :param device: """ self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # 初始化网络的参数 self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) # 需要手机 loss 函数关于各个训练参数?的梯度信息 with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) # 更新梯度的 tf 操作 self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) # 每一个 local AC 在算法结束的时候需要从 global AC 网络同步参数 self.sync = self.local_network.sync_from(global_network) # 封装游戏 self.game_state = GameState() # 统计 时间步 self.local_t = 0 # 各色训练参参数 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # 控制日志的输出 self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = action_size self.gamma = gamma self.local_t_max = local_t_max self.agent_type = agent_type self.performance_log_interval = performance_log_interval self.log_level = log_level if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) self.local_network.prepare_loss(entropy_beta) with tf.device(device): var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device,task_index=""): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) if(global_network): self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.mode="threading"; else: self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients ) self.mode="dist_tensor"; if not (task_index): self.game_state = GameState(113 * thread_index) else: self.game_state = GameState(113 * task_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_episode, device, arrived_jobs, condition): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_episode = max_global_time_episode # 通过thread_index 即机器编号来获取在该机器上加工的所有工序 self.operations = get_data_by_machine(thread_index) self.condition = condition self.is_terminal_counted = False self.last_episode_reward = 0 if USE_LSTM: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients) # self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) # 创建该工序的环境 self.env = JspEnv(self.operations, thread_index, arrived_jobs) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v.ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 tempdir = os.path.join(os.getcwd(), "results") self.res_file = os.path.join(tempdir, RESULTS_FILE) file = open(self.res_file, 'wb') file.write('itr,mean_score,max,min,std,runs,test_steps\n') file.close()
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if NETWORK_TYPE == 'LSTM': self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) elif NETWORK_TYPE == 'DILATED': self.local_network = GameACDilatedNetwork(ACTION_SIZE, device) elif NETWORK_TYPE == 'CONV': self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): """ A3C 算法的 local AC 网络的训练 :param thread_index: 线程编号,-1 是 全局的 AC 网络 :param global_network: :param initial_learning_rate: :param learning_rate_input: :param grad_applier: 梯度更新器对象,论文中使用了 RMSProp :param max_global_time_step: :param device: """ self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # 初始化网络的参数 self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) # 需要手机 loss 函数关于各个训练参数?的梯度信息 with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) # 更新梯度的 tf 操作 self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) # 每一个 local AC 在算法结束的时候需要从 global AC 网络同步参数 self.sync = self.local_network.sync_from(global_network) # 封装游戏 self.game_state = GameState() # 统计 时间步 self.local_t = 0 # 各色训练参参数 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # 控制日志的输出 self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): """ 递减学习率,主要是防止在 loss 的最小值的地方来回的震荡 :param global_time_step: 已经玩的时间 :return: """ learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): """ 这个是 epsilon-greedy, 需要指定输出行为的分布 :param pi_values: 获得的策略 :return: 返回一个动作 """ return np.random.choice(range(len(pi_values)), p=pi_values) def set_start_time(self, start_time): """ 设置开始时间 :param start_time: :return: """ self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, learning_rate_input, score_input): """ 开始 local AC 网络的训练过程 :param sess: :param global_t: :param summary_writer: :param learning_rate_input: :param score_input: :return: """ states = [] actions = [] rewards = [] values = [] terminal_end = False # 从全局的 AC 网络中获取参数 sess.run(self.sync) start_local_t = self.local_t start_lstm_state = self.local_network.lstm_state_out # 必须规定 local AC 网络最大的时间步,T_{max} for i in range(LOCAL_T_MAX): # 已经的到了游戏的当前状态以及更新后的 pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) # 保存累计的信息 states.append(self.game_state.s_t) actions.append(action) values.append(value_) # 只有 第一个 local AC 网络需要在合适的时候输出日志 if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # 执行动作 self.game_state.process(action) # 获取游戏的回报,这个封装了连接 连续4帧图像的过程 reward = self.game_state.reward terminal = self.game_state.terminal # 累计的报答信息 self.episode_reward += reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # 状态更新 self.game_state.update() # 在附录的 Algorithm 3 中 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self.game_state.reset() # LSTM 的传递的装套重置 self.local_network.reset_state() break # 分类讨论,计算的是 discounted 的 Rewards R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) # 在最后的一个状态开始自举 actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # 这是 MDP 的四元组形式 在论文中,时间步是反的 for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi # a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if terminal_end: # summary_str = sess.run(summary_op, feed_dict={ score_input: self.episode_reward, learning_rate_input: cur_learning_rate }) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() # 修改记录 self.episode_reward = 0 batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate}) # 计算 wall clock time, 在论文第6页 if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # 进行 LOCAL_TIME_MAX 次采样的时间差 diff_local_t = self.local_t - start_local_t return diff_local_t
def display(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip, agent_type, action_size, rand_seed, checkpoint_dir, display_time_sleep, display_episodes, display_log_level, display_save_log, show_max): # use CPU for display tool device = "/cpu:0" LOG_FILE = 'log_{}-{}.txt'.format(experiment_name, agent_type) if agent_type == 'LSTM': global_network = GameACLSTMNetwork(action_size, -1, device) else: global_network = GameACFFNetwork(action_size, -1, device) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=rmsp_alpha, momentum=0.0, epsilon=rmsp_epsilon, clip_norm=grad_norm_clip, device=device) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") episode = 0 terminal = False episode_rewards = [] episode_steps = [] episode_passed_obsts = [] print ' ' print 'DISPLAYING {} EPISODES'.format(display_episodes) print '--------------------------------------------------- ' while not episode == display_episodes: episode_reward = 0 episode_passed_obst = 0 game_state = GameState(rand_seed, action_size, show_score=True) if display_log_level == 'FULL': print 'EPISODE {}'.format(episode) full_frame = None while True: pi_values, value = global_network.run_policy_and_value( sess, game_state.s_t) action = choose_action(pi_values) game_state.process(action) terminal = game_state.terminal episode_step = game_state.steps reward = game_state.reward passed_obst = game_state.passed_obst if len(episode_passed_obsts) == 0 and show_max: if passed_obst > 0: full_frame = game_state.full_frame elif episode_passed_obst > np.max( episode_passed_obsts) and show_max: full_frame = game_state.full_frame episode_reward += reward episode_passed_obst = passed_obst if display_log_level == 'FULL': print 'step / pi_values: {} / value: {} / action: {} / reward: {} / passed_obst: {}'.format( pi_values, value, action, reward, passed_obst) time.sleep(display_time_sleep) if not terminal: game_state.update() else: break episode_rewards.append(episode_reward) episode_steps.append(episode_step) episode_passed_obsts.append(episode_passed_obst) if not display_log_level == 'NONE': reward_steps = format( float(episode_reward) / float(episode_step), '.4f') print "EPISODE: {} / STEPS: {} / PASSED OBST: {} / REWARD: {} / REWARD/STEP: {}".format( episode, episode_step, passed_obst, episode_reward, reward_steps) if display_save_log: with open(LOG_FILE, "a") as text_file: text_file.write('{},{},{},{},{}\n'.format( episode, episode_step, passed_obst, episode_reward, reward_steps)) episode += 1 print '--------------------------------------------------- ' print 'DISPLAY SESSION FINISHED' print 'TOTAL EPISODES: {}'.format(display_episodes) print ' ' print 'MIN' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.min(episode_rewards), np.min(episode_steps), np.min(episode_passed_obsts)) print ' ' print 'AVERAGE' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.average(episode_rewards), np.average(episode_steps), np.average(episode_passed_obsts)) print ' ' print 'MAX' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.max(episode_rewards), np.max(episode_steps), np.max(episode_passed_obsts)) if show_max and not full_frame == None: plt.imshow(full_frame, origin='lower') plt.show()
def run_a3c_test(args): """Run A3C testing.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = args.folder else: folder = 'results/a3c/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 \ or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' if args.padding == 'SAME': end_str += '_same' folder += end_str folder = pathlib.Path(folder) demo_memory_cam = None demo_cam_human = False if args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME) demo_memory_folder = pathlib.Path(demo_memory_folder) if args.demo_cam_id is not None: demo_cam_human = True demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] logger.info("loaded demo {} for testing CAM".format( args.demo_cam_id)) else: demo_cam_folder = pathlib.Path(args.demo_cam_folder) demo_cam = ReplayMemory() demo_cam.load(name='test_cam', folder=demo_cam_folder) logger.info("loaded demo {} for testing CAM".format( str(demo_cam_folder / 'test_cam'))) demo_memory_cam = np.zeros( (len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0, _, _, _, _, _, t1, _ = demo_cam[i] demo_memory_cam[i] = np.copy(s0) del demo_cam device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) input_shape = (84, 84, 4) if args.padding == 'VALID' else (88, 88, 4) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = \ args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" if args.use_lstm: local_network = GameACLSTMNetwork(action_size, 0, device) else: local_network = GameACFFNetwork( action_size, 0, device, padding=args.padding, in_shape=input_shape) testing_thread = A3CTrainingThread( 0, global_network, local_network, initial_learning_rate, learning_rate_input, grad_applier, 0, device=device) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' # TODO: make this an argument transfer_folder += end_str transfer_folder = pathlib.Path(transfer_folder) transfer_folder /= 'transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list, ) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) else: logger.warning("Could not find old checkpoint") def test_function(): nonlocal global_t if args.use_transfer: from_folder = str(transfer_folder).split('/')[-2] else: from_folder = str(folder).split('/')[-1] from_folder = pathlib.Path(from_folder) save_folder = 'results/test_model/a3c' / from_folder prepare_dir(str(save_folder), empty=False) prepare_dir(str(save_folder / 'frames'), empty=False) # Evaluate model before training if not stop_requested: testing_thread.testing_model( sess, args.eval_max_steps, global_t, save_folder, demo_memory_cam=demo_memory_cam, demo_cam_human=demo_cam_human) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) test_thread = threading.Thread(target=test_function, args=()) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) test_thread.start() print('Press Ctrl+C to stop') test_thread.join() sess.close()
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, options): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.options = options if options.use_lstm: self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(options.action_size, device) self.local_network.prepare_loss(options.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.indent = " |" * self.thread_index self.steps = 0 self.no_reward_steps = 0 self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0) if self.options.train_episode_steps > 0: self.max_reward = 0.0 self.max_episode_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores = Episode_scores(options) self.tes = self.options.train_episode_steps if self.options.tes_list is not None: self.tes = self.options.tes_list[thread_index] print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes)) self.initial_lives = self.game_state.initial_lives self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1) if self.options.record_new_record_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_record_dir): os.makedirs(self.options.record_new_record_dir) self.episode_screens = [] if self.options.record_new_room_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_room_dir): os.makedirs(self.options.record_new_room_dir) self.episode_screens = [] self.greediness = options.greediness self.repeat_action_ratio = options.repeat_action_ratio self.prev_action = 0
device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) global_t = 0 stop_requested = False global_game = DoomGameState(scenario_path="scenarios/cig.cfg") if USE_LSTM: global_network = GameACLSTMNetwork(global_game.get_action_size(), -1, device) else: global_network = GameACFFNetwork(global_game.get_action_size(), -1, device) del global_game training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device)
# -*- coding: utf-8 -*- import tensorflow as tf import matplotlib.pyplot as plt from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread from rmsprop_applier import RMSPropApplier import options options = options.options # use CPU for weight visualize tool device = "/cpu:0" if options.use_lstm: global_network = GameACLSTMNetwork(options.action_size, -1, device) else: global_network = GameACFFNetwork(options.action_size, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = options.rmsp_alpha, momentum = 0.0, epsilon = options.rmsp_epsilon, clip_norm = options.grad_norm_clip, device = device) sess = tf.Session()
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, 11, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.epSteps = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * \ (self.max_global_time_step - global_time_step) / \ self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] states2 = [] actions = [] comms = [] rewards = [] values = [] # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, comm_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t, self.game_state.s2) action = self.choose_action(pi_) comm = self.choose_action(comm_) states.append(self.game_state.s_t) states2.append(self.game_state.s2) actions.append(action) comms.append(comm) values.append(value_) # process game self.game_state.process(action, comm) # receive game result reward = self.game_state.reward self.episode_reward += reward if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) print(" R={}".format(reward)) # clip reward # rewards.append(np.clip(reward, -1, 1)) rewards.append(reward) self.local_t += 1 self.epSteps += 1 # s_t1 -> s_t self.game_state.update() if self.epSteps >= 100: self.epSteps = 0 if (self.thread_index == 0 and self.local_t % LOG_INTERVAL == 0): print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() self.local_network.reset_state() break R = 0.0 R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() states2.reverse() rewards.reverse() values.reverse() comms.reverse() batch_si = [] batch_s2 = [] batch_a = [] batch_c = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi, ci, s2i) in zip(actions, rewards, states, values, comms, states2): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 c = np.zeros([5]) c[ci] = 1 batch_si.append(si) batch_s2.append(s2i) batch_a.append(a) batch_c.append(c) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) batch_si.reverse() batch_s2.reverse() batch_a.reverse() batch_c.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.comm: batch_c, self.local_network.s2: batch_s2, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and \ (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. \ {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
from constants import ACTION_SIZE from constants import CHECKPOINT_DIR from constants import RMSP_EPSILON from constants import RMSP_ALPHA from constants import GRAD_NORM_CLIP def choose_action(pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) # 使用 CPU,可以边训练边检查 device = "/cpu:0" global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver()
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device=None, pretrained_model=None, pretrained_model_sess=None, advice=False, reward_shaping=False): assert self.action_size != -1 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.use_pretrained_model_as_advice = advice self.use_pretrained_model_as_reward_shaping = reward_shaping logger.info("thread_index: {}".format(self.thread_index)) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("use_lstm: {}".format( colored(self.use_lstm, "green" if self.use_lstm else "red"))) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("finetune_upper_layers_only: {}".format( colored(self.finetune_upper_layers_only, "green" if self.finetune_upper_layers_only else "red"))) logger.info("use_pretrained_model_as_advice: {}".format( colored( self.use_pretrained_model_as_advice, "green" if self.use_pretrained_model_as_advice else "red"))) logger.info("use_pretrained_model_as_reward_shaping: {}".format( colored( self.use_pretrained_model_as_reward_shaping, "green" if self.use_pretrained_model_as_reward_shaping else "red"))) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) if self.use_lstm: GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACFFNetwork(self.action_size, thread_index, device) with tf.device(device): self.local_network.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) local_vars = self.local_network.get_vars if self.finetune_upper_layers_only: local_vars = self.local_network.get_vars_upper var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs) global_vars = global_network.get_vars if self.finetune_upper_layers_only: global_vars = global_network.get_vars_upper with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) #self.apply_gradients = grad_applier.apply_gradients( # global_vars(), # self.gradients) self.sync = self.local_network.sync_from( global_network, upper_layers_only=self.finetune_upper_layers_only) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=30, human_demo=False, episode_life=True) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 self.is_demo_thread = False with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped.get_action_meanings( ) self.local_network.build_grad_cam_grads() self.pretrained_model = pretrained_model self.pretrained_model_sess = pretrained_model_sess self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0 self.advice_ctr = 0 self.shaping_ctr = 0 self.last_rho = 0. if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping: assert self.pretrained_model is not None
from game_ac_network import GameACLSTMNetwork from constants import ACTION_SIZE from constants import CHECKPOINT_DIR from constants import USE_GPU from game_state import GameState device = "/cpu:0" if USE_GPU: device = "/gpu:0" global_t = 0 stop_requested = False global_network = GameACLSTMNetwork(ACTION_SIZE, -1, 11, device) training_threads = [] # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() sess.run(init) # summary for tensorboard def choose_action(pi_values): return np.random.choice(range(len(pi_values)), p=pi_values)
def make_network(): if USE_LSTM: return GameACLSTMNetwork(ACTION_SIZE, -1, device) else: return GameACFFNetwork(ACTION_SIZE, device)
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, training, cooperative, delay_delta): self.delay_delta = delay_delta logging.info(" ".join( map(str, ("delay_delta", delay_delta, "cooperative", cooperative)))) self.training = training self.cooperative = cooperative self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACLSTMNetwork(thread_index, device) self.local_network.prepare_loss() with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( zip(self.gradients, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.episode_count = 0 self.backup_vars = self.local_network.backup_vars() self.restore_backup = self.local_network.restore_backup() self.initial_learning_rate = initial_learning_rate def inverse_sigmoid(self, x): return (1 + math.exp(-self.delay_delta)) / ( 1 + math.exp(SIGMOID_ALPHA * (x - self.delay_delta))) def reset_state_and_reinitialize(self, sess): self.local_network.reset_state() # action, value_ = self.local_network.run_action_and_value(sess, [0.0]*STATE_SIZE) def get_network_vars(self): return self.local_network.get_vars() def _anneal_learning_rate(self, global_time_step): # learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step # if learning_rate < 0.0: # learning_rate = 0.0 # return learning_rate return self.initial_learning_rate def _record_score(self, sess, summary_writer, summary_op, summary_inputs, things, global_t): # print("window in _record_score", self.windows, self.time_differences) feed_dict = {} for key in things.keys(): feed_dict[summary_inputs[key]] = things[key] summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def start_anew(self): # print("self.windows", self.windows) assert (len(self.windows) > 0) current_index = 0 while current_index < len(self.windows): # print("current_index", current_index) if current_index + math.floor( A3CTrainingThread.get_actual_window( self.windows[current_index] + self.actions[current_index])) == len(self.windows): return True current_index = current_index + math.floor( A3CTrainingThread.get_actual_window( self.windows[current_index] + self.actions[current_index])) # print("current_index afterwards", current_index) return False def start_anew_index(self): assert (len(self.windows) > 0) current_index = 0 while current_index < len(self.windows): current_index = current_index + math.floor( A3CTrainingThread.get_actual_window( self.windows[current_index] + self.actions[current_index])) # print("current_index in start_anew", current_index) return current_index def action_step(self, sess, state, tickno, window): # print(self.thread_index, "in action") # Run this still with the old weights, before syncing them # print("state", state) assert (np.all(np.isfinite(np.array(state, dtype=np.float32)))) # print(self.thread_index, "state", state) if self.training: self.estimated_values.append( self.local_network.run_value(sess, state)) # if len(self.actions) % LOCAL_T_MAX == 0: if not (len(self.start_lstm_states) == 0) == (len(self.actions) == 0): print("Oh no, something went pretty wrong:", self.start_lstm_states, self.actions) assert ((len(self.start_lstm_states) == 0) == (len( self.actions) == 0)) if ('LOCAL_T_MAX' in globals() and len(self.actions) % globals()["LOCAL_T_MAX"] == 0) or (not 'LOCAL_T_MAX' in globals() and (len(self.actions) == 0 or self.start_anew())): # print("Starting new period") self.time_differences.append(None) # Sync for the next iteration sess.run(self.sync) self.start_lstm_states.append( (self.local_network.lstm_state_out_action, self.local_network.lstm_state_out_value, self.local_network.lstm_state_out_duration)) self.variable_snapshots.append( sess.run(self.local_network.get_vars())) if self.training: action, value_ = self.local_network.run_action_and_value( sess, state) else: assert (False) action = self.local_network.run_action(sess, state) # logging.debug(" ".join(map(str,(self.thread_index,"pi_values:",pi_)))) if self.training: self.states.append(state) self.ticknos.append(tickno) self.windows.append(window) self.actions.append(action) self.values.append(value_) # if self.local_t % LOG_INTERVAL == 0: # logging.debug("{}: pi={}".format(self.thread_index, pi_)) # logging.debug("{}: V={}".format(self.thread_index, value_)) # print(self.thread_index, action[0]) return action def reward_step(self, sess, global_t, summary_writer, summary_op, summary_inputs, reward_throughput, reward_delay, duration, sent): # print(self.thread_index, "in reward") assert (reward_throughput >= 0) assert (reward_delay >= 0) # print("duration", duration) assert (duration >= 0) assert (sent >= 0) assert (len(self.rewards) <= 2 * MAX_WINDOW) self.rewards.append((reward_throughput, reward_delay, duration, sent)) # if len(self.rewards)>=LOCAL_T_MAX or (len([item for item in self.actions[:LOCAL_T_MAX] if item is not None]) == len(self.rewards) and len(self.rewards) > 0 and self.time_differences[0] is not None): if ('LOCAL_T_MAX' in globals() and (len(self.rewards) >= globals()["LOCAL_T_MAX"] or (len([ item for item in self.actions[:globals()["LOCAL_T_MAX"]] if item is not None ]) == len(self.rewards) and len(self.rewards) > 0 and self.time_differences[0] is not None)) ) or ( not 'LOCAL_T_MAX' in globals() and (len(self.rewards) >= math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0])) or (len([ item for item in self.actions[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] if item is not None ]) == len(self.rewards) and len(self.rewards) > 0 and self.time_differences[0] is not None))): if not 'LOCAL_T_MAX' in globals(): assert (len(self.rewards) == math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]) ) or (len([ item for item in self.actions[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] if item is not None ]) == len(self.rewards) and len(self.rewards) > 0 and self.time_differences[0] is not None)) # print(self.thread_index, "rewards", self.rewards, "actions", self.actions, "time_diffs", self.time_differences) # assert(len(self.rewards) <= LOCAL_T_MAX) # print(len([item for item in self.actions[:LOCAL_T_MAX] if item is not None]), len(self.rewards[:LOCAL_T_MAX])) # assert(len([item for item in self.actions[:LOCAL_T_MAX] if item is not None]) == len(self.rewards[:LOCAL_T_MAX])) if not len([ item for item in self.actions[:len(self.rewards)] if item is not None ]) == len(self.rewards[:len(self.rewards)]): print("actions", self.actions, "rewards", self.rewards) assert (len([ item for item in self.actions[:len(self.rewards)] if item is not None ]) == len(self.rewards[:len(self.rewards)])) result = self.process(sess, global_t, summary_writer, summary_op, summary_inputs, self.time_differences[0]) return result else: return 0 def final_step(self, sess, global_t, summary_writer, summary_op, summary_inputs, actions_to_remove, time_difference, window): # print(self.thread_index, "self.time_differences", self.time_differences) # print("self.actions", len(self.actions)) # print("self.states", len(self.states)) # print("self.values", len(self.values)) # print("self.rewards", len(self.rewards)) # print("self.estimated_values", len(self.estimated_values)) # print("self.time_differences", len(self.time_differences)) # print("self.start_lstm_states", len(self.start_lstm_states)) # print("self.variable_snapshots", len(self.variable_snapshots)) # self.actions = self.actions[:-actions_to_remove] # self.states = self.states[:-actions_to_remove] # self.values = self.values[:-actions_to_remove] # self.estimated_values = self.estimated_values[:-actions_to_remove+1] # Sure that you have to remove one less? # print("Final step is called") if self.training: if len(self.actions) > 0: self.time_differences = self.time_differences[:-1] self.time_differences.append(time_difference) # self.windows = self.windows[:-1] # self.windows.append(window) if 'LOCAL_T_MAX' in globals(): nones_to_add = [None] * ( (LOCAL_T_MAX - (len(self.actions) % LOCAL_T_MAX)) % LOCAL_T_MAX) else: # Sure this makes sense? nones_to_add = [None] * (self.start_anew_index() - len(self.actions)) self.actions += nones_to_add self.states += nones_to_add self.values += nones_to_add self.estimated_values += nones_to_add self.windows += nones_to_add self.ticknos += nones_to_add # TODO: Is this useful? I guess only the `local_t' is actually needed... else: self.local_t = 0 self.episode_count += 1 self.episode_reward_throughput = 0 self.episode_reward_delay = 0 self.episode_reward_sent = 0 # If, for some strange reason, absolutely nothing happened in this episode, don't do anything... # Or if you're actually in testing mode :) # if len(self.rewards)>0: # time_diff = self.process(sess, global_t, summary_writer, summary_op, summary_inputs, time_difference) # else: # time_diff = 0 # self.states = [] # self.actions = [] # self.rewards = [] # self.values = [] # self.estimated_values = [] # self.start_lstm_states = [] # self.variable_snapshots = [] # FIXME: Not resetting state any longer!!! Is that bad? sess.run(self.sync) self.reset_state_and_reinitialize(sess) @staticmethod def get_actual_window(x): return max(min(x, MAX_WINDOW), MIN_WINDOW) def process(self, sess, global_t, summary_writer, summary_op, summary_inputs, time_difference=None): # print(self.thread_index, "in process") assert (len(self.rewards) > 0) # print(len(self.rewards)) if not len(self.start_lstm_states) <= len(self.actions): print(len(self.start_lstm_states), len(self.actions)) assert (len(self.start_lstm_states) <= len(self.actions)) if self.local_t <= 0: self.start_time = time.time() final = time_difference is not None start_local_t = self.local_t # logging.debug(" ".join(map(str,(self.thread_index, "In process: len(rewards)", len(self.rewards), "len(states)", len(self.states), "len(actions)", len(self.actions), "len(values)", len(self.values))))) if 'LOCAL_T_MAX' in globals(): actions = self.actions[:LOCAL_T_MAX] ticknos = self.ticknos[:LOCAL_T_MAX] windows = self.windows[:LOCAL_T_MAX] states = self.states[:LOCAL_T_MAX] rewards = self.rewards[:LOCAL_T_MAX] values = self.values[:LOCAL_T_MAX] else: # actions = self.actions[:len(self.rewards)] # ticknos = self.ticknos[:len(self.rewards)] # windows = self.windows[:len(self.rewards)] # states = self.states[:len(self.rewards)] # values = self.values[:len(self.rewards)] # rewards = self.rewards[:len(self.rewards)] actions = self.actions[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] ticknos = self.ticknos[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] windows = self.windows[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] states = self.states[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] values = self.values[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] rewards = self.rewards[:math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0]))] actions = [item for item in actions if item is not None] ticknos = [item for item in ticknos if item is not None] windows = [item for item in windows if item is not None] states = [item for item in states if item is not None] rewards = [item for item in rewards if item is not None] values = [item for item in values if item is not None] assert (len(actions) > 0) assert (len(ticknos) > 0) assert (len(windows) > 0) assert (len(states) > 0) assert (len(rewards) > 0) assert (len(values) > 0) if not (len(actions) == len(ticknos) == len(windows) == len(states) == len(rewards) == len(values)): print(len(self.actions), len(self.ticknos), len(self.windows), len(self.states), len(self.rewards), len(self.values)) print(self.actions, self.ticknos, self.windows, self.states, self.rewards, self.values) print(len(actions), len(ticknos), len(windows), len(states), len(rewards), len(values)) print(actions, ticknos, windows, states, rewards, values) assert (len(actions) == len(ticknos) == len(windows) == len(states) == len(rewards) == len(values)) # if not len(self.actions) == len(self.ticknos) == len(self.windows) == len(self.states) == len(self.values) == len(self.estimated_values): # print("In thread:", self.thread_index, "rewards:", len(self.rewards), ";", len(self.actions), len(self.ticknos), len(self.windows), len(self.states), len(self.values), len(self.estimated_values)) # print("In thread:", self.thread_index, "rewards:", len(self.rewards), ";", len(self.actions), "lstm_states:", len(self.start_lstm_states)) assert (len(self.actions) == len(self.ticknos) == len(self.windows) == len(self.states) == len(self.values) == len( self.estimated_values)) assert (len(self.time_differences) == len(self.start_lstm_states) == len(self.variable_snapshots)) # logging.debug(" ".join(map(str,(self.thread_index, "In process: rewards", rewards, "states", states, "actions", actions, "values", values)))) # get estimated value of step n+1 # assert((not len(self.estimated_values) <= len(rewards)) or final) # print("self.estimated_values", self.estimated_values) # print("Spam and eggs") R_packets, R_duration, R_sent = self.estimated_values[len( rewards )] if len(self.estimated_values) > len( rewards) and self.estimated_values[len( rewards)] is not None and not final else self.estimated_values[ len(rewards) - 1] R_packets_initial, R_duration_initial, R_sent_initial = R_packets, R_duration, R_sent R_packets, R_duration, R_sent = R_packets, 1 / R_duration, R_sent # R_packets, R_accumulated_delay, R_duration, R_sent = (R_packets)/(1-GAMMA), (R_accumulated_delay)/(1-GAMMA), (R_duration)/(1-GAMMA), (R_sent)/(1-GAMMA) # logging.debug(" ".join(map(str,("exp(R_packets)", R_packets, "exp(R_accumulated_delay)", R_accumulated_delay, "exp(R_duration)", R_duration)))) if not (R_duration > 0): print("R_duration", R_duration) if not np.isfinite(R_duration): R_duration = 0.0 assert (np.isfinite(R_duration)) # Pretty dumb assert (np.isfinite(R_packets)) # assert(np.isfinite(R_accumulated_delay)) assert (np.isfinite(R_sent)) actions.reverse() states.reverse() rewards.reverse() values.reverse() windows.reverse() # logging.debug(" ".join(map(str,("values", values)))) batch_si = [] batch_ai = [] batch_td = [] batch_R_duration = [] batch_R_packets = [] # batch_R_accumulated_delay = [] batch_R_sent = [] # compute and accmulate gradients for (ai, ri, si, Vi, wi) in zip(actions, rewards, states, values, windows): # FIXME: Make sure that it actually works with how the roll-off factor gets normalized. # assert(False) # The GAMMA_FACTOR increases the influence that following observations have on this one. # GAMMA = (1 - 2/(A3CTrainingThread.get_actual_window(wi+ai) + 1)) GAMMA = 0.99 # R_duration = ((1-GAMMA)*ri[2] + GAMMA*R_duration) # R_packets = ((1-GAMMA)*ri[0] + GAMMA*R_packets) # R_sent = ((1-GAMMA)*ri[3] + GAMMA*R_sent) # R_accumulated_delay = ((1-GAMMA)*ri[1] + GAMMA*R_accumulated_delay) R_duration = ((1 - GAMMA) * ri[2] + GAMMA * R_duration) R_packets = ((1 - GAMMA) * ri[0] + GAMMA * R_packets) R_sent = ((1 - GAMMA) * ri[3] + GAMMA * R_sent) # R_accumulated_delay = ((1-GAMMA)*ri[1] + GAMMA*R_accumulated_delay) # R_delay = R_accumulated_delay/R_packets # td_delay = -(np.log(R_accumulated_delay/R_packets/DELAY_MULTIPLIER) - np.log(Vi[1]/Vi[0]/DELAY_MULTIPLIER)) # td -= self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # td_delay = -(R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # Doesn't work... # td = R_packets/R_duration/(R_accumulated_delay/R_packets+self.delay_delta) - Vi[0]/Vi[2]/(Vi[1]/Vi[0]+self.delay_delta) # td = R_packets - Vi[0] - self.delay_delta*(R_sent - Vi[3]) # - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # td = R_packets/R_duration - Vi[0]/Vi[2] - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # td = R_packets/R_duration - Vi[0]/(Vi[2]) - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0]) - (R_sent/R_duration - Vi[3]/(Vi[2])) # td = inverse_sigmoid(self.delay_delta, R_sent/(R_packets+R_sent))*R_packets/R_duration - inverse_sigmoid(self.delay_delta, Vi[3]/(Vi[0]+Vi[3]))*Vi[0]/Vi[2] - (R_sent/R_duration - Vi[3]/(Vi[2])) # td = inverse_sigmoid(self.delay_delta, R_sent/(R_packets+R_sent) - 0.05)*R_packets/R_duration - inverse_sigmoid(self.delay_delta, Vi[3]/(Vi[0]+Vi[3]) - 0.05)*Vi[0]/Vi[2] - (R_sent/R_duration - Vi[3]/(Vi[2])) - (R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # if environ.get('reward_type') == "PCC": # # PCC # td = self.inverse_sigmoid(((R_sent - R_packets)/R_sent))*R_packets/R_duration - self.inverse_sigmoid(((Vi[2]-Vi[0])/Vi[2]))*Vi[0]/(1/Vi[1]) - ((R_sent - R_packets)/R_duration - (Vi[2] - Vi[0])/(1/Vi[1])) #- (R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # if environ.get('reward_type') is None or environ.get('reward_type') == "no_cutoff": # PCC without cutoff td = R_packets / R_duration - Vi[0] / ( 1 / Vi[1]) - self.delay_delta * ( (R_sent - R_packets) / R_duration - (Vi[2] - Vi[0]) / (1 / Vi[1]) ) #- (R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # elif environ.get('reward_type') == "modified": # # PCC modified # td = (1 - (R_sent - R_packets)/R_sent)*R_packets/R_duration - (1 - (Vi[2]-Vi[0])/Vi[2])*Vi[0]/(1/Vi[1]) - ((R_sent - R_packets)/R_duration - (Vi[2] - Vi[0])/(1/Vi[1])) #- (R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # td = R_packets*(1-GAMMA)*inverse_sigmoid(SIGMOID_ALPHA * R_sent/(R_packets+R_sent) - self.delay_delta) - Vi[0]*inverse_sigmoid(SIGMOID_ALPHA * Vi[3]/(Vi[0]+Vi[3]) - self.delay_delta) - (R_sent*(1-GAMMA) - Vi[3]) - (R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # PCC modified # td = R_packets*inverse_sigmoid(SIGMOID_ALPHA * R_sent/(R_packets+R_sent) - self.delay_delta) - Vi[0]*inverse_sigmoid(SIGMOID_ALPHA * Vi[3]/(Vi[0]+Vi[3]) - self.delay_delta) - (R_sent - Vi[3])# - (R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # td = R_packets/R_duration/(R_accumulated_delay/R_packets) - Vi[0]/Vi[2]/(Vi[1]/Vi[0]) - (R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # td = (np.log(R_packets/R_duration) - np.log(Vi[0]/Vi[2])) - self.delay_delta*(R_accumulated_delay/R_packets - Vi[1]/Vi[0]) # R_packets, R_accumulated_delay, R_duration, R_sent = (R_packets)/(1-GAMMA), (R_accumulated_delay)/(1-GAMMA), (R_duration)/(1-GAMMA), (R_sent)/(1-GAMMA) batch_si.append(si) batch_ai.append(ai) batch_td.append(td) batch_R_duration.append(1.0 / R_duration) batch_R_packets.append(R_packets) # batch_R_accumulated_delay.append(R_accumulated_delay) batch_R_sent.append(R_sent) # batch_R_duration.append(R_duration/(1-GAMMA)) # batch_R_packets.append(R_packets/(1-GAMMA)) # batch_R_accumulated_delay.append(R_accumulated_delay/(1-GAMMA)) # batch_R_sent.append(R_sent/(1-GAMMA)) # logging.debug(" ".join(map(str,("batch_td_throughput[-1]", batch_td_throughput[-1], "batch_td_delay[-1]", batch_td_delay[-1], "batch_R_packets[-1]", batch_R_packets[-1], "batch_R_accumulated_delay[-1]", batch_R_accumulated_delay[-1], "batch_R_duration[-1]", batch_R_duration[-1])))) self.episode_reward_throughput += ri[0] self.episode_reward_sent += ri[3] self.episode_reward_delay += ri[1] old_local_t = self.local_t self.local_t += len(rewards) # # if final or self.local_t % LOG_INTERVAL == 0: # print(self.thread_index, "windows", len(self.windows), "\nticknos", len(self.ticknos), "\nstates", len(self.states), "\nactions", len(self.actions), "\nrewards", len(self.rewards), "\nvalues", len(self.values), "\nestimated_values", len(self.estimated_values)) # print(self.thread_index, "windows", self.windows, "\nticknos", self.ticknos, "\nstates", self.states, "\nactions", self.actions, "\nrewards", self.rewards, "\nvalues", self.values, "\nestimated_values", self.estimated_values) # return advanced local step size diff_local_t = self.local_t - start_local_t cur_learning_rate = self._anneal_learning_rate(global_t) # logging.info(" ".join(map(str,("All the batch stuff", "batch_si", batch_si, "batch_ai", batch_ai,"batch_R_packets", batch_R_packets, "batch_R_accumulated_delay", batch_R_accumulated_delay, "batch_R_duration", batch_R_duration)))) self.backup_vars() batch_si.reverse() batch_ai.reverse() batch_td.reverse() batch_R_duration.reverse() batch_R_packets.reverse() # batch_R_accumulated_delay.reverse() batch_R_sent.reverse() windows.reverse() # print([A3CTrainingThread.get_actual_window(w+a) for w, a in zip(windows, batch_ai)]) feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_ai, self.local_network.td: batch_td, self.local_network.w: [ A3CTrainingThread.get_actual_window(w + a) for w, a in zip(windows, batch_ai) ], self.local_network.r_duration: batch_R_duration, self.local_network.r_packets: batch_R_packets, self.local_network.r_sent: batch_R_sent, self.local_network.initial_lstm_state_action: self.start_lstm_states[0][0], self.local_network.initial_lstm_state_value: self.start_lstm_states[0][1], self.local_network.initial_lstm_state_duration: self.start_lstm_states[0][2], self.local_network.step_size: [len(batch_ai)], self.learning_rate_input: cur_learning_rate } var_dict = dict( zip(self.local_network.get_vars(), self.variable_snapshots[0])) feed_dict.update(var_dict) sess.run(self.apply_gradients, feed_dict=feed_dict) # if len(ticknos) == 0: # print(self.thread_index, "actions", self.actions, "rewards", self.rewards, "values", self.values, "estimated_values", self.estimated_values, "ticknos", self.ticknos) # if final or self.local_t % LOG_INTERVAL == 0: if final or (self.local_t >= math.floor(self.local_t / LOG_INTERVAL) * LOG_INTERVAL and old_local_t < math.floor(self.local_t / LOG_INTERVAL) * LOG_INTERVAL): # if final: # if ticknos[-1]-ticknos[0] > 0 and self.episode_reward_throughput > 0: if self.episode_reward_throughput > 0: # print(ticknos) # print(self.episode_reward_throughput, ticknos[0], ticknos[-1]) # normalized_final_score_throughput = self.episode_reward_throughput/(ticknos[-1]-ticknos[0]) # logging.info("{}: self.episode_reward_throughput={}, time_difference={}".format(self.thread_index, self.episode_reward_throughput, time_difference)) normalized_final_score_delay = self.episode_reward_delay / self.episode_reward_throughput loss_score = ( self.episode_reward_sent - self.episode_reward_throughput) / self.episode_reward_sent # print(self.windows) # logging.info("{}: score_throughput={}, score_delay={}, measured throughput beginning={}, measured delay beginning={}, measured throughput end={}, measured delay end={}".format(self.thread_index, normalized_final_score_throughput, normalized_final_score_delay, batch_R_packets[0]/batch_R_duration[0], batch_R_accumulated_delay[0]/batch_R_packets[0], batch_R_packets[-1]/batch_R_duration[-1], batch_R_accumulated_delay[-1]/batch_R_packets[-1])) # logging.info("{}: score_delay={}, measured throughput beginning={}, measured delay beginning={}, measured throughput end={}, measured delay end={} {}".format(self.thread_index, normalized_final_score_delay, batch_R_packets[0]/batch_R_duration[0], batch_R_accumulated_delay[0]/batch_R_packets[0], batch_R_packets[-1]/batch_R_duration[-1], batch_R_accumulated_delay[-1]/batch_R_packets[-1], ("final:"+str(final)+", delta:"+str(self.delay_delta)+"; "+" ".join(map(str,("R_packets", R_packets_initial, "R_accumulated_delay", R_accumulated_delay_initial, "R_duration", R_duration_initial))), "state", batch_si[0], "action", batch_ai[0][0]))) logging.info( "{}: score_delay={}, measured throughput beginning={}, {}". format(self.thread_index, normalized_final_score_delay, batch_R_packets[0] / (1 / batch_R_duration[0]), ("final:" + str(final) + ", delta:" + str(self.delay_delta) + "; " + " ".join( map(str, ("R_packets", R_packets_initial, "R_duration", R_duration_initial, "R_sent", R_sent_initial))), "state", batch_si[0], "action", batch_ai[0]))) # time_difference > 0 because of a bug in Unicorn.cc that makes it possible for time_difference to be smaller than 0. # elapsed_time = time.time() - self.start_time # steps_per_sec = self.local_t / elapsed_time # logging.info("### {}: Performance: {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(self.thread_index, self.local_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # print([[A3CTrainingThread.get_actual_window(w+a) for w, a in zip(windows, batch_ai)][0]]) feed_dict = { self.local_network.s: [batch_si[0]], self.local_network.a: [batch_ai[0]], self.local_network.td: [batch_td[0]], self.local_network.w: [[ A3CTrainingThread.get_actual_window(w + a) for w, a in zip(windows, batch_ai) ][0]], self.local_network.r_duration: [batch_R_duration[0]], self.local_network.r_packets: [batch_R_packets[0]], self.local_network.r_sent: [batch_R_sent[0]], self.local_network.initial_lstm_state_action: self.start_lstm_states[0][0], self.local_network.initial_lstm_state_value: self.start_lstm_states[0][1], self.local_network.initial_lstm_state_duration: self.start_lstm_states[0][2], self.local_network.step_size: [1] } feed_dict.update(var_dict) entropy, actor_loss, value_loss, total_loss, window_increase, std = self.local_network.run_loss( sess, feed_dict) # print(entropy, actor_loss, value_loss, total_loss, window_increase, std) things = { "estimated_throughput": batch_R_packets[0] / (1 / batch_R_duration[0]), "estimated_loss_rate": (batch_R_sent[0] - batch_R_packets[0]) / batch_R_sent[0], "R_duration": batch_R_duration[0], "R_packets": batch_R_packets[0], "R_sent": batch_R_sent[0], "v_estimated_throughput": values[0][0] / (1 / values[0][1]), "v_estimated_loss_rate": (values[0][2] - values[0][0]) / values[0][2], "v_duration": values[0][1], "v_packets": values[0][0], "v_sent": values[0][2], "score_delay": normalized_final_score_delay, "score_lost": loss_score, "actor_loss": actor_loss.item(), "value_loss": value_loss, "entropy": entropy.item(), "total_loss": total_loss, "window_increase": window_increase.item(), "window": windows[0], "std": std.item(), "lstm_state_action_mean": np.mean(self.start_lstm_states[0][0]), "lstm_state_action_std": np.std(self.start_lstm_states[0][0]), "lstm_state_value_mean": np.mean(self.start_lstm_states[0][1]), "lstm_state_value_std": np.std(self.start_lstm_states[0][1]), "lstm_state_duration_mean": np.mean(self.start_lstm_states[0][2]), "lstm_state_duration_std": np.std(self.start_lstm_states[0][2]), # "speed": steps_per_sec } # logging.debug(" ".join(map(str,("things", things)))) self._record_score(sess, summary_writer, summary_op, summary_inputs, things, ticknos[0]) # if final: self.episode_count += 1 self.local_t = 0 self.episode_reward_throughput = 0 self.episode_reward_delay = 0 self.episode_reward_sent = 0 self.restore_backup() if 'LOCAL_T_MAX' in globals(): self.actions = self.actions[LOCAL_T_MAX:] self.ticknos = self.ticknos[LOCAL_T_MAX:] self.windows = self.windows[LOCAL_T_MAX:] self.states = self.states[LOCAL_T_MAX:] self.values = self.values[LOCAL_T_MAX:] self.rewards = self.rewards[LOCAL_T_MAX:] self.estimated_values = self.estimated_values[LOCAL_T_MAX:] else: items_to_remove = math.floor( A3CTrainingThread.get_actual_window(self.windows[0] + self.actions[0])) self.actions = self.actions[items_to_remove:] self.ticknos = self.ticknos[items_to_remove:] self.windows = self.windows[items_to_remove:] self.states = self.states[items_to_remove:] self.values = self.values[items_to_remove:] self.estimated_values = self.estimated_values[items_to_remove:] self.rewards = self.rewards[items_to_remove:] self.time_differences = self.time_differences[1:] self.start_lstm_states = self.start_lstm_states[1:] self.variable_snapshots = self.variable_snapshots[1:] if final: assert (len(self.rewards) <= 0) return diff_local_t
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input #每个worker不同 self.max_global_time_step = max_global_time_step #4000w steps self.action_size = action_size #2 self.gamma = gamma # 0.99 self.local_t_max = local_t_max # 256 self.agent_type = agent_type #FF self.performance_log_interval = performance_log_interval self.log_level = log_level #初始化worker的网络 if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) #创建一下loss的相关变量 self.local_network.prepare_loss(entropy_beta) with tf.device(device): #获取worker网络的参数 #[self.W_conv1, self.b_conv1, self.W_conv2, self.b_conv2,self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2,self.W_fc3, self.b_fc3] var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) #计算梯度, self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) #更新网络 self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) #拉取global网络参数 self.sync = self.local_network.sync_from(global_network) #初始化游戏环境 np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate #重置一些计数器 self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0
from rmsprop_applier import RMSPropApplier import options options = options.options def choose_action(pi_values): pi_values -= np.finfo(np.float32).epsneg action_samples = np.random.multinomial(options.num_experiments, pi_values) return action_samples.argmax(0) # use CPU for display tool device = "/cpu:0" if options.use_lstm: global_network = GameACLSTMNetwork(options.action_size, -1, device) else: global_network = GameACFFNetwork(options.action_size, device) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() model_checkpoint_path = None checkpoint = tf.train.get_checkpoint_state(options.checkpoint_dir) if checkpoint == None: checkpoint = tf.train.get_checkpoint_state(os.path.dirname(options.checkpoint_dir)) model_checkpoint_path = os.path.join(os.path.dirname(options.checkpoint_dir), os.path.basename(options.checkpoint_dir)) # for pseudo-count
def visualize(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip, agent_type, action_size, rand_seed, checkpoint_dir): # use CPU for weight visualize tool device = "/cpu:0" if agent_type == 'LSTM': global_network = GameACLSTMNetwork(action_size, -1, device) else: global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=rmsp_alpha, momentum=0.0, epsilon=rmsp_epsilon, clip_norm=grad_norm_clip, device=device) game = GameState(rand_seed, action_size) game.process(0) x_t = game.x_t plt.imshow(x_t, interpolation="nearest", cmap=plt.cm.gray) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") W_conv1 = sess.run(global_network.W_conv1) # show graph of W_conv1 fig, axes = plt.subplots(4, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(4 * 16)): inch = i // 16 outch = i % 16 img = W_conv1[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() W_conv2 = sess.run(global_network.W_conv2) # show graph of W_conv2 fig, axes = plt.subplots(2, 32, figsize=(27, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(2 * 32)): inch = i // 32 outch = i % 32 img = W_conv2[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() arr = sess.run(global_network.get_vars()) s = tf.placeholder("float", [None, 84, 84, 4]) b_conv1 = sess.run(global_network.b_conv1) b_conv2 = sess.run(global_network.b_conv2) inp_1 = tf.nn.conv2d(s, W_conv1, strides=[1, 4, 4, 1], padding="VALID") h_conv1 = tf.nn.relu(inp_1 + b_conv1) inp_2 = tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding="VALID") h_conv2 = tf.nn.relu(inp_2 + b_conv2) s_t = game.s_t getActivations(sess, s, h_conv1, s_t, 16) getActivations(sess, s, h_conv2, s_t, 32)
def run_a3c(args): """ python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> --load-pretrained-model --onevsall-mtl --pretrained-model-folder=<> --use-pretrained-model-as-advice --use-pretrained-model-as-reward-shaping """ from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf def log_uniform(lo, hi, rate): log_lo = math.log(lo) log_hi = math.log(hi) v = log_lo * (1 - rate) + log_hi * rate return math.exp(v) if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = 'results/a3c/{}_{}'.format(args.gym_env.replace('-', '_'), args.folder) else: folder = 'results/a3c/{}'.format(args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' folder += end_str if args.append_experiment_num is not None: folder += '_' + args.append_experiment_num if False: from common.util import LogFormatter fh = logging.FileHandler('{}/a3c.log'.format(folder), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) demo_memory = None num_demos = 0 max_reward = 0. if args.load_memory or args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format( args.gym_env.replace('-', '_')) if args.load_memory: # FIXME: use new load_memory function demo_memory, actions_ctr, max_reward = load_memory( args.gym_env, demo_memory_folder, imgs_normalized=True) #, create_symmetry=True) action_freq = [ actions_ctr[a] for a in range(demo_memory[0].num_actions) ] num_demos = len(demo_memory) demo_memory_cam = None if args.load_demo_cam: demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] demo_memory_cam = np.zeros((len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0 = (demo_cam[i])[0] demo_memory_cam[i] = np.copy(s0) del demo_cam logger.info("loaded demo {} for testing CAM".format(args.demo_cam_id)) device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:" + os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 pretrain_global_t = 0 pretrain_epoch = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) pretrained_model = None pretrained_model_sess = None if args.load_pretrained_model: if args.onevsall_mtl: from game_class_network import MTLBinaryClassNetwork as PretrainedModelNetwork elif args.onevsall_mtl_linear: from game_class_network import MTLMultivariateNetwork as PretrainedModelNetwork else: from game_class_network import MultiClassNetwork as PretrainedModelNetwork logger.error("Not supported yet!") assert False if args.pretrained_model_folder is not None: pretrained_model_folder = args.pretrained_model_folder else: pretrained_model_folder = '{}_classifier_use_mnih_onevsall_mtl'.format( args.gym_env.replace('-', '_')) PretrainedModelNetwork.use_mnih_2015 = args.use_mnih_2015 pretrained_model = PretrainedModelNetwork(action_size, -1, device) pretrained_model_sess = tf.Session(config=config, graph=pretrained_model.graph) pretrained_model.load( pretrained_model_sess, '{}/{}_checkpoint'.format(pretrained_model_folder, args.gym_env.replace('-', '_'))) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer(learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" n_shapers = args.parallel_size #int(args.parallel_size * .25) mod = args.parallel_size // n_shapers for i in range(args.parallel_size): is_reward_shape = False is_advice = False if i % mod == 0: is_reward_shape = args.use_pretrained_model_as_reward_shaping is_advice = args.use_pretrained_model_as_advice training_thread = A3CTrainingThread( i, global_network, initial_learning_rate, learning_rate_input, grad_applier, args.max_time_step, device=device, pretrained_model=pretrained_model, pretrained_model_sess=pretrained_model_sess, advice=is_advice, reward_shaping=is_reward_shape) training_threads.append(training_thread) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format( args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' #TODO: make this an argument transfer_folder += end_str transfer_folder += '/transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1 ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2 ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # summary writer for tensorboard summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter( 'results/log/a3c/{}/'.format(args.gym_env.replace('-', '_')) + folder[12:], sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=6) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(folder) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) # set wall time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'r') as f: wall_t = float(f.read()) with open(folder + '/pretrain_global_t', 'r') as f: pretrain_global_t = int(f.read()) with open(folder + '/model_best/best_model_reward', 'r') as f_best_model_reward: best_model_reward = float(f_best_model_reward.read()) rewards = pickle.load( open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'rb')) else: logger.warning("Could not find old checkpoint") # set wall time wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder + '/model_checkpoints', empty=True) prepare_dir(folder + '/model_best', empty=True) prepare_dir(folder + '/frames', empty=True) lock = threading.Lock() test_lock = False if global_t == 0: test_lock = True last_temp_global_t = global_t ispretrain_markers = [False] * args.parallel_size num_demo_thread = 0 ctr_demo_thread = 0 def train_function(parallel_index): nonlocal global_t, pretrain_global_t, pretrain_epoch, \ rewards, test_lock, lock, \ last_temp_global_t, ispretrain_markers, num_demo_thread, \ ctr_demo_thread training_thread = training_threads[parallel_index] training_thread.set_summary_writer(summary_writer) # set all threads as demo threads training_thread.is_demo_thread = args.load_memory and args.use_demo_threads if training_thread.is_demo_thread or args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs: training_thread.pretrain_init(demo_memory) if global_t == 0 and ( args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0) and parallel_index < 2: ispretrain_markers[parallel_index] = True training_thread.replay_mem_reset() # Pretraining with demo memory logger.info("t_idx={} pretrain starting".format(parallel_index)) while ispretrain_markers[parallel_index]: if stop_requested: return if pretrain_global_t > args.train_with_demo_num_steps and pretrain_epoch > args.train_with_demo_num_epochs: # At end of pretraining, reset state training_thread.replay_mem_reset() training_thread.episode_reward = 0 training_thread.local_t = 0 if args.use_lstm: training_thread.local_network.reset_state() ispretrain_markers[parallel_index] = False logger.info( "t_idx={} pretrain ended".format(parallel_index)) break diff_pretrain_global_t, _ = training_thread.demo_process( sess, pretrain_global_t) for _ in range(diff_pretrain_global_t): pretrain_global_t += 1 if pretrain_global_t % 10000 == 0: logger.debug( "pretrain_global_t={}".format(pretrain_global_t)) pretrain_epoch += 1 if pretrain_epoch % 1000 == 0: logger.debug("pretrain_epoch={}".format(pretrain_epoch)) # Waits for all threads to finish pretraining while not stop_requested and any(ispretrain_markers): time.sleep(0.01) # Evaluate model before training if not stop_requested and global_t == 0: with lock: if parallel_index == 0: test_reward, test_steps, test_episodes = training_threads[ 0].testing(sess, args.eval_max_steps, global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][global_t] = (test_reward, test_steps, test_episodes) saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t) save_best_model(test_reward) test_lock = False # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) # set start_time start_time = time.time() - wall_t training_thread.set_start_time(start_time) episode_end = True use_demo_thread = False while True: if stop_requested: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if args.use_demo_threads and global_t < args.max_steps_threads_as_demo and episode_end and num_demo_thread < 16: #if num_demo_thread < 2: demo_rate = 1.0 * (args.max_steps_threads_as_demo - global_t) / args.max_steps_threads_as_demo if demo_rate < 0.0333: demo_rate = 0.0333 if np.random.random() <= demo_rate and num_demo_thread < 16: ctr_demo_thread += 1 training_thread.replay_mem_reset(D_idx=ctr_demo_thread % num_demos) num_demo_thread += 1 logger.info( "idx={} as demo thread started ({}/16) rate={}".format( parallel_index, num_demo_thread, demo_rate)) use_demo_thread = True if use_demo_thread: diff_global_t, episode_end = training_thread.demo_process( sess, global_t) if episode_end: num_demo_thread -= 1 use_demo_thread = False logger.info("idx={} demo thread concluded ({}/16)".format( parallel_index, num_demo_thread)) else: diff_global_t, episode_end = training_thread.process( sess, global_t, rewards) for _ in range(diff_global_t): global_t += 1 if global_t % args.eval_freq == 0: temp_global_t = global_t lock.acquire() try: # catch multiple threads getting in at the same time if last_temp_global_t == temp_global_t: logger.info("Threading race problem averted!") continue test_lock = True test_reward, test_steps, n_episodes = training_thread.testing( sess, args.eval_max_steps, temp_global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][temp_global_t] = (test_reward, test_steps, n_episodes) if temp_global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save(sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format( args.gym_env.replace('-', '_')), global_step=temp_global_t, write_meta_graph=False) if test_reward > best_model_reward: save_best_model(test_reward) test_lock = False last_temp_global_t = temp_global_t finally: lock.release() if global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t, write_meta_graph=False) # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward best_model_reward = test_reward with open(folder + '/model_best/best_model_reward', 'w') as f_best_model_reward: f_best_model_reward.write(str(best_model_reward)) best_saver.save( sess, folder + '/model_best/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_'))) train_threads = [] for i in range(args.parallel_size): train_threads.append( threading.Thread(target=train_function, args=(i, ))) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) with open(folder + '/pretrain_global_t', 'w') as f: f.write(str(pretrain_global_t)) root_saver.save( sess, folder + '/{}_checkpoint_a3c'.format(args.gym_env.replace('-', '_')), global_step=global_t) pickle.dump( rewards, open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) logger.info('Data saved!') sess.close()
if not settings.mode == 'display' and not settings.mode == 'visualize': device = "/cpu:0" if settings.use_gpu: device = "/gpu:0" initial_learning_rates = log_uniform(settings.initial_alpha_low, settings.initial_alpha_high, settings.parallel_agent_size) global_t = 0 stop_requested = False if settings.agent_type == 'LSTM': global_network = GameACLSTMNetwork(settings.action_size, -1, device) else: global_network = GameACFFNetwork(settings.action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=settings.rmsp_alpha, momentum=0.0, epsilon=settings.rmsp_epsilon, clip_norm=settings.grad_norm_clip, device=device) for i in range(settings.parallel_agent_size):
from game_state import GameState from game_ac_network import GameACFFNetwork, GameACLSTMNetwork, GameACDilatedNetwork from constants import ACTION_SIZE from constants import PARALLEL_SIZE from constants import CHECKPOINT_DIR from constants import USE_GPU from constants import NETWORK_TYPE from constants import TESTING_DAYS # use CPU for display tool device = "/cpu:0" if NETWORK_TYPE == 'LSTM': global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) elif NETWORK_TYPE == 'DILATED': global_network = GameACDilatedNetwork(ACTION_SIZE, device) elif NETWORK_TYPE == 'CONV': global_network = GameACFFNetwork(ACTION_SIZE, device) else: raise SystemExit('NETWORK_TYPE must be LSTM, CONV or DILATED.') sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path)
def __init__(self, thread_index, global_network, pinitial_learning_rate, plearning_rate_input, pgrad_applier, vinitial_learning_rate, vlearning_rate_input, vgrad_applier, max_global_time_step, device, task_index=""): self.thread_index = thread_index self.plearning_rate_input = plearning_rate_input self.vlearning_rate_input = vlearning_rate_input self.max_global_time_step = max_global_time_step self.game_state = GameState() state = self.game_state.reset() self.game_state.reset_gs(state) self.action_size = self.game_state.action_size self.state_size = self.game_state.state_size self.local_max_iter = self.game_state.local_max_iter if USE_LSTM: self.local_network = GameACLSTMNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): pvar_refs = [v._ref() for v in self.local_network.get_pvars()] self.policy_gradients = tf.gradients( self.local_network.policy_loss, pvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) vvar_refs = [v._ref() for v in self.local_network.get_vvars()] self.value_gradients = tf.gradients( self.local_network.value_loss, vvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_policy_gradients = pgrad_applier.apply_gradients( self.local_network.get_pvars(), self.policy_gradients) self.apply_value_gradients = vgrad_applier.apply_gradients( self.local_network.get_vvars(), self.value_gradients) self.local_t = 0 self.pinitial_learning_rate = pinitial_learning_rate self.vinitial_learning_rate = vinitial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
from constants import ACTION_SIZE from constants import PARALLEL_SIZE from constants import MAX_TIME_STEP from constants import CHECKPOINT_DIR from constants import RMSP_EPSILON from constants import RMSP_ALPHA from constants import GRAD_NORM_CLIP from constants import USE_GPU from constants import USE_LSTM # use CPU for weight visualize tool device = "/cpu:0" if USE_LSTM: global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) else: global_network = GameACFFNetwork(ACTION_SIZE, -1, device) training_threads = [] learning_rate_input = tf.placeholder(PRECISION) grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) sess = tf.Session()
[str(i.name) for i in not_initialized_vars])))) # only for testing if len(not_initialized_vars) > 0: sess.run(tf.variables_initializer(not_initialized_vars)) # initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, # INITIAL_ALPHA_HIGH, # INITIAL_ALPHA_LOG_RATE) initial_learning_rate = INITIAL_RATE global_t = 0 if cooperative: global_network = GameACLSTMNetwork(0, device) else: num_threads = int(environ.get('num_threads')) logging.info(" ".join(map(str, ("num_threads", num_threads)))) assert (num_threads is not None) global_network = [] for i in range(1, int(num_threads) + 1): global_network.append(GameACLSTMNetwork(-i, device)) learning_rate_input = tf.placeholder(PRECISION) # grad_applier = RMSPropApplier(learning_rate = learning_rate_input, # decay = RMSP_ALPHA, # momentum = 0.0, # epsilon = RMSP_EPSILON,