def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, max_global_time_step, device): self.learn_rate = 0 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) # # self.apply_gradients = tf.train.RMSPropOptimizer( self.learning_rate_input).apply_gradients( zip(self.gradients, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.game_state = Game() self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = action_size self.gamma = gamma self.local_t_max = local_t_max self.agent_type = agent_type self.performance_log_interval = performance_log_interval self.log_level = log_level if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) self.local_network.prepare_loss(entropy_beta) with tf.device(device): var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device,task_index=""): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) if(global_network): self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.mode="threading"; else: self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients ) self.mode="dist_tensor"; if not (task_index): self.game_state = GameState(113 * thread_index) else: self.game_state = GameState(113 * task_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v.ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 tempdir = os.path.join(os.getcwd(), "results") self.res_file = os.path.join(tempdir, RESULTS_FILE) file = open(self.res_file, 'wb') file.write('itr,mean_score,max,min,std,runs,test_steps\n') file.close()
def __init__(self, thread_index, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_episode, device, arrived_jobs, condition): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_episode = max_global_time_episode # 通过thread_index 即机器编号来获取在该机器上加工的所有工序 self.operations = get_data_by_machine(thread_index) self.condition = condition self.is_terminal_counted = False self.last_episode_reward = 0 if USE_LSTM: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients) # self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) # 创建该工序的环境 self.env = JspEnv(self.operations, thread_index, arrived_jobs) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if NETWORK_TYPE == 'LSTM': self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) elif NETWORK_TYPE == 'DILATED': self.local_network = GameACDilatedNetwork(ACTION_SIZE, device) elif NETWORK_TYPE == 'CONV': self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # STATE_SIZE = 6 - 3 Landmarks + 5 (comm-size) self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.epSteps = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def run_a3c(args): """Run A3C experiment.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') GAME_NAME = args.gym_env.replace('NoFrameskip-v4','') # setup folder name and path to folder folder = pathlib.Path(setup_folder(args, GYM_ENV_NAME)) # setup GPU (if applicable) import tensorflow as tf gpu_options = setup_gpu(tf, args.use_gpu, args.gpu_fraction) ###################################################### # setup default device device = "/cpu:0" global_t = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) if args.load_pretrained_model: class_rewards = {'class_eval': {}} # setup logging info for analysis, see Section 4.2 of the paper sil_dict = { # count number of SIL updates "sil_ctr":{}, # total number of butter D sampled during SIL "sil_a3c_sampled":{}, # total number of buffer D samples (i.e., generated by A3C workers) used during SIL (i.e., passed max op) "sil_a3c_used":{}, # the return of used samples for buffer D "sil_a3c_used_return":{}, # total number of buffer R sampled during SIL "sil_rollout_sampled":{}, # total number of buffer R samples (i.e., generated by refresher worker) used during SIL (i.e., passed max op) "sil_rollout_used":{}, # the return of used samples for buffer R "sil_rollout_used_return":{}, # number of old samples still used (even after refreshing) "sil_old_used":{} } sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return = 0, 0, 0, 0 sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return = 0, 0, 0 sil_old_used = 0 rollout_dict = { # total number of rollout performed "rollout_ctr": {}, # total number of successful rollout (i.e., Gnew > G) "rollout_added_ctr":{}, # the return of Gnew "rollout_new_return":{}, # the return of G "rollout_old_return":{} } rollout_ctr, rollout_added_ctr = 0, 0 rollout_new_return = 0 # this records the total, avg = this / rollout_added_ctr rollout_old_return = 0 # this records the total, avg = this / rollout_added_ctr # setup file names reward_fname = folder / '{}-a3c-rewards.pkl'.format(GYM_ENV_NAME) sil_fname = folder / '{}-a3c-dict-sil.pkl'.format(GYM_ENV_NAME) rollout_fname = folder / '{}-a3c-dict-rollout.pkl'.format(GYM_ENV_NAME) if args.load_pretrained_model: class_reward_fname = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) sharedmem_fname = folder / '{}-sharedmem.pkl'.format(GYM_ENV_NAME) sharedmem_params_fname = folder / '{}-sharedmem-params.pkl'.format(GYM_ENV_NAME) sharedmem_trees_fname = folder / '{}-sharedmem-trees.pkl'.format(GYM_ENV_NAME) rolloutmem_fname = folder / '{}-rolloutmem.pkl'.format(GYM_ENV_NAME) rolloutmem_params_fname = folder / '{}-rolloutmem-params.pkl'.format(GYM_ENV_NAME) rolloutmem_trees_fname = folder / '{}-rolloutmem-trees.pkl'.format(GYM_ENV_NAME) # for removing older ckpt, save mem space prev_ckpt_t = -1 stop_req = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state input_shape = (args.input_shape, args.input_shape, 4) ####################################################### # setup global A3C GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) logger.info('A3C Initial Learning Rate={}'.format(args.initial_learn_rate)) # setup pretrained model global_pretrained_model = None local_pretrained_model = None pretrain_graph = None # if use pretrained model to refresh # then must load pretrained model # otherwise, don't load model if args.use_lider and args.nstep_bc > 0: assert args.load_pretrained_model, "refreshing with other policies, must load a pre-trained model (TA or BC)" else: assert not args.load_pretrained_model, "refreshing with the current policy, don't load pre-trained models" if args.load_pretrained_model: pretrain_graph, global_pretrained_model = setup_pretrained_model(tf, args, action_size, input_shape, device="/gpu:0" if args.use_gpu else device) assert global_pretrained_model is not None assert pretrain_graph is not None time.sleep(2.0) # setup experience memory shared_memory = None # => this is BufferD rollout_buffer = None # => this is BufferR if args.use_sil: shared_memory = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) if args.use_lider and not args.onebuffer: rollout_buffer = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) # log memory information shared_memory.log() if args.use_lider and not args.onebuffer: rollout_buffer.log() ############## Setup Thread Workers BEGIN ################ # 17 total number of threads for all experiments assert args.parallel_size ==17, "use 17 workers for all experiments" startIndex = 0 all_workers = [] # a3c and sil learning rate and optimizer learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) setup_common_worker(CommonWorker, args, action_size) # setup SIL worker sil_worker = None if args.use_sil: _device = "/gpu:0" if args.use_gpu else device sil_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) sil_worker = SILTrainingThread(startIndex, global_network, sil_network, args.initial_learn_rate, learning_rate_input, grad_applier, device=_device, batch_size=args.batch_size, use_rollout=args.use_lider, one_buffer=args.onebuffer, sampleR=args.sampleR) all_workers.append(sil_worker) startIndex += 1 # setup refresh worker refresh_worker = None if args.use_lider: _device = "/gpu:0" if args.use_gpu else device refresh_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) refresh_local_pretrained_model = None # if refreshing with other polies if args.nstep_bc > 0: refresh_local_pretrained_model = PretrainedModelNetwork( pretrain_graph, action_size, startIndex, padding=args.padding, in_shape=input_shape, sae=False, tied_weights=False, use_denoising=False, noise_factor=0.3, loss_function='mse', use_slv=False, device=_device) refresh_worker = RefreshThread( thread_index=startIndex, action_size=action_size, env_id=args.gym_env, global_a3c=global_network, local_a3c=refresh_network, update_in_rollout=args.update_in_rollout, nstep_bc=args.nstep_bc, global_pretrained_model=global_pretrained_model, local_pretrained_model=refresh_local_pretrained_model, transformed_bellman = args.transformed_bellman, device=_device, entropy_beta=args.entropy_beta, clip_norm=args.grad_norm_clip, grad_applier=grad_applier, initial_learn_rate=args.initial_learn_rate, learning_rate_input=learning_rate_input) all_workers.append(refresh_worker) startIndex += 1 # setup a3c workers setup_a3c_worker(A3CTrainingThread, args, startIndex) for i in range(startIndex, args.parallel_size): local_network = GameACFFNetwork( action_size, i, device="/cpu:0", padding=args.padding, in_shape=input_shape) a3c_worker = A3CTrainingThread( i, global_network, local_network, args.initial_learn_rate, learning_rate_input, grad_applier, device="/cpu:0", no_op_max=30) all_workers.append(a3c_worker) ############## Setup Thread Workers END ################ # setup config for tensorflow config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) # prepare sessions sess = tf.Session(config=config) pretrain_sess = None if global_pretrained_model: pretrain_sess = tf.Session(config=config, graph=pretrain_graph) # initial pretrained model if pretrain_sess: assert args.pretrained_model_folder is not None global_pretrained_model.load( pretrain_sess, args.pretrained_model_folder) sess.run(tf.global_variables_initializer()) if global_pretrained_model: initialize_uninitialized(tf, pretrain_sess, global_pretrained_model) if local_pretrained_model: initialize_uninitialized(tf, pretrain_sess, local_pretrained_model) # summary writer for tensorboard summ_file = args.save_to+'log/a3c/{}/'.format(GYM_ENV_NAME) + str(folder)[58:] # str(folder)[12:] summary_writer = tf.summary.FileWriter(summ_file, sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=1) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)+'/model_checkpoints') if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) tmp_t = (global_t // args.eval_freq) * args.eval_freq logger.info(">>> tmp_t: {}".format(tmp_t)) # set wall time wall_t = 0. # set up reward files best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('r') as f: best_model_reward = float(f.read()) # restore rewards rewards = restore_dict(reward_fname, global_t) logger.info(">>> restored: rewards") # restore loggings sil_dict = restore_dict(sil_fname, global_t) sil_ctr = sil_dict['sil_ctr'][tmp_t] sil_a3c_sampled = sil_dict['sil_a3c_sampled'][tmp_t] sil_a3c_used = sil_dict['sil_a3c_used'][tmp_t] sil_a3c_used_return = sil_dict['sil_a3c_used_return'][tmp_t] sil_rollout_sampled = sil_dict['sil_rollout_sampled'][tmp_t] sil_rollout_used = sil_dict['sil_rollout_used'][tmp_t] sil_rollout_used_return = sil_dict['sil_rollout_used_return'][tmp_t] sil_old_used = sil_dict['sil_old_used'][tmp_t] logger.info(">>> restored: sil_dict") rollout_dict = restore_dict(rollout_fname, global_t) rollout_ctr = rollout_dict['rollout_ctr'][tmp_t] rollout_added_ctr = rollout_dict['rollout_added_ctr'][tmp_t] rollout_new_return = rollout_dict['rollout_new_return'][tmp_t] rollout_old_return = rollout_dict['rollout_old_return'][tmp_t] logger.info(">>> restored: rollout_dict") if args.load_pretrained_model: class_reward_file = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) class_rewards = restore_dict(class_reward_file, global_t) # restore replay buffers (if saved) if args.checkpoint_buffer: # restore buffer D if args.use_sil and args.priority_memory: shared_memory = restore_buffer(sharedmem_fname, shared_memory, global_t) shared_memory = restore_buffer_trees(sharedmem_trees_fname, shared_memory, global_t) shared_memory = restore_buffer_params(sharedmem_params_fname, shared_memory, global_t) logger.info(">>> restored: shared_memory (Buffer D)") shared_memory.log() # restore buffer R if args.use_lider and not args.onebuffer: rollout_buffer = restore_buffer(rolloutmem_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_trees(rolloutmem_trees_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_params(rolloutmem_params_fname, rollout_buffer, global_t) logger.info(">>> restored: rollout_buffer (Buffer R)") rollout_buffer.log() # if all restores okay, remove old ckpt to save storage space prev_ckpt_t = global_t else: logger.warning("Could not find old checkpoint") wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder / 'model_checkpoints', empty=True) prepare_dir(folder / 'model_best', empty=True) prepare_dir(folder / 'frames', empty=True) lock = threading.Lock() # next saving global_t def next_t(current_t, freq): return np.ceil((current_t + 0.00001) / freq) * freq next_global_t = next_t(global_t, args.eval_freq) next_save_t = next_t( global_t, args.eval_freq*args.checkpoint_freq) step_t = 0 def train_function(parallel_idx, th_ctr, ep_queue, net_updates): nonlocal global_t, step_t, rewards, class_rewards, lock, \ next_save_t, next_global_t, prev_ckpt_t nonlocal shared_memory, rollout_buffer nonlocal sil_dict, sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return, \ sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return, \ sil_old_used nonlocal rollout_dict, rollout_ctr, rollout_added_ctr, \ rollout_new_return, rollout_old_return parallel_worker = all_workers[parallel_idx] parallel_worker.set_summary_writer(summary_writer) with lock: # Evaluate model before training if not stop_req and global_t == 0 and step_t == 0: rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, global_t, folder, worker=all_workers[-1]) # testing pretrained TA or BC in game if args.load_pretrained_model: assert pretrain_sess is not None assert global_pretrained_model is not None class_rewards['class_eval'][step_t] = \ parallel_worker.test_loaded_classifier(global_t=global_t, max_eps=50, # testing 50 episodes sess=pretrain_sess, worker=all_workers[-1], model=global_pretrained_model) # log pretrained model performance class_eval_file = pathlib.Path(args.pretrained_model_folder[:21]+\ str(GAME_NAME)+"/"+str(GAME_NAME)+'-model-eval.txt') class_std = np.std(class_rewards['class_eval'][step_t][-1]) class_mean = np.mean(class_rewards['class_eval'][step_t][-1]) with class_eval_file.open('w') as f: f.write("class_mean: \n" + str(class_mean) + "\n") f.write("class_std: \n" + str(class_std) + "\n") f.write("class_rewards: \n" + str(class_rewards['class_eval'][step_t][-1]) + "\n") checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t) save_best_model(rewards['eval'][global_t][0]) # saving worker info to dicts for analysis sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works under priority mem) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saving shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saving rollout_buffer') prev_ckpt_t = global_t step_t = 1 # set start_time start_time = time.time() - wall_t parallel_worker.set_start_time(start_time) if parallel_worker.is_sil_thread: sil_interval = 0 # bigger number => slower SIL updates m_repeat = 4 min_mem = args.batch_size * m_repeat sil_train_flag = len(shared_memory) >= min_mem while True: if stop_req: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if parallel_worker.is_sil_thread: # before sil starts, init local count local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 if net_updates.qsize() >= sil_interval \ and len(shared_memory) >= min_mem: sil_train_flag = True if sil_train_flag: sil_train_flag = False th_ctr.get() train_out = parallel_worker.sil_train( sess, global_t, shared_memory, m_repeat, rollout_buffer=rollout_buffer) local_sil_ctr, local_sil_a3c_sampled, local_sil_a3c_used, \ local_sil_a3c_used_return, \ local_sil_rollout_sampled, local_sil_rollout_used, \ local_sil_rollout_used_return, \ local_sil_old_used = train_out th_ctr.put(1) with net_updates.mutex: net_updates.queue.clear() if args.use_lider: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, a3c_used_return=sil_a3c_used_return/(sil_a3c_used+1),#add one in case divide by zero rollout_used=sil_rollout_used, rollout_used_return=sil_rollout_used_return/(sil_rollout_used+1), old_used=sil_old_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: rollout_buffsize = 0 if not args.onebuffer: rollout_buffsize = len(rollout_buffer) log_data = (sil_ctr, len(shared_memory), rollout_buffsize, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, sil_a3c_used_return/(sil_a3c_used+1), sil_rollout_used, sil_rollout_used_return/(sil_rollout_used+1), sil_old_used) logger.info("SIL: sil_ctr={0:}" " sil_memory_size={1:}" " rollout_buffer_size={2:}" " total_sample_used={3:}/{4:}" " a3c_used={5:}" " a3c_used_return_avg={6:.2f}" " rollout_used={7:}" " rollout_used_return_avg={8:.2f}" " old_used={9:}".format(*log_data)) else: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, rollout_used=sil_rollout_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: log_data = (sil_ctr, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, len(shared_memory)) logger.info("SIL: sil_ctr={0:}" " total_sample_used={1:}/{2:}" " a3c_used={3:}" " sil_memory_size={4:}".format(*log_data)) # Adding episodes to SIL memory is centralize to ensure # sampling and updating of priorities does not become a problem # since we add new episodes to SIL at once and during # SIL training it is guaranteed that SIL memory is untouched. max = args.parallel_size while not ep_queue.empty(): data = ep_queue.get() parallel_worker.episode.set_data(*data) shared_memory.extend(parallel_worker.episode) parallel_worker.episode.reset() max -= 1 if max <= 0: # This ensures that SIL has a chance to train break diff_global_t = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 elif parallel_worker.is_refresh_thread: # before refresh starts, init local count diff_global_t = 0 local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 if len(shared_memory) >= 1: th_ctr.get() # randomly sample a state from buffer D sample = shared_memory.sample_one_random() # after sample, flip refreshed to True # TODO: fix this so that only *succesful* refresh is flipped to True # currently counting *all* refresh as True assert sample[-1] == True train_out = parallel_worker.rollout(sess, folder, pretrain_sess, global_t, sample, args.addall, args.max_ep_step, args.nstep_bc, args.update_in_rollout) diff_global_t, episode_end, part_end, local_rollout_ctr, \ local_rollout_added_ctr, add, local_rollout_new_return, \ local_rollout_old_return = train_out th_ctr.put(1) if rollout_ctr % 20 == 0 and rollout_ctr > 0: log_msg = "ROLLOUT: rollout_ctr={} added_rollout_ct={} worker={}".format( rollout_ctr, rollout_added_ctr, parallel_worker.thread_idx) logger.info(log_msg) logger.info("ROLLOUT Gnew: {}, G: {}".format(local_rollout_new_return, local_rollout_old_return)) # should always part_end, i.e., end of episode # and only add if new return is better (if not LiDER-AddAll) if part_end and add: if not args.onebuffer: # directly put into Buffer R rollout_buffer.extend(parallel_worker.episode) else: # Buffer D add sample is centralized when OneBuffer ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # a3c training thread worker else: th_ctr.get() train_out = parallel_worker.train(sess, global_t, rewards) diff_global_t, episode_end, part_end = train_out th_ctr.put(1) if args.use_sil: net_updates.put(1) if part_end: ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 # ensure only one thread is updating global_t at a time with lock: global_t += diff_global_t # centralize increasing count for SIL and Rollout sil_ctr += local_sil_ctr sil_a3c_sampled += local_sil_a3c_sampled sil_a3c_used += local_sil_a3c_used sil_a3c_used_return += local_sil_a3c_used_return sil_rollout_sampled += local_sil_rollout_sampled sil_rollout_used += local_sil_rollout_used sil_rollout_used_return += local_sil_rollout_used_return sil_old_used += local_sil_old_used rollout_ctr += local_rollout_ctr rollout_added_ctr += local_rollout_added_ctr rollout_new_return += local_rollout_new_return rollout_old_return += local_rollout_old_return # if during a thread's update, global_t has reached a evaluation interval if global_t > next_global_t: next_global_t = next_t(global_t, args.eval_freq) step_t = int(next_global_t - args.eval_freq) # wait for all threads to be done before testing while not stop_req and th_ctr.qsize() < len(all_workers): time.sleep(0.001) step_t = int(next_global_t - args.eval_freq) # Evaluate for 125,000 steps rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, step_t, folder, worker=all_workers[-1]) save_best_model(rewards['eval'][step_t][0]) last_reward = rewards['eval'][step_t][0] # saving worker info to dicts # SIL sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used # ROLLOUT rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # save ckpt after done with eval if global_t > next_save_t: next_save_t = next_t(global_t, args.eval_freq*args.checkpoint_freq) # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works for priority mem for now) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saved shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saved rollout_buffer') # save a3c after saving buffer -- in case saving buffer OOM # so that at least we can revert back to the previous ckpt checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t, write_meta_graph=False) logger.info('Saved model ckpt') # if everything saves okay, clean up previous ckpt to save space remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Removed ckpt from step {}'.format(prev_ckpt_t)) prev_ckpt_t = global_t def signal_handler(signal, frame): nonlocal stop_req logger.info('You pressed Ctrl+C!') stop_req = True if stop_req and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward if test_reward > best_model_reward: best_model_reward = test_reward best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('w') as f: f.write(str(best_model_reward)) best_checkpt_file = folder / 'model_best' best_checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) best_saver.save(sess, str(best_checkpt_file)) train_threads = [] th_ctr = Queue() for i in range(args.parallel_size): th_ctr.put(1) episodes_queue = None net_updates = None if args.use_sil: episodes_queue = Queue() net_updates = Queue() for i in range(args.parallel_size): worker_thread = Thread( target=train_function, args=(i, th_ctr, episodes_queue, net_updates,)) train_threads.append(worker_thread) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder / 'wall_t.{}'.format(global_t) with wall_t_fname.open('w') as f: f.write(str(wall_t)) # save final model checkpoint_file = str(folder / '{}_checkpoint_a3c'.format(GYM_ENV_NAME)) root_saver.save(sess, checkpoint_file, global_step=global_t) dump_final_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname]) logger.info('Data saved!') # if everything saves okay & is done training (not because of pressed Ctrl+C), # clean up previous ckpt to save space if global_t >= (args.max_time_step * args.max_time_step_fraction): remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Done training, removed ckpt from step {}'.format(prev_ckpt_t)) sess.close() if pretrain_sess: pretrain_sess.close()
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
import matplotlib.pyplot as plt from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread from rmsprop_applier import RMSPropApplier import options options = options.options # use CPU for weight visualize tool device = "/cpu:0" if options.use_lstm: global_network = GameACLSTMNetwork(options.action_size, -1, device) else: global_network = GameACFFNetwork(options.action_size, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = options.rmsp_alpha, momentum = 0.0, epsilon = options.rmsp_epsilon, clip_norm = options.grad_norm_clip, device = device) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init)
def __init__(self, thread_index, global_network, pinitial_learning_rate, plearning_rate_input, pgrad_applier, vinitial_learning_rate, vlearning_rate_input, vgrad_applier, max_global_time_step, device, task_index=""): self.thread_index = thread_index self.plearning_rate_input = plearning_rate_input self.vlearning_rate_input = vlearning_rate_input self.max_global_time_step = max_global_time_step self.game_state = GameState() state = self.game_state.reset() self.game_state.reset_gs(state) self.action_size = self.game_state.action_size self.state_size = self.game_state.state_size self.local_max_iter = self.game_state.local_max_iter if USE_LSTM: self.local_network = GameACLSTMNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): pvar_refs = [v._ref() for v in self.local_network.get_pvars()] self.policy_gradients = tf.gradients( self.local_network.policy_loss, pvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) vvar_refs = [v._ref() for v in self.local_network.get_vvars()] self.value_gradients = tf.gradients( self.local_network.value_loss, vvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_policy_gradients = pgrad_applier.apply_gradients( self.local_network.get_pvars(), self.policy_gradients) self.apply_value_gradients = vgrad_applier.apply_gradients( self.local_network.get_vvars(), self.value_gradients) self.local_t = 0 self.pinitial_learning_rate = pinitial_learning_rate self.vinitial_learning_rate = vinitial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop self.episode_reward = 0 for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # print(self.episode_reward) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() # self._record_score(sess, summary_writer, summary_op, score_input, # self.episode_reward, global_t) if terminal: terminal_end = True print("score={}".format(self.episode_reward)) # self._record_score(sess, summary_writer, summary_op, score_input, # self.episode_reward, global_t) # self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
def display(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip, agent_type, action_size, rand_seed, checkpoint_dir, display_time_sleep, display_episodes, display_log_level, display_save_log, show_max): # use CPU for display tool device = "/cpu:0" LOG_FILE = 'log_{}-{}.txt'.format(experiment_name, agent_type) if agent_type == 'LSTM': global_network = GameACLSTMNetwork(action_size, -1, device) else: global_network = GameACFFNetwork(action_size, -1, device) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=rmsp_alpha, momentum=0.0, epsilon=rmsp_epsilon, clip_norm=grad_norm_clip, device=device) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") episode = 0 terminal = False episode_rewards = [] episode_steps = [] episode_passed_obsts = [] print ' ' print 'DISPLAYING {} EPISODES'.format(display_episodes) print '--------------------------------------------------- ' while not episode == display_episodes: episode_reward = 0 episode_passed_obst = 0 game_state = GameState(rand_seed, action_size, show_score=True) if display_log_level == 'FULL': print 'EPISODE {}'.format(episode) full_frame = None while True: pi_values, value = global_network.run_policy_and_value( sess, game_state.s_t) action = choose_action(pi_values) game_state.process(action) terminal = game_state.terminal episode_step = game_state.steps reward = game_state.reward passed_obst = game_state.passed_obst if len(episode_passed_obsts) == 0 and show_max: if passed_obst > 0: full_frame = game_state.full_frame elif episode_passed_obst > np.max( episode_passed_obsts) and show_max: full_frame = game_state.full_frame episode_reward += reward episode_passed_obst = passed_obst if display_log_level == 'FULL': print 'step / pi_values: {} / value: {} / action: {} / reward: {} / passed_obst: {}'.format( pi_values, value, action, reward, passed_obst) time.sleep(display_time_sleep) if not terminal: game_state.update() else: break episode_rewards.append(episode_reward) episode_steps.append(episode_step) episode_passed_obsts.append(episode_passed_obst) if not display_log_level == 'NONE': reward_steps = format( float(episode_reward) / float(episode_step), '.4f') print "EPISODE: {} / STEPS: {} / PASSED OBST: {} / REWARD: {} / REWARD/STEP: {}".format( episode, episode_step, passed_obst, episode_reward, reward_steps) if display_save_log: with open(LOG_FILE, "a") as text_file: text_file.write('{},{},{},{},{}\n'.format( episode, episode_step, passed_obst, episode_reward, reward_steps)) episode += 1 print '--------------------------------------------------- ' print 'DISPLAY SESSION FINISHED' print 'TOTAL EPISODES: {}'.format(display_episodes) print ' ' print 'MIN' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.min(episode_rewards), np.min(episode_steps), np.min(episode_passed_obsts)) print ' ' print 'AVERAGE' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.average(episode_rewards), np.average(episode_steps), np.average(episode_passed_obsts)) print ' ' print 'MAX' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.max(episode_rewards), np.max(episode_steps), np.max(episode_passed_obsts)) if show_max and not full_frame == None: plt.imshow(full_frame, origin='lower') plt.show()
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input #每个worker不同 self.max_global_time_step = max_global_time_step #4000w steps self.action_size = action_size #2 self.gamma = gamma # 0.99 self.local_t_max = local_t_max # 256 self.agent_type = agent_type #FF self.performance_log_interval = performance_log_interval self.log_level = log_level #初始化worker的网络 if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) #创建一下loss的相关变量 self.local_network.prepare_loss(entropy_beta) with tf.device(device): #获取worker网络的参数 #[self.W_conv1, self.b_conv1, self.W_conv2, self.b_conv2,self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2,self.W_fc3, self.b_fc3] var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) #计算梯度, self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) #更新网络 self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) #拉取global网络参数 self.sync = self.local_network.sync_from(global_network) #初始化游戏环境 np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate #重置一些计数器 self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, options): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.options = options if options.use_lstm: self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(options.action_size, device) self.local_network.prepare_loss(options.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.indent = " |" * self.thread_index self.steps = 0 self.no_reward_steps = 0 self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0) if self.options.train_episode_steps > 0: self.max_reward = 0.0 self.max_episode_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores = Episode_scores(options) self.tes = self.options.train_episode_steps if self.options.tes_list is not None: self.tes = self.options.tes_list[thread_index] print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes)) self.initial_lives = self.game_state.initial_lives self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1) if self.options.record_new_record_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_record_dir): os.makedirs(self.options.record_new_record_dir) self.episode_screens = [] if self.options.record_new_room_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_room_dir): os.makedirs(self.options.record_new_room_dir) self.episode_screens = [] self.greediness = options.greediness self.repeat_action_ratio = options.repeat_action_ratio self.prev_action = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values, global_t): # Add greediness for broader exploration r = random.random() if r < self.greediness: action = int(r * len(pi_values)) elif r < self.repeat_action_ratio: action = self.prev_action else: # Increase randomness of choice if no reward term is too long if self.no_reward_steps > self.options.no_reward_steps: randomness = (self.no_reward_steps - self.options.no_reward_steps) * self.options.randomness pi_values += randomness pi_values /= sum(pi_values) if self.local_t % self.options.randomness_log_interval == 0: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}randomness={:.8f}".format( elapsed_time, global_t, self.thread_index, self.indent, randomness)) pi_values -= np.finfo(np.float32).epsneg action_samples = np.random.multinomial(self.options.num_experiments, pi_values) action = action_samples.argmax(0) self.prev_action = action return action def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time #@profile def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] liveses = [self.game_state.lives] if self.tes > 0: if self.episode_liveses == []: self.episode_liveses.append(self.game_state.lives) terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if self.options.use_lstm: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.options.local_t_max): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_, global_t) states.append(self.game_state.s_t) actions.append(action) values.append(value_) liveses.append(self.game_state.lives) if (self.thread_index == 0) and (self.local_t % self.options.log_interval == 0): print("pi={} (thread{})".format(pi_, self.thread_index)) print(" V={} (thread{})".format(value_, self.thread_index)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward if reward > 0 and \ (self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0"): elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:4.0f},th={}:{}r={:3.0f}RM{:02d}| NEW-SCORE".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, self.game_state.room_no)) # pseudo-count reward if self.options.psc_use: reward += self.game_state.psc_reward # add basic income after some no reward steps if self.no_reward_steps > self.options.no_reward_steps: reward += self.options.basic_income # clip reward if self.options.reward_clip > 0.0: reward = np.clip(reward, -self.options.reward_clip, self.options.reward_clip) rewards.append( reward ) # collect episode log if self.tes > 0: self.episode_states.append(self.game_state.s_t) self.episode_actions.append(action) self.episode_rewards.append(reward) self.episode_values.append(value_) self.episode_liveses.append(self.game_state.lives) if len(self.episode_states) > self.max_history * 2: self.episode_states = self.episode_states[-self.max_history:] self.episode_actions = self.episode_actions[-self.max_history:] self.episode_rewards = self.episode_rewards[-self.max_history:] self.episode_values = self.episode_values[-self.max_history:] self.episode_liveses = self.episode_liveses[-self.max_history-1:] # requirement for OpenAI Gym: --clear-history-on-death=False if self.options.clear_history_on_death and (liveses[-2] > liveses[-1]): self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = self.episode_liveses[-2:] self.local_t += 1 if self.options.record_new_record_dir is not None \ or self.options.record_new_room_dir is not None: screen = self.game_state.uncropped_screen if self.options.compress_frame: screen = lzma.compress(screen.tobytes(), preset=0) self.episode_screens.append(screen) # terminate if the play time is too long self.steps += 1 if self.steps > self.options.max_play_steps: terminal = True # requirement for OpenAI Gym: --terminate-on-lives-lost=False # terminate if lives lost if self.terminate_on_lives_lost and (liveses[-2] > liveses[-1]): terminal = True # count no reward steps if self.game_state.reward == 0.0: self.no_reward_steps += 1 else: self.no_reward_steps = 0 # s_t1 -> s_t self.game_state.update() if self.local_t % self.options.score_log_interval == 0: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, self.game_state.room_no, self.game_state.lives, value_, self.game_state.psc_reward)) # if self.game_state.room_no != self.game_state.prev_room_no: # elapsed_time = time.time() - self.start_time # print("t={:6.0f},s={:9d},th={}:{}RM{:02d}>RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format( # elapsed_time, global_t, self.thread_index, self.indent, # self.game_state.prev_room_no, self.game_state.room_no, # self.game_state.lives, value_, self.game_state.psc_reward)) if self.tes > 0: if self.game_state.lives < self.episode_liveses[-2]: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}l={:.0f}>{:.0f}RM{:02d}|".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_liveses[-2], self.game_state.lives, self.game_state.room_no)) # seperate steps after getting reward if self.game_state.reward > 0: if not terminal: break if terminal: terminal_end = True elapsed_time = time.time() - self.start_time end_mark = "end" if self.terminate_on_lives_lost else "END" print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}@{}|".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, end_mark)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) if self.tes > 0: if self.options.record_new_room_dir is not None \ and self.game_state.new_room >= 0: dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t, self.thread_index,\ self.episode_reward, self.game_state.new_room) dirname = os.path.join(self.options.record_new_room_dir, dirname) os.makedirs(dirname) for index, screen in enumerate(self.episode_screens): filename = "{:06d}.png".format(index) filename = os.path.join(dirname, filename) screen_image = screen if self.options.compress_frame: screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160)) cv2.imwrite(filename, screen_image) print("@@@ New Room record screens saved to {}".format(dirname)) if self.episode_reward > self.max_episode_reward: if self.options.record_new_record_dir is not None: dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t, self.thread_index,\ self.episode_reward, self.game_state.room_no) dirname = os.path.join(self.options.record_new_record_dir, dirname) os.makedirs(dirname) for index, screen in enumerate(self.episode_screens): filename = "{:06d}.png".format(index) filename = os.path.join(dirname, filename) screen_image = screen if self.options.compress_frame: screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160)) cv2.imwrite(filename, screen_image) print("@@@ New Record screens saved to {}".format(dirname)) self.max_episode_reward = self.episode_reward if self.options.record_all_non0_record: self.max_episode_reward = 0 self.max_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores.add(self.episode_reward, global_t, self.thread_index) if self.options.record_new_record_dir is not None \ or self.options.record_new_room_dir is not None: self.episode_screens= [] self.episode_reward = 0 self.steps = 0 self.no_reward_steps = 0 self.game_state.reset() if self.options.use_lstm: self.local_network.reset_state() break if self.thread_index == 0 and self.local_t % self.options.performance_log_interval < self.options.local_t_max: elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) if self.options.gym_eval: diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end # don't train if following condition # requirement for OpenAI Gym: --terminate-on-lives-lost=False if self.options.terminate_on_lives_lost and (self.thread_index == 0) and (not self.options.train_in_eval): return 0, terminal_end else: if self.tes > 0: _ = self.episode_scores.is_highscore(self.episode_reward) if self.episode_reward > self.max_reward: self.max_reward = self.episode_reward if True: tes = self.tes # requirement for OpenAI Gym: --test-extend=False if self.options.tes_extend and self.initial_lives != 0: tes *= self.options.tes_extend_ratio * (self.game_state.lives / self.initial_lives) if self.game_state.lives == self.initial_lives: tes *= 2 tes = int(tes) tes = min(tes, len(self.episode_states)) print("[OHL]SCORE={:3.0f},s={:9d},th={},lives={},steps={},tes={},RM{:02d}".format(self.episode_reward, global_t, self.thread_index, self.game_state.lives, self.steps, tes, self.game_state.room_no)) if tes == 0: states = [] actions = [] rewards = [] values = [] liveses = self.episode_liveses[-1:] else: states = self.episode_states[-tes:] actions = self.episode_actions[-tes:] rewards = self.episode_rewards[-tes:] values = self.episode_values[-tes:] liveses = self.episode_liveses[-tes-1:] if self.options.clear_history_after_ohl: self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = self.episode_liveses[-2:] if len(states) > 0: R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] lives = liveses.pop() # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): # Consider the number of lives if (not self.options.use_gym) and self.initial_lives != 0.0 and not self.terminate_on_lives_lost: prev_lives = liveses.pop() if prev_lives > lives: weight = self.options.lives_lost_weight rratio = self.options.lives_lost_rratio R *= rratio * ( (1.0 - weight) + weight * (lives / prev_lives) ) ri = self.options.lives_lost_reward lives = prev_lives R = ri + self.options.gamma * R td = R - Vi a = np.zeros([self.options.action_size]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if self.options.use_lstm: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end
class A3CTrainingThread(object): log_interval = 100 performance_log_interval = 1000 local_t_max = 20 demo_t_max = 20 use_lstm = False action_size = -1 entropy_beta = 0.01 demo_entropy_beta = 0.01 gamma = 0.99 use_mnih_2015 = False env_id = None reward_type = 'CLIP' # CLIP | LOG | RAW finetune_upper_layers_oinly = False shaping_reward = 0.001 shaping_factor = 1. shaping_gamma = 0.85 advice_confidence = 0.8 shaping_actions = -1 # -1 all actions, 0 exclude noop transformed_bellman = False clip_norm = 0.5 use_grad_cam = False def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device=None, pretrained_model=None, pretrained_model_sess=None, advice=False, reward_shaping=False): assert self.action_size != -1 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.use_pretrained_model_as_advice = advice self.use_pretrained_model_as_reward_shaping = reward_shaping logger.info("thread_index: {}".format(self.thread_index)) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("use_lstm: {}".format( colored(self.use_lstm, "green" if self.use_lstm else "red"))) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("finetune_upper_layers_only: {}".format( colored(self.finetune_upper_layers_only, "green" if self.finetune_upper_layers_only else "red"))) logger.info("use_pretrained_model_as_advice: {}".format( colored( self.use_pretrained_model_as_advice, "green" if self.use_pretrained_model_as_advice else "red"))) logger.info("use_pretrained_model_as_reward_shaping: {}".format( colored( self.use_pretrained_model_as_reward_shaping, "green" if self.use_pretrained_model_as_reward_shaping else "red"))) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) if self.use_lstm: GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACFFNetwork(self.action_size, thread_index, device) with tf.device(device): self.local_network.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) local_vars = self.local_network.get_vars if self.finetune_upper_layers_only: local_vars = self.local_network.get_vars_upper var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs) global_vars = global_network.get_vars if self.finetune_upper_layers_only: global_vars = global_network.get_vars_upper with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) #self.apply_gradients = grad_applier.apply_gradients( # global_vars(), # self.gradients) self.sync = self.local_network.sync_from( global_network, upper_layers_only=self.finetune_upper_layers_only) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=30, human_demo=False, episode_life=True) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 self.is_demo_thread = False with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped.get_action_meanings( ) self.local_network.build_grad_cam_grads() self.pretrained_model = pretrained_model self.pretrained_model_sess = pretrained_model_sess self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0 self.advice_ctr = 0 self.shaping_ctr = 0 self.last_rho = 0. if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping: assert self.pretrained_model is not None def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, logits): """sample() in https://github.com/ppyht2/tf-a2c/blob/master/src/policy.py""" noise = np.random.uniform(0, 1, np.shape(logits)) return np.argmax(logits - np.log(-np.log(noise))) def choose_action_with_high_confidence(self, pi_values, exclude_noop=True): actions_confidence = [] # exclude NOOP action for action in range(1 if exclude_noop else 0, self.action_size): actions_confidence.append(pi_values[action][0][0]) max_confidence_action = np.argmax(actions_confidence) confidence = actions_confidence[max_confidence_action] return (max_confidence_action + (1 if exclude_noop else 0)), confidence def set_summary_writer(self, writer): self.writer = writer def record_summary(self, score=0, steps=0, episodes=None, global_t=0, mode='Test'): summary = tf.Summary() summary.value.add(tag='{}/score'.format(mode), simple_value=float(score)) summary.value.add(tag='{}/steps'.format(mode), simple_value=float(steps)) if episodes is not None: summary.value.add(tag='{}/episodes'.format(mode), simple_value=float(episodes)) self.writer.add_summary(summary, global_t) self.writer.flush() def set_start_time(self, start_time): self.start_time = start_time def generate_cam(self, sess, test_cam_si, global_t): cam_side_img = [] for i in range(len(test_cam_si)): # get max action per demo state readout_t = self.local_network.run_policy(sess, test_cam_si[i]) action = np.argmax(readout_t) # convert action to one-hot vector action_onehot = [0.] * self.game_state.env.action_space.n action_onehot[action] = 1. # compute grad cam for conv layer 3 activations, gradients = self.local_network.evaluate_grad_cam( sess, test_cam_si[i], action_onehot) cam = grad_cam(activations, gradients) cam_img = visualize_cam(cam) side_by_side = generate_image_for_cam_video( test_cam_si[i], cam_img, global_t, i, self.action_meaning[action]) cam_side_img.append(side_by_side) return cam_side_img def generate_cam_video(self, sess, time_per_step, global_t, folder, demo_memory_cam, demo_cam_human=False): # use one demonstration data to record cam # only need to make movie for demo data once cam_side_img = self.generate_cam(sess, demo_memory_cam, global_t) path = '/frames/demo-cam_side_img' if demo_cam_human: path += '_human' make_movie(cam_side_img, folder + '{}{ep:010d}'.format(path, ep=(global_t)), duration=len(cam_side_img) * time_per_step, true_image=True, salience=False) del cam_side_img def testing_model(self, sess, max_steps, global_t, folder, demo_memory_cam=None, demo_cam_human=False): logger.info("Testing model at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam, demo_cam_human) return else: self.game_state.reset(hard_reset=True) max_steps += 4 test_memory = ReplayMemory( 84, 84, np.random.RandomState(), max_steps=max_steps, phi_length=4, num_actions=self.game_state.env.action_space.n, wrap_memory=False, full_state_size=self.game_state.clone_full_state().shape[0]) for _ in range(4): test_memory.add(self.game_state.x_t, 0, self.game_state.reward, self.game_state.terminal, self.game_state.lives, fullstate=self.game_state.full_state) episode_buffer = [] test_memory_cam = [] total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 terminal = False while True: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) test_memory_cam.append(self.game_state.s_t) episode_buffer.append(self.game_state.get_screen_rgb()) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) #action = self.choose_action(logits_) action = np.argmax(pi_) # take action self.game_state.step(action) terminal = self.game_state.terminal memory_full = episode_steps == max_steps - 5 terminal_ = terminal or memory_full # store the transition to replay memory test_memory.add(self.game_state.x_t1, action, self.game_state.reward, terminal_, self.game_state.lives, fullstate=self.game_state.full_state1) # update the old values episode_reward += self.game_state.reward episode_steps += 1 # s_t = s_t1 self.game_state.update() if terminal_: if get_wrapper_by_name( self.game_state.env, 'EpisodicLifeEnv').was_real_done or memory_full: time_per_step = 0.03 images = np.array(episode_buffer) make_movie(images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) break self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() total_reward = episode_reward total_steps = episode_steps log_data = (global_t, self.thread_index, total_reward, total_steps) logger.info( "test: global_t={} worker={} final score={} final steps={}".format( *log_data)) self.generate_cam_video(sess, 0.03, global_t, folder, np.array(test_memory_cam)) test_memory.save(name='test_cam', folder=folder, resize=True) if self.use_lstm: self.local_network.reset_state() return def testing(self, sess, max_steps, global_t, folder, demo_memory_cam=None): logger.info("Evaluate policy at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None and global_t % 5000000 == 0: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam) episode_buffer = [] self.game_state.reset(hard_reset=True) episode_buffer.append(self.game_state.get_screen_rgb()) total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) if False: action = np.random.choice(range(self.action_size), p=pi_) else: action = self.choose_action(logits_) if self.use_pretrained_model_as_advice: psi = self.psi if self.psi > 0.001 else 0.0 if psi > np.random.rand(): model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) model_action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) if model_action > self.shaping_actions and confidence >= self.advice_confidence: action = model_action # take action self.game_state.step(action) terminal = self.game_state.terminal if n_episodes == 0 and global_t % 5000000 == 0: episode_buffer.append(self.game_state.get_screen_rgb()) episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: if n_episodes == 0 and global_t % 5000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) make_movie( images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (global_t, self.thread_index, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} worker={} trial={} {} {} total_steps={}" .format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (global_t, self.thread_index, total_reward, total_steps, n_episodes) logger.info( "test: global_t={} worker={} final score={} final steps={} # trials={}" .format(*log_data)) self.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=global_t, mode='Test') # reset variables used in training self.episode_reward = 0 self.episode_steps = 0 self.game_state.reset(hard_reset=True) self.last_rho = 0. if self.is_demo_thread: self.replay_mem_reset() if self.use_lstm: self.local_network.reset_state() return total_reward, total_steps, n_episodes def pretrain_init(self, demo_memory): self.demo_memory_size = len(demo_memory) self.demo_memory = demo_memory self.replay_mem_reset() def replay_mem_reset(self, demo_memory_idx=None): if demo_memory_idx is not None: self.demo_memory_idx = demo_memory_idx else: # new random episode self.demo_memory_idx = np.random.randint(0, self.demo_memory_size) self.demo_memory_count = np.random.randint( 0, len(self.demo_memory[self.demo_memory_idx]) - self.local_t_max) # if self.demo_memory_count+self.local_t_max < len(self.demo_memory[self.demo_memory_idx]): # self.demo_memory_max_count = np.random.randint(self.demo_memory_count+self.local_t_max, len(self.demo_memory[self.demo_memory_idx])) # else: # self.demo_memory_max_count = len(self.demo_memory[self.demo_memory_idx]) logger.debug( "worker={} mem_reset demo_memory_idx={} demo_memory_start={}". format(self.thread_index, self.demo_memory_idx, self.demo_memory_count)) s_t, action, reward, terminal = self.demo_memory[self.demo_memory_idx][ self.demo_memory_count] self.demo_memory_action = action self.demo_memory_reward = reward self.demo_memory_terminal = terminal if not self.demo_memory[self.demo_memory_idx].imgs_normalized: self.demo_memory_s_t = s_t * (1.0 / 255.0) else: self.demo_memory_s_t = s_t def replay_mem_process(self): self.demo_memory_count += 1 s_t, action, reward, terminal = self.demo_memory[self.demo_memory_idx][ self.demo_memory_count] self.demo_memory_next_action = action self.demo_memory_reward = reward self.demo_memory_terminal = terminal if not self.demo_memory[self.demo_memory_idx].imgs_normalized: self.demo_memory_s_t1 = s_t * (1.0 / 255.0) else: self.demo_memory_s_t1 = s_t def replay_mem_update(self): self.demo_memory_action = self.demo_memory_next_action self.demo_memory_s_t = self.demo_memory_s_t1 def demo_process(self, sess, global_t, demo_memory_idx=None): states = [] actions = [] rewards = [] values = [] demo_ended = False terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if self.use_lstm: reset_lstm_state = False start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.demo_t_max): pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.demo_memory_s_t) action = self.demo_memory_action time.sleep(0.0025) states.append(self.demo_memory_s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % self.log_interval == 0): log_msg = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg += " pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg += " V={:.4f}".format(value_) logger.debug(log_msg) # process replay memory self.replay_mem_process() # receive replay memory result reward = self.demo_memory_reward terminal = self.demo_memory_terminal self.episode_reward += reward if self.reward_type == 'LOG': reward = np.sign(reward) * np.log(1 + np.abs(reward)) elif self.reward_type == 'CLIP': # clip reward reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 # demo_memory_s_t1 -> demo_memory_s_t self.replay_mem_update() s_t = self.demo_memory_s_t if terminal or self.demo_memory_count == len( self.demo_memory[self.demo_memory_idx]): logger.debug("worker={} score={}".format( self.thread_index, self.episode_reward)) demo_ended = True if terminal: terminal_end = True if self.use_lstm: self.local_network.reset_state() else: # some demo episodes doesn't reach terminal state if self.use_lstm: reset_lstm_state = True self.episode_reward = 0 self.episode_steps = 0 self.replay_mem_reset(demo_memory_idx=demo_memory_idx) break cumulative_reward = 0.0 if not terminal_end: cumulative_reward = self.local_network.run_value(sess, s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumulative_reward = [] # compute and accmulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): cumulative_reward = ri + self.gamma * cumulative_reward advantage = cumulative_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) cur_learning_rate = self._anneal_learning_rate(global_t) #* 0.005 if self.use_lstm: batch_state.reverse() batch_action.reverse() batch_adv.reverse() batch_cumulative_reward.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_action)], self.learning_rate_input: cur_learning_rate }) # some demo episodes doesn't reach terminal state if reset_lstm_state: self.local_network.reset_state() reset_lstm_state = False else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_R, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= self.performance_log_interval): self.prev_local_t += self.performance_log_interval # return advancd local step size diff_local_t = self.local_t - start_local_t return diff_local_t, demo_ended def process(self, sess, global_t, train_rewards): states = [] actions = [] rewards = [] values = [] rho = [] terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if self.use_lstm: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.local_t_max): pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(logits_) model_pi = None confidence = 0. if self.use_pretrained_model_as_advice: self.psi = 0.9999 * ( 0.9999** global_t) if self.psi > 0.001 else 0.0 # 0.99995 works if self.psi > np.random.rand(): model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) model_action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) if (model_action > self.shaping_actions and confidence >= self.advice_confidence): action = model_action self.advice_ctr += 1 if self.use_pretrained_model_as_reward_shaping: #if action > 0: if model_pi is None: model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) confidence = model_pi[action][0][0] if (action > self.shaping_actions and confidence >= self.advice_confidence): #rho.append(round(confidence, 5)) rho.append(self.shaping_reward) self.shaping_ctr += 1 else: rho.append(0.) #self.shaping_ctr += 1 states.append(self.game_state.s_t) actions.append(action) values.append(value_) if self.thread_index == 0 and self.local_t % self.log_interval == 0: log_msg1 = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg2 = "pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg3 = "V={:.4f}".format(value_) if self.use_pretrained_model_as_advice: log_msg3 += " psi={:.4f}".format(self.psi) logger.debug(log_msg1) logger.debug(log_msg2) logger.debug(log_msg3) # process game self.game_state.step(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal if self.use_pretrained_model_as_reward_shaping: if reward < 0 and reward > 0: rho[i] = 0. j = i - 1 while j > i - 5: if rewards[j] != 0: break rho[j] = 0. j -= 1 # if self.game_state.loss_life: # if self.game_state.gain_life or reward > 0: # rho[i] = 0. # j = i-1 # k = 1 # while j >= 0: # if rewards[j] != 0: # rho[j] = self.shaping_reward * (self.gamma ** -1) # break # rho[j] = self.shaping_reward / k # j -= 1 # k += 1 self.episode_reward += reward if self.reward_type == 'LOG': reward = np.sign(reward) * np.log(1 + np.abs(reward)) elif self.reward_type == 'CLIP': # clip reward reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: log_msg = "train: worker={} global_t={}".format( self.thread_index, global_t) if self.use_pretrained_model_as_advice: log_msg += " advice_ctr={}".format(self.advice_ctr) if self.use_pretrained_model_as_reward_shaping: log_msg += " shaping_ctr={}".format(self.shaping_ctr) score_str = colored("score={}".format(self.episode_reward), "magenta") steps_str = colored("steps={}".format(self.episode_steps), "blue") log_msg += " {} {}".format(score_str, steps_str) logger.debug(log_msg) train_rewards['train'][global_t] = (self.episode_reward, self.episode_steps) self.record_summary(score=self.episode_reward, steps=self.episode_steps, episodes=None, global_t=global_t, mode='Train') self.episode_reward = 0 self.episode_steps = 0 terminal_end = True self.last_rho = 0. if self.use_lstm: self.local_network.reset_state() self.game_state.reset(hard_reset=False) break cumulative_reward = 0.0 if not terminal: cumulative_reward = self.local_network.run_value( sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumulative_reward = [] if self.use_pretrained_model_as_reward_shaping: rho.reverse() rho.append(self.last_rho) self.last_rho = rho[0] i = 0 # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): # Wiewiora et al.(2003) Principled Methods for Advising RL agents # Look-Back Advice #F = rho[i] - (self.shaping_gamma**-1) * rho[i+1] #F = rho[i] - self.shaping_gamma * rho[i+1] f = (self.shaping_gamma**-1) * rho[i] - rho[i + 1] if (i == 0 and terminal) or (f != 0 and (ri > 0 or ri < 0)): #logger.warn("averted additional F in absorbing state") F = 0. # if (F < 0. and ri > 0) or (F > 0. and ri < 0): # logger.warn("Negative reward shaping F={} ri={} rho[s]={} rhos[s-1]={}".format(F, ri, rho[i], rho[i+1])) # F = 0. cumulative_reward = (ri + f * self.shaping_factor ) + self.gamma * cumulative_reward advantage = cumulative_reward - vi a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) i += 1 else: def h(z, eps=10**-2): return (np.sign(z) * (np.sqrt(np.abs(z) + 1.) - 1.)) + (eps * z) def h_inv(z, eps=10**-2): return np.sign(z) * (np.square( (np.sqrt(1 + 4 * eps * (np.abs(z) + 1 + eps)) - 1) / (2 * eps)) - 1) def h_log(z, eps=.6): return (np.sign(z) * np.log(1. + np.abs(z)) * eps) def h_inv_log(z, eps=.6): return np.sign(z) * (np.exp(np.abs(z) / eps) - 1) # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: cumulative_reward = h(ri + self.gamma * h_inv(cumulative_reward)) else: cumulative_reward = ri + self.gamma * cumulative_reward advantage = cumulative_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) cur_learning_rate = self._anneal_learning_rate(global_t) if self.use_lstm: batch_state.reverse() batch_action.reverse() batch_adv.reverse() batch_cumulative_reward.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_action)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= self.performance_log_interval): self.prev_local_t += self.performance_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time logger.info( "Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device=None, pretrained_model=None, pretrained_model_sess=None, advice=False, reward_shaping=False): assert self.action_size != -1 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.use_pretrained_model_as_advice = advice self.use_pretrained_model_as_reward_shaping = reward_shaping logger.info("thread_index: {}".format(self.thread_index)) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("use_lstm: {}".format( colored(self.use_lstm, "green" if self.use_lstm else "red"))) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("finetune_upper_layers_only: {}".format( colored(self.finetune_upper_layers_only, "green" if self.finetune_upper_layers_only else "red"))) logger.info("use_pretrained_model_as_advice: {}".format( colored( self.use_pretrained_model_as_advice, "green" if self.use_pretrained_model_as_advice else "red"))) logger.info("use_pretrained_model_as_reward_shaping: {}".format( colored( self.use_pretrained_model_as_reward_shaping, "green" if self.use_pretrained_model_as_reward_shaping else "red"))) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) if self.use_lstm: GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACFFNetwork(self.action_size, thread_index, device) with tf.device(device): self.local_network.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) local_vars = self.local_network.get_vars if self.finetune_upper_layers_only: local_vars = self.local_network.get_vars_upper var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs) global_vars = global_network.get_vars if self.finetune_upper_layers_only: global_vars = global_network.get_vars_upper with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) #self.apply_gradients = grad_applier.apply_gradients( # global_vars(), # self.gradients) self.sync = self.local_network.sync_from( global_network, upper_layers_only=self.finetune_upper_layers_only) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=30, human_demo=False, episode_life=True) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 self.is_demo_thread = False with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped.get_action_meanings( ) self.local_network.build_grad_cam_grads() self.pretrained_model = pretrained_model self.pretrained_model_sess = pretrained_model_sess self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0 self.advice_ctr = 0 self.shaping_ctr = 0 self.last_rho = 0. if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping: assert self.pretrained_model is not None
def make_network(): if USE_LSTM: return GameACLSTMNetwork(ACTION_SIZE, -1, device) else: return GameACFFNetwork(ACTION_SIZE, device)
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = action_size self.gamma = gamma self.local_t_max = local_t_max self.agent_type = agent_type self.performance_log_interval = performance_log_interval self.log_level = log_level if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) self.local_network.prepare_loss(entropy_beta) with tf.device(device): var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0 def reset_counters(self): self.total_q_max = 0 self.episode_reward = 0 self.episode_actions = [] self.passed_obst = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 self.learn_rate = learning_rate return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, statistics): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if self.agent_type == 'LSTM': start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.local_t_max): pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) # process game try: # Bitblt may raise error, but we can safely ignore it, otherwise thread will die self.game_state.process(action) except Exception as e: print e.message # receive game result reward = self.game_state.reward terminal = self.game_state.terminal steps = self.game_state.steps passed = self.game_state.passed_obst self.episode_reward += reward # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() self.total_q_max += np.max(pi_) self.episode_actions.append(action) self.passed_obst = self.game_state.passed_obst if terminal: terminal_end = True self.episode += 1 if self.log_level == 'FULL': reward_steps = format( float(self.episode_reward) / float(steps), '.4f') print "THREAD: {} / EPISODE: {} / TOTAL STEPS: {} / STEPS: {} / PASSED OBST: {} / REWARD: {} / REWARD/STEP: {}".format( self.thread_index, self.episode, global_t, steps, self.passed_obst, self.episode_reward, reward_steps) statistics.update(global_t, self.episode_reward, self.total_q_max, steps, self.episode_actions, self.learn_rate, self.passed_obst) self.reset_counters() self.game_state.reset() if self.agent_type == 'LSTM': self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R td = R - Vi a = np.zeros([self.action_size]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if self.agent_type == 'LSTM': batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= self.performance_log_interval) and (self.log_level == 'FULL'): self.prev_local_t += self.performance_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) global_t = 0 stop_requested = False global_game = DoomGameState(scenario_path="scenarios/cig.cfg") if USE_LSTM: global_network = GameACLSTMNetwork(global_game.get_action_size(), -1, device) else: global_network = GameACFFNetwork(global_game.get_action_size(), -1, device) del global_game training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) for i in range(PARALLEL_SIZE): game = DoomGameState(scenario_path="scenarios/cig.cfg")
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print("pi=", pi_) print(" V=", value_) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score=", self.episode_reward) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)] }) else: sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print("TIMESTEP", self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_episode, device, arrived_jobs, condition): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_episode = max_global_time_episode # 通过thread_index 即机器编号来获取在该机器上加工的所有工序 self.operations = get_data_by_machine(thread_index) self.condition = condition self.is_terminal_counted = False self.last_episode_reward = 0 if USE_LSTM: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients) # self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) # 创建该工序的环境 self.env = JspEnv(self.operations, thread_index, arrived_jobs) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): # return self.initial_learning_rate learning_rate = self.initial_learning_rate * ( self.max_global_time_episode - global_time_step) / self.max_global_time_episode if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values, use_max_choice): # if len(self.env.action_space) != 1: # print('\n------------------------------------------------' # 'machine = {}'.format(self.thread_index)) # print('action space = {}'.format(self.env.action_space)) # print('pi = {}'.format(pi_values)) # # for i in range(len(pi_values)): # if i not in self.env.action_space: # pi_values[i] = 0 # sum = np.sum(pi_values) # if sum == 0: # return np.random.choice(self.env.action_space) # else: # for i in range(len(pi_values)): # pi_values[i] = pi_values[i] / sum # if use_max_choice: # if len(self.env.action_space) != 1: # pi_values[self.env.machine_size] = 0 # return np.argmax(pi_values) # else: # return np.random.choice(range(len(pi_values)), p=pi_values) return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input, use_max_choice): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local # sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop # for i in range(LOCAL_T_MAX): while True: # pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) pi_, value_ = self.local_network.run_policy_and_value( sess, self.env.local_state) action = self.choose_action(pi_, use_max_choice) # states.append(self.game_state.s_t) states.append(self.env.local_state) actions.append(action) values.append(value_) # if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): # if (self.thread_index == 0): # print('machine index: ' + str(self.thread_index)) # print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index])) # print('actions:{}'.format(action)) # print('clock:{}'.format(self.env.clock)) # print("action space = {}".format(self.env.action_space)) # # print("pi={}".format(pi_)) # print(" V={}".format(value_)) ''' # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal ''' new_state, reward, terminal, info = self.env.step(action) self.episode_reward += reward # clip reward # rewards.append( np.clip(reward, -1, 1) ) rewards.append(reward) self.local_t += 1 # s_t1 -> s_t # self.game_state.update() if terminal: terminal_end = True # print("score={}".format(self.episode_reward)) # print("complete time={}".format(self.env.clock)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) # print('\n----------------------------------------------------') # print('machine index: ' + str(self.thread_index)) # print('arrived jobs:{}'.format(self.env.arrived_jobs[self.thread_index])) # print('actions:{}'.format(action)) # print('clock:{}'.format(self.env.clock)) # print("jobs size = {}".format(len(self.env.init_operations))) # print("action space = {}".format(self.env.action_space)) # print("pi={}".format(pi_)) # print(" V={}".format(value_)) # print('----------------------------------------------------\n') self.complete_time = self.env.clock self.last_episode_reward = self.episode_reward self.episode_reward = 0 # self.game_state.reset() self.env.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: # R = self.local_network.run_value(sess, self.game_state.s_t) R = self.local_network.run_value(sess, self.env.local_state) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi # a = np.zeros([ACTION_SIZE]) a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) # if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): # self.prev_local_t += PERFORMANCE_LOG_INTERVAL # elapsed_time = time.time() - self.start_time # steps_per_sec = global_t / elapsed_time # print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( # global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, self.complete_time, self.last_episode_reward
if not settings.mode == 'display' and not settings.mode == 'visualize': device = "/cpu:0" if settings.use_gpu: device = "/gpu:0" initial_learning_rates = log_uniform(settings.initial_alpha_low, settings.initial_alpha_high, settings.parallel_agent_size) global_t = 0 stop_requested = False if settings.agent_type == 'LSTM': global_network = GameACLSTMNetwork(settings.action_size, -1, device) else: global_network = GameACFFNetwork(settings.action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=settings.rmsp_alpha, momentum=0.0, epsilon=settings.rmsp_epsilon, clip_norm=settings.grad_norm_clip, device=device) for i in range(settings.parallel_agent_size): training_thread = A3CTrainingThread( i, global_network, initial_learning_rates[i], learning_rate_input,
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, max_global_time_step, device): self.learn_rate = 0 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) # # self.apply_gradients = tf.train.RMSPropOptimizer( self.learning_rate_input).apply_gradients( zip(self.gradients, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.game_state = Game() self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def process(self, sess, global_t): states = [] actions = [] rewards = [] values = [] temp_reward = 0 terminal_end = False sess.run(self.sync) start_local_t = self.local_t for i in range(0, LOCAL_T_MAX): # while True: # sleep(100) pi_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) # print(pi_) action = self.choose_action(pi_) # print(action) states.append(self.game_state.s_t) actions.append(action) values.append(value_) temp_action = [0, 0, 0] temp_action[action] = 1 self.game_state.process(temp_action) # receive game result reward = self.game_state.reward # print(self.game_state.terminal) terminal = self.game_state.terminal self.episode_reward += reward temp_reward = self.episode_reward # clip reward rewards.append(reward) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score={}".format(self.episode_reward)) print("process:", self.thread_index, " learn_rate:", self.learn_rate) self.episode_reward = 0 self.game_state.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) self.learn_rate = cur_learning_rate sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate }) diff_local_t = self.local_t - start_local_t return diff_local_t, temp_reward
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, options): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.options = options if options.use_lstm: self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(options.action_size, device) self.local_network.prepare_loss(options.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.indent = " |" * self.thread_index self.steps = 0 self.no_reward_steps = 0 self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0) if self.options.train_episode_steps > 0: self.max_reward = 0.0 self.max_episode_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores = Episode_scores(options) self.tes = self.options.train_episode_steps if self.options.tes_list is not None: self.tes = self.options.tes_list[thread_index] print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes)) self.initial_lives = self.game_state.initial_lives self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1) if self.options.record_new_record_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_record_dir): os.makedirs(self.options.record_new_record_dir) self.episode_screens = [] if self.options.record_new_room_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_room_dir): os.makedirs(self.options.record_new_room_dir) self.episode_screens = [] self.greediness = options.greediness self.repeat_action_ratio = options.repeat_action_ratio self.prev_action = 0
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.apply_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)], self.learning_rate_input: cur_learning_rate } ) else: sess.run( self.apply_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.learning_rate_input: cur_learning_rate} ) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
def run_a3c_test(args): """Run A3C testing.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = args.folder else: folder = 'results/a3c/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 \ or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' if args.padding == 'SAME': end_str += '_same' folder += end_str folder = pathlib.Path(folder) demo_memory_cam = None demo_cam_human = False if args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME) demo_memory_folder = pathlib.Path(demo_memory_folder) if args.demo_cam_id is not None: demo_cam_human = True demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] logger.info("loaded demo {} for testing CAM".format( args.demo_cam_id)) else: demo_cam_folder = pathlib.Path(args.demo_cam_folder) demo_cam = ReplayMemory() demo_cam.load(name='test_cam', folder=demo_cam_folder) logger.info("loaded demo {} for testing CAM".format( str(demo_cam_folder / 'test_cam'))) demo_memory_cam = np.zeros( (len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0, _, _, _, _, _, t1, _ = demo_cam[i] demo_memory_cam[i] = np.copy(s0) del demo_cam device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) input_shape = (84, 84, 4) if args.padding == 'VALID' else (88, 88, 4) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = \ args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" if args.use_lstm: local_network = GameACLSTMNetwork(action_size, 0, device) else: local_network = GameACFFNetwork( action_size, 0, device, padding=args.padding, in_shape=input_shape) testing_thread = A3CTrainingThread( 0, global_network, local_network, initial_learning_rate, learning_rate_input, grad_applier, 0, device=device) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' # TODO: make this an argument transfer_folder += end_str transfer_folder = pathlib.Path(transfer_folder) transfer_folder /= 'transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list, ) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) else: logger.warning("Could not find old checkpoint") def test_function(): nonlocal global_t if args.use_transfer: from_folder = str(transfer_folder).split('/')[-2] else: from_folder = str(folder).split('/')[-1] from_folder = pathlib.Path(from_folder) save_folder = 'results/test_model/a3c' / from_folder prepare_dir(str(save_folder), empty=False) prepare_dir(str(save_folder / 'frames'), empty=False) # Evaluate model before training if not stop_requested: testing_thread.testing_model( sess, args.eval_max_steps, global_t, save_folder, demo_memory_cam=demo_memory_cam, demo_cam_human=demo_cam_human) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) test_thread = threading.Thread(target=test_function, args=()) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) test_thread.start() print('Press Ctrl+C to stop') test_thread.join() sess.close()
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * \ (self.max_global_time_step - global_time_step) / \ self.max_global_time_step assert learning_rate > 0, 'Learning rate {} is not >0'.format( learning_rate) return learning_rate def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop start_local_t = self.local_t terminal_end = False for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) # Debug output for progress if (self.thread_index == 0) and (self.local_t % 100) == 0: print(('local_t = {:10} pi = ' + '{:7.5f} ' * len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t, *pi_, value_, self.thread_index)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward # TODO: Does this make sense? rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print ("score=", self.episode_reward) self._record_score( sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break # Compute and accmulate gradients R = 0.0 if terminal_end else self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # What is the meaning of these values? batch_si = [] batch_a = [] batch_td = [] batch_R = [] for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)]}) else: sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R}) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print ("TIMESTEP", self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
def run_a3c(args): """ python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> --load-pretrained-model --onevsall-mtl --pretrained-model-folder=<> --use-pretrained-model-as-advice --use-pretrained-model-as-reward-shaping """ from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf def log_uniform(lo, hi, rate): log_lo = math.log(lo) log_hi = math.log(hi) v = log_lo * (1 - rate) + log_hi * rate return math.exp(v) if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = 'results/a3c/{}_{}'.format(args.gym_env.replace('-', '_'), args.folder) else: folder = 'results/a3c/{}'.format(args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' folder += end_str if args.append_experiment_num is not None: folder += '_' + args.append_experiment_num if False: from common.util import LogFormatter fh = logging.FileHandler('{}/a3c.log'.format(folder), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) demo_memory = None num_demos = 0 max_reward = 0. if args.load_memory or args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format( args.gym_env.replace('-', '_')) if args.load_memory: # FIXME: use new load_memory function demo_memory, actions_ctr, max_reward = load_memory( args.gym_env, demo_memory_folder, imgs_normalized=True) #, create_symmetry=True) action_freq = [ actions_ctr[a] for a in range(demo_memory[0].num_actions) ] num_demos = len(demo_memory) demo_memory_cam = None if args.load_demo_cam: demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] demo_memory_cam = np.zeros((len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0 = (demo_cam[i])[0] demo_memory_cam[i] = np.copy(s0) del demo_cam logger.info("loaded demo {} for testing CAM".format(args.demo_cam_id)) device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:" + os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 pretrain_global_t = 0 pretrain_epoch = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) pretrained_model = None pretrained_model_sess = None if args.load_pretrained_model: if args.onevsall_mtl: from game_class_network import MTLBinaryClassNetwork as PretrainedModelNetwork elif args.onevsall_mtl_linear: from game_class_network import MTLMultivariateNetwork as PretrainedModelNetwork else: from game_class_network import MultiClassNetwork as PretrainedModelNetwork logger.error("Not supported yet!") assert False if args.pretrained_model_folder is not None: pretrained_model_folder = args.pretrained_model_folder else: pretrained_model_folder = '{}_classifier_use_mnih_onevsall_mtl'.format( args.gym_env.replace('-', '_')) PretrainedModelNetwork.use_mnih_2015 = args.use_mnih_2015 pretrained_model = PretrainedModelNetwork(action_size, -1, device) pretrained_model_sess = tf.Session(config=config, graph=pretrained_model.graph) pretrained_model.load( pretrained_model_sess, '{}/{}_checkpoint'.format(pretrained_model_folder, args.gym_env.replace('-', '_'))) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer(learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" n_shapers = args.parallel_size #int(args.parallel_size * .25) mod = args.parallel_size // n_shapers for i in range(args.parallel_size): is_reward_shape = False is_advice = False if i % mod == 0: is_reward_shape = args.use_pretrained_model_as_reward_shaping is_advice = args.use_pretrained_model_as_advice training_thread = A3CTrainingThread( i, global_network, initial_learning_rate, learning_rate_input, grad_applier, args.max_time_step, device=device, pretrained_model=pretrained_model, pretrained_model_sess=pretrained_model_sess, advice=is_advice, reward_shaping=is_reward_shape) training_threads.append(training_thread) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format( args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' #TODO: make this an argument transfer_folder += end_str transfer_folder += '/transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1 ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2 ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # summary writer for tensorboard summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter( 'results/log/a3c/{}/'.format(args.gym_env.replace('-', '_')) + folder[12:], sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=6) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(folder) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) # set wall time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'r') as f: wall_t = float(f.read()) with open(folder + '/pretrain_global_t', 'r') as f: pretrain_global_t = int(f.read()) with open(folder + '/model_best/best_model_reward', 'r') as f_best_model_reward: best_model_reward = float(f_best_model_reward.read()) rewards = pickle.load( open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'rb')) else: logger.warning("Could not find old checkpoint") # set wall time wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder + '/model_checkpoints', empty=True) prepare_dir(folder + '/model_best', empty=True) prepare_dir(folder + '/frames', empty=True) lock = threading.Lock() test_lock = False if global_t == 0: test_lock = True last_temp_global_t = global_t ispretrain_markers = [False] * args.parallel_size num_demo_thread = 0 ctr_demo_thread = 0 def train_function(parallel_index): nonlocal global_t, pretrain_global_t, pretrain_epoch, \ rewards, test_lock, lock, \ last_temp_global_t, ispretrain_markers, num_demo_thread, \ ctr_demo_thread training_thread = training_threads[parallel_index] training_thread.set_summary_writer(summary_writer) # set all threads as demo threads training_thread.is_demo_thread = args.load_memory and args.use_demo_threads if training_thread.is_demo_thread or args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs: training_thread.pretrain_init(demo_memory) if global_t == 0 and ( args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0) and parallel_index < 2: ispretrain_markers[parallel_index] = True training_thread.replay_mem_reset() # Pretraining with demo memory logger.info("t_idx={} pretrain starting".format(parallel_index)) while ispretrain_markers[parallel_index]: if stop_requested: return if pretrain_global_t > args.train_with_demo_num_steps and pretrain_epoch > args.train_with_demo_num_epochs: # At end of pretraining, reset state training_thread.replay_mem_reset() training_thread.episode_reward = 0 training_thread.local_t = 0 if args.use_lstm: training_thread.local_network.reset_state() ispretrain_markers[parallel_index] = False logger.info( "t_idx={} pretrain ended".format(parallel_index)) break diff_pretrain_global_t, _ = training_thread.demo_process( sess, pretrain_global_t) for _ in range(diff_pretrain_global_t): pretrain_global_t += 1 if pretrain_global_t % 10000 == 0: logger.debug( "pretrain_global_t={}".format(pretrain_global_t)) pretrain_epoch += 1 if pretrain_epoch % 1000 == 0: logger.debug("pretrain_epoch={}".format(pretrain_epoch)) # Waits for all threads to finish pretraining while not stop_requested and any(ispretrain_markers): time.sleep(0.01) # Evaluate model before training if not stop_requested and global_t == 0: with lock: if parallel_index == 0: test_reward, test_steps, test_episodes = training_threads[ 0].testing(sess, args.eval_max_steps, global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][global_t] = (test_reward, test_steps, test_episodes) saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t) save_best_model(test_reward) test_lock = False # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) # set start_time start_time = time.time() - wall_t training_thread.set_start_time(start_time) episode_end = True use_demo_thread = False while True: if stop_requested: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if args.use_demo_threads and global_t < args.max_steps_threads_as_demo and episode_end and num_demo_thread < 16: #if num_demo_thread < 2: demo_rate = 1.0 * (args.max_steps_threads_as_demo - global_t) / args.max_steps_threads_as_demo if demo_rate < 0.0333: demo_rate = 0.0333 if np.random.random() <= demo_rate and num_demo_thread < 16: ctr_demo_thread += 1 training_thread.replay_mem_reset(D_idx=ctr_demo_thread % num_demos) num_demo_thread += 1 logger.info( "idx={} as demo thread started ({}/16) rate={}".format( parallel_index, num_demo_thread, demo_rate)) use_demo_thread = True if use_demo_thread: diff_global_t, episode_end = training_thread.demo_process( sess, global_t) if episode_end: num_demo_thread -= 1 use_demo_thread = False logger.info("idx={} demo thread concluded ({}/16)".format( parallel_index, num_demo_thread)) else: diff_global_t, episode_end = training_thread.process( sess, global_t, rewards) for _ in range(diff_global_t): global_t += 1 if global_t % args.eval_freq == 0: temp_global_t = global_t lock.acquire() try: # catch multiple threads getting in at the same time if last_temp_global_t == temp_global_t: logger.info("Threading race problem averted!") continue test_lock = True test_reward, test_steps, n_episodes = training_thread.testing( sess, args.eval_max_steps, temp_global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][temp_global_t] = (test_reward, test_steps, n_episodes) if temp_global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save(sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format( args.gym_env.replace('-', '_')), global_step=temp_global_t, write_meta_graph=False) if test_reward > best_model_reward: save_best_model(test_reward) test_lock = False last_temp_global_t = temp_global_t finally: lock.release() if global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t, write_meta_graph=False) # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward best_model_reward = test_reward with open(folder + '/model_best/best_model_reward', 'w') as f_best_model_reward: f_best_model_reward.write(str(best_model_reward)) best_saver.save( sess, folder + '/model_best/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_'))) train_threads = [] for i in range(args.parallel_size): train_threads.append( threading.Thread(target=train_function, args=(i, ))) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) with open(folder + '/pretrain_global_t', 'w') as f: f.write(str(pretrain_global_t)) root_saver.save( sess, folder + '/{}_checkpoint_a3c'.format(args.gym_env.replace('-', '_')), global_step=global_t) pickle.dump( rewards, open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) logger.info('Data saved!') sess.close()
def visualize(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip, agent_type, action_size, rand_seed, checkpoint_dir): # use CPU for weight visualize tool device = "/cpu:0" if agent_type == 'LSTM': global_network = GameACLSTMNetwork(action_size, -1, device) else: global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=rmsp_alpha, momentum=0.0, epsilon=rmsp_epsilon, clip_norm=grad_norm_clip, device=device) game = GameState(rand_seed, action_size) game.process(0) x_t = game.x_t plt.imshow(x_t, interpolation="nearest", cmap=plt.cm.gray) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") W_conv1 = sess.run(global_network.W_conv1) # show graph of W_conv1 fig, axes = plt.subplots(4, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(4 * 16)): inch = i // 16 outch = i % 16 img = W_conv1[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() W_conv2 = sess.run(global_network.W_conv2) # show graph of W_conv2 fig, axes = plt.subplots(2, 32, figsize=(27, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(2 * 32)): inch = i // 32 outch = i % 32 img = W_conv2[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() arr = sess.run(global_network.get_vars()) s = tf.placeholder("float", [None, 84, 84, 4]) b_conv1 = sess.run(global_network.b_conv1) b_conv2 = sess.run(global_network.b_conv2) inp_1 = tf.nn.conv2d(s, W_conv1, strides=[1, 4, 4, 1], padding="VALID") h_conv1 = tf.nn.relu(inp_1 + b_conv1) inp_2 = tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding="VALID") h_conv2 = tf.nn.relu(inp_2 + b_conv2) s_t = game.s_t getActivations(sess, s, h_conv1, s_t, 16) getActivations(sess, s, h_conv2, s_t, 32)
from constants import PARALLEL_SIZE from constants import MAX_TIME_STEP from constants import CHECKPOINT_DIR from constants import RMSP_EPSILON from constants import RMSP_ALPHA from constants import GRAD_NORM_CLIP from constants import USE_GPU from constants import USE_LSTM # use CPU for weight visualize tool device = "/cpu:0" if USE_LSTM: global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) else: global_network = GameACFFNetwork(ACTION_SIZE, -1, device) training_threads = [] learning_rate_input = tf.placeholder(PRECISION) grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init)
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, pinitial_learning_rate, plearning_rate_input, pgrad_applier, vinitial_learning_rate, vlearning_rate_input, vgrad_applier, max_global_time_step, device, task_index=""): self.thread_index = thread_index self.plearning_rate_input = plearning_rate_input self.vlearning_rate_input = vlearning_rate_input self.max_global_time_step = max_global_time_step self.game_state = GameState() state = self.game_state.reset() self.game_state.reset_gs(state) self.action_size = self.game_state.action_size self.state_size = self.game_state.state_size self.local_max_iter = self.game_state.local_max_iter if USE_LSTM: self.local_network = GameACLSTMNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): pvar_refs = [v._ref() for v in self.local_network.get_pvars()] self.policy_gradients = tf.gradients( self.local_network.policy_loss, pvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) vvar_refs = [v._ref() for v in self.local_network.get_vvars()] self.value_gradients = tf.gradients( self.local_network.value_loss, vvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_policy_gradients = pgrad_applier.apply_gradients( self.local_network.get_pvars(), self.policy_gradients) self.apply_value_gradients = vgrad_applier.apply_gradients( self.local_network.get_vvars(), self.value_gradients) self.local_t = 0 self.pinitial_learning_rate = pinitial_learning_rate self.vinitial_learning_rate = vinitial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _panneal_learning_rate(self, global_time_step): learning_rate = self.pinitial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def _vanneal_learning_rate(self, global_time_step): learning_rate = self.vinitial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input, score_ph="", score_ops=""): states = [] actions = [] rewards = [] values = [] terminal_end = False start_local_t = self.local_t if USE_LSTM: pstart_lstm_state = self.local_network.plstm_state_out vstart_lstm_state = self.local_network.vlstm_state_out # t_max times loop for i in range(self.local_max_iter): action, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) states.append(self.game_state.s_t) actions.append(action) values.append(value_) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward #rewards.append( np.clip(reward,-1,1) ) rewards.append(reward) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score={}".format(self.episode_reward / self.game_state.r_sc)) score = self.episode_reward / self.game_state.r_sc if summary_writer: self._record_score( sess, summary_writer, summary_op, score_input, self.episode_reward / self.game_state.r_sc, global_t) else: sess.run( score_ops, {score_ph: self.episode_reward / self.game_state.r_sc}) self.episode_reward = 0 state = self.game_state.reset() self.game_state.reset_gs(state) if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) score = self.episode_reward / self.game_state.r_sc actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi batch_si.append(si) batch_R.append(R) batch_td.append(td) pcur_learning_rate = self._panneal_learning_rate(global_t) vcur_learning_rate = self._vanneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.apply_policy_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.pinitial_lstm_state: pstart_lstm_state, self.local_network.pstep_size: [len(batch_a)], self.local_network.vinitial_lstm_state: vstart_lstm_state, self.local_network.vstep_size: [len(batch_a)], self.plearning_rate_input: pcur_learning_rate }) sess.run(self.apply_value_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.pinitial_lstm_state: pstart_lstm_state, self.local_network.pstep_size: [len(batch_a)], self.local_network.vinitial_lstm_state: vstart_lstm_state, self.local_network.vstep_size: [len(batch_a)], self.vlearning_rate_input: vcur_learning_rate }) else: sess.run(self.apply_policy_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.r: batch_R, self.local_network.td: batch_td, self.plearning_rate_input: pcur_learning_rate }) sess.run(self.apply_value_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.r: batch_R, self.local_network.td: batch_td, self.vlearning_rate_input: vcur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time #print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( # global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t