def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars()) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients( ) self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list()) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars()) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input)
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
def __init__(self, thread_id, env_name, global_model, init_lr, lr_ph, grad_applier, max_time_steps, model_dim, gamma): self.thread_id = thread_id self.global_model = global_model self.init_lr = init_lr self.grad_applier = grad_applier self.lr_ph = lr_ph self.max_time_steps = max_time_steps self.gamma = gamma height, width, num_frames, num_actions = model_dim self.local_model = ConvNetA3C(height, width, num_frames, num_actions) self.num_actions = num_actions trainer = AccumTrainer("/cpu:0") trainer.prepare_minimize(self.local_model.loss, self.local_model.params) self.accum_grads = trainer.accumulate_gradients() self.reset_grads = trainer.reset_gradients() self.apply_grads = grad_applier.apply_gradients( global_model.params, trainer.get_accum_grad_list()) self.sync = self.local_model.sync_from(global_model) self.env = AtariAleEnvironment(env_name) self.s_t = self.env.reset() self.start_time = None self.ep_rwd, self.num_ep = 0, 0 self.avg_rwd = None self.t = 0 self.prev_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, sess, name="agent"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step #if USE_LSTM: # self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) #else: self.local_network = Network(name=name) self.local_network.prepare_loss(FLAGS.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.local_network.vars = self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_train_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_train_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) #if USE_ALE: # self.game_state = GameState(113 * thread_index) #else: self.game = gym.make('Lis-v2') self.game.configure(str(5000 + thread_index)) # game initialization # observation = env.reset() self.observation, reward, end_episode, _ = self.game.step(1) #self.observation = self.preprocess([self.observation]) self.history = [self.rgb2gray(self.observation) for _ in range(4)] #FLAGS.history_frames self.observation = np.dstack(self.history) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if NETWORK_TYPE == 'LSTM': self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) elif NETWORK_TYPE == 'DILATED': self.local_network = GameACDilatedNetwork(ACTION_SIZE, device) elif NETWORK_TYPE == 'CONV': self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, optimizer, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, device, thread_index) else: self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, device) self.local_network.create_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.create_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() clip_accum_grads = [ tf.clip_by_norm(accum_grad, 40.0) for accum_grad in self.trainer.get_accum_grad_list() ] self.apply_gradients = optimizer.apply_gradients( zip(clip_accum_grads, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.local_t = 0 self.initial_learning_rate = initial_learning_rate # for log self.episode_reward = 0.0 self.episode_start_time = 0.0 self.prev_local_t = 0 # for pull mode, like brower based game self.states = [] self.actions = [] self.rewards = [] self.values = [] self.start_lstm_state = None return
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, game_function=ale_game_state, local_network=None): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = local_network() self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = game_function(thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, environment): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network = global_network.structural_clone( network_name="thread-net-%s" % self.thread_index) self.local_network.prepare_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) self.game_state = GymGameState(113 * thread_index, env=environment) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.lstm_last_output_state = None # cache last lstm hidden states here
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, options): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.options = options if options.use_lstm: self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(options.action_size, device) self.local_network.prepare_loss(options.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.indent = " |" * self.thread_index self.steps = 0 self.no_reward_steps = 0 self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0) if self.options.train_episode_steps > 0: self.max_reward = 0.0 self.max_episode_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores = Episode_scores(options) self.tes = self.options.train_episode_steps if self.options.tes_list is not None: self.tes = self.options.tes_list[thread_index] print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes)) self.initial_lives = self.game_state.initial_lives self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1) if self.options.record_new_record_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_record_dir): os.makedirs(self.options.record_new_record_dir) self.episode_screens = [] if self.options.record_new_room_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_room_dir): os.makedirs(self.options.record_new_room_dir) self.episode_screens = [] self.greediness = options.greediness self.repeat_action_ratio = options.repeat_action_ratio self.prev_action = 0