def __init__(self, config, action_space): self.replay_memory = ReplayMemory(config) self.history = History(config) self.config = config self.action_space = action_space self.train_counter = 0 #self.w_initializer = tf.truncated_normal_initializer(0, 0.02) #self.w_initializer = tf.uniform_unit_scaling_initializer(1.0) self.w_initializer = tf.contrib.layers.xavier_initializer() self.b_initializer = tf.constant_initializer(0.0) # Build placeholders with tf.name_scope("placeholders"): self.current_observation = tf.placeholder( tf.float32, [None, self.config.screen_height, self.config.screen_width, self.config.history_length] ) self.next_observation = tf.placeholder( tf.float32, [None, self.config.screen_height, self.config.screen_width, self.config.history_length] ) self.current_action = tf.placeholder(tf.int32, [None]) self.current_reward = tf.placeholder(tf.float32, [None]) self.done = tf.placeholder(tf.float32, [None]) # Build ops self.train_op, self.predicted_action, self.target_update_op = self._build( self.current_observation, self.next_observation, self.current_action, self.current_reward, self.done ) self.summary_op = tf.summary.merge_all()
def __init__(self, mode=None): self.env = wrap_dqn(gym.make('PongDeterministic-v4')) if mode == 'test': self.env = Monitor(self.env, './video', force=True, video_callable=lambda episode_id: True) self.num_actions = self.env.action_space.n self.dqn = DQN(self.num_actions) self.target_dqn = DQN(self.num_actions) if use_gpu: self.dqn.cuda() self.target_dqn.cuda() self.buffer = ReplayMemory(1000) self.gamma = 0.99 self.mse_loss = nn.MSELoss() self.optim = optim.RMSprop(self.dqn.parameters(), lr=0.01) self.out_dir = './model' self.writer = SummaryWriter() if not os.path.exists(self.out_dir): os.makedirs(self.out_dir)
def test_len(self): max_size = 10 size = 5 replay_memory = ReplayMemory(max_size) for i in range(size): replay_memory.add(i, i + size, i + size * 2, i + size * 3, i + size * 4) self.assertEqual(min(size, max_size), len(replay_memory))
def test_sample(self): size = 5 sample_size = 3 replay_memory = ReplayMemory(size) for i in range(size): replay_memory.add(i, i + size, i + size * 2, i + size * 3, i + size * 4) sample = replay_memory.sample(sample_size) print(sample)
def test_add(self): size = 5 replay_memory = ReplayMemory(size) for i in range(size): replay_memory.add(i, i + size, i + size * 2, i + size * 3, i + size * 4) for i in range(size): data = replay_memory[i] for j in range(5): self.assertEqual(data[j], i + size * j)
def __init__(self, params, model_path): self.params = params self.model_path = model_path self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.current_q_net.to(self.device) self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.target_q_net.to(self.device) self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.params.lr) self.replay_memory = ReplayMemory(self.params.memory_capacity) env = gym.make('CarRacing-v0') self.environment = EnvironmentWrapper(env, self.params.skip_steps)
def __init__(self, model=None, n_actions=-1, train=True, replay_size=1000000, s_epsilon=1.0, e_epsilon=0.1, f_epsilon=1000000, batch_size=32, gamma=0.99, hard_learn_interval=10000, warmup=50000, priority_epsilon=0.02, priority_alpha=0.6, window_size = 4): """ :param model: Keras neural network model. :param n_actions: Number of possible actions. Only used if using default model. :param train: Whether to train or not (test). :param replay_size: Size of experience replay memory. :param s_epsilon: Start epsilon for Q-learning. :param e_epsilon: End epsilon for Q-learning. :param f_epsilon: Number of frames before epsilon gradually reaches e_epsilon. :param batch_size: Number of sampled experiences per frame. :param gamma: Future discount for Q-learning. :param hard_learn_interval: How often the target network is updated. :param warmup: Only perform random actions without learning for warmup steps. :param priority_epsilon: Added to every priority to avoid zero-valued priorities. :param priority_alpha: Between 0-1. Strength of priority experience sampling. 0 means uniform. :param window_size: Number of last observations to use as a single observation (accounting for transitions). """ if model is None: #use default model model = DEEPMIND_MODEL model.add(Dense(n_actions, init=weight_init, activation="linear")) model.compile(optimizer=Adam(lr=0.00025), loss='mse') #or RMSProp(lr=) self.model = model self.n_actions = model.layers[-1].output_shape[1] self.replay_memory = ReplayMemory(replay_size, window_size=window_size) self.epsilon = s_epsilon self.e_epsilon = e_epsilon self.d_epsilon = (e_epsilon - s_epsilon) / f_epsilon self.batch_size = batch_size self.warmup = warmup self.gamma = gamma self.hard_learn_interval = hard_learn_interval self.priority_epsilon = priority_epsilon self.priority_alpha = priority_alpha self.window_size = window_size self.train = train if train: self.target_model = copy.deepcopy(model) else: self.target_model = model self.warmup = -1 self.e_epsilon = s_epsilon self.step = 1
def __init__( self, # main_q: Model, # target_q: Model, # replay_memory: ReplayMemory, n_actions: int, input_shape: Tuple = (84, 84), batch_size: int=32, history_length: int=4, learning_rate: float=0.00001, eps_initial: int=1, eps_final: float=0.1, eps_final_frame: float=0.0, eps_evaluation: float=0.0, eps_annealing_frames: int=1000000, replay_buffer_size: int = 1000000, replay_buffer_start_size: int=50000, max_frames: int=25000000, use_per: bool=True) -> None: self.n_actions = n_actions self.input_shape = input_shape self.history_length = history_length self.learning_rate = learning_rate self.replay_buffer_start_size = replay_buffer_start_size self.max_frames = max_frames self.batch_size = batch_size # self.replay_buffer = replay_memory self.use_per = use_per self.eps_initial = eps_initial self.eps_final = eps_final self.eps_final_frame = eps_final_frame self.eps_evaluation = eps_evaluation self.eps_annealing_frames = eps_annealing_frames self.replay_buffer_size = replay_buffer_size self.slope = -(self.eps_initial - self.eps_final) / self.eps_annealing_frames self.intercept = self.eps_initial - self.slope*self.replay_buffer_start_size self.slope_2 = -(self.eps_final - self.eps_final_frame) / (self.max_frames - self.eps_annealing_frames - self.replay_buffer_start_size) self.intercept_2 = self.eps_final_frame - self.slope_2*self.max_frames self.replay_buffer: ReplayMemory = ReplayMemory( size=self.replay_buffer_size, input_shape=self.input_shape, history_length=self.history_length, use_per=self.use_per) # self.main_q: Model = DuelingDQN(self.n_actions, self.input_shape, self.history_length) # self.target_q: Model = DuelingDQN(self.n_actions, self.input_shape, self.history_length) # self.main_q.build((self.input_shape[0], self.input_shape[1], self.history_length)) # self.target_q.build((self.input_shape[0], self.input_shape[1], self.history_length)) self.main_q = build_q_network(self.n_actions, self.input_shape, self.history_length) self.target_q = build_q_network(self.n_actions, self.input_shape, self.history_length) self.main_q.compile(optimizer=Adam(self.learning_rate), loss=Huber()) self.target_q.compile(optimizer=Adam(self.learning_rate), loss=Huber())
def __init__(self, config, environment, sess): super(Agent, self).__init__(config) self.sess = sess self.weight_dir = "weights" self.env = environment self.num_actions = self.env.action_size self.history = History(self.config) self.memory = ReplayMemory(self.config, self.model_dir) self.is_knn_dict_annoy_used = config.is_knn_dict_annoy_used with tf.variable_scope("step"): self.step_op = tf.Variable(0, trainable=False, name="step") self.step_input = tf.placeholder("int32", None, name="step_input") self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(config)
def load(name, only_model = False): model = keras.models.load_model('{}.h5'.format(name)) if only_model: dqn = DDQN(model, train=False) else: with open('{}.pkl'.format(name), 'rb') as file: dqn = pickle.load(file) dqn.replay_memory = ReplayMemory.load_by_chunks(file) dqn.model = model dqn.target_model = keras.models.load_model('{}_target.h5'.format(name)) return dqn
def learn(self): """ mainメソッド. DQNのアルゴリズムを回す. """ # Replay Memory replay_mem = ReplayMemory(self.mem_size) # Q-Network q_func = CNN(self.env.action_space.n, self.history_len, self.width, self.height) q_network_weights = q_func.model.trainable_weights # 学習される重み # TargetNetwork target_func = CNN(self.env.action_space.n, self.history_len, self.width, self.height) target_network_weights = target_func.model.trainable_weights # 重みのリスト # 定期的にTargetNetworkをQ-Networkで同期する処理 assign_target_network = [ target_network_weights[i].assign(q_network_weights[i]) for i in range(len(target_network_weights)) ] # 誤差関数や最適化のための処理 a, y, loss, grad_update = self.build_training_op( self.env.action_space.n, q_func) # Sessionの構築 sess = tf.InteractiveSession() # 変数の初期化(Q Networkの初期化) sess.run(tf.global_variables_initializer()) # Target Networkの初期化 sess.run(assign_target_network) # エージェント初期化 agent = DQNAgent(num_actions=self.env.action_space.n, q_func=q_func, schedule_time_steps=int(self.expl_frac * self.tmax), initial_time_step=self.replay_st_size, final_p=self.fin_expl) # Logger logger = Logger(sess, self.save_summary_path) t = 0 episode = 0 # メインループ while t < self.tmax: # エピソード実行 episode += 1 duration = 0 total_reward = 0.0 total_q_max = 0.0 total_loss = 0 done = False # 環境初期化 obs = self.env.reset() # エピソード終了まで実行 while not done: # 前の状態を保存 pre_obs = obs.copy() # ε-greedyに従って行動を選択 action = agent.action(t, obs) # 行動を実行し,報酬と次の画面とdoneを観測 obs, reward, done, info = self.env.step(action) # replay memoryに(s_t,a_t,r_t,s_{t+1},done)を追加 replay_mem.add(pre_obs, action, reward, obs, done) if self.render: self.env.render() if t > self.replay_st_size and t % self.learn_freq == 0: # Q-Networkの学習 total_loss += self.train(sess, q_func, a, y, loss, grad_update, replay_mem, target_func) if t > self.replay_st_size and t % self.update_freq == 0: # Target Networkの更新 sess.run(assign_target_network) if t > self.replay_st_size and t % self.save_network_freq == 0: save_sess(sess, self.save_network_path, t) total_reward += reward total_q_max += np.max( q_func.q_values.eval(feed_dict={q_func.s: [obs]})) t += 1 duration += 1 if t >= self.replay_st_size: logger.write(sess, total_reward, total_q_max / float(duration), duration, total_loss / float(duration), t, episode) print( 'EPISODE: {0:6d} / TIME_STEP: {1:8d} / DURATION: {2:5d} / EPSILON: {3:.5f} / TOTAL_REWARD: {4:3.0f} ' '/ AVG_MAX_Q: {5:2.4f} / AVG_LOSS: {6:.5f}'.format( episode, t, duration, agent.epsilon.value(t), total_reward, total_q_max / float(duration), total_loss / float(duration)))
class Agent(object): def __init__(self, config, action_space): self.replay_memory = ReplayMemory(config) self.history = History(config) self.config = config self.action_space = action_space self.train_counter = 0 #self.w_initializer = tf.truncated_normal_initializer(0, 0.02) #self.w_initializer = tf.uniform_unit_scaling_initializer(1.0) self.w_initializer = tf.contrib.layers.xavier_initializer() self.b_initializer = tf.constant_initializer(0.0) # Build placeholders with tf.name_scope("placeholders"): self.current_observation = tf.placeholder( tf.float32, [None, self.config.screen_height, self.config.screen_width, self.config.history_length] ) self.next_observation = tf.placeholder( tf.float32, [None, self.config.screen_height, self.config.screen_width, self.config.history_length] ) self.current_action = tf.placeholder(tf.int32, [None]) self.current_reward = tf.placeholder(tf.float32, [None]) self.done = tf.placeholder(tf.float32, [None]) # Build ops self.train_op, self.predicted_action, self.target_update_op = self._build( self.current_observation, self.next_observation, self.current_action, self.current_reward, self.done ) self.summary_op = tf.summary.merge_all() def train(self, observation, reward, done, current_step, sess): # Update history self.history.add(observation) # Predict action via epsilon-greedy policy epsilon = (self.config.epsilon_end + max(0.0, ((self.config.epsilon_start - self.config.epsilon_end) * (self.config.epsilon_end_step - max(0., current_step - self.config.learn_start)) / self.config.epsilon_end_step))) if random.random() < epsilon: action = random.randrange(self.action_space) else: action = sess.run( self.predicted_action, {self.current_observation: [self.history.get()]} ) action = action[0] # Reset history if done: self.history.reset() # Update memory and sample self.replay_memory.add(observation, reward, action, done) # Update source network if current_step > self.config.learn_start: if self.train_counter == self.config.update_frequency: current_observation, current_action, current_reward, next_observation, current_done = self.replay_memory.sample() _, summary_str = sess.run([self.train_op, self.summary_op], {self.current_observation: current_observation, self.next_observation: next_observation, self.current_action: current_action, self.current_reward: current_reward, self.done: current_done}) self.train_counter = 0 else: self.train_counter += 1 summary_str = None else: summary_str = None # Update target network if (current_step + 1) % self.config.target_network_update_step == 0: tf.logging.info("Update target network") sess.run([self.target_update_op]) return action, epsilon, summary_str def predict(self, observation, sess): pass def _build(self, current_observation, next_observation, current_action, current_reward, done): # Global variables self.global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), dtype=tf.int32, trainable=False) # Build network source_q = self._build_network(current_observation, 'source', True) target_q = self._build_network(next_observation, 'target', False) # Compute loss action_one_hot = tf.one_hot(current_action, self.action_space, 1.0, 0.0, name="action_one_hot") q_acted = tf.reduce_sum(source_q * action_one_hot, reduction_indices=1, name="q_acted") max_target_q = tf.reduce_max(target_q, axis=1) delta = (1 - done) * self.config.gamma * max_target_q + current_reward - q_acted loss = tf.reduce_mean(clipped_error(delta)) # Optimize learning_rate_op = tf.maximum( self.config.learning_rate_minimum, tf.train.exponential_decay( self.config.initial_learning_rate, self.global_step, self.config.learning_rate_decay_step, self.config.learning_rate_decay, staircase=True ) ) train_op = tf.train.RMSPropOptimizer(learning_rate_op, momentum=0.95, epsilon=0.01).minimize(loss, global_step=self.global_step) # Update target network target_update_op = [] source_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='source') target_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target') for source_variable, target_variable in zip(source_variables, target_variables): target_update_op.append(target_variable.assign(source_variable.value())) target_update_op = tf.group(*target_update_op) # Logging predicted_action = tf.argmax(source_q, dimension=1) avg_q = tf.reduce_mean(source_q, 0) for idx in range(self.action_space): tf.summary.histogram('q/{}'.format(idx), avg_q[idx]) tf.summary.scalar('learning_rate', learning_rate_op) tf.summary.scalar('loss', loss) return train_op, predicted_action, target_update_op def _build_network(self, observation, name='source', trainable=True): with tf.variable_scope(name): with arg_scope([layers.conv2d, layers.fully_connected], activation_fn=tf.nn.relu, weights_initializer=self.w_initializer, biases_initializer=self.b_initializer, trainable=trainable): with arg_scope([layers.conv2d], padding='VALID'): conv1 = layers.conv2d(observation, num_outputs=32, kernel_size=8, stride=4, scope='conv1') conv2 = layers.conv2d(conv1, num_outputs=64, kernel_size=4, stride=2, scope='conv2') conv3 = layers.conv2d(conv2, num_outputs=64, kernel_size=3, stride=1, scope='conv3') conv3_shape = conv3.get_shape().as_list() conv3_flat = tf.reshape(conv3, [-1, reduce(lambda x, y: x * y, conv3_shape[1:])]) fc4 = layers.fully_connected(conv3_flat, 512, scope='fc4') q = layers.fully_connected(fc4, self.action_space, scope='q') return q
class DQNTrainer: def __init__(self, params, model_path): self.params = params self.model_path = model_path self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.current_q_net.to(self.device) self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.target_q_net.to(self.device) self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.params.lr) self.replay_memory = ReplayMemory(self.params.memory_capacity) game = "Breakout-ram-v0" env = gym.make(game) self.environment = EnvironmentWrapper(env, self.params.skip_steps) def run(self): state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) self._update_target_q_net() total_reward = 0 for step in range(int(self.params.num_of_steps)): q_value = self.current_q_net(torch.stack([state])) action_index, action = get_action(q_value, train=True, step=step, params=self.params, device=self.device) next_state, reward, done = self.environment.step(action) next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32) self.replay_memory.add(state, action_index, reward, next_state, done) state = next_state total_reward += reward if done: state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) if len(self.replay_memory.memory) > self.params.batch_size: loss = self._update_current_q_net() print('Update: {}. Loss: {}. Score: {}'.format( step, loss, total_reward)) if step % self.params.target_update_freq == 0: self._update_target_q_net() torch.save(self.target_q_net.state_dict(), self.model_path) def _update_current_q_net(self): batch = self.replay_memory.sample(self.params.batch_size) states, actions, rewards, next_states, dones = batch states = torch.stack(states) next_states = torch.stack(next_states) actions = torch.stack(actions).view(-1, 1) rewards = torch.tensor(rewards, device=self.device) dones = torch.tensor(dones, device=self.device, dtype=torch.float32) q_values = self.current_q_net(states).gather(1, actions) next_q_values = self.target_q_net(next_states).max(1)[0] expected_q_values = rewards + self.params.discount_factor * next_q_values * ( 1 - dones) loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def _update_target_q_net(self): self.target_q_net.load_state_dict(self.current_q_net.state_dict())
class DQNTrainer: def __init__(self, params, model_path): self.params = params self.model_path = model_path self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.current_q_net.to(self.device) self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.target_q_net.to(self.device) self.lr = self.params.lr #NEW self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.lr) # CHANGE self.replay_memory = ReplayMemory(self.params.memory_capacity) env = gym.make('CarRacing-v0') self.environment = EnvironmentWrapper(env, self.params.skip_steps) self.loss_log = [] # NEW self.score_log = [] # NEW def run(self): episode_score = 0 # NEW episode_score_short_array = np.array([]) # NEW loss_short_array = np.array([]) # NEW episode = 0 # NEW state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) self._update_target_q_net() for step in range(int(self.params.num_of_steps)): q_value = self.current_q_net(torch.stack([state])) action_index, action = get_action(q_value, train=True, step=step, params=self.params, device=self.device) next_state, reward, done = self.environment.step(action) episode_score += reward # NEW next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32) self.replay_memory.add(state, action_index, reward, next_state, done) state = next_state if done: episode += 1 # NEW print('***************Episode: {}. Score: {}'.format(episode, episode_score)) # NEW episode_score_short_array = np.append(episode_score_short_array, episode_score) # NEW episode_score = 0 # NEW state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) if len(self.replay_memory.memory) > self.params.batch_size: loss = self._update_current_q_net() loss_short_array = np.append(loss_short_array, loss.cpu().detach().numpy()) # NEW print('Update: {}. Loss: {}'.format(step, loss)) if step % self.params.target_update_freq == 0: self._update_target_q_net() if step % int(self.params.num_of_steps/50) == 0: ### NEW self.lr *= 0.8 self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.lr) torch.save(self.target_q_net.state_dict(), "models/dqn{}.pt".format(step)) self.score_log.append(np.mean(episode_score_short_array)) self.loss_log.append(np.mean(loss_short_array)) torch.save(self.target_q_net.state_dict(), self.model_path) def _update_current_q_net(self): batch = self.replay_memory.sample(self.params.batch_size) states, actions, rewards, next_states, dones = batch states = torch.stack(states) next_states = torch.stack(next_states) actions = torch.stack(actions).view(-1, 1) rewards = torch.tensor(rewards, device=self.device) dones = torch.tensor(dones, device=self.device, dtype=torch.float32) q_values = self.current_q_net(states).gather(1, actions) next_q_values = self.target_q_net(next_states).max(1)[0] expected_q_values = rewards + self.params.discount_factor * next_q_values * (1 - dones) loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def _update_target_q_net(self): self.target_q_net.load_state_dict(self.current_q_net.state_dict())
def main(): REPLAY_CAPACITY = 100000 INITIAL_EPSILON = 1.0 TARGET_EPSILON = 0.1 EXPLORATION_FRAMES = 1e6 BATCH_SIZE = 32 GAMMA = 0.97 LR = 0.0005 training, game, verbose, fps, W, H = parser.get_arguments() training = parser.str2bool(training) start_time = time.time() max_score = 0 games_played = 0 frame_iterations = 0 scores = {} print("Training: ", training) if game == 'pong': env = Pong(W, H) elif game == 'snake': env = SnakeGame(W, H, training=training, fps=fps) else: print('Invalid game title') return nn = NeuralNet(W, H, env.action_space['n'], env.GAME_TITLE, n_channels=1, gamma=GAMMA, learning_rate=LR, verbose=verbose) replay_memory = ReplayMemory(capacity=REPLAY_CAPACITY) epsilon_greedy = EpsilonGreedy(initial_value=INITIAL_EPSILON, target_value=TARGET_EPSILON, exploration_frames=EXPLORATION_FRAMES) try: s = env.reset() s = process(s, W, H) while True: # make 10 moves, then train on a minibatch for i in range(10): take_random = epsilon_greedy.evaluate() if training and take_random: a = env.sample() else: a = nn.predict([s])[0] s1, r, t, score = env.step(a) s1 = process(s1, W, H) replay_memory.add((s, a, r, s1, t)) frame_iterations += 1 if not t: s = s1 else: max_score = max(max_score, score) games_played += 1 scores[score] = scores.get(score, 0) + 1 e_value = 0 if not training else epsilon_greedy.peek() print("\rMax Score: {:3} || Last Score: {:3} || Games Played: {:7} Iterations: {:10} Epsilon: {:.5f} Scores: {}" \ .format(max_score, score, games_played, frame_iterations, e_value, str(scores)), end="\n" if verbose or games_played % 1000 == 0 else "") s = env.reset() s = process(s, W, H) if training and frame_iterations > REPLAY_CAPACITY // 2: batch = replay_memory.get_minibatch(batch_size=BATCH_SIZE) loss = nn.optimize(batch) except KeyboardInterrupt: if training: nn.save() print("\nCheckpoint saved") nn.close_session() stats_saver.save_to_file(env.GAME_TITLE, max_score, games_played, frame_iterations, scores, training, start_time) print("Session closed")
def train(sess, config): env = GymEnvironment(config) log_dir = './log/{}_lookahead_{}_gats_{}/'.format(config.env_name, config.lookahead, config.gats) checkpoint_dir = os.path.join(log_dir, 'checkpoints/') image_dir = os.path.join(log_dir, 'rollout/') if os.path.isdir(log_dir): shutil.rmtree(log_dir) print(' [*] Removed log dir: ' + log_dir) with tf.variable_scope('step'): step_op = tf.Variable(0, trainable=False, name='step') step_input = tf.placeholder('int32', None, name='step_input') step_assign_op = step_op.assign(step_input) with tf.variable_scope('summary'): scalar_summary_tags = [ 'average.reward', 'average.loss', 'average.q value', 'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate', 'rp.rp_accuracy', 'rp.rp_plus_accuracy', 'rp.rp_minus_accuracy', 'rp.nonzero_rp_accuracy' ] summary_placeholders = {} summary_ops = {} for tag in scalar_summary_tags: summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace( ' ', '_')) summary_ops[tag] = tf.summary.scalar( "%s-%s/%s" % (config.env_name, config.env_type, tag), summary_placeholders[tag]) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace( ' ', '_')) summary_ops[tag] = tf.summary.histogram(tag, summary_placeholders[tag]) config.num_actions = env.action_size # config.num_actions = 3 exploration = LinearSchedule(config.epsilon_end_t, config.epsilon_end) agent = Agent(sess, config, num_actions=config.num_actions) if config.gats: lookahead = config.lookahead rp_train_frequency = 4 gdm_train_frequency = 4 gdm = GDM(sess, config, num_actions=config.num_actions) rp = RP(sess, config, num_actions=config.num_actions) leaves_size = config.num_actions**config.lookahead if config.dyna: gan_memory = GANReplayMemory(config) else: gan_memory = None def base_generator(): tree_base = np.zeros((leaves_size, lookahead)).astype('uint8') for i in range(leaves_size): n = i j = 0 while n: n, r = divmod(n, config.num_actions) tree_base[i, lookahead - 1 - j] = r j = j + 1 return tree_base tree_base = base_generator() # memory = ReplayMemory(config) memory = ReplayMemory(config, log_dir) history = History(config) tf.global_variables_initializer().run() saver = tf.train.Saver(max_to_keep=30) # model load, if exist ckpt. load_model(sess, saver, checkpoint_dir) agent.updated_target_q_network() writer = tf.summary.FileWriter(log_dir, sess.graph) num_game, update_count, ep_reward = 0, 0, 0. total_reward, total_loss, total_q_value = 0., 0., 0. max_avg_ep_reward = -100 ep_rewards, actions = [], [] rp_accuracy = [] rp_plus_accuracy = [] rp_minus_accuracy = [] nonzero_rp_accuracy = [] screen, reward, action, terminal = env.new_random_game() # init state for _ in range(config.history_length): history.add(screen) start_step = step_op.eval() # main for step in tqdm(range(start_step, config.max_step), ncols=70, initial=start_step): if step == config.learn_start: num_game, update_count, ep_reward = 0, 0, 0. total_reward, total_loss, total_q_value = 0., 0., 0. ep_rewards, actions = [], [] if step == config.gan_dqn_learn_start: rp_accuracy = [] rp_plus_accuracy = [] rp_minus_accuracy = [] nonzero_rp_accuracy = [] # ε-greedy MCTS_FLAG = False epsilon = exploration.value(step) if random.random() < epsilon: action = random.randrange(config.num_actions) else: current_state = norm_frame(np.expand_dims(history.get(), axis=0)) if config.gats and (step >= config.gan_dqn_learn_start): action, predicted_reward = MCTS_planning( gdm, rp, agent, current_state, leaves_size, tree_base, config, exploration, step, gan_memory) MCTS_FLAG = True else: action = agent.get_action( norm_frame_Q(unnorm_frame(current_state))) # GATS用? apply_action = action # if int(apply_action != 0): # apply_action += 1 # Observe screen, reward, terminal = env.act(apply_action, is_training=True) reward = max(config.min_reward, min(config.max_reward, reward)) history.add(screen) memory.add(screen, reward, action, terminal) if MCTS_FLAG: rp_accuracy.append(int(predicted_reward == reward)) if reward != 0: nonzero_rp_accuracy.append(int(predicted_reward == reward)) if reward == 1: rp_plus_accuracy.append(int(predicted_reward == reward)) elif reward == -1: rp_minus_accuracy.append(int(predicted_reward == reward)) # Train if step > config.gan_learn_start and config.gats: if step % rp_train_frequency == 0 and memory.can_sample( config.rp_batch_size): obs, act, rew = memory.reward_sample(config.rp_batch_size) # obs, act, rew = memory.reward_sample2( # config.rp_batch_size, config.lookahead) reward_obs, reward_act, reward_rew = memory.reward_sample( config.nonzero_batch_size, nonzero=True) # reward_obs, reward_act, reward_rew = memory.nonzero_reward_sample( # config.rp_batch_size, config.lookahead) obs_batch = norm_frame( np.concatenate((obs, reward_obs), axis=0)) act_batch = np.concatenate((act, reward_act), axis=0) rew_batch = np.concatenate((rew, reward_rew), axis=0) reward_label = rew_batch + 1 trajectories = gdm.get_state(obs_batch, act_batch[:, :-1]) rp_summary = rp.train(trajectories, act_batch, reward_label) writer.add_summary(rp_summary, step) if step % gdm_train_frequency == 0 and memory.can_sample( config.gan_batch_size): state_batch, action_batch, next_state_batch = memory.GAN_sample( ) # state_batch, act_batch, next_state_batch = memory.GAN_sample2( # config.gan_batch_size, config.lookahead) # gdm.summary, disc_summary, merged_summary = gdm.train( # norm_frame(state_batch), act_batch, norm_frame(next_state_batch), warmup_bool) gdm.summary, disc_summary = gdm.train( norm_frame(state_batch), action_batch, norm_frame(next_state_batch)) if step > config.learn_start: # if step % config.train_frequency == 0 and memory.can_sample(config.batch_size): if step % config.train_frequency == 0: # s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample( # config.batch_size, config.lookahead) s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample( ) s_t, s_t_plus_1 = norm_frame(s_t), norm_frame(s_t_plus_1) if config.gats and config.dyna: if step > config.gan_dqn_learn_start and gan_memory.can_sample( config.batch_size): gan_obs_batch, gan_act_batch, gan_rew_batch, gan_terminal_batch = gan_memory.sample( ) # gan_obs_batch, gan_act_batch, gan_rew_batch = gan_memory.sample( # config.batch_size) gan_obs_batch = norm_frame(gan_obs_batch) trajectories = gdm.get_state( gan_obs_batch, np.expand_dims(gan_act_batch, axis=1)) gan_next_obs_batch = trajectories[:, -config. history_length:, ...] # gan_obs_batch, gan_next_obs_batch = \ # norm_frame(gan_obs_batch), norm_frame(gan_next_obs_batch) s_t = np.concatenate([s_t, gan_obs_batch], axis=0) act_batch = np.concatenate([act_batch, gan_act_batch], axis=0) rew_batch = np.concatenate([rew_batch, gan_rew_batch], axis=0) s_t_plus_1 = np.concatenate( [s_t_plus_1, gan_next_obs_batch], axis=0) terminal_batch = np.concatenate( [terminal_batch, gan_terminal_batch], axis=0) s_t, s_t_plus_1 = norm_frame_Q( unnorm_frame(s_t)), norm_frame_Q(unnorm_frame(s_t_plus_1)) q_t, loss, dqn_summary = agent.train(s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch, step) writer.add_summary(dqn_summary, step) total_loss += loss total_q_value += q_t.mean() update_count += 1 if step % config.target_q_update_step == config.target_q_update_step - 1: agent.updated_target_q_network() # reinit if terminal: screen, reward, action, terminal = env.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward total_reward += reward # change train freqancy if config.gats: if step == 10000 - 1: rp_train_frequency = 8 gdm_train_frequency = 8 if step == 50000 - 1: rp_train_frequency = 16 gdm_train_frequency = 16 if step == 100000 - 1: rp_train_frequency = 24 gdm_train_frequency = 24 # rolloutを行い画像を保存 if config.gats and step % config._test_step == config._test_step - 1: rollout_image(config, image_dir, gdm, memory, step + 1, 16) # calcurate infometion if step >= config.learn_start: if step % config._test_step == config._test_step - 1: # plot if config.gats: writer.add_summary(gdm.summary, step) writer.add_summary(disc_summary, step) avg_reward = total_reward / config._test_step avg_loss = total_loss / update_count avg_q = total_q_value / update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 print( '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)) # require terget q network if max_avg_ep_reward * 0.9 <= avg_ep_reward: step_assign_op.eval({step_input: step + 1}) save_model(sess, saver, checkpoint_dir, step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if step >= config.gan_dqn_learn_start: if len(rp_accuracy) > 0: rp_accuracy = np.mean(rp_accuracy) rp_plus_accuracy = np.mean(rp_plus_accuracy) rp_minus_accuracy = np.mean(rp_minus_accuracy) nonzero_rp_accuracy = np.mean(nonzero_rp_accuracy) else: rp_accuracy = 0 rp_plus_accuracy = 0 rp_minus_accuracy = 0 nonzero_rp_accuracy = 0 else: rp_accuracy = 0 rp_plus_accuracy = 0 rp_minus_accuracy = 0 nonzero_rp_accuracy = 0 # summary if step > 180: inject_summary( sess, writer, summary_ops, summary_placeholders, { 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q value': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of game': num_game, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'rp.rp_accuracy': rp_accuracy, 'rp.rp_plus_accuracy': rp_plus_accuracy, 'rp.rp_minus_accuracy': rp_minus_accuracy, 'rp.nonzero_rp_accuracy': nonzero_rp_accuracy }, step) num_game = 0 total_reward = 0. total_loss = 0. total_q_value = 0. update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] rp_accuracy = [] rp_plus_accuracy = [] rp_minus_accuracy = [] nonzero_rp_accuracy = []
class DDQN(): @staticmethod def load(name, only_model = False): model = keras.models.load_model('{}.h5'.format(name)) if only_model: dqn = DDQN(model, train=False) else: with open('{}.pkl'.format(name), 'rb') as file: dqn = pickle.load(file) dqn.replay_memory = ReplayMemory.load_by_chunks(file) dqn.model = model dqn.target_model = keras.models.load_model('{}_target.h5'.format(name)) return dqn def __init__(self, model=None, n_actions=-1, train=True, replay_size=1000000, s_epsilon=1.0, e_epsilon=0.1, f_epsilon=1000000, batch_size=32, gamma=0.99, hard_learn_interval=10000, warmup=50000, priority_epsilon=0.02, priority_alpha=0.6, window_size = 4): """ :param model: Keras neural network model. :param n_actions: Number of possible actions. Only used if using default model. :param train: Whether to train or not (test). :param replay_size: Size of experience replay memory. :param s_epsilon: Start epsilon for Q-learning. :param e_epsilon: End epsilon for Q-learning. :param f_epsilon: Number of frames before epsilon gradually reaches e_epsilon. :param batch_size: Number of sampled experiences per frame. :param gamma: Future discount for Q-learning. :param hard_learn_interval: How often the target network is updated. :param warmup: Only perform random actions without learning for warmup steps. :param priority_epsilon: Added to every priority to avoid zero-valued priorities. :param priority_alpha: Between 0-1. Strength of priority experience sampling. 0 means uniform. :param window_size: Number of last observations to use as a single observation (accounting for transitions). """ if model is None: #use default model model = DEEPMIND_MODEL model.add(Dense(n_actions, init=weight_init, activation="linear")) model.compile(optimizer=Adam(lr=0.00025), loss='mse') #or RMSProp(lr=) self.model = model self.n_actions = model.layers[-1].output_shape[1] self.replay_memory = ReplayMemory(replay_size, window_size=window_size) self.epsilon = s_epsilon self.e_epsilon = e_epsilon self.d_epsilon = (e_epsilon - s_epsilon) / f_epsilon self.batch_size = batch_size self.warmup = warmup self.gamma = gamma self.hard_learn_interval = hard_learn_interval self.priority_epsilon = priority_epsilon self.priority_alpha = priority_alpha self.window_size = window_size self.train = train if train: self.target_model = copy.deepcopy(model) else: self.target_model = model self.warmup = -1 self.e_epsilon = s_epsilon self.step = 1 def _get_target(self, orig, r, a_n, q_n, d): """ Calculates double Q-learning target. Clips the diffrence from original value to [-1,1]. """ t = r if not d: t += self.gamma * q_n[a_n] #clipping if t-orig>1: t = orig +1 elif t-orig<-1: t = orig -1 return t def _modify_target(self, t, a, r, d, a_n, q_n): """ Modifies target vector with DDQN target. """ t[a] = self._get_target(t[a], r, a_n, q_n, d) return t def _get_propotional_priority(self, priority): return (priority + self.priority_epsilon)**self.priority_alpha def _get_priority(self, t, a, r, d, a_n, q_n): priority = abs(t[a] - self._get_target(t[a], r, a_n, q_n, d)) return self._get_propotional_priority(priority) def save(self, name, only_model=False): #it isn't recommended to pickle keras models. We don't pickle replay memory because of memory issues. self.model.save("{}.h5".format(name)) if not only_model: self.target_model.save("{}_target.h5".format(name)) model_tmp = self.model target_model_tmp = self.target_model replay_memory_tmp = self.replay_memory self.model = None self.target_model = None self.replay_memory = None with open("{}.pkl".format(name), "wb") as file: pickle.dump(self, file) replay_memory_tmp.save_by_chunks(file) self.model = model_tmp self.target_model = target_model_tmp self.replay_memory = replay_memory_tmp def predict(self, observation): """ Predicts next action with epsilon policy, given environment observation. :param observation: Numpy array with the same shape as input Keras layer or utils.ObservationSequenceStore object. """ if random.random() < self.epsilon or self.step <= self.warmup: return random.randint(0,self.n_actions-1), None Q = self.model.predict_on_batch(np.expand_dims(observation, axis=0))[0] a = np.argmax(Q) return a, Q[a] def learning_step(self, action, reward, new_observation, done): """ Performs DDQN learning step :param action: Action performed. :param reward: Reward after performing the action. :param new_observation:Observation after performing the action. :param done: Bool - Is new state terminal. :return: """ if self.step <= self.warmup: #we use reward as priority during warmup priority = self._get_propotional_priority(abs(reward)) self.replay_memory.add(priority, action, reward, new_observation, done) else: if self.epsilon > self.e_epsilon: self.epsilon += self.d_epsilon sample = self.replay_memory.sample(self.batch_size) idxs, _, experiences = zip(*sample) obs, actions, rewards, obs2, dones = map(np.array, zip(*experiences)) targets = self.model.predict_on_batch(obs) #double q learning target a_next = np.argmax(self.model.predict_on_batch(obs2), axis=1) Q_next = self.target_model.predict_on_batch(obs2) #calculate new priorities priorities = [self._get_priority(t, actions[i], rewards[i], dones[i], a_next[i], Q_next[i]) for i, t in enumerate(targets)] #update priorities and add latest experience to memory for idx, priority in zip(idxs, priorities): self.replay_memory.update(idx, priority) self.replay_memory.add(abs(3 * max(priorities)), action, reward, new_observation, done) #self.replay_memory.add(priorities, last_experience) #calculate new targets targets = np.array( [self._modify_target(t, actions[i], rewards[i], dones[i], a_next[i], Q_next[i]) for i, t in enumerate(targets)]) #latest experience is excluded from training self.model.train_on_batch(obs, targets) #Update target network - aka hard learning step if self.step % self.hard_learn_interval == 0: self.target_model.set_weights(self.model.get_weights()) self.step += 1
class Agent(BaseModel): def __init__(self, config, environment, sess): super(Agent, self).__init__(config) self.sess = sess self.weight_dir = "weights" self.env = environment self.num_actions = self.env.action_size self.history = History(self.config) self.memory = ReplayMemory(self.config, self.model_dir) self.is_knn_dict_annoy_used = config.is_knn_dict_annoy_used with tf.variable_scope("step"): self.step_op = tf.Variable(0, trainable=False, name="step") self.step_input = tf.placeholder("int32", None, name="step_input") self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(config) def build_dqn(self, config): self.w = {} self.t_w = {} initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu with tf.variable_scope("embeddings"): if self.cnn_format == "NHWC": self.s_t = tf.placeholder("float32", [ None, self.screen_height, self.screen_width, self.history_length ], name="s_t") else: self.s_t = tf.placeholder("float32", [ None, self.history_length, self.screen_height, self.screen_width ], name="s_t") self.l1, self.w["l1_w"], self.w["l1_b"] = conv2d(self.s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name="l1") self.l2, self.w["l2_w"], self.w["l2_b"] = conv2d(self.l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name="l2") self.l3, self.w["l3_w"], self.w["l3_b"] = conv2d(self.l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name="l3") shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape( self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])], name="l3_float") self.l3_flat_length = reduce(lambda x, y: x * y, shape[1:]) self.embeddings_flat_length = tf.shape(self.l3_flat) with tf.variable_scope("prediction"): self.embeddings = tf.placeholder("float32", [None, self.l3_flat_length], name="embeddings") if self.dueling: self.value_hid, self.w["l4_val_w"], self.w["l4_val_b"] = \ linear(self.embeddings, 512, activation_fn=activation_fn, name="value_hid") self.adv_hid, self.w["l4_adv_w"], self.w["l4_adv_b"] = \ linear(self.embeddings, 512, activation_fn=activation_fn, name="adv_hid") self.value, self.w["val_w_out"], self.w["val_w_b"] = \ linear(self.value_hid, 1, name="value_out") self.advantage, self.w["adv_w_out"], self.w["adv_w_b"] = \ linear(self.adv_hid, self.num_actions, name="adv_out") self.q = self.value_hid + (self.advantage - tf.reduce_mean( self.advantage, reduction_indices=1, keep_dims=True)) else: self.l4, self.w["l4_w"], self.w["l4_b"] = \ linear(self.embeddings, 512, activation_fn=activation_fn, name="l4") self.q, self.w["q_w"], self.w["q_b"] = \ linear(self.l4, self.num_actions, name="q") # self.q_action = tf.argmax(self.q, dimension=1) self.q_action = tf.argmax(self.q, axis=1) q_summary = [] avg_q = tf.reduce_mean(self.q, 0) for idx in range(self.num_actions): q_summary.append(tf.summary.histogram("q/%s" % idx, avg_q[idx])) self.q_summary = tf.summary.merge(q_summary, "q_summary") # target network with tf.variable_scope('target'): if self.cnn_format == "NHWC": self.target_s_t = tf.placeholder("float32", [ None, self.screen_height, self.screen_width, self.history_length ], name="target_s_t") else: self.target_s_t = tf.placeholder("float32", [ None, self.screen_height, self.screen_width, self.history_length ], name="target_s_t") self.target_l1, self.t_w["l1_w"], self.t_w["l1_b"] = conv2d( self.target_s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name="target_l1") self.target_l2, self.t_w["l2_w"], self.t_w["l2_b"] = conv2d( self.target_l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name="target_l2") self.target_l3, self.t_w["l3_w"], self.t_w["l3_b"] = conv2d( self.target_l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name="target_l3") shape = self.target_l3.get_shape().as_list() self.target_l3_flat = tf.reshape( self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) self.target_l3_flat_length = reduce(lambda x, y: x * y, shape[1:]) self.target_embeddings = tf.placeholder( "float32", [None, self.target_l3_flat_length], name="embeddings") if self.dueling: self.t_value_hid, self.t_w["l4_val_w"], self.t_w["l4_val_b"] = \ linear(self.target_embeddings, 512, activation_fn=activation_fn, name="target_value_hid") self.t_adv_hid, self.t_w["l4_adv_w"], self.t_w["l4_adv_b"] = \ linear(self.target_embeddings, 512, activation_fn=activation_fn, name="target_adv_hid") self.t_value, self.t_w["val_w_out"], self.t_w["val_w_b"] = \ linear(self.t_value_hid, 1, name="target_value_out") self.t_advantage, self.t_w["adv_w_out"], self.t_w["adv_w_b"] = \ linear(self.t_adv_hid, self.num_actions, name="target_adv_out") self.target_q = self.t_value + ( self.t_advantage - tf.reduce_mean( self.t_advantage, reduction_indices=1, keep_dims=True)) else: self.target_l4, self.t_w["l4_w"], self.t_w["l4_b"] = \ linear(self.target_embeddings, 512, activation_fn=activation_fn, name="target_l4") self.target_q, self.t_w["q_w"], self.t_w['q_b'] = \ linear(self.target_l4, self.num_actions, name='target_q') self.target_q_idx = tf.placeholder("int32", [None, None], 'outputs_idx') self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx) with tf.variable_scope("pred_to_target"): self.t_w_input = {} self.t_w_assign_op = {} for name in self.w.keys(): self.t_w_input[name] = tf.placeholder( "float32", self.t_w[name].get_shape().as_list(), name=name) self.t_w_assign_op[name] = self.t_w[name].assign( self.t_w_input[name]) with tf.variable_scope("optimizer"): self.target_q_t = tf.placeholder('float32', [None], name="target_q_t") self.action = tf.placeholder("int64", [None], name="action") action_one_hot = tf.one_hot(self.action, self.num_actions, 1.0, 0.0, name="action_one_hot") q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name="q_acted") self.delta = self.target_q_t - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_sum(clipped_error(self.delta), name="loss") self.learning_rate_step = tf.placeholder("int64", None, name="learning_rate_step") self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss) with tf.variable_scope("summary"): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder( "float32", None, name=tag.replace(" ", "_")) self.summary_ops[tag] = tf.summary.scalar( "%s-%s/%s" % (self.env_name, self.env_type, tag), self.summary_placeholders[tag]) histogram_summary_tags = ['episode.rewards', "episode.actions"] for tag in histogram_summary_tags: self.summary_placeholders[tag] = tf.placeholder( "float32", None, name=tag.replace(" ", "_")) self.summary_ops[tag] = tf.summary.histogram( tag, self.summary_placeholders[tag]) self.writer = tf.summary.FileWriter("./logs/%s" % self.model_dir, self.sess.graph) self.sess.run(tf.global_variables_initializer()) self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op], max_to_keep=30) self.load_model() self.load_weight_from_pkl() self.update_target_q_network() def update_target_q_network(self): for name in self.w.keys(): self.t_w_assign_op[name].eval( {self.t_w_input[name]: self.w[name].eval()}) def train(self): print( "..............................agent training start.............................." ) start_step = self.step_op.eval() start_time = time.time() num_game = 0 self.update_count = 0 ep_reward = 0. total_reward = 0. self.total_loss = 0. self.total_q = 0. max_avg_ep_reward = -1e5 ep_rewards = [] actions = [] screen, reward, action, terminal = self.env.new_random_game() for _ in range(self.history_length): self.history.add(screen) pre_screen = self.l3_flat.eval({self.s_t: [self.history.get()]}) embs_length = self.embeddings_flat_length.eval( {self.s_t: [self.history.get()]})[1] if self.is_knn_dict_annoy_used: print("embs_length: ", embs_length) self.config.knn_key_dim = embs_length self.q_annoy_dict = Q_Annoy_Dict(self.config, self.num_actions) print("self.q_annoy_dict.key_dimension: ", self.q_annoy_dict.key_dimension) for self.step in tqdm(range(start_step, self.max_step), ncols=50, initial=start_step): if self.step == self.learning_start: num_game = 0 self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] # 1. predict self.emb_s_t = self.l3_flat.eval({self.s_t: [self.history.get()]}) action = self.predict(self.emb_s_t) # 2. act screen, reward, terminal = self.env.act(action, is_training=True) self.observe(screen, reward, action, terminal) if self.is_knn_dict_annoy_used: post_screen = self.l3_flat.eval( {self.s_t: [self.history.get()]}) self.observe_knn_dict(pre_screen, reward, action, terminal, post_screen) pre_screen = post_screen if (self.step + 1) % self.scale == 0: if self.is_knn_dict_annoy_used: print("self.q_annoy_dict.action_capacity: ", self.q_annoy_dict.action_capacity) if terminal: screen, reward, action, terminal = self.env.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self.learning_start: if self.step % self.test_step == self.test_step - 1: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: # print("ep_rewards: ", ep_rewards) max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0., 0., 0. print("\n arg_r %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, " \ "max_ep_r: %.4f, min_ep_r: %.4f, # game: %d" \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)) if max_avg_ep_reward * 0.9 <= avg_ep_reward: self.step_assign_op.eval( {self.step_input: self.step + 1}) # save checkpoint self.save_model(self.step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) # self.save_weight_to_pkl() if self.step > 180: self.inject_summary( { "average.reward": avg_reward, "average.loss": avg_loss, "average.q": avg_q, "episode.max reward": max_ep_reward, "episode.min reward": min_ep_reward, "episode.avg reward": avg_ep_reward, "episode.num of game": num_game, "episode.rewards": ep_rewards, "episode.actions": actions, "training.learning_rate": self.learning_rate_op.eval( {self.learning_rate_step: self.step}), }, self.step) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def observe(self, screen, reward, action, terminal): reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.memory.add(screen, reward, action, terminal) if self.step > self.learning_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_q_network() def observe_knn_dict(self, pre_screen, reward, action, terminal, post_screen): reward = max(self.min_reward, min(self.max_reward, reward)) self.q_annoy_dict.add(pre_screen, [action], [reward], [terminal], post_screen) def q_learning_mini_batch(self): # print("q_learning_mini_batch") if self.memory.count < self.history_length: return else: s_t, action, reward, s_t_plus_1, terminal = self.memory.sample() # print("shape(s_t): ", np.shape(s_t)) # print("type(s_t): ", type(s_t)) # print("action: ", action) # print("reward: ", reward) # print("s_t_plus_1: ", s_t_plus_1) if self.is_knn_dict_annoy_used: s_t_ = self.l3_flat.eval({self.s_t: s_t}) s_t_plus_1_ = self.l3_flat.eval({self.s_t: s_t_plus_1}) # print("shape(s_t_): ", np.shape(s_t_)) # s_t_, action_, reward_, terminal_, s_t_plus_1_ = self.q_annoy_dict.query(s_t_, 1) # s_t_, action_, reward_, terminal_, s_t_plus_1_ = self.q_annoy_dict.query_(s_t_, 1) s_t_dnd, action_dnd, reward_dnd, terminal_dnd, s_t_plus_1_dnd = self.q_annoy_dict.query_actions( s_t_, action, 1) # print("np.ndim(s_t_dnd): ", np.ndim(s_t_dnd)) # print("type(s_t_dnd): ", type(s_t_dnd)) # print(s_t_dnd) # print("shape(s_t_dnd): ", np.shape(s_t_dnd)) # print("action_dnd: ", action_dnd) # print("reward_dnd: ", reward_dnd) # print("terminal_dnd: ", terminal_dnd) # print("shape(s_t_plus_1_dnd)", np.shape(s_t_plus_1_dnd)) # exit() s_t = np.concatenate((s_t_, s_t_dnd), axis=0) action = np.concatenate((action, action_dnd), axis=0) reward = np.concatenate((reward, reward_dnd), axis=0) s_t_plus_1 = np.concatenate((s_t_plus_1_, s_t_plus_1_dnd), axis=0) terminal = np.concatenate((terminal, terminal_dnd), axis=0) # print("shape(s_t): ", np.shape(s_t)) t = time.time() if self.double_q: # Double Q_learning if not self.is_knn_dict_annoy_used: emb_s_t_plus_1 = self.l3_flat.eval({self.s_t: s_t_plus_1}) pred_action = self.q_action.eval( {self.embeddings: emb_s_t_plus_1}) target_s_t_plus_1_embeddings = self.target_l3_flat.eval( {self.target_s_t: s_t_plus_1}) q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({ self.target_embeddings: target_s_t_plus_1_embeddings, self.target_q_idx: [[idx, pred_a] for idx, pred_a in enumerate(pred_action)] }) target_q_t = ( 1. - terminal ) * self.discount * q_t_plus_1_with_pred_action + reward else: pred_action = self.q_action_eval( {self.embeddings: s_t_plus_1}) q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({ self.target_embeddings: s_t_plus_1, self.target_q_idx: [[idx, pred_a] for idx, pred_a in enumerate(pred_action)] }) else: if not self.is_knn_dict_annoy_used: target_s_t_plus_1_embeddings = self.target_l3_flat.eval( {self.target_s_t: s_t_plus_1}) q_t_plus_1 = self.target_q.eval( {self.target_embeddings: target_s_t_plus_1_embeddings}) else: q_t_plus_1 = self.target_q.eval( {self.target_embeddings: s_t_plus_1}) terminal = np.array(terminal) + 0. max_q_t_plus_1 = np.max(q_t_plus_1, axis=1) target_q_t = ( 1. - terminal) * self.discount * max_q_t_plus_1 + reward if not self.is_knn_dict_annoy_used: emb_s_t = self.l3_flat.eval({self.s_t: s_t}) else: emb_s_t = s_t _, q_t, loss, summary_str = self.sess.run( [self.optim, self.q, self.loss, self.q_summary], { self.target_q_t: target_q_t, self.action: action, # self.s_t: s_t, self.embeddings: emb_s_t, self.learning_rate_step: self.step }) self.writer.add_summary(summary_str, self.step) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def predict(self, emb_s_t, test_ep=None): ep = test_ep or (self.ep_end + max( 0., (self.ep_start - self.ep_end) * (self.ep_end_t - max(0., self.step - self.learning_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.num_actions) else: action = self.q_action.eval({self.embeddings: emb_s_t})[0] return action def inject_summary(self, tag_dict, step): summary_str_lists = self.sess.run( [self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.writer.add_summary(summary_str, self.step) def play(self, n_step=10000, n_episode=100, test_ep=None, render=False): if test_ep == None: test_ep = self.ep_end test_history = History(self.config) if not self.display: gym_dir = "/tmp/%s-%s" % (self.env_name, get_time()) self.env.env.monitor.start(gym_dir) best_reward, best_idx = 0., 0 for idx in range(n_episode): screen, reward, action, terminal = self.env.new_random_game() current_reward = 0 for _ in range(self.history): test_history.add(screen) for t in tqdm(range(n_step), ncols=70): # 1. predict action = self.predict(test_history.get(), test_ep) # 2. act screen, reward, terminal = self.env.act(action, is_training=False) test_history.add(screen) current_reward += reward if terminal: break if current_reward > best_reward: best_reward = current_reward best_idx = idx print("=" * 30) print(" [%d] Best reward : %d" % (best_idx, best_reward)) print("=" * 30) if not self.display: self.env.env.monitor.close() def save_weight_to_pkl(self): print("[*] save pred to pkl......") if not os.path.exists(self.weight_dir): os.makedirs(self.weight_dir) for name in self.w.keys(): save_pkl(self.w[name].eval(), os.path.join(self.weight_dir, "s.pkl" % name)) def load_weight_from_pkl(self, cpu_mode=False): print("[*] load pred from pkl.......") for name in self.w.keys(): if not os.path.exists( os.path.join(self.weight_dir, "%s.pkl" % name)): print("[*FAIL] load pred from pkl") return with tf.variable_scope("load_pred_from_pkl"): self.w_input = {} self.w_assign_op = {} for name in self.w.keys(): self.w_input[name] = tf.placeholder( "float32", self.w[name].get_shape().as_list(), name=name) self.w_assign_op[name] = self.w[name].assign( self.w_input[name]) for name in self.w.keys(): self.w_assign_op[name].eval({ self.w_input[name]: load_pkl(os.path.join(self.weight_dir, "%s.pkl" % name)) }) self.update_target_q_network() print("[*SUCCESS] load pred from pkl")
class PongAgent: def __init__(self, mode=None): self.env = wrap_dqn(gym.make('PongDeterministic-v4')) if mode == 'test': self.env = Monitor(self.env, './video', force=True, video_callable=lambda episode_id: True) self.num_actions = self.env.action_space.n self.dqn = DQN(self.num_actions) self.target_dqn = DQN(self.num_actions) if use_gpu: self.dqn.cuda() self.target_dqn.cuda() self.buffer = ReplayMemory(1000) self.gamma = 0.99 self.mse_loss = nn.MSELoss() self.optim = optim.RMSprop(self.dqn.parameters(), lr=0.01) self.out_dir = './model' self.writer = SummaryWriter() if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) def to_var(self, x): x_var = Variable(x) if use_gpu: x_var = x_var.cuda() return x_var def predict_q_values(self, states): states = self.to_var(torch.from_numpy(states).float()) actions = self.dqn(states) return actions def predict_q_target_values(self, states): states = self.to_var(torch.from_numpy(states).float()) actions = self.target_dqn(states) return actions def select_action(self, state, epsilon): choice = np.random.choice([0, 1], p=(epsilon, (1 - epsilon))) if choice == 0: return np.random.choice(range(self.num_actions)) else: state = np.expand_dims(state, 0) actions = self.predict_q_values(state) return np.argmax(actions.data.cpu().numpy()) def update(self, predicts, targets, actions): targets = self.to_var( torch.unsqueeze(torch.from_numpy(targets).float(), -1)) actions = self.to_var( torch.unsqueeze(torch.from_numpy(actions).long(), -1)) affected_values = torch.gather(predicts, 1, actions) loss = self.mse_loss(affected_values, targets) self.optim.zero_grad() loss.backward() self.optim.step() def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start, epsilon_final): return max(epsilon_final, epsilon_start - total_steps / max_epsilon_steps) def sync_target_network(self): primary_params = list(self.dqn.parameters()) target_params = list(self.target_dqn.parameters()) for i in range(0, len(primary_params)): target_params[i].data[:] = primary_params[i].data[:] def calculate_q_targets(self, next_states, rewards, dones): dones_mask = (dones == 1) predicted_q_target_values = self.predict_q_target_values(next_states) next_max_q_values = np.max( predicted_q_target_values.data.cpu().numpy(), axis=1) next_max_q_values[ dones_mask] = 0 # no next max Q values if the game is over q_targets = rewards + self.gamma * next_max_q_values return q_targets def save_final_model(self): filename = '{}/final_model.pth'.format(self.out_dir) torch.save(self.dqn.state_dict(), filename) def save_model_during_training(self, episode): filename = '{}/current_model_{}.pth'.format(self.out_dir, episode) torch.save(self.dqn.state_dict(), filename) def load_model(self, filename): self.dqn.load_state_dict(torch.load(filename)) self.sync_target_network() def play(self, episodes): for i in range(1, episodes + 1): done = False state = self.env.reset() while not done: action = self.select_action( state, 0) # force to choose an action from the network state, reward, done, _ = self.env.step(action) # self.env.render() def close_env(self): self.env.close() def train(self, replay_buffer_fill_len, batch_size, episodes, max_epsilon_steps, epsilon_start, epsilon_final, sync_target_net_freq): start_time = time.time() print('Start training at: ' + time.asctime(time.localtime(start_time))) total_steps = 0 running_episode_reward = 0 # populate replay memory print('Populating replay buffer... ') print('\n') state = self.env.reset() for i in range(replay_buffer_fill_len): action = self.select_action(state, 1) # force to choose a random action next_state, reward, done, _ = self.env.step(action) self.buffer.add(state, action, reward, done, next_state) state = next_state if done: self.env.reset() print('replay buffer populated with {} transitions, start training...'. format(self.buffer.count())) print('\n') # main loop - iterate over episodes for i in range(1, episodes + 1): # reset the environment done = False state = self.env.reset() # reset spisode reward and length episode_reward = 0 episode_length = 0 # play until it is possible while not done: # synchronize target network with estimation network in required frequence if (total_steps % sync_target_net_freq) == 0: self.sync_target_network() # calculate epsilon and select greedy action epsilon = self.get_epsilon(total_steps, max_epsilon_steps, epsilon_start, epsilon_final) action = self.select_action(state, epsilon) # execute action in the environment next_state, reward, done, _ = self.env.step(action) # store transition in replay memory self.buffer.add(state, action, reward, done, next_state) # sample random minibatch of transitions s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample( batch_size) # predict Q value using the estimation network predicted_values = self.predict_q_values(s_batch) # estimate Q value using the target network q_targets = self.calculate_q_targets(next_s_batch, r_batch, d_batch) # update weights in the estimation network self.update(predicted_values, q_targets, a_batch) # set the state for the next action selction and update counters and reward state = next_state total_steps += 1 episode_length += 1 episode_reward += reward self.writer.add_scalar('data/reward', reward, total_steps) self.writer.add_scalar('data/epsilon', epsilon, total_steps) running_episode_reward = running_episode_reward * 0.9 + 0.1 * episode_reward self.writer.add_scalar('data/episode_reward', episode_reward, i) self.writer.add_scalar('data/running_episode_reward', running_episode_reward, i) if (i % 30) == 0: print('global step: {}'.format(total_steps)) print('episode: {}'.format(i)) print('running reward: {}'.format( round(running_episode_reward, 2))) print('current epsilon: {}'.format(round(epsilon, 2))) print('episode_length: {}'.format(episode_length)) print('episode reward: {}'.format(episode_reward)) curr_time = time.time() print('current time: ' + time.asctime(time.localtime(curr_time))) print('running for: ' + str(datetime.timedelta(seconds=curr_time - start_time))) print('saving model after {} episodes...'.format(i)) print('\n') self.save_model_during_training(i) print('Finish training at: ' + time.asctime(time.localtime(start_time)))