Exemple #1
0
    def __init__(self, config, action_space):
        self.replay_memory = ReplayMemory(config)
        self.history = History(config)
        self.config = config
        self.action_space = action_space
        self.train_counter = 0

        #self.w_initializer = tf.truncated_normal_initializer(0, 0.02)
        #self.w_initializer = tf.uniform_unit_scaling_initializer(1.0)
        self.w_initializer = tf.contrib.layers.xavier_initializer()
        self.b_initializer = tf.constant_initializer(0.0)

        # Build placeholders
        with tf.name_scope("placeholders"):
            self.current_observation = tf.placeholder(
                tf.float32,
                [None, self.config.screen_height, self.config.screen_width, self.config.history_length]
            )
            self.next_observation = tf.placeholder(
                tf.float32,
                [None, self.config.screen_height, self.config.screen_width, self.config.history_length]
            )
            self.current_action = tf.placeholder(tf.int32, [None])
            self.current_reward = tf.placeholder(tf.float32, [None])
            self.done = tf.placeholder(tf.float32, [None])

        # Build ops
        self.train_op, self.predicted_action, self.target_update_op = self._build(
            self.current_observation, self.next_observation, self.current_action,
            self.current_reward, self.done
        )
        self.summary_op = tf.summary.merge_all()
Exemple #2
0
    def __init__(self, mode=None):
        self.env = wrap_dqn(gym.make('PongDeterministic-v4'))
        if mode == 'test':
            self.env = Monitor(self.env,
                               './video',
                               force=True,
                               video_callable=lambda episode_id: True)
        self.num_actions = self.env.action_space.n

        self.dqn = DQN(self.num_actions)
        self.target_dqn = DQN(self.num_actions)

        if use_gpu:
            self.dqn.cuda()
            self.target_dqn.cuda()

        self.buffer = ReplayMemory(1000)

        self.gamma = 0.99

        self.mse_loss = nn.MSELoss()
        self.optim = optim.RMSprop(self.dqn.parameters(), lr=0.01)

        self.out_dir = './model'
        self.writer = SummaryWriter()

        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
 def test_len(self):
     max_size = 10
     size = 5
     replay_memory = ReplayMemory(max_size)
     for i in range(size):
         replay_memory.add(i, i + size, i + size * 2, i + size * 3,
                           i + size * 4)
     self.assertEqual(min(size, max_size), len(replay_memory))
 def test_sample(self):
     size = 5
     sample_size = 3
     replay_memory = ReplayMemory(size)
     for i in range(size):
         replay_memory.add(i, i + size, i + size * 2, i + size * 3,
                           i + size * 4)
     sample = replay_memory.sample(sample_size)
     print(sample)
 def test_add(self):
     size = 5
     replay_memory = ReplayMemory(size)
     for i in range(size):
         replay_memory.add(i, i + size, i + size * 2, i + size * 3,
                           i + size * 4)
     for i in range(size):
         data = replay_memory[i]
         for j in range(5):
             self.assertEqual(data[j], i + size * j)
Exemple #6
0
 def __init__(self, params, model_path):
     self.params = params
     self.model_path = model_path
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
     self.current_q_net.to(self.device)
     self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
     self.target_q_net.to(self.device)
     self.optimizer = RMSprop(self.current_q_net.parameters(),
                              lr=self.params.lr)
     self.replay_memory = ReplayMemory(self.params.memory_capacity)
     env = gym.make('CarRacing-v0')
     self.environment = EnvironmentWrapper(env, self.params.skip_steps)
Exemple #7
0
    def __init__(self, model=None, n_actions=-1, train=True, replay_size=1000000, s_epsilon=1.0, e_epsilon=0.1,
                 f_epsilon=1000000, batch_size=32, gamma=0.99, hard_learn_interval=10000, warmup=50000,
                 priority_epsilon=0.02, priority_alpha=0.6, window_size = 4):
        """
        :param model: Keras neural network model.
        :param n_actions: Number of possible actions. Only used if using default model.
        :param train: Whether to train or not (test).
        :param replay_size: Size of experience replay memory.
        :param s_epsilon: Start epsilon for Q-learning.
        :param e_epsilon: End epsilon for Q-learning.
        :param f_epsilon: Number of frames before epsilon gradually reaches e_epsilon.
        :param batch_size: Number of sampled experiences per frame.
        :param gamma: Future discount for Q-learning.
        :param hard_learn_interval: How often the target network is updated.
        :param warmup: Only perform random actions without learning for warmup steps.
        :param priority_epsilon: Added to every priority to avoid zero-valued priorities.
        :param priority_alpha: Between 0-1. Strength of priority experience sampling. 0 means uniform.
        :param window_size: Number of last observations to use as a single observation (accounting for transitions).
        """

        if model is None:
            #use default model
            model = DEEPMIND_MODEL
            model.add(Dense(n_actions, init=weight_init, activation="linear"))
            model.compile(optimizer=Adam(lr=0.00025), loss='mse') #or RMSProp(lr=)

        self.model = model
        self.n_actions = model.layers[-1].output_shape[1]
        self.replay_memory = ReplayMemory(replay_size, window_size=window_size)
        self.epsilon = s_epsilon
        self.e_epsilon = e_epsilon
        self.d_epsilon = (e_epsilon - s_epsilon) / f_epsilon
        self.batch_size = batch_size
        self.warmup = warmup
        self.gamma = gamma
        self.hard_learn_interval = hard_learn_interval
        self.priority_epsilon = priority_epsilon
        self.priority_alpha = priority_alpha
        self.window_size = window_size
        self.train = train

        if train:
            self.target_model = copy.deepcopy(model)
        else:
            self.target_model = model
            self.warmup = -1
            self.e_epsilon = s_epsilon

        self.step = 1
Exemple #8
0
    def __init__(
        self, 
        # main_q: Model, 
        # target_q: Model,
        # replay_memory: ReplayMemory,
        n_actions: int,
        input_shape: Tuple = (84, 84),
        batch_size: int=32,
        history_length: int=4,
        learning_rate: float=0.00001,
        eps_initial: int=1,
        eps_final: float=0.1,
        eps_final_frame: float=0.0,
        eps_evaluation: float=0.0,
        eps_annealing_frames: int=1000000,
        replay_buffer_size: int = 1000000,
        replay_buffer_start_size: int=50000,
        max_frames: int=25000000,
        use_per: bool=True) -> None:

        self.n_actions = n_actions
        self.input_shape = input_shape
        self.history_length = history_length
        self.learning_rate = learning_rate
        self.replay_buffer_start_size = replay_buffer_start_size
        self.max_frames = max_frames
        self.batch_size = batch_size
        # self.replay_buffer = replay_memory
        self.use_per = use_per
        self.eps_initial = eps_initial
        self.eps_final = eps_final
        self.eps_final_frame = eps_final_frame
        self.eps_evaluation = eps_evaluation
        self.eps_annealing_frames = eps_annealing_frames
        self.replay_buffer_size = replay_buffer_size
        
        self.slope = -(self.eps_initial - self.eps_final) / self.eps_annealing_frames
        self.intercept = self.eps_initial - self.slope*self.replay_buffer_start_size
        self.slope_2 = -(self.eps_final - self.eps_final_frame) / (self.max_frames - self.eps_annealing_frames - self.replay_buffer_start_size)
        self.intercept_2 = self.eps_final_frame - self.slope_2*self.max_frames
        
        self.replay_buffer: ReplayMemory = ReplayMemory(
                                                size=self.replay_buffer_size,
                                                input_shape=self.input_shape,
                                                history_length=self.history_length,
                                                use_per=self.use_per)
        
        # self.main_q: Model = DuelingDQN(self.n_actions, self.input_shape, self.history_length)
        # self.target_q: Model = DuelingDQN(self.n_actions, self.input_shape, self.history_length)
        
        # self.main_q.build((self.input_shape[0], self.input_shape[1], self.history_length))
        # self.target_q.build((self.input_shape[0], self.input_shape[1], self.history_length))
        
        self.main_q = build_q_network(self.n_actions, self.input_shape, self.history_length)
        self.target_q = build_q_network(self.n_actions, self.input_shape, self.history_length)
        
        self.main_q.compile(optimizer=Adam(self.learning_rate), loss=Huber())
        self.target_q.compile(optimizer=Adam(self.learning_rate), loss=Huber())
Exemple #9
0
    def __init__(self, config, environment, sess):
        super(Agent, self).__init__(config)
        self.sess = sess
        self.weight_dir = "weights"

        self.env = environment
        self.num_actions = self.env.action_size

        self.history = History(self.config)
        self.memory = ReplayMemory(self.config, self.model_dir)

        self.is_knn_dict_annoy_used = config.is_knn_dict_annoy_used

        with tf.variable_scope("step"):
            self.step_op = tf.Variable(0, trainable=False, name="step")
            self.step_input = tf.placeholder("int32", None, name="step_input")
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(config)
Exemple #10
0
    def load(name, only_model = False):
        model = keras.models.load_model('{}.h5'.format(name))
        if only_model:
            dqn = DDQN(model, train=False)
        else:
            with open('{}.pkl'.format(name), 'rb') as file:
                dqn = pickle.load(file)
                dqn.replay_memory = ReplayMemory.load_by_chunks(file)

            dqn.model = model
            dqn.target_model = keras.models.load_model('{}_target.h5'.format(name))

        return dqn
Exemple #11
0
    def learn(self):
        """
        mainメソッド.
        DQNのアルゴリズムを回す.
        """
        # Replay Memory
        replay_mem = ReplayMemory(self.mem_size)

        # Q-Network
        q_func = CNN(self.env.action_space.n, self.history_len, self.width,
                     self.height)
        q_network_weights = q_func.model.trainable_weights  # 学習される重み
        # TargetNetwork
        target_func = CNN(self.env.action_space.n, self.history_len,
                          self.width, self.height)
        target_network_weights = target_func.model.trainable_weights  # 重みのリスト

        # 定期的にTargetNetworkをQ-Networkで同期する処理
        assign_target_network = [
            target_network_weights[i].assign(q_network_weights[i])
            for i in range(len(target_network_weights))
        ]

        # 誤差関数や最適化のための処理
        a, y, loss, grad_update = self.build_training_op(
            self.env.action_space.n, q_func)

        # Sessionの構築
        sess = tf.InteractiveSession()

        # 変数の初期化(Q Networkの初期化)
        sess.run(tf.global_variables_initializer())

        # Target Networkの初期化
        sess.run(assign_target_network)

        # エージェント初期化
        agent = DQNAgent(num_actions=self.env.action_space.n,
                         q_func=q_func,
                         schedule_time_steps=int(self.expl_frac * self.tmax),
                         initial_time_step=self.replay_st_size,
                         final_p=self.fin_expl)

        # Logger
        logger = Logger(sess, self.save_summary_path)

        t = 0
        episode = 0
        # メインループ
        while t < self.tmax:
            # エピソード実行
            episode += 1
            duration = 0
            total_reward = 0.0
            total_q_max = 0.0
            total_loss = 0
            done = False
            # 環境初期化
            obs = self.env.reset()
            # エピソード終了まで実行
            while not done:
                # 前の状態を保存
                pre_obs = obs.copy()
                # ε-greedyに従って行動を選択
                action = agent.action(t, obs)
                # 行動を実行し,報酬と次の画面とdoneを観測
                obs, reward, done, info = self.env.step(action)
                # replay memoryに(s_t,a_t,r_t,s_{t+1},done)を追加
                replay_mem.add(pre_obs, action, reward, obs, done)
                if self.render:
                    self.env.render()
                if t > self.replay_st_size and t % self.learn_freq == 0:
                    # Q-Networkの学習
                    total_loss += self.train(sess, q_func, a, y, loss,
                                             grad_update, replay_mem,
                                             target_func)
                if t > self.replay_st_size and t % self.update_freq == 0:
                    # Target Networkの更新
                    sess.run(assign_target_network)
                if t > self.replay_st_size and t % self.save_network_freq == 0:
                    save_sess(sess, self.save_network_path, t)
                total_reward += reward
                total_q_max += np.max(
                    q_func.q_values.eval(feed_dict={q_func.s: [obs]}))
                t += 1
                duration += 1
            if t >= self.replay_st_size:
                logger.write(sess, total_reward, total_q_max / float(duration),
                             duration, total_loss / float(duration), t,
                             episode)
            print(
                'EPISODE: {0:6d} / TIME_STEP: {1:8d} / DURATION: {2:5d} / EPSILON: {3:.5f} / TOTAL_REWARD: {4:3.0f} '
                '/ AVG_MAX_Q: {5:2.4f} / AVG_LOSS: {6:.5f}'.format(
                    episode, t, duration, agent.epsilon.value(t), total_reward,
                    total_q_max / float(duration),
                    total_loss / float(duration)))
Exemple #12
0
class Agent(object):
    def __init__(self, config, action_space):
        self.replay_memory = ReplayMemory(config)
        self.history = History(config)
        self.config = config
        self.action_space = action_space
        self.train_counter = 0

        #self.w_initializer = tf.truncated_normal_initializer(0, 0.02)
        #self.w_initializer = tf.uniform_unit_scaling_initializer(1.0)
        self.w_initializer = tf.contrib.layers.xavier_initializer()
        self.b_initializer = tf.constant_initializer(0.0)

        # Build placeholders
        with tf.name_scope("placeholders"):
            self.current_observation = tf.placeholder(
                tf.float32,
                [None, self.config.screen_height, self.config.screen_width, self.config.history_length]
            )
            self.next_observation = tf.placeholder(
                tf.float32,
                [None, self.config.screen_height, self.config.screen_width, self.config.history_length]
            )
            self.current_action = tf.placeholder(tf.int32, [None])
            self.current_reward = tf.placeholder(tf.float32, [None])
            self.done = tf.placeholder(tf.float32, [None])

        # Build ops
        self.train_op, self.predicted_action, self.target_update_op = self._build(
            self.current_observation, self.next_observation, self.current_action,
            self.current_reward, self.done
        )
        self.summary_op = tf.summary.merge_all()

    def train(self, observation, reward, done, current_step, sess):
        # Update history
        self.history.add(observation)

        # Predict action via epsilon-greedy policy
        epsilon = (self.config.epsilon_end +
                   max(0.0, ((self.config.epsilon_start - self.config.epsilon_end) *
                             (self.config.epsilon_end_step - max(0., current_step - self.config.learn_start)) /
                             self.config.epsilon_end_step)))

        if random.random() < epsilon:
            action = random.randrange(self.action_space)
        else:
            action = sess.run(
                self.predicted_action,
                {self.current_observation: [self.history.get()]}
            )
            action = action[0]

        # Reset history
        if done:
            self.history.reset()

        # Update memory and sample
        self.replay_memory.add(observation, reward, action, done)

        # Update source network
        if current_step > self.config.learn_start:
            if self.train_counter == self.config.update_frequency:
                current_observation, current_action, current_reward, next_observation, current_done = self.replay_memory.sample()
                _, summary_str = sess.run([self.train_op, self.summary_op],
                                          {self.current_observation: current_observation,
                                           self.next_observation: next_observation,
                                           self.current_action: current_action,
                                           self.current_reward: current_reward,
                                           self.done: current_done})
                self.train_counter = 0
            else:
                self.train_counter += 1
                summary_str = None
        else:
            summary_str = None

        # Update target network
        if (current_step + 1) % self.config.target_network_update_step == 0:
            tf.logging.info("Update target network")
            sess.run([self.target_update_op])

        return action, epsilon, summary_str

    def predict(self, observation, sess):
        pass

    def _build(self, current_observation, next_observation, current_action, current_reward, done):
        # Global variables
        self.global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), dtype=tf.int32, trainable=False)

        # Build network
        source_q = self._build_network(current_observation, 'source', True)
        target_q = self._build_network(next_observation, 'target', False)

        # Compute loss
        action_one_hot = tf.one_hot(current_action, self.action_space, 1.0, 0.0, name="action_one_hot")
        q_acted = tf.reduce_sum(source_q * action_one_hot, reduction_indices=1, name="q_acted")
        max_target_q = tf.reduce_max(target_q, axis=1)
        delta = (1 - done) * self.config.gamma * max_target_q + current_reward - q_acted
        loss = tf.reduce_mean(clipped_error(delta))

        # Optimize
        learning_rate_op = tf.maximum(
            self.config.learning_rate_minimum,
            tf.train.exponential_decay(
                self.config.initial_learning_rate,
                self.global_step,
                self.config.learning_rate_decay_step,
                self.config.learning_rate_decay,
                staircase=True
            )
        )
        train_op = tf.train.RMSPropOptimizer(learning_rate_op, momentum=0.95, epsilon=0.01).minimize(loss, global_step=self.global_step)

        # Update target network
        target_update_op = []
        source_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='source')
        target_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target')
        for source_variable, target_variable in zip(source_variables, target_variables):
            target_update_op.append(target_variable.assign(source_variable.value()))
        target_update_op = tf.group(*target_update_op)

        # Logging
        predicted_action = tf.argmax(source_q, dimension=1)
        avg_q = tf.reduce_mean(source_q, 0)
        for idx in range(self.action_space):
            tf.summary.histogram('q/{}'.format(idx), avg_q[idx])
        tf.summary.scalar('learning_rate', learning_rate_op)
        tf.summary.scalar('loss', loss)

        return train_op, predicted_action, target_update_op

    def _build_network(self, observation, name='source', trainable=True):
        with tf.variable_scope(name):
            with arg_scope([layers.conv2d, layers.fully_connected],
                           activation_fn=tf.nn.relu,
                           weights_initializer=self.w_initializer,
                           biases_initializer=self.b_initializer,
                           trainable=trainable):
                with arg_scope([layers.conv2d], padding='VALID'):
                    conv1 = layers.conv2d(observation, num_outputs=32, kernel_size=8, stride=4, scope='conv1')
                    conv2 = layers.conv2d(conv1, num_outputs=64, kernel_size=4, stride=2, scope='conv2')
                    conv3 = layers.conv2d(conv2, num_outputs=64, kernel_size=3, stride=1, scope='conv3')

                conv3_shape = conv3.get_shape().as_list()
                conv3_flat = tf.reshape(conv3, [-1, reduce(lambda x, y: x * y, conv3_shape[1:])])

                fc4 = layers.fully_connected(conv3_flat, 512, scope='fc4')
                q = layers.fully_connected(fc4, self.action_space, scope='q')

        return q
Exemple #13
0
class DQNTrainer:
    def __init__(self, params, model_path):
        self.params = params
        self.model_path = model_path
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.current_q_net = DQN(input_shape=1,
                                 num_of_actions=get_action_space())
        self.current_q_net.to(self.device)
        self.target_q_net = DQN(input_shape=1,
                                num_of_actions=get_action_space())
        self.target_q_net.to(self.device)
        self.optimizer = RMSprop(self.current_q_net.parameters(),
                                 lr=self.params.lr)
        self.replay_memory = ReplayMemory(self.params.memory_capacity)
        game = "Breakout-ram-v0"
        env = gym.make(game)
        self.environment = EnvironmentWrapper(env, self.params.skip_steps)

    def run(self):
        state = torch.tensor(self.environment.reset(),
                             device=self.device,
                             dtype=torch.float32)
        self._update_target_q_net()
        total_reward = 0
        for step in range(int(self.params.num_of_steps)):
            q_value = self.current_q_net(torch.stack([state]))
            action_index, action = get_action(q_value,
                                              train=True,
                                              step=step,
                                              params=self.params,
                                              device=self.device)
            next_state, reward, done = self.environment.step(action)
            next_state = torch.tensor(next_state,
                                      device=self.device,
                                      dtype=torch.float32)
            self.replay_memory.add(state, action_index, reward, next_state,
                                   done)
            state = next_state
            total_reward += reward
            if done:
                state = torch.tensor(self.environment.reset(),
                                     device=self.device,
                                     dtype=torch.float32)
            if len(self.replay_memory.memory) > self.params.batch_size:
                loss = self._update_current_q_net()
                print('Update: {}. Loss: {}. Score: {}'.format(
                    step, loss, total_reward))
            if step % self.params.target_update_freq == 0:
                self._update_target_q_net()
        torch.save(self.target_q_net.state_dict(), self.model_path)

    def _update_current_q_net(self):
        batch = self.replay_memory.sample(self.params.batch_size)
        states, actions, rewards, next_states, dones = batch

        states = torch.stack(states)
        next_states = torch.stack(next_states)
        actions = torch.stack(actions).view(-1, 1)
        rewards = torch.tensor(rewards, device=self.device)
        dones = torch.tensor(dones, device=self.device, dtype=torch.float32)

        q_values = self.current_q_net(states).gather(1, actions)
        next_q_values = self.target_q_net(next_states).max(1)[0]

        expected_q_values = rewards + self.params.discount_factor * next_q_values * (
            1 - dones)
        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def _update_target_q_net(self):
        self.target_q_net.load_state_dict(self.current_q_net.state_dict())
Exemple #14
0
class DQNTrainer:
    def __init__(self, params, model_path):
        self.params = params
        self.model_path = model_path
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
        self.current_q_net.to(self.device)
        self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
        self.target_q_net.to(self.device)
        self.lr = self.params.lr #NEW
        self.optimizer = RMSprop(self.current_q_net.parameters(),
                                 lr=self.lr) # CHANGE
                                 
        self.replay_memory = ReplayMemory(self.params.memory_capacity)
        env = gym.make('CarRacing-v0')
        self.environment = EnvironmentWrapper(env, self.params.skip_steps)
        self.loss_log = [] # NEW
        self.score_log = [] # NEW

    def run(self):
        episode_score = 0 # NEW
        episode_score_short_array = np.array([]) # NEW
        loss_short_array = np.array([]) # NEW
        episode = 0 # NEW
        state = torch.tensor(self.environment.reset(),
                             device=self.device,
                             dtype=torch.float32)
        self._update_target_q_net()
        for step in range(int(self.params.num_of_steps)):
            q_value = self.current_q_net(torch.stack([state]))
            action_index, action = get_action(q_value,
                                              train=True,
                                              step=step,
                                              params=self.params,
                                              device=self.device)
            next_state, reward, done = self.environment.step(action)
            episode_score += reward # NEW
            next_state = torch.tensor(next_state,
                                      device=self.device,
                                      dtype=torch.float32)
            self.replay_memory.add(state, action_index, reward, next_state, done)
            state = next_state
            if done:
                episode += 1 # NEW
                print('***************Episode: {}. Score: {}'.format(episode, episode_score)) # NEW
                episode_score_short_array = np.append(episode_score_short_array, episode_score) # NEW
                episode_score = 0 # NEW
                
                state = torch.tensor(self.environment.reset(),
                                     device=self.device,
                                     dtype=torch.float32)

            if len(self.replay_memory.memory) > self.params.batch_size:
                loss = self._update_current_q_net()
                loss_short_array = np.append(loss_short_array, loss.cpu().detach().numpy()) # NEW
                print('Update: {}. Loss: {}'.format(step, loss))

            if step % self.params.target_update_freq == 0:
                self._update_target_q_net()
                
            if step % int(self.params.num_of_steps/50) == 0: ### NEW
                self.lr *= 0.8  
                self.optimizer = RMSprop(self.current_q_net.parameters(),
                                 lr=self.lr) 
                torch.save(self.target_q_net.state_dict(), "models/dqn{}.pt".format(step))

                self.score_log.append(np.mean(episode_score_short_array))
                self.loss_log.append(np.mean(loss_short_array))


        torch.save(self.target_q_net.state_dict(), self.model_path)

    def _update_current_q_net(self):
        batch = self.replay_memory.sample(self.params.batch_size)
        states, actions, rewards, next_states, dones = batch

        states = torch.stack(states)
        next_states = torch.stack(next_states)
        actions = torch.stack(actions).view(-1, 1)
        rewards = torch.tensor(rewards, device=self.device)
        dones = torch.tensor(dones, device=self.device, dtype=torch.float32)

        q_values = self.current_q_net(states).gather(1, actions)
        next_q_values = self.target_q_net(next_states).max(1)[0]

        expected_q_values = rewards + self.params.discount_factor * next_q_values * (1 - dones)
        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def _update_target_q_net(self):
        self.target_q_net.load_state_dict(self.current_q_net.state_dict())
Exemple #15
0
def main():

    REPLAY_CAPACITY = 100000
    INITIAL_EPSILON = 1.0
    TARGET_EPSILON = 0.1
    EXPLORATION_FRAMES = 1e6
    BATCH_SIZE = 32
    GAMMA = 0.97
    LR = 0.0005

    training, game, verbose, fps, W, H = parser.get_arguments()
    training = parser.str2bool(training)
    start_time = time.time()

    max_score = 0
    games_played = 0
    frame_iterations = 0
    scores = {}

    print("Training: ", training)

    if game == 'pong':
        env = Pong(W, H)
    elif game == 'snake':
        env = SnakeGame(W, H, training=training, fps=fps)
    else:
        print('Invalid game title')
        return

    nn = NeuralNet(W,
                   H,
                   env.action_space['n'],
                   env.GAME_TITLE,
                   n_channels=1,
                   gamma=GAMMA,
                   learning_rate=LR,
                   verbose=verbose)

    replay_memory = ReplayMemory(capacity=REPLAY_CAPACITY)
    epsilon_greedy = EpsilonGreedy(initial_value=INITIAL_EPSILON,
                                   target_value=TARGET_EPSILON,
                                   exploration_frames=EXPLORATION_FRAMES)
    try:
        s = env.reset()
        s = process(s, W, H)
        while True:
            # make 10 moves, then train on a minibatch
            for i in range(10):
                take_random = epsilon_greedy.evaluate()
                if training and take_random:
                    a = env.sample()
                else:
                    a = nn.predict([s])[0]
                s1, r, t, score = env.step(a)
                s1 = process(s1, W, H)
                replay_memory.add((s, a, r, s1, t))
                frame_iterations += 1
                if not t:
                    s = s1
                else:
                    max_score = max(max_score, score)
                    games_played += 1
                    scores[score] = scores.get(score, 0) + 1
                    e_value = 0 if not training else epsilon_greedy.peek()
                    print("\rMax Score: {:3} || Last Score: {:3} || Games Played: {:7} Iterations: {:10} Epsilon: {:.5f} Scores: {}" \
                        .format(max_score, score, games_played, frame_iterations, e_value, str(scores)),
                        end="\n" if verbose or games_played % 1000 == 0 else "")
                    s = env.reset()
                    s = process(s, W, H)
            if training and frame_iterations > REPLAY_CAPACITY // 2:
                batch = replay_memory.get_minibatch(batch_size=BATCH_SIZE)
                loss = nn.optimize(batch)

    except KeyboardInterrupt:
        if training:
            nn.save()
            print("\nCheckpoint saved")
        nn.close_session()
        stats_saver.save_to_file(env.GAME_TITLE, max_score, games_played,
                                 frame_iterations, scores, training,
                                 start_time)
        print("Session closed")
Exemple #16
0
def train(sess, config):

    env = GymEnvironment(config)

    log_dir = './log/{}_lookahead_{}_gats_{}/'.format(config.env_name,
                                                      config.lookahead,
                                                      config.gats)
    checkpoint_dir = os.path.join(log_dir, 'checkpoints/')
    image_dir = os.path.join(log_dir, 'rollout/')
    if os.path.isdir(log_dir):
        shutil.rmtree(log_dir)
        print(' [*] Removed log dir: ' + log_dir)

    with tf.variable_scope('step'):
        step_op = tf.Variable(0, trainable=False, name='step')
        step_input = tf.placeholder('int32', None, name='step_input')
        step_assign_op = step_op.assign(step_input)

    with tf.variable_scope('summary'):
        scalar_summary_tags = [
            'average.reward', 'average.loss', 'average.q value',
            'episode.max reward', 'episode.min reward', 'episode.avg reward',
            'episode.num of game', 'training.learning_rate', 'rp.rp_accuracy',
            'rp.rp_plus_accuracy', 'rp.rp_minus_accuracy',
            'rp.nonzero_rp_accuracy'
        ]

        summary_placeholders = {}
        summary_ops = {}

        for tag in scalar_summary_tags:
            summary_placeholders[tag] = tf.placeholder('float32',
                                                       None,
                                                       name=tag.replace(
                                                           ' ', '_'))
            summary_ops[tag] = tf.summary.scalar(
                "%s-%s/%s" % (config.env_name, config.env_type, tag),
                summary_placeholders[tag])

        histogram_summary_tags = ['episode.rewards', 'episode.actions']

        for tag in histogram_summary_tags:
            summary_placeholders[tag] = tf.placeholder('float32',
                                                       None,
                                                       name=tag.replace(
                                                           ' ', '_'))
            summary_ops[tag] = tf.summary.histogram(tag,
                                                    summary_placeholders[tag])

    config.num_actions = env.action_size
    # config.num_actions = 3

    exploration = LinearSchedule(config.epsilon_end_t, config.epsilon_end)

    agent = Agent(sess, config, num_actions=config.num_actions)

    if config.gats:
        lookahead = config.lookahead
        rp_train_frequency = 4
        gdm_train_frequency = 4
        gdm = GDM(sess, config, num_actions=config.num_actions)
        rp = RP(sess, config, num_actions=config.num_actions)
        leaves_size = config.num_actions**config.lookahead
        if config.dyna:
            gan_memory = GANReplayMemory(config)
        else:
            gan_memory = None

        def base_generator():
            tree_base = np.zeros((leaves_size, lookahead)).astype('uint8')
            for i in range(leaves_size):
                n = i
                j = 0
                while n:
                    n, r = divmod(n, config.num_actions)
                    tree_base[i, lookahead - 1 - j] = r
                    j = j + 1
            return tree_base

        tree_base = base_generator()

    # memory = ReplayMemory(config)
    memory = ReplayMemory(config, log_dir)
    history = History(config)

    tf.global_variables_initializer().run()
    saver = tf.train.Saver(max_to_keep=30)

    # model load, if exist ckpt.
    load_model(sess, saver, checkpoint_dir)

    agent.updated_target_q_network()

    writer = tf.summary.FileWriter(log_dir, sess.graph)

    num_game, update_count, ep_reward = 0, 0, 0.
    total_reward, total_loss, total_q_value = 0., 0., 0.
    max_avg_ep_reward = -100
    ep_rewards, actions = [], []

    rp_accuracy = []
    rp_plus_accuracy = []
    rp_minus_accuracy = []
    nonzero_rp_accuracy = []

    screen, reward, action, terminal = env.new_random_game()

    # init state
    for _ in range(config.history_length):
        history.add(screen)

    start_step = step_op.eval()

    # main
    for step in tqdm(range(start_step, config.max_step),
                     ncols=70,
                     initial=start_step):

        if step == config.learn_start:
            num_game, update_count, ep_reward = 0, 0, 0.
            total_reward, total_loss, total_q_value = 0., 0., 0.
            ep_rewards, actions = [], []

        if step == config.gan_dqn_learn_start:
            rp_accuracy = []
            rp_plus_accuracy = []
            rp_minus_accuracy = []
            nonzero_rp_accuracy = []

        # ε-greedy
        MCTS_FLAG = False
        epsilon = exploration.value(step)
        if random.random() < epsilon:
            action = random.randrange(config.num_actions)
        else:
            current_state = norm_frame(np.expand_dims(history.get(), axis=0))
            if config.gats and (step >= config.gan_dqn_learn_start):
                action, predicted_reward = MCTS_planning(
                    gdm, rp, agent, current_state, leaves_size, tree_base,
                    config, exploration, step, gan_memory)
                MCTS_FLAG = True
            else:
                action = agent.get_action(
                    norm_frame_Q(unnorm_frame(current_state)))

        # GATS用?
        apply_action = action
        # if int(apply_action != 0):
        #     apply_action += 1

        # Observe
        screen, reward, terminal = env.act(apply_action, is_training=True)
        reward = max(config.min_reward, min(config.max_reward, reward))
        history.add(screen)
        memory.add(screen, reward, action, terminal)

        if MCTS_FLAG:
            rp_accuracy.append(int(predicted_reward == reward))
            if reward != 0:
                nonzero_rp_accuracy.append(int(predicted_reward == reward))
                if reward == 1:
                    rp_plus_accuracy.append(int(predicted_reward == reward))
                elif reward == -1:
                    rp_minus_accuracy.append(int(predicted_reward == reward))

        # Train
        if step > config.gan_learn_start and config.gats:
            if step % rp_train_frequency == 0 and memory.can_sample(
                    config.rp_batch_size):
                obs, act, rew = memory.reward_sample(config.rp_batch_size)
                # obs, act, rew = memory.reward_sample2(
                #     config.rp_batch_size, config.lookahead)
                reward_obs, reward_act, reward_rew = memory.reward_sample(
                    config.nonzero_batch_size, nonzero=True)
                # reward_obs, reward_act, reward_rew = memory.nonzero_reward_sample(
                #     config.rp_batch_size, config.lookahead)
                obs_batch = norm_frame(
                    np.concatenate((obs, reward_obs), axis=0))
                act_batch = np.concatenate((act, reward_act), axis=0)
                rew_batch = np.concatenate((rew, reward_rew), axis=0)
                reward_label = rew_batch + 1

                trajectories = gdm.get_state(obs_batch, act_batch[:, :-1])

                rp_summary = rp.train(trajectories, act_batch, reward_label)
                writer.add_summary(rp_summary, step)

            if step % gdm_train_frequency == 0 and memory.can_sample(
                    config.gan_batch_size):
                state_batch, action_batch, next_state_batch = memory.GAN_sample(
                )
                # state_batch, act_batch, next_state_batch = memory.GAN_sample2(
                #     config.gan_batch_size, config.lookahead)

                # gdm.summary, disc_summary, merged_summary = gdm.train(
                #     norm_frame(state_batch), act_batch, norm_frame(next_state_batch), warmup_bool)
                gdm.summary, disc_summary = gdm.train(
                    norm_frame(state_batch), action_batch,
                    norm_frame(next_state_batch))

        if step > config.learn_start:
            # if step % config.train_frequency == 0 and memory.can_sample(config.batch_size):
            if step % config.train_frequency == 0:
                # s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample(
                #     config.batch_size, config.lookahead)
                s_t, act_batch, rew_batch, s_t_plus_1, terminal_batch = memory.sample(
                )
                s_t, s_t_plus_1 = norm_frame(s_t), norm_frame(s_t_plus_1)
                if config.gats and config.dyna:
                    if step > config.gan_dqn_learn_start and gan_memory.can_sample(
                            config.batch_size):
                        gan_obs_batch, gan_act_batch, gan_rew_batch, gan_terminal_batch = gan_memory.sample(
                        )
                        # gan_obs_batch, gan_act_batch, gan_rew_batch = gan_memory.sample(
                        #     config.batch_size)
                        gan_obs_batch = norm_frame(gan_obs_batch)
                        trajectories = gdm.get_state(
                            gan_obs_batch, np.expand_dims(gan_act_batch,
                                                          axis=1))
                        gan_next_obs_batch = trajectories[:, -config.
                                                          history_length:, ...]

                        # gan_obs_batch, gan_next_obs_batch = \
                        #     norm_frame(gan_obs_batch), norm_frame(gan_next_obs_batch)

                        s_t = np.concatenate([s_t, gan_obs_batch], axis=0)
                        act_batch = np.concatenate([act_batch, gan_act_batch],
                                                   axis=0)
                        rew_batch = np.concatenate([rew_batch, gan_rew_batch],
                                                   axis=0)
                        s_t_plus_1 = np.concatenate(
                            [s_t_plus_1, gan_next_obs_batch], axis=0)
                        terminal_batch = np.concatenate(
                            [terminal_batch, gan_terminal_batch], axis=0)

                s_t, s_t_plus_1 = norm_frame_Q(
                    unnorm_frame(s_t)), norm_frame_Q(unnorm_frame(s_t_plus_1))

                q_t, loss, dqn_summary = agent.train(s_t, act_batch, rew_batch,
                                                     s_t_plus_1,
                                                     terminal_batch, step)

                writer.add_summary(dqn_summary, step)
                total_loss += loss
                total_q_value += q_t.mean()
                update_count += 1

            if step % config.target_q_update_step == config.target_q_update_step - 1:
                agent.updated_target_q_network()

        # reinit
        if terminal:
            screen, reward, action, terminal = env.new_random_game()

            num_game += 1
            ep_rewards.append(ep_reward)
            ep_reward = 0.
        else:
            ep_reward += reward

        total_reward += reward

        # change train freqancy
        if config.gats:
            if step == 10000 - 1:
                rp_train_frequency = 8
                gdm_train_frequency = 8
            if step == 50000 - 1:
                rp_train_frequency = 16
                gdm_train_frequency = 16
            if step == 100000 - 1:
                rp_train_frequency = 24
                gdm_train_frequency = 24

        # rolloutを行い画像を保存
        if config.gats and step % config._test_step == config._test_step - 1:
            rollout_image(config, image_dir, gdm, memory, step + 1, 16)

        # calcurate infometion
        if step >= config.learn_start:
            if step % config._test_step == config._test_step - 1:

                # plot
                if config.gats:
                    writer.add_summary(gdm.summary, step)
                    writer.add_summary(disc_summary, step)

                avg_reward = total_reward / config._test_step
                avg_loss = total_loss / update_count
                avg_q = total_q_value / update_count

                try:
                    max_ep_reward = np.max(ep_rewards)
                    min_ep_reward = np.min(ep_rewards)
                    avg_ep_reward = np.mean(ep_rewards)
                except:
                    max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                print(
                    '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d'
                    % (avg_reward, avg_loss, avg_q, avg_ep_reward,
                       max_ep_reward, min_ep_reward, num_game))

                # require terget q network
                if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                    step_assign_op.eval({step_input: step + 1})
                    save_model(sess, saver, checkpoint_dir, step + 1)

                    max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)

                if step >= config.gan_dqn_learn_start:
                    if len(rp_accuracy) > 0:
                        rp_accuracy = np.mean(rp_accuracy)
                        rp_plus_accuracy = np.mean(rp_plus_accuracy)
                        rp_minus_accuracy = np.mean(rp_minus_accuracy)
                        nonzero_rp_accuracy = np.mean(nonzero_rp_accuracy)
                    else:
                        rp_accuracy = 0
                        rp_plus_accuracy = 0
                        rp_minus_accuracy = 0
                        nonzero_rp_accuracy = 0
                else:
                    rp_accuracy = 0
                    rp_plus_accuracy = 0
                    rp_minus_accuracy = 0
                    nonzero_rp_accuracy = 0

                # summary
                if step > 180:
                    inject_summary(
                        sess, writer, summary_ops, summary_placeholders, {
                            'average.reward': avg_reward,
                            'average.loss': avg_loss,
                            'average.q value': avg_q,
                            'episode.max reward': max_ep_reward,
                            'episode.min reward': min_ep_reward,
                            'episode.avg reward': avg_ep_reward,
                            'episode.num of game': num_game,
                            'episode.rewards': ep_rewards,
                            'episode.actions': actions,
                            'rp.rp_accuracy': rp_accuracy,
                            'rp.rp_plus_accuracy': rp_plus_accuracy,
                            'rp.rp_minus_accuracy': rp_minus_accuracy,
                            'rp.nonzero_rp_accuracy': nonzero_rp_accuracy
                        }, step)

                num_game = 0
                total_reward = 0.
                total_loss = 0.
                total_q_value = 0.
                update_count = 0
                ep_reward = 0.
                ep_rewards = []
                actions = []

                rp_accuracy = []
                rp_plus_accuracy = []
                rp_minus_accuracy = []
                nonzero_rp_accuracy = []
Exemple #17
0
class DDQN():
    @staticmethod
    def load(name, only_model = False):
        model = keras.models.load_model('{}.h5'.format(name))
        if only_model:
            dqn = DDQN(model, train=False)
        else:
            with open('{}.pkl'.format(name), 'rb') as file:
                dqn = pickle.load(file)
                dqn.replay_memory = ReplayMemory.load_by_chunks(file)

            dqn.model = model
            dqn.target_model = keras.models.load_model('{}_target.h5'.format(name))

        return dqn

    def __init__(self, model=None, n_actions=-1, train=True, replay_size=1000000, s_epsilon=1.0, e_epsilon=0.1,
                 f_epsilon=1000000, batch_size=32, gamma=0.99, hard_learn_interval=10000, warmup=50000,
                 priority_epsilon=0.02, priority_alpha=0.6, window_size = 4):
        """
        :param model: Keras neural network model.
        :param n_actions: Number of possible actions. Only used if using default model.
        :param train: Whether to train or not (test).
        :param replay_size: Size of experience replay memory.
        :param s_epsilon: Start epsilon for Q-learning.
        :param e_epsilon: End epsilon for Q-learning.
        :param f_epsilon: Number of frames before epsilon gradually reaches e_epsilon.
        :param batch_size: Number of sampled experiences per frame.
        :param gamma: Future discount for Q-learning.
        :param hard_learn_interval: How often the target network is updated.
        :param warmup: Only perform random actions without learning for warmup steps.
        :param priority_epsilon: Added to every priority to avoid zero-valued priorities.
        :param priority_alpha: Between 0-1. Strength of priority experience sampling. 0 means uniform.
        :param window_size: Number of last observations to use as a single observation (accounting for transitions).
        """

        if model is None:
            #use default model
            model = DEEPMIND_MODEL
            model.add(Dense(n_actions, init=weight_init, activation="linear"))
            model.compile(optimizer=Adam(lr=0.00025), loss='mse') #or RMSProp(lr=)

        self.model = model
        self.n_actions = model.layers[-1].output_shape[1]
        self.replay_memory = ReplayMemory(replay_size, window_size=window_size)
        self.epsilon = s_epsilon
        self.e_epsilon = e_epsilon
        self.d_epsilon = (e_epsilon - s_epsilon) / f_epsilon
        self.batch_size = batch_size
        self.warmup = warmup
        self.gamma = gamma
        self.hard_learn_interval = hard_learn_interval
        self.priority_epsilon = priority_epsilon
        self.priority_alpha = priority_alpha
        self.window_size = window_size
        self.train = train

        if train:
            self.target_model = copy.deepcopy(model)
        else:
            self.target_model = model
            self.warmup = -1
            self.e_epsilon = s_epsilon

        self.step = 1

    def _get_target(self, orig, r, a_n, q_n, d):
        """
        Calculates double Q-learning target. Clips the diffrence from original value to [-1,1].
        """
        t = r
        if not d:
            t += self.gamma * q_n[a_n]

        #clipping
        if t-orig>1:
            t = orig +1
        elif t-orig<-1:
            t = orig -1
        return t

    def _modify_target(self, t, a, r, d, a_n, q_n):
        """
        Modifies target vector with DDQN target.
        """
        t[a] = self._get_target(t[a], r, a_n, q_n, d)
        return t

    def _get_propotional_priority(self, priority):
        return (priority + self.priority_epsilon)**self.priority_alpha

    def _get_priority(self, t, a, r, d, a_n, q_n):
        priority = abs(t[a] - self._get_target(t[a], r, a_n, q_n, d))
        return self._get_propotional_priority(priority)


    def save(self, name, only_model=False):
        #it isn't recommended to pickle keras models. We don't pickle replay memory because of memory issues.
        self.model.save("{}.h5".format(name))

        if not only_model:
            self.target_model.save("{}_target.h5".format(name))

            model_tmp = self.model
            target_model_tmp = self.target_model
            replay_memory_tmp = self.replay_memory

            self.model = None
            self.target_model = None
            self.replay_memory = None
            with open("{}.pkl".format(name), "wb") as file:
                pickle.dump(self, file)
                replay_memory_tmp.save_by_chunks(file)

            self.model = model_tmp
            self.target_model = target_model_tmp
            self.replay_memory = replay_memory_tmp


    def predict(self, observation):
        """
        Predicts next action with epsilon policy, given environment observation.
        :param observation: Numpy array with the same shape as input Keras layer or
                            utils.ObservationSequenceStore object.
        """

        if random.random() < self.epsilon or self.step <= self.warmup:
            return random.randint(0,self.n_actions-1), None

        Q = self.model.predict_on_batch(np.expand_dims(observation, axis=0))[0]
        a = np.argmax(Q)
        return a, Q[a]

    def learning_step(self, action, reward, new_observation, done):
        """
        Performs DDQN learning step
        :param action: Action performed.
        :param reward: Reward after performing the action.
        :param new_observation:Observation after performing the action.
        :param done: Bool - Is new state terminal.
        :return:
        """
        if self.step <= self.warmup:
            #we use reward as priority during warmup
            priority = self._get_propotional_priority(abs(reward))
            self.replay_memory.add(priority, action, reward, new_observation, done)

        else:
            if self.epsilon > self.e_epsilon:
                self.epsilon += self.d_epsilon

            sample = self.replay_memory.sample(self.batch_size)
            idxs, _, experiences = zip(*sample)

            obs, actions, rewards, obs2, dones = map(np.array, zip(*experiences))
            targets = self.model.predict_on_batch(obs)
            #double q learning target
            a_next = np.argmax(self.model.predict_on_batch(obs2), axis=1)
            Q_next = self.target_model.predict_on_batch(obs2)

            #calculate new priorities
            priorities = [self._get_priority(t, actions[i], rewards[i], dones[i], a_next[i], Q_next[i])
                          for i, t in enumerate(targets)]
            #update priorities and add latest experience to memory
            for idx, priority in zip(idxs, priorities):
                self.replay_memory.update(idx, priority)
            self.replay_memory.add(abs(3 * max(priorities)), action, reward, new_observation, done)
            #self.replay_memory.add(priorities, last_experience)

            #calculate new targets
            targets = np.array(
                [self._modify_target(t, actions[i], rewards[i], dones[i], a_next[i], Q_next[i])
                for i, t in enumerate(targets)])
            #latest experience is excluded from training
            self.model.train_on_batch(obs, targets)

            #Update target network - aka hard learning step
            if self.step % self.hard_learn_interval == 0:
                self.target_model.set_weights(self.model.get_weights())

        self.step += 1
Exemple #18
0
class Agent(BaseModel):
    def __init__(self, config, environment, sess):
        super(Agent, self).__init__(config)
        self.sess = sess
        self.weight_dir = "weights"

        self.env = environment
        self.num_actions = self.env.action_size

        self.history = History(self.config)
        self.memory = ReplayMemory(self.config, self.model_dir)

        self.is_knn_dict_annoy_used = config.is_knn_dict_annoy_used

        with tf.variable_scope("step"):
            self.step_op = tf.Variable(0, trainable=False, name="step")
            self.step_input = tf.placeholder("int32", None, name="step_input")
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(config)

    def build_dqn(self, config):

        self.w = {}
        self.t_w = {}

        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu

        with tf.variable_scope("embeddings"):
            if self.cnn_format == "NHWC":
                self.s_t = tf.placeholder("float32", [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                          name="s_t")
            else:
                self.s_t = tf.placeholder("float32", [
                    None, self.history_length, self.screen_height,
                    self.screen_width
                ],
                                          name="s_t")

            self.l1, self.w["l1_w"], self.w["l1_b"] = conv2d(self.s_t,
                                                             32, [8, 8],
                                                             [4, 4],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name="l1")
            self.l2, self.w["l2_w"], self.w["l2_b"] = conv2d(self.l1,
                                                             64, [4, 4],
                                                             [2, 2],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name="l2")
            self.l3, self.w["l3_w"], self.w["l3_b"] = conv2d(self.l2,
                                                             64, [3, 3],
                                                             [1, 1],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name="l3")
            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(
                self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])],
                name="l3_float")
            self.l3_flat_length = reduce(lambda x, y: x * y, shape[1:])
            self.embeddings_flat_length = tf.shape(self.l3_flat)

        with tf.variable_scope("prediction"):
            self.embeddings = tf.placeholder("float32",
                                             [None, self.l3_flat_length],
                                             name="embeddings")
            if self.dueling:
                self.value_hid, self.w["l4_val_w"], self.w["l4_val_b"] = \
                    linear(self.embeddings, 512, activation_fn=activation_fn, name="value_hid")
                self.adv_hid, self.w["l4_adv_w"], self.w["l4_adv_b"] = \
                    linear(self.embeddings, 512, activation_fn=activation_fn, name="adv_hid")
                self.value, self.w["val_w_out"], self.w["val_w_b"] = \
                    linear(self.value_hid, 1, name="value_out")
                self.advantage, self.w["adv_w_out"], self.w["adv_w_b"] = \
                    linear(self.adv_hid, self.num_actions, name="adv_out")
                self.q = self.value_hid + (self.advantage - tf.reduce_mean(
                    self.advantage, reduction_indices=1, keep_dims=True))
            else:
                self.l4, self.w["l4_w"], self.w["l4_b"] = \
                    linear(self.embeddings, 512, activation_fn=activation_fn, name="l4")
                self.q, self.w["q_w"], self.w["q_b"] = \
                    linear(self.l4, self.num_actions, name="q")
            # self.q_action = tf.argmax(self.q, dimension=1)
            self.q_action = tf.argmax(self.q, axis=1)

            q_summary = []
            avg_q = tf.reduce_mean(self.q, 0)
            for idx in range(self.num_actions):
                q_summary.append(tf.summary.histogram("q/%s" % idx,
                                                      avg_q[idx]))
            self.q_summary = tf.summary.merge(q_summary, "q_summary")

        # target network
        with tf.variable_scope('target'):
            if self.cnn_format == "NHWC":
                self.target_s_t = tf.placeholder("float32", [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                                 name="target_s_t")
            else:
                self.target_s_t = tf.placeholder("float32", [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                                 name="target_s_t")
            self.target_l1, self.t_w["l1_w"], self.t_w["l1_b"] = conv2d(
                self.target_s_t,
                32, [8, 8], [4, 4],
                initializer,
                activation_fn,
                self.cnn_format,
                name="target_l1")
            self.target_l2, self.t_w["l2_w"], self.t_w["l2_b"] = conv2d(
                self.target_l1,
                64, [4, 4], [2, 2],
                initializer,
                activation_fn,
                self.cnn_format,
                name="target_l2")
            self.target_l3, self.t_w["l3_w"], self.t_w["l3_b"] = conv2d(
                self.target_l2,
                64, [3, 3], [1, 1],
                initializer,
                activation_fn,
                self.cnn_format,
                name="target_l3")
            shape = self.target_l3.get_shape().as_list()
            self.target_l3_flat = tf.reshape(
                self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

            self.target_l3_flat_length = reduce(lambda x, y: x * y, shape[1:])

            self.target_embeddings = tf.placeholder(
                "float32", [None, self.target_l3_flat_length],
                name="embeddings")

            if self.dueling:
                self.t_value_hid, self.t_w["l4_val_w"], self.t_w["l4_val_b"] = \
                    linear(self.target_embeddings, 512, activation_fn=activation_fn, name="target_value_hid")
                self.t_adv_hid, self.t_w["l4_adv_w"], self.t_w["l4_adv_b"] = \
                    linear(self.target_embeddings, 512, activation_fn=activation_fn, name="target_adv_hid")

                self.t_value, self.t_w["val_w_out"], self.t_w["val_w_b"] = \
                    linear(self.t_value_hid, 1, name="target_value_out")
                self.t_advantage, self.t_w["adv_w_out"], self.t_w["adv_w_b"] = \
                    linear(self.t_adv_hid, self.num_actions, name="target_adv_out")

                self.target_q = self.t_value + (
                    self.t_advantage - tf.reduce_mean(
                        self.t_advantage, reduction_indices=1, keep_dims=True))
            else:
                self.target_l4, self.t_w["l4_w"], self.t_w["l4_b"] = \
                    linear(self.target_embeddings, 512, activation_fn=activation_fn, name="target_l4")
                self.target_q, self.t_w["q_w"], self.t_w['q_b'] = \
                    linear(self.target_l4, self.num_actions, name='target_q')

            self.target_q_idx = tf.placeholder("int32", [None, None],
                                               'outputs_idx')
            self.target_q_with_idx = tf.gather_nd(self.target_q,
                                                  self.target_q_idx)

        with tf.variable_scope("pred_to_target"):
            self.t_w_input = {}
            self.t_w_assign_op = {}
            for name in self.w.keys():
                self.t_w_input[name] = tf.placeholder(
                    "float32", self.t_w[name].get_shape().as_list(), name=name)
                self.t_w_assign_op[name] = self.t_w[name].assign(
                    self.t_w_input[name])

        with tf.variable_scope("optimizer"):
            self.target_q_t = tf.placeholder('float32', [None],
                                             name="target_q_t")
            self.action = tf.placeholder("int64", [None], name="action")

            action_one_hot = tf.one_hot(self.action,
                                        self.num_actions,
                                        1.0,
                                        0.0,
                                        name="action_one_hot")
            q_acted = tf.reduce_sum(self.q * action_one_hot,
                                    reduction_indices=1,
                                    name="q_acted")

            self.delta = self.target_q_t - q_acted

            self.global_step = tf.Variable(0, trainable=False)
            self.loss = tf.reduce_sum(clipped_error(self.delta), name="loss")
            self.learning_rate_step = tf.placeholder("int64",
                                                     None,
                                                     name="learning_rate_step")
            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                   momentum=0.95,
                                                   epsilon=0.01).minimize(
                                                       self.loss)

        with tf.variable_scope("summary"):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
                                   'episode.max reward', 'episode.min reward', 'episode.avg reward',
                                   'episode.num of game', 'training.learning_rate']
            self.summary_placeholders = {}
            self.summary_ops = {}
            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(
                    "float32", None, name=tag.replace(" ", "_"))
                self.summary_ops[tag] = tf.summary.scalar(
                    "%s-%s/%s" % (self.env_name, self.env_type, tag),
                    self.summary_placeholders[tag])

            histogram_summary_tags = ['episode.rewards', "episode.actions"]
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(
                    "float32", None, name=tag.replace(" ", "_"))
                self.summary_ops[tag] = tf.summary.histogram(
                    tag, self.summary_placeholders[tag])
            self.writer = tf.summary.FileWriter("./logs/%s" % self.model_dir,
                                                self.sess.graph)

        self.sess.run(tf.global_variables_initializer())
        self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op],
                                     max_to_keep=30)

        self.load_model()
        self.load_weight_from_pkl()
        self.update_target_q_network()

    def update_target_q_network(self):
        for name in self.w.keys():
            self.t_w_assign_op[name].eval(
                {self.t_w_input[name]: self.w[name].eval()})

    def train(self):
        print(
            "..............................agent training start.............................."
        )
        start_step = self.step_op.eval()
        start_time = time.time()

        num_game = 0
        self.update_count = 0
        ep_reward = 0.
        total_reward = 0.
        self.total_loss = 0.
        self.total_q = 0.
        max_avg_ep_reward = -1e5

        ep_rewards = []
        actions = []

        screen, reward, action, terminal = self.env.new_random_game()
        for _ in range(self.history_length):
            self.history.add(screen)
        pre_screen = self.l3_flat.eval({self.s_t: [self.history.get()]})

        embs_length = self.embeddings_flat_length.eval(
            {self.s_t: [self.history.get()]})[1]
        if self.is_knn_dict_annoy_used:
            print("embs_length: ", embs_length)
            self.config.knn_key_dim = embs_length
            self.q_annoy_dict = Q_Annoy_Dict(self.config, self.num_actions)
            print("self.q_annoy_dict.key_dimension: ",
                  self.q_annoy_dict.key_dimension)

        for self.step in tqdm(range(start_step, self.max_step),
                              ncols=50,
                              initial=start_step):
            if self.step == self.learning_start:
                num_game = 0
                self.update_count = 0
                ep_reward = 0.
                ep_rewards = []
                actions = []

            # 1. predict
            self.emb_s_t = self.l3_flat.eval({self.s_t: [self.history.get()]})
            action = self.predict(self.emb_s_t)
            # 2. act
            screen, reward, terminal = self.env.act(action, is_training=True)

            self.observe(screen, reward, action, terminal)
            if self.is_knn_dict_annoy_used:
                post_screen = self.l3_flat.eval(
                    {self.s_t: [self.history.get()]})
                self.observe_knn_dict(pre_screen, reward, action, terminal,
                                      post_screen)
                pre_screen = post_screen
            if (self.step + 1) % self.scale == 0:
                if self.is_knn_dict_annoy_used:
                    print("self.q_annoy_dict.action_capacity: ",
                          self.q_annoy_dict.action_capacity)

            if terminal:
                screen, reward, action, terminal = self.env.new_random_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.
            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

            if self.step >= self.learning_start:
                if self.step % self.test_step == self.test_step - 1:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count
                    try:
                        # print("ep_rewards: ", ep_rewards)
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0., 0., 0.

                    print("\n arg_r %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, " \
                          "max_ep_r: %.4f, min_ep_r: %.4f, # game: %d" \
                          % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game))

                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        self.step_assign_op.eval(
                            {self.step_input: self.step + 1})
                        # save checkpoint
                        self.save_model(self.step + 1)
                        max_avg_ep_reward = max(max_avg_ep_reward,
                                                avg_ep_reward)
                        # self.save_weight_to_pkl()

                    if self.step > 180:
                        self.inject_summary(
                            {
                                "average.reward":
                                avg_reward,
                                "average.loss":
                                avg_loss,
                                "average.q":
                                avg_q,
                                "episode.max reward":
                                max_ep_reward,
                                "episode.min reward":
                                min_ep_reward,
                                "episode.avg reward":
                                avg_ep_reward,
                                "episode.num of game":
                                num_game,
                                "episode.rewards":
                                ep_rewards,
                                "episode.actions":
                                actions,
                                "training.learning_rate":
                                self.learning_rate_op.eval(
                                    {self.learning_rate_step: self.step}),
                            }, self.step)

                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

    def observe(self, screen, reward, action, terminal):
        reward = max(self.min_reward, min(self.max_reward, reward))
        self.history.add(screen)
        self.memory.add(screen, reward, action, terminal)

        if self.step > self.learning_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()
            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_q_network()

    def observe_knn_dict(self, pre_screen, reward, action, terminal,
                         post_screen):
        reward = max(self.min_reward, min(self.max_reward, reward))
        self.q_annoy_dict.add(pre_screen, [action], [reward], [terminal],
                              post_screen)

    def q_learning_mini_batch(self):
        # print("q_learning_mini_batch")
        if self.memory.count < self.history_length:
            return
        else:
            s_t, action, reward, s_t_plus_1, terminal = self.memory.sample()
            # print("shape(s_t): ", np.shape(s_t))
            # print("type(s_t): ", type(s_t))
            # print("action: ", action)
            # print("reward: ", reward)
            # print("s_t_plus_1: ", s_t_plus_1)

            if self.is_knn_dict_annoy_used:
                s_t_ = self.l3_flat.eval({self.s_t: s_t})
                s_t_plus_1_ = self.l3_flat.eval({self.s_t: s_t_plus_1})
                # print("shape(s_t_): ", np.shape(s_t_))

                # s_t_, action_, reward_, terminal_, s_t_plus_1_ = self.q_annoy_dict.query(s_t_, 1)
                # s_t_, action_, reward_, terminal_, s_t_plus_1_ = self.q_annoy_dict.query_(s_t_, 1)
                s_t_dnd, action_dnd, reward_dnd, terminal_dnd, s_t_plus_1_dnd = self.q_annoy_dict.query_actions(
                    s_t_, action, 1)

                # print("np.ndim(s_t_dnd): ", np.ndim(s_t_dnd))
                # print("type(s_t_dnd): ", type(s_t_dnd))
                # print(s_t_dnd)
                # print("shape(s_t_dnd): ", np.shape(s_t_dnd))
                # print("action_dnd: ", action_dnd)
                # print("reward_dnd: ", reward_dnd)
                # print("terminal_dnd: ", terminal_dnd)
                # print("shape(s_t_plus_1_dnd)", np.shape(s_t_plus_1_dnd))
                # exit()
                s_t = np.concatenate((s_t_, s_t_dnd), axis=0)
                action = np.concatenate((action, action_dnd), axis=0)
                reward = np.concatenate((reward, reward_dnd), axis=0)
                s_t_plus_1 = np.concatenate((s_t_plus_1_, s_t_plus_1_dnd),
                                            axis=0)
                terminal = np.concatenate((terminal, terminal_dnd), axis=0)
            # print("shape(s_t): ", np.shape(s_t))
            t = time.time()
            if self.double_q:
                # Double Q_learning
                if not self.is_knn_dict_annoy_used:
                    emb_s_t_plus_1 = self.l3_flat.eval({self.s_t: s_t_plus_1})
                    pred_action = self.q_action.eval(
                        {self.embeddings: emb_s_t_plus_1})
                    target_s_t_plus_1_embeddings = self.target_l3_flat.eval(
                        {self.target_s_t: s_t_plus_1})
                    q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({
                        self.target_embeddings:
                        target_s_t_plus_1_embeddings,
                        self.target_q_idx:
                        [[idx, pred_a]
                         for idx, pred_a in enumerate(pred_action)]
                    })
                    target_q_t = (
                        1. - terminal
                    ) * self.discount * q_t_plus_1_with_pred_action + reward
                else:
                    pred_action = self.q_action_eval(
                        {self.embeddings: s_t_plus_1})
                    q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({
                        self.target_embeddings:
                        s_t_plus_1,
                        self.target_q_idx:
                        [[idx, pred_a]
                         for idx, pred_a in enumerate(pred_action)]
                    })
            else:

                if not self.is_knn_dict_annoy_used:
                    target_s_t_plus_1_embeddings = self.target_l3_flat.eval(
                        {self.target_s_t: s_t_plus_1})

                    q_t_plus_1 = self.target_q.eval(
                        {self.target_embeddings: target_s_t_plus_1_embeddings})
                else:
                    q_t_plus_1 = self.target_q.eval(
                        {self.target_embeddings: s_t_plus_1})

                terminal = np.array(terminal) + 0.
                max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
                target_q_t = (
                    1. - terminal) * self.discount * max_q_t_plus_1 + reward
        if not self.is_knn_dict_annoy_used:
            emb_s_t = self.l3_flat.eval({self.s_t: s_t})
        else:
            emb_s_t = s_t
        _, q_t, loss, summary_str = self.sess.run(
            [self.optim, self.q, self.loss, self.q_summary],
            {
                self.target_q_t: target_q_t,
                self.action: action,
                # self.s_t: s_t,
                self.embeddings: emb_s_t,
                self.learning_rate_step: self.step
            })

        self.writer.add_summary(summary_str, self.step)
        self.total_loss += loss
        self.total_q += q_t.mean()
        self.update_count += 1

    def predict(self, emb_s_t, test_ep=None):
        ep = test_ep or (self.ep_end + max(
            0., (self.ep_start - self.ep_end) *
            (self.ep_end_t - max(0., self.step - self.learning_start)) /
            self.ep_end_t))
        if random.random() < ep:
            action = random.randrange(self.num_actions)
        else:
            action = self.q_action.eval({self.embeddings: emb_s_t})[0]
        return action

    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run(
            [self.summary_ops[tag] for tag in tag_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in tag_dict.items()
            })

        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, self.step)

    def play(self, n_step=10000, n_episode=100, test_ep=None, render=False):
        if test_ep == None:
            test_ep = self.ep_end
        test_history = History(self.config)

        if not self.display:
            gym_dir = "/tmp/%s-%s" % (self.env_name, get_time())
            self.env.env.monitor.start(gym_dir)

        best_reward, best_idx = 0., 0
        for idx in range(n_episode):
            screen, reward, action, terminal = self.env.new_random_game()
            current_reward = 0
            for _ in range(self.history):
                test_history.add(screen)
                for t in tqdm(range(n_step), ncols=70):
                    # 1. predict
                    action = self.predict(test_history.get(), test_ep)
                    # 2. act
                    screen, reward, terminal = self.env.act(action,
                                                            is_training=False)
                    test_history.add(screen)
                    current_reward += reward
                    if terminal:
                        break

                if current_reward > best_reward:
                    best_reward = current_reward
                    best_idx = idx

                print("=" * 30)
                print(" [%d] Best reward : %d" % (best_idx, best_reward))
                print("=" * 30)

        if not self.display:
            self.env.env.monitor.close()

    def save_weight_to_pkl(self):
        print("[*] save pred to pkl......")
        if not os.path.exists(self.weight_dir):
            os.makedirs(self.weight_dir)

        for name in self.w.keys():
            save_pkl(self.w[name].eval(),
                     os.path.join(self.weight_dir, "s.pkl" % name))

    def load_weight_from_pkl(self, cpu_mode=False):
        print("[*] load pred from pkl.......")
        for name in self.w.keys():
            if not os.path.exists(
                    os.path.join(self.weight_dir, "%s.pkl" % name)):
                print("[*FAIL] load pred from pkl")
                return
        with tf.variable_scope("load_pred_from_pkl"):
            self.w_input = {}
            self.w_assign_op = {}

            for name in self.w.keys():
                self.w_input[name] = tf.placeholder(
                    "float32", self.w[name].get_shape().as_list(), name=name)
                self.w_assign_op[name] = self.w[name].assign(
                    self.w_input[name])
            for name in self.w.keys():
                self.w_assign_op[name].eval({
                    self.w_input[name]:
                    load_pkl(os.path.join(self.weight_dir, "%s.pkl" % name))
                })
            self.update_target_q_network()
        print("[*SUCCESS] load pred from pkl")
Exemple #19
0
class PongAgent:
    def __init__(self, mode=None):
        self.env = wrap_dqn(gym.make('PongDeterministic-v4'))
        if mode == 'test':
            self.env = Monitor(self.env,
                               './video',
                               force=True,
                               video_callable=lambda episode_id: True)
        self.num_actions = self.env.action_space.n

        self.dqn = DQN(self.num_actions)
        self.target_dqn = DQN(self.num_actions)

        if use_gpu:
            self.dqn.cuda()
            self.target_dqn.cuda()

        self.buffer = ReplayMemory(1000)

        self.gamma = 0.99

        self.mse_loss = nn.MSELoss()
        self.optim = optim.RMSprop(self.dqn.parameters(), lr=0.01)

        self.out_dir = './model'
        self.writer = SummaryWriter()

        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

    def to_var(self, x):
        x_var = Variable(x)
        if use_gpu:
            x_var = x_var.cuda()
        return x_var

    def predict_q_values(self, states):
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.dqn(states)
        return actions

    def predict_q_target_values(self, states):
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.target_dqn(states)
        return actions

    def select_action(self, state, epsilon):
        choice = np.random.choice([0, 1], p=(epsilon, (1 - epsilon)))

        if choice == 0:
            return np.random.choice(range(self.num_actions))
        else:
            state = np.expand_dims(state, 0)
            actions = self.predict_q_values(state)
            return np.argmax(actions.data.cpu().numpy())

    def update(self, predicts, targets, actions):
        targets = self.to_var(
            torch.unsqueeze(torch.from_numpy(targets).float(), -1))
        actions = self.to_var(
            torch.unsqueeze(torch.from_numpy(actions).long(), -1))

        affected_values = torch.gather(predicts, 1, actions)
        loss = self.mse_loss(affected_values, targets)

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

    def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start,
                    epsilon_final):
        return max(epsilon_final,
                   epsilon_start - total_steps / max_epsilon_steps)

    def sync_target_network(self):
        primary_params = list(self.dqn.parameters())
        target_params = list(self.target_dqn.parameters())
        for i in range(0, len(primary_params)):
            target_params[i].data[:] = primary_params[i].data[:]

    def calculate_q_targets(self, next_states, rewards, dones):
        dones_mask = (dones == 1)

        predicted_q_target_values = self.predict_q_target_values(next_states)

        next_max_q_values = np.max(
            predicted_q_target_values.data.cpu().numpy(), axis=1)
        next_max_q_values[
            dones_mask] = 0  # no next max Q values if the game is over
        q_targets = rewards + self.gamma * next_max_q_values

        return q_targets

    def save_final_model(self):
        filename = '{}/final_model.pth'.format(self.out_dir)
        torch.save(self.dqn.state_dict(), filename)

    def save_model_during_training(self, episode):
        filename = '{}/current_model_{}.pth'.format(self.out_dir, episode)
        torch.save(self.dqn.state_dict(), filename)

    def load_model(self, filename):
        self.dqn.load_state_dict(torch.load(filename))
        self.sync_target_network()

    def play(self, episodes):
        for i in range(1, episodes + 1):
            done = False
            state = self.env.reset()
            while not done:
                action = self.select_action(
                    state, 0)  # force to choose an action from the network
                state, reward, done, _ = self.env.step(action)
                # self.env.render()

    def close_env(self):
        self.env.close()

    def train(self, replay_buffer_fill_len, batch_size, episodes,
              max_epsilon_steps, epsilon_start, epsilon_final,
              sync_target_net_freq):
        start_time = time.time()
        print('Start training at: ' + time.asctime(time.localtime(start_time)))

        total_steps = 0
        running_episode_reward = 0

        # populate replay memory
        print('Populating replay buffer... ')
        print('\n')
        state = self.env.reset()
        for i in range(replay_buffer_fill_len):
            action = self.select_action(state,
                                        1)  # force to choose a random action
            next_state, reward, done, _ = self.env.step(action)

            self.buffer.add(state, action, reward, done, next_state)

            state = next_state
            if done:
                self.env.reset()

        print('replay buffer populated with {} transitions, start training...'.
              format(self.buffer.count()))
        print('\n')

        # main loop - iterate over episodes
        for i in range(1, episodes + 1):
            # reset the environment
            done = False
            state = self.env.reset()

            # reset spisode reward and length
            episode_reward = 0
            episode_length = 0

            # play until it is possible
            while not done:
                # synchronize target network with estimation network in required frequence
                if (total_steps % sync_target_net_freq) == 0:
                    self.sync_target_network()

                # calculate epsilon and select greedy action
                epsilon = self.get_epsilon(total_steps, max_epsilon_steps,
                                           epsilon_start, epsilon_final)
                action = self.select_action(state, epsilon)

                # execute action in the environment
                next_state, reward, done, _ = self.env.step(action)

                # store transition in replay memory
                self.buffer.add(state, action, reward, done, next_state)

                # sample random minibatch of transitions
                s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample(
                    batch_size)

                # predict Q value using the estimation network
                predicted_values = self.predict_q_values(s_batch)

                # estimate Q value using the target network
                q_targets = self.calculate_q_targets(next_s_batch, r_batch,
                                                     d_batch)

                # update weights in the estimation network
                self.update(predicted_values, q_targets, a_batch)

                # set the state for the next action selction and update counters and reward
                state = next_state
                total_steps += 1
                episode_length += 1
                episode_reward += reward
                self.writer.add_scalar('data/reward', reward, total_steps)
                self.writer.add_scalar('data/epsilon', epsilon, total_steps)

            running_episode_reward = running_episode_reward * 0.9 + 0.1 * episode_reward
            self.writer.add_scalar('data/episode_reward', episode_reward, i)
            self.writer.add_scalar('data/running_episode_reward',
                                   running_episode_reward, i)

            if (i % 30) == 0:
                print('global step: {}'.format(total_steps))
                print('episode: {}'.format(i))
                print('running reward: {}'.format(
                    round(running_episode_reward, 2)))
                print('current epsilon: {}'.format(round(epsilon, 2)))
                print('episode_length: {}'.format(episode_length))
                print('episode reward: {}'.format(episode_reward))
                curr_time = time.time()
                print('current time: ' +
                      time.asctime(time.localtime(curr_time)))
                print('running for: ' +
                      str(datetime.timedelta(seconds=curr_time - start_time)))
                print('saving model after {} episodes...'.format(i))
                print('\n')
                self.save_model_during_training(i)

        print('Finish training at: ' +
              time.asctime(time.localtime(start_time)))