Esempio n. 1
0
    def __init__(self, params, num, global_episodes, tvars, global_network):

        self.params = params
        self.name = "worker_" + str(num)
        self.number = num
        self.model_path = self.params.logdir
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_" +
                                                    str(self.number))
        self.global_network = global_network

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_network(params, num, tvars, name=self.name)
        self.update_local_ops = self.update_target_graph(
            tvars, self.local_AC.local_vars)

        #The Below code is related to setting up the Doom environment
        self.actions = None

        #load cartpole
        self.env = gym.make('CartPole-v0')
        self.myBuffer = ReplayMemory(max_size=self.params.max_ep_length)
Esempio n. 2
0
    def __init__(self, params):

        self.env = gym.make('CartPole-v0')
        self.params = params
        self.graph = tf.Graph()

        with self.graph.as_default():

            self.main_actor = Policy_network(params, "primary")
            tvars = tf.trainable_variables()
            tact_start_index = int(len(tvars))

            self.target_actor = Policy_network(params, "target")
            tvars = tf.trainable_variables()
            mcri_start_index = int(len(tvars))

            self.main_critic = Value_network(params, "primary")
            tvars = tf.trainable_variables()
            tcri_start_index = int(len(tvars))

            self.target_critic = Value_network(params, "target")

            self.tvars = tf.trainable_variables()

            self.main_actor_tvars = self.tvars[:tact_start_index]
            self.target_actor_tvars = self.tvars[
                tact_start_index:mcri_start_index]
            self.main_critic_tvars = self.tvars[
                mcri_start_index:tcri_start_index]
            self.target_critic_tvars = self.tvars[tcri_start_index:]

            self.main_actor.backprop(self.main_actor_tvars)

            self.init = tf.global_variables_initializer()
            self.saver = tf.train.Saver()

        if not os.path.exists(self.params.logdir):
            os.mkdir(self.params.logdir)

        self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size)
        self.running_reward = None
        self.reward_sum = 0
        self.global_step = 0

        self.actor_targetOps = self.update_TargetGraph(self.main_actor_tvars,
                                                       self.target_actor_tvars,
                                                       self.params.tau)
        self.critic_targetOps = self.update_TargetGraph(
            self.main_critic_tvars, self.target_critic_tvars, self.params.tau)
Esempio n. 3
0
    def __init__(self, input_size, nb_action, gamma):
        self.gamma = gamma

        # append the average of the rewards to reward_window
        self.reward_window = []
        self.model = Network(input_size, nb_action)

        # take 100,000 transition for the model to learn
        self.memory = ReplayMemory(100000)
        """create an object using adam optimizer and connect 
        it to nerual network to make sure learning doesn't happen too fast"""
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        # create fake dimension -- unsqueese, tensor wrapped into gradient
        self.prev_state = torch.Tensor(input_size).unsqueeze(0)
        self.prev_action = 0
        self.prev_reward = 0
Esempio n. 4
0
    def __init__(self, sess, config, environment, evaluation_enviroment):
        # Get the session, config, environment, and create a replaymemory
        self.sess = sess
        self.config = config
        self.environment = environment
        self.evaluation_enviroment = evaluation_enviroment

        if config.prm:
            self.memory = PrioritizedExperienceReplay(sess, config)
        else:
            self.memory = ReplayMemory(config.state_shape, config.rep_max_size)

        self.init_dirs()

        self.init_cur_epsiode()
        self.init_global_step()
        self.init_epsilon()
        self.init_summaries()

        # Intialize the DQN graph which contain 2 Networks Target and Q
        self.estimator = DQN(sess, config, self.environment.n_actions)

        # To initialize all variables
        self.init = tf.group(tf.global_variables_initializer(),
                             tf.local_variables_initializer())
        self.sess.run(self.init)

        self.saver = tf.train.Saver(max_to_keep=10)
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.sess.graph)

        if config.is_train and not config.cont_training:
            pass
        elif config.is_train and config.cont_training:
            self.load()
        elif config.is_play:
            self.load()
        else:
            raise Exception("Please Set proper mode for training or playing")
Esempio n. 5
0
            self.best_action = tf.argmax(self.q_value, axis=1)  # 返回索引值

    def _build_optimizer(self):
        self.target_q = tf.placeholder(shape=[None, 6], dtype=tf.float32)
        self.loss = tf.reduce_mean(tf.square(self.q_value - self.target_q))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.update = self.optimizer.minimize(self.loss)
        self.update = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6).minimize(self.cost)

if __name__ == "__main__":
    env = GameEnv('PongDeterministic-v4', 4)
    # env = GameEnv('BreakoutDeterministic-v4', 4)
    dqn = DeepQNetwork(n_actions=6, hist_len=4, name="eval_dqn")
    env.reset()
    replay = ReplayMemory()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(100000):
            dummy_q = np.zeros((64, 6))
            # dummy_q = [dummy_q]
            action = env.env.action_space.sample()
            terminal_life_lost, observation, reward, is_done, info = env.step(
                action)
            observation = cv2.resize(observation, (84, 84), interpolation=cv2.INTER_NEAREST)
            replay.add_experience(action, observation,
                                  reward, terminal_life_lost)

            if i > 10000:
                states, actions, rewards, new_states, terminal_flags = replay.get_minibatch()
                loss, _, best_action = sess.run([dqn.loss, dqn.update, dqn.best_action],
Esempio n. 6
0
class Dqn():
    def __init__(self, input_size, nb_action, gamma):
        self.gamma = gamma

        # append the average of the rewards to reward_window
        self.reward_window = []
        self.model = Network(input_size, nb_action)

        # take 100,000 transition for the model to learn
        self.memory = ReplayMemory(100000)
        """create an object using adam optimizer and connect 
        it to nerual network to make sure learning doesn't happen too fast"""
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        # create fake dimension -- unsqueese, tensor wrapped into gradient
        self.prev_state = torch.Tensor(input_size).unsqueeze(0)
        self.prev_action = 0
        self.prev_reward = 0

    """Use softmax to select highest probablity of the q-value, then save 
        memory and improve performance by convertign state to gradient """

    def select_action(self, state):
        # T = 100, increasing the temperature makes it look more certain
        probs = F.softmax(self.model(Variable(state, volatile=True)) * 100)

        # draw randomly from probs
        action = probs.multinomial()

        return action.data[0, 0]

    """ Implement Markov Decision Process """

    def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
        outputs = self.model(batch_state).gather(
            1, batch_action.unsqueeze(1)).squeeze(1)

        next_outputs = self.model(batch_next_state).detach().max(1)[0]

        # the reward + the next output which is the max of the values of nex state
        target = self.gamma * next_outputs + batch_reward

        # temporal difference lost (predictions- output, target- goal)
        td_loss = F.smooth_l1_loss(outputs, target)

        # re initialize optimizer
        self.optimizer.zero_grad()

        # backpropogate the temporal difference and free the memory(true)
        td_loss.backward(retain_variables=True)

        # update how much contripute to error (weight)
        self.optimizer.step()

    """ Update all the elements of our transition and select the action"""

    def update(self, reward, new_signal):

        # signal is the state, 3 signals plus orientation, -orientation
        new_state = torch.Tensor(new_signal).float().unsqueeze(0)

        # update memory and convert the list to tensor
        self.memory.push((self.prev_state, new_state,
                          torch.LongTensor([int(self.prev_action)]),
                          torch.Tensor([self.prev_reward])))

        # play an action
        action = self.select_action(new_state)

        # the ai starts learning after 100 transitions
        if len(self.memory.memory) > 100:
            batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample(
                100)
            self.learn(batch_state, batch_next_state, batch_reward,
                       batch_action)

        # update previous action
        self.prev_action = action
        self.prev_state = new_state
        self.prev_reward = reward

        self.reward_window.append(reward)

        if len(self.reward_window) > 1000:
            del self.reward_window[0]

        return action

    """ Take the sume of all rewards in stored reward and divide by the mean"""

    def score(self):
        return sum(self.reward_window) / (len(self.reward_window) + 1.)

    """ Save last model and optimizer"""

    def save(self):
        torch.save(
            {
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict()
            }, 'previous_brain.pth')

    """ Loads the saved file allows us to use that brain"""

    def load(self):

        if os.path.isfile('previous_brain.pth'):
            print("=> loading checkpoint...")

            checkpoint = torch.load('previous_brain.pth')

            self.model.load_state_dict(checkpoint['state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])

            print("Done.")

        else:
            print("File not found...")
Esempio n. 7
0
class Worker():
    def __init__(self, params, num, global_episodes, tvars, global_network):

        self.params = params
        self.name = "worker_" + str(num)
        self.number = num
        self.model_path = self.params.logdir
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_" +
                                                    str(self.number))
        self.global_network = global_network

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_network(params, num, tvars, name=self.name)
        self.update_local_ops = self.update_target_graph(
            tvars, self.local_AC.local_vars)

        #The Below code is related to setting up the Doom environment
        self.actions = None

        #load cartpole
        self.env = gym.make('CartPole-v0')
        self.myBuffer = ReplayMemory(max_size=self.params.max_ep_length)

    def train(self, sess):
        trainBatch = self.myBuffer.sample(self.total_steps)
        batch_state = np.array(trainBatch[0]).reshape(
            [self.total_steps, self.params.input_dim])
        batch_actions = np.array(trainBatch[1]).reshape(
            [self.total_steps, self.params.num_actions])
        batch_rewards = np.array(trainBatch[2])
        batch_next_state = np.array(trainBatch[3]).reshape(
            [self.total_steps, self.params.input_dim])
        batch_done = np.array(trainBatch[4])

        end_multiplier = -(batch_done - 1)

        # Here we take the rewards and values from the buffer, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        #self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        #discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        #self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        #advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        #advantages = discount(advantages,gamma)

        next_Q = np.max(
            sess.run(self.local_AC.Qout,
                     feed_dict={self.local_AC.input_x: batch_next_state}))
        state_value = np.max(
            sess.run(self.local_AC.Qout,
                     feed_dict={self.local_AC.input_x: batch_state}))

        batch_target_Q = batch_rewards + (self.params.gamma * next_Q *
                                          end_multiplier)
        batch_advantages = batch_target_Q - state_value

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {
            self.local_AC.input_x: batch_state,
            self.local_AC.target_Q: batch_target_Q,
            self.local_AC.actions: batch_actions,
            self.local_AC.advantages:
            batch_advantages.reshape(self.total_steps, 1)
        }

        v_l, p_l, e_l, _ = sess.run([
            self.local_AC.value_loss, self.local_AC.policy_loss,
            self.local_AC.entropy, self.local_AC.apply_grads
        ],
                                    feed_dict=feed_dict)

        #return v_l/self.total_steps , p_l/self.total_steps , e_l/self.total_steps

    def work(self, sess, coord, saver):
        episode_count = sess.run(self.global_episodes)
        self.total_steps = 0
        print("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = []
                episode_step_count = []
                score = 0
                d = False
                state_input = self.env.reset()
                state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []

                while not d:

                    state_input = state_input.reshape(
                        [1, self.params.input_dim])
                    # Run the policy network and get an action to take.
                    curr_policy = sess.run(
                        self.local_AC.probability,
                        feed_dict={self.local_AC.input_x: state_input})

                    # get the action from predicted policy
                    action = np.random.choice(np.arange(len(curr_policy)),
                                              p=curr_policy)

                    # step the environment and get new measurements
                    next_state, reward, d, _ = self.env.step(action)

                    next_state = next_state.reshape([1, self.params.input_dim])

                    state_buffer.append(state_input)
                    action_buffer.append([1, 0] if action == 0 else [0, 1])
                    reward_buffer.append(
                        reward if not d or score == 399 else -200)
                    # reward_buffer.append(reward)
                    next_state_buffer.append(next_state)
                    done_buffer.append(d)
                    score += reward
                    self.total_steps += 1

                    state_input = next_state

                self.myBuffer.append(state_buffer, action_buffer,
                                     reward_buffer, next_state_buffer,
                                     done_buffer)

                #state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []
                episode_reward.append(score)
                #print(score)

                episode_step_count.append(self.total_steps)

                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))

                # Update the network using the episode buffer at the end of the episode.
                if self.myBuffer != None:
                    #v_l,p_l,e_l = self.train(sess)
                    self.train(sess)
                    #     #print(v_l, p_l, e_l)
                    self.update_Target(self.update_local_ops, sess)
                    #print(myBuffer._memory)
                    self.myBuffer.reset()
                    self.total_steps = 0

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 10 == 0 and episode_count != 0:
                    if episode_count % 100 == 0 and self.name == 'worker_0':
                        saver.save(
                            sess, self.model_path + '/model-' +
                            str(episode_count) + '.cptk')
                        print("Saved Model")

                    if self.name == "worker_0":

                        curr_reward = 0

                        for i in range(5):
                            test_done = False
                            state = self.env.reset()
                            while not test_done:
                                state = state.reshape(1, self.params.input_dim)
                                curr_policy = sess.run(
                                    self.global_network.probability,
                                    feed_dict={
                                        self.global_network.input_x: state
                                    })

                                # get the action from predicted policy
                                action = np.random.choice(np.arange(
                                    len(curr_policy)),
                                                          p=curr_policy)

                                # step the environment and get new measurements
                                next_state, reward, test_done, _ = self.env.step(
                                    action)
                                curr_reward += 1
                                state = next_state

                        print("Episode: {}, Current global reward: {:.1f}".
                              format(episode_count, curr_reward / 5))
                        time.sleep(0.5)

                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1

                if episode_count > self.params.total_episodes and self.name == "worker_0":
                    coord.request_stop()

    def update_target_graph(self, from_vars, to_vars):
        op_holder = []
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder

    def update_Target(self, op_holder, sess):
        '''run operation defined in updateTargetGraph function'''
        for op in op_holder:
            sess.run(op)
Esempio n. 8
0
class Agent:
    def __init__(self,
                 env,
                 lr=0.00025,
                 batch_size=32,
                 gamma=0.99,
                 n_frames=3000000,
                 start_frame=50000,
                 anneal_frame=10**6,
                 update_freq=5000,
                 hist_len=4,
                 num_reward=200,
                 experience_size=10**6,
                 check_point_path=r'../checkpoints',
                 save_freq=1000,
                 no_ops=10,
                 eval_times=10,
                 restore=False):

        # 环境搭建
        self.env = GameEnv(env, hist_len)
        # 训练用到的参数
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.hist_len = hist_len
        self.experience_size = experience_size
        # 容易乱的参数.....(- & -)
        self.n_frames = n_frames  # 总共训练的帧数
        self.start_frame = start_frame  # 开始eps递增
        self.anneal_frame = anneal_frame  # eps递增到最大值
        self.update_freq = update_freq  # target_q_net_work的参数更新,每过update_freq个图,更新一次
        self.num_reward = num_reward  # 储存的reward的个数
        self.no_ops = no_ops  # 什么都不做的步数
        self.eval_times = eval_times  # 评估局数(默认十局)

        self.sess = tf.Session()
        self.save_freq = save_freq
        self.check_point_path = check_point_path

        n_actions = self.env.get_num_actions()
        self.action_chooser = ChooseAction(n_actions=n_actions,
                                           start_frame=self.start_frame,
                                           annealing_frame=self.anneal_frame)

        self.eval_dqn = DeepQNetwork(n_actions,
                                     batch_size=self.batch_size,
                                     lr=self.lr,
                                     name='eval_dqn')

        self.target_dqn = DeepQNetwork(n_actions,
                                       batch_size=self.batch_size,
                                       name='target_dqn')

        self.replay_memory = ReplayMemory(size=self.experience_size,
                                          batch_size=self.batch_size)

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(name="model")
        self.restore = restore
        self.step = 0

    # 读取以往的检查点
    def _restore(self):
        if self.restore:
            print("Checkpoint Path: ", self.check_point_path)
            print("Checkpoint to be Restored:",
                  tf.train.latest_checkpoint(self.check_point_path))
            self.saver.restore(
                self.sess, tf.train.latest_checkpoint(self.check_point_path))

    def _save(self):
        self.saver.save(self.sess,
                        self.check_point_path + '/model.ckpt',
                        global_step=self.step)

    # 评估训练结果(论文要求)
    def _eval(self):
        print("Evaluating...")
        # 保存当前状态
        internal_state, system_state = self.env.clone_state()
        for eval_episodes in range(self.eval_times):
            self.env.reset()
            start_ep = True
            eval_reward = 0
            eval_rewards = []
            no_op = 0
            no_ops = np.random.randint(0, self.no_ops)
            while True:
                self.env.render()
                if start_ep:
                    no_op += 1
                    action = 0
                else:
                    action = self.action_chooser.choose_action(
                        self.sess, 3000000, self.env.state, self.eval_dqn)
                _, _, reward, is_done, _ = self.env.step(action)
                eval_reward += reward
                if no_op == no_ops:
                    start_ep = False
                if is_done:
                    eval_rewards.append(eval_reward)
                    break
        avg_eval_rewards = np.mean(eval_rewards)
        print("Evaluation average reward: {}".format(avg_eval_rewards))
        # 恢复状态,继续训练
        self.env.restore_state(internal_state, system_state)

    def _learn(self):
        # 从记忆库中取出记忆
        states, score_lost, actions, rewards, new_states = self.replay_memory.get_minibatch(
        )
        # [None 84 84 4] boolen [64] [64] [None 84 84 4]
        # 令下个状态回报最大 的 最优动作
        best_action = self.sess.run(
            self.eval_dqn.best_action,
            feed_dict={self.eval_dqn.input: new_states})

        # mini_batch每个状态时,下个状态的 reward
        target_q_val = self.sess.run(
            self.target_dqn.q_value,
            feed_dict={self.target_dqn.input: new_states})

        # [batch_size, best_action]---每个状态下的最优动作值
        target_q_val = target_q_val[range(self.batch_size), best_action]

        # 因为每次失分会导致整个游戏reset(),所以每失分一次,算作一个结束
        # 根据论文算法,每次结束都采用reward作为target_q
        target_q = rewards + self.gamma * target_q_val * (1 - score_lost)

        # 为了反向传播....妈妈咪...-_-
        pred_q = self.sess.run(self.eval_dqn.q_value,
                               feed_dict={self.eval_dqn.input: states})
        target_q_transition = pred_q.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = actions
        target_q_transition[batch_index, eval_act_index] = target_q
        target_q = target_q_transition
        # print(target_q - pred_q)
        loss, _ = self.sess.run([self.eval_dqn.loss, self.eval_dqn.update],
                                feed_dict={
                                    self.eval_dqn.input: states,
                                    self.eval_dqn.target_q: target_q
                                })
        return pred_q, target_q, loss

    def _update_target_q_network(self):
        eval_vars = self.eval_dqn.variables()
        target_vars = self.target_dqn.variables()
        update_operation = []
        for eval_var, target_var in zip(eval_vars, target_vars):
            update_operation.append(
                tf.assign(target_var, tf.identity(eval_var)))
        copy_operation = tf.group(*update_operation)  # 返回一个操作而非数组
        self.sess.run(copy_operation)
        # 防止参数没有更新
        check_before = self.sess.run(eval_vars[0])
        check_after = self.sess.run(target_vars[0])
        assert (check_before == check_after).all(), "Parameters not updated"

    def train(self):
        self._restore()
        self.env.reset()
        start_ep = True
        no_op = 0
        no_ops = np.random.randint(0, self.no_ops)
        train_rewards = []
        train_reward = 0
        num_dones = 0
        # current_reward = 0
        # last_reward = 0

        print("Training for {} frames...".format(self.n_frames))

        # Training loop
        for elapsed_frame in range(0, self.n_frames):
            self.env.render()
            # 每过4帧换一次动作
            if elapsed_frame % 4 == 0:
                # TODO 矩阵维度问题...-_-(done)
                action = self.action_chooser.choose_action(
                    self.sess, elapsed_frame, self.env.state, self.eval_dqn)
            # print(action)
            score_lost, observation, reward, is_done, _ = self.env.step(action)
            train_reward += reward

            # 是否结束了一局(某一方得20分)
            if is_done:
                num_dones += 1
                if len(train_rewards) < self.num_reward:
                    train_rewards.append(train_reward)
                else:
                    train_rewards[num_dones % self.num_reward] = train_reward
                last_reward = sum(train_rewards) / len(train_rewards)
                current_reward = train_reward
                train_reward = 0
                print("Training Reward(average):", last_reward)
                print("Training Reward(current):", current_reward)

            self.replay_memory.add_experience(action, observation[:, :, 0],
                                              reward, score_lost)

            # start_frame之前是随机走动...只是为了丰富记忆库(林子大了什么鸟都有)
            if elapsed_frame > self.start_frame:
                _, _, loss = self._learn()
                print('loss:', loss)
                self.step += 1

            if (elapsed_frame % self.update_freq == 0
                    and elapsed_frame > self.start_frame):
                print("Updating target network params", end=', ')
                print("Current number of frames elapsed: {}".format(
                    elapsed_frame))
                self._update_target_q_network()

            if (elapsed_frame % self.save_freq == 0
                    and elapsed_frame > self.start_frame):
                # 保存参数,检测训练结果
                self._save()
                self._eval()

        print("Training finished")
        self.env.close()
Esempio n. 9
0
    def __init__(self,
                 env,
                 lr=0.00025,
                 batch_size=32,
                 gamma=0.99,
                 n_frames=3000000,
                 start_frame=50000,
                 anneal_frame=10**6,
                 update_freq=5000,
                 hist_len=4,
                 num_reward=200,
                 experience_size=10**6,
                 check_point_path=r'../checkpoints',
                 save_freq=1000,
                 no_ops=10,
                 eval_times=10,
                 restore=False):

        # 环境搭建
        self.env = GameEnv(env, hist_len)
        # 训练用到的参数
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.hist_len = hist_len
        self.experience_size = experience_size
        # 容易乱的参数.....(- & -)
        self.n_frames = n_frames  # 总共训练的帧数
        self.start_frame = start_frame  # 开始eps递增
        self.anneal_frame = anneal_frame  # eps递增到最大值
        self.update_freq = update_freq  # target_q_net_work的参数更新,每过update_freq个图,更新一次
        self.num_reward = num_reward  # 储存的reward的个数
        self.no_ops = no_ops  # 什么都不做的步数
        self.eval_times = eval_times  # 评估局数(默认十局)

        self.sess = tf.Session()
        self.save_freq = save_freq
        self.check_point_path = check_point_path

        n_actions = self.env.get_num_actions()
        self.action_chooser = ChooseAction(n_actions=n_actions,
                                           start_frame=self.start_frame,
                                           annealing_frame=self.anneal_frame)

        self.eval_dqn = DeepQNetwork(n_actions,
                                     batch_size=self.batch_size,
                                     lr=self.lr,
                                     name='eval_dqn')

        self.target_dqn = DeepQNetwork(n_actions,
                                       batch_size=self.batch_size,
                                       name='target_dqn')

        self.replay_memory = ReplayMemory(size=self.experience_size,
                                          batch_size=self.batch_size)

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(name="model")
        self.restore = restore
        self.step = 0
Esempio n. 10
0
class Ddpg_Agent():
    def __init__(self, params):

        self.env = gym.make('CartPole-v0')
        self.params = params
        self.graph = tf.Graph()

        with self.graph.as_default():

            self.main_actor = Policy_network(params, "primary")
            tvars = tf.trainable_variables()
            tact_start_index = int(len(tvars))

            self.target_actor = Policy_network(params, "target")
            tvars = tf.trainable_variables()
            mcri_start_index = int(len(tvars))

            self.main_critic = Value_network(params, "primary")
            tvars = tf.trainable_variables()
            tcri_start_index = int(len(tvars))

            self.target_critic = Value_network(params, "target")

            self.tvars = tf.trainable_variables()

            self.main_actor_tvars = self.tvars[:tact_start_index]
            self.target_actor_tvars = self.tvars[
                tact_start_index:mcri_start_index]
            self.main_critic_tvars = self.tvars[
                mcri_start_index:tcri_start_index]
            self.target_critic_tvars = self.tvars[tcri_start_index:]

            self.main_actor.backprop(self.main_actor_tvars)

            self.init = tf.global_variables_initializer()
            self.saver = tf.train.Saver()

        if not os.path.exists(self.params.logdir):
            os.mkdir(self.params.logdir)

        self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size)
        self.running_reward = None
        self.reward_sum = 0
        self.global_step = 0

        self.actor_targetOps = self.update_TargetGraph(self.main_actor_tvars,
                                                       self.target_actor_tvars,
                                                       self.params.tau)
        self.critic_targetOps = self.update_TargetGraph(
            self.main_critic_tvars, self.target_critic_tvars, self.params.tau)

    def update_TargetGraph(self, main_tfVar, target_tfVar, tau):
        '''Holds operation node for assigning Target values to Target network
        Args:
            tfVars - Variables for training(weights, bias...)
            Tau - rate for updating (low Tau value for slow updates)
        Return:
            op_holder - tf.assign() operation. input for updateTarget Function'''

        assert len(main_tfVar) == len(target_tfVar)
        total_vars = len(main_tfVar)
        op_holder = []

        # for latter-half part of trainable variables (= for Target network variables)
        for idx, var in enumerate(main_tfVar[0:total_vars]):
            # assigning tau*new_value+(1-tau)*old_values
            op_holder.append(target_tfVar[idx].assign((var.value() * tau) + (
                (1 - tau) * target_tfVar[idx].value())))

        return op_holder

    def update_Target(self, op_holder, sess):
        '''run operation defined in updateTargetGraph function'''

        for op in op_holder:
            sess.run(op)

    def _load_model(self, sess, load_ckpt):
        if load_ckpt:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(self.params.logdir)
            self.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            # initialize gloabl variables
            print('Initialize variables...')
            sess.run(self.init)

    def train(self):

        with tf.Session(graph=self.graph) as sess:

            self._load_model(sess, self.params.load_model)
            self.total_episodes = self.params.total_episodes

            # Obtain an initial observation of the environment
            state = self.env.reset()
            state_input = state.reshape([1, self.params.input_dim])

            for episode_number in xrange(self.params.total_episodes):

                done = False
                score = 0

                while not done:

                    if self.global_step > self.params.preTrainStep:

                        # Value network update
                        trainBatch = self.myBuffer.sample(
                            self.params.batch_size)

                        batch_state = np.array(trainBatch[0]).reshape(
                            [self.params.batch_size, self.params.input_dim])
                        batch_actions = np.array(trainBatch[1]).reshape(
                            [self.params.batch_size, self.params.num_actions])
                        batch_rewards = np.array(trainBatch[2])
                        batch_next_state = np.array(trainBatch[3]).reshape(
                            [self.params.batch_size, self.params.input_dim])
                        batch_done = np.array(trainBatch[4])

                        end_multiplier = -(batch_done - 1)

                        target_action = sess.run(self.target_actor.det_prob,
                                                 feed_dict={
                                                     self.target_actor.input_x:
                                                     batch_next_state
                                                 })
                        target_action = np.array([[1, 0] if i == 0 else [0, 1]
                                                  for i in target_action])
                        targetQ_all = sess.run(self.target_critic.Qout,
                                               feed_dict={
                                                   self.target_critic.input_x:
                                                   batch_next_state,
                                                   self.target_critic.actions:
                                                   target_action
                                               })
                        nextQ = np.sum(np.multiply(targetQ_all, target_action),
                                       axis=-1)
                        targetQ = batch_rewards + (self.params.gamma * nextQ *
                                                   end_multiplier)

                        pred_actions = sess.run(
                            self.main_actor.det_prob,
                            feed_dict={self.main_actor.input_x: batch_state})
                        pred_actions = np.array([[1, 0] if i == 0 else [0, 1]
                                                 for i in pred_actions])

                        # Update the network with our target values.
                        sess.run(self.main_critic.update_value_model,
                                 feed_dict={
                                     self.main_critic.input_x: batch_state,
                                     self.main_critic.target_Q: targetQ,
                                     self.main_critic.actions: batch_actions
                                 })
                        self.update_Target(self.critic_targetOps, sess)

                        gradients = sess.run(self.main_critic.action_grads,
                                             feed_dict={
                                                 self.main_critic.input_x:
                                                 batch_state,
                                                 self.main_critic.actions:
                                                 pred_actions
                                             })

                        gradients = np.array(gradients).reshape(
                            self.params.batch_size, self.params.num_actions)
                        sess.run(self.main_actor.optimize,
                                 feed_dict={
                                     self.main_actor.input_x: batch_state,
                                     self.main_actor.action_gradient: gradients
                                 })

                        self.update_Target(self.actor_targetOps, sess)

                    # Make sure the observation is in a shape the network can handle.
                    state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []

                    actor_noise = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(self.params.num_actions))

                    action = sess.run(self.main_actor.logits,
                                      feed_dict={
                                          self.main_actor.input_x: state_input
                                      }) + actor_noise()
                    action = np.argmax(action)

                    # step the environment and get new measurements
                    next_state, reward, done, _ = self.env.step(action)

                    next_state = next_state.reshape([1, self.params.input_dim])

                    state_buffer.append(state_input)
                    action_buffer.append([1, 0] if action == 0 else [0, 1])
                    reward_buffer.append(
                        reward if not done or score == 299 else -100)
                    #reward_buffer.append(reward)
                    next_state_buffer.append(next_state)
                    done_buffer.append(done)

                    # move to next state
                    state_input = next_state

                    # add up reward
                    self.reward_sum += reward
                    score += reward
                    self.global_step += 1
                    self.myBuffer.append(state_buffer, action_buffer,
                                         reward_buffer, next_state_buffer,
                                         done_buffer)

                if episode_number % self.params.update_freq == 0:
                    self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01
                    print(
                        'Current Episode {} Average reward for episode {:.2f}.  Total average reward {:.2f}.'
                        .format(episode_number,
                                self.reward_sum // self.params.update_freq,
                                self.running_reward //
                                self.params.update_freq))
                    self.reward_sum = 0
                    time.sleep(0.5)

                self.state = self.env.reset()
                state_input = self.state.reshape([1, self.params.input_dim])
                self.global_step += 1
Esempio n. 11
0
# Hyper - Parameters
MAX_STEP_EP = 50  # Maximum number of timesteps in an episode/episode length
MAX_NUM_EP = 100  # Maximum Number of Episodes
GAMMA = 0.9  # The Discount factor on rewards
PAR_RANGES = np.array(
    [[0, 1], [0, 0.75], [1, 2], [0, 1]]
)  # Parameter ranges in the order: Pole mass, Pole Length, Cart Mass, Friction
BUFFER_SIZE = 128
EPISODES = 10  # Num. of Episodes

agent = RDPGAgent(experiment, GAMMA, PAR_RANGES)

random_env = random_cartpole_env(experiment, PAR_RANGES)

replay_buffer = ReplayMemory(BUFFER_SIZE)

for i in range(MAX_NUM_EP):
    #each episode has different sampled dynamic parameters in the environment
    random_env.sample_env()
    env, env_parameters = random_env.get_sampled_env()
    state_array = env.reset()

    episode = EpisodeMemory(
        env, MAX_STEP_EP
    )  # The parameters passed into this function are to be decided yet

    # First action is a random sample from the action space of the environment, since we need history from the next time step
    # in an episode to implement the policy(action) from the actor network
    episode_reward = 0
Esempio n. 12
0
class AC_Agent():

    def __init__(self, params):

        self.env = gym.make('CartPole-v0')
        #self.env = gym.make('Pong-v0')

        self.params = params
        self.graph = tf.Graph()

        with self.graph.as_default():

            self.actor = Policy_network(params)

            self.main_critic = Value_network(params, "primary")
            self.target_critic = Value_network(params, "target")

            self.init = tf.global_variables_initializer()

            if not os.path.exists(self.params.logdir):
                os.mkdir(self.params.logdir)

            self.saver = tf.train.Saver()
            self.tvars = tf.trainable_variables()
            main_start_index = int(len(self.tvars)/3)
            target_start_index = int(2*len(self.tvars)/3)
            self.actor_tvars = self.tvars[:main_start_index]
            self.main_critic_tvars = self.tvars[main_start_index:target_start_index]
            self.target_critic_tvars = self.tvars[target_start_index:]
            #self.actor.backprop(tvars=None)

        self.running_reward = None
        self.reward_sum = 0
        self.episode_number = 0
        rendering = False
        self.global_step = 0

        self.critic_targetOps = self.update_critic_TargetGraph(self.main_critic_tvars, self.target_critic_tvars, self.params.tau)

        self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size)


    def update_critic_TargetGraph(self, main_tfVar, target_tfVar, tau):
        '''Holds operation node for assigning Target values to Target network
        Args:
            tfVars - Variables for training(weights, bias...)
            Tau - rate for updating (low Tau value for slow updates)
        Return:
            op_holder - tf.assign() operation. input for updateTarget Function'''
        assert len(main_tfVar) == len(target_tfVar)
        total_vars = len(main_tfVar)
        op_holder = []

        # for latter-half part of trainable variables (= for Target network variables)
        for idx, var in enumerate(main_tfVar[0:total_vars]):
            # assigning tau*new_value+(1-tau)*old_values
            op_holder.append(target_tfVar[idx].assign(
                (var.value() * tau) + ((1 - tau) * target_tfVar[idx].value())))
        return op_holder

    def update_critic_Target(self, op_holder, sess):
        '''run operation defined in updateTargetGraph function'''
        for op in op_holder:
            sess.run(op)


    def _load_model(self, sess, load_ckpt):
        if load_ckpt:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(self.params.logdir)
            self.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            # initialize gloabl variables
            print('Initialize variables...')
            sess.run(self.init)

    def rendering(self, rendering):
        if self.reward_sum / self.params.update_freq >= 180 or rendering == True :
            self.env.render()
            rendering = True


    def train(self):

        with tf.Session(graph=self.graph) as sess:

            self._load_model(sess, self.params.load_model)
            self.total_episodes = self.params.total_episodes

            # Obtain an initial observation of the environment
            self.state = self.env.reset()
            #state_input = self.prepro(self.state)
            state_input = self.state.reshape([1, self.params.input_dim])

            for self.episode_number in xrange(self.params.total_episodes):

                done = False
                score = 0

                while not done:

                    if self.global_step > self.params.preTrainStep:

                        #print(self.myBuffer)

                        # Value network update
                        trainBatch = self.myBuffer.sample(self.params.batch_size)

                        #print(trainBatch)
                        batch_state = np.array(trainBatch[0]).reshape([self.params.batch_size, self.params.input_dim])
                        batch_actions = np.array(trainBatch[1]).reshape([self.params.batch_size, self.params.num_actions])
                        batch_rewards = np.array(trainBatch[2])
                        batch_next_state = np.array(trainBatch[3]).reshape([self.params.batch_size, self.params.input_dim])
                        batch_done = np.array(trainBatch[4])

                        end_multiplier = -(batch_done - 1)

                        targetQ_all = sess.run(self.target_critic.Qout, feed_dict={self.target_critic.input_x: batch_next_state})
                        targetQ = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier)

                        predictedQ_all = sess.run(self.main_critic.Qout, feed_dict={self.main_critic.input_x: batch_state})

                        # Update the network with our target values.
                        sess.run(self.main_critic.update_value_model,
                                                           feed_dict={self.main_critic.input_x : batch_state,
                                                                      self.main_critic.target_Q : targetQ,
                                                                      self.main_critic.actions : batch_actions})
                        self.update_critic_Target(self.critic_targetOps, sess)

                        batch_advantage = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier) - np.max(predictedQ_all)
                        # Policy network update
                        batch_advantage = batch_advantage.reshape([self.params.batch_size, 1])
                        sess.run(self.actor.optimize, feed_dict={self.actor.input_x: batch_state,
                                                                 self.actor.input_y: batch_actions,
                                                                 self.actor.advantages: batch_advantage})


                    # Make sure the observation is in a shape the network can handle.
                    state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []

                    #print(state_input.shape)
                    #prev_state = state_input

                    # Run the policy network and get an action to take.
                    curr_policy = sess.run(self.actor.probability, feed_dict={self.actor.input_x: state_input})

                    # get the action from predicted policy
                    action = np.random.choice(np.arange(len(curr_policy)), p=curr_policy)

                    # step the environment and get new measurements
                    next_state, reward, done, _ = self.env.step(action)

                    next_state = next_state.reshape([1, self.params.input_dim])
                    #next_state = self.prepro(next_state)
                    #next_state = next_state - prev_state

                    state_buffer.append(state_input)
                    action_buffer.append([1, 0] if action == 0 else [0, 1])
                    reward_buffer.append(reward if not done or score == 299 else -100)
                    #reward_buffer.append(reward)
                    next_state_buffer.append(next_state)
                    done_buffer.append(done)

                    state_input = next_state

                    # move to next state

                    # add up reward
                    self.reward_sum += reward
                    score += reward
                    self.global_step += 1
                    self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer)

                if self.episode_number % self.params.update_freq == 0:
                    self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01
                    print('Current Episode {} Average reward for episode {:.2f}.  Total average reward {:.2f}.'
                          .format(self.episode_number,
                                  self.reward_sum // self.params.update_freq,
                                  self.running_reward // self.params.update_freq))
                    self.reward_sum = 0
                    time.sleep(0.5)


                self.state = self.env.reset()
                state_input = self.state.reshape([1, self.params.input_dim])
                #state_input = self.prepro(self.state)
                self.global_step += 1
Esempio n. 13
0
        '---------------------------- vizDoom training script ---------------------------'
    )
    print('scenario: {}, agent: {}'.format(hp.scenario, hp.agent))

    print('\ntraining parameters:')
    print('n_epoch: {}, steps_per_epoch: {}, play_steps: {}'.format(
        hp.n_epoch, hp.steps_per_epoch, hp.play_steps))
    print('batch_size: {}, time_size: {}, not_update: {}'.format(
        hp.batch_size, hp.time_size, hp.not_update))
    print('tests_per_epoch: {}'.format(hp.tests_per_epoch))

    train_env = DoomEnvironment('scenarios/' + hp.scenario + '.cfg', False,
                                hp.train_skiprate)
    test_env = DoomEnvironment('scenarios/' + hp.scenario + '.cfg', False,
                               hp.test_skiprate)
    er = ReplayMemory(hp.replay_size, hp.screen_size)

    policy_net = agent[hp.agent](hp.scenario, 2**train_env.get_n_buttons())
    target_net = agent[hp.agent](hp.scenario, 2**train_env.get_n_buttons())
    optimizer = torch.optim.RMSprop(policy_net.parameters(), hp.learning_rate)

    trainer = Trainer(scenario=hp.scenario,
                      cuda=hp.cuda,
                      environment=train_env,
                      test_environment=test_env,
                      experience_replay=er,
                      policy_net=policy_net,
                      target_net=target_net,
                      optimizer=optimizer,
                      not_update=hp.not_update,
                      log_folder='logs/' + hp.scenario + '/' + hp.agent)
Esempio n. 14
0
class Agent:
    """Our Wasted Agent :P """
    def __init__(self, sess, config, environment, evaluation_enviroment):
        # Get the session, config, environment, and create a replaymemory
        self.sess = sess
        self.config = config
        self.environment = environment
        self.evaluation_enviroment = evaluation_enviroment

        if config.prm:
            self.memory = PrioritizedExperienceReplay(sess, config)
        else:
            self.memory = ReplayMemory(config.state_shape, config.rep_max_size)

        self.init_dirs()

        self.init_cur_epsiode()
        self.init_global_step()
        self.init_epsilon()
        self.init_summaries()

        # Intialize the DQN graph which contain 2 Networks Target and Q
        self.estimator = DQN(sess, config, self.environment.n_actions)

        # To initialize all variables
        self.init = tf.group(tf.global_variables_initializer(),
                             tf.local_variables_initializer())
        self.sess.run(self.init)

        self.saver = tf.train.Saver(max_to_keep=10)
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.sess.graph)

        if config.is_train and not config.cont_training:
            pass
        elif config.is_train and config.cont_training:
            self.load()
        elif config.is_play:
            self.load()
        else:
            raise Exception("Please Set proper mode for training or playing")

    def load(self):
        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir)
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            self.saver.restore(self.sess, latest_checkpoint)

    def save(self):
        self.saver.save(self.sess, self.checkpoint_dir,
                        self.global_step_tensor)

    def init_dirs(self):
        # Create directories for checkpoints and summaries
        self.checkpoint_dir = os.path.join(self.config.experiment_dir,
                                           "checkpoints/")
        self.summary_dir = os.path.join(self.config.experiment_dir,
                                        "summaries/")

    def init_cur_epsiode(self):
        """Create cur episode tensor to totally save the process of the training"""
        with tf.variable_scope('cur_episode'):
            self.cur_episode_tensor = tf.Variable(-1,
                                                  trainable=False,
                                                  name='cur_epsiode')
            self.cur_epsiode_input = tf.placeholder('int32',
                                                    None,
                                                    name='cur_episode_input')
            self.cur_episode_assign_op = self.cur_episode_tensor.assign(
                self.cur_epsiode_input)

    def init_global_step(self):
        """Create a global step variable to be a reference to the number of iterations"""
        with tf.variable_scope('step'):
            self.global_step_tensor = tf.Variable(0,
                                                  trainable=False,
                                                  name='global_step')
            self.global_step_input = tf.placeholder('int32',
                                                    None,
                                                    name='global_step_input')
            self.global_step_assign_op = self.global_step_tensor.assign(
                self.global_step_input)

    def init_epsilon(self):
        """Create an epsilon variable"""
        with tf.variable_scope('epsilon'):
            self.epsilon_tensor = tf.Variable(self.config.initial_epsilon,
                                              trainable=False,
                                              name='epsilon')
            self.epsilon_input = tf.placeholder('float32',
                                                None,
                                                name='epsilon_input')
            self.epsilon_assign_op = self.epsilon_tensor.assign(
                self.epsilon_input)

    def init_summaries(self):
        """Create the summary part of the graph"""
        with tf.variable_scope('summary'):
            self.summary_placeholders = {}
            self.summary_ops = {}
            self.scalar_summary_tags = [
                'episode.total_reward', 'episode.length',
                'evaluation.total_reward', 'evaluation.length', 'epsilon'
            ]
            for tag in self.scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.scalar(
                    tag, self.summary_placeholders[tag])

    def init_replay_memory(self):
        # Populate the replay memory with initial experience
        print("initializing replay memory...")

        state = self.environment.reset()
        for i in itertools.count():
            action = self.take_action(state)
            next_state, reward, done = self.observe_and_save(
                state, self.environment.valid_actions[action])
            if done:
                if self.config.prm:
                    if i >= self.config.prm_init_size:
                        break
                else:
                    if i >= self.config.replay_memory_init_size:
                        break
                state = self.environment.reset()
            else:
                state = next_state
        print("finished initializing replay memory")

    def policy_fn(self, fn_type, estimator, n_actions):
        """Function that contain definitions to various number of policy functions and choose between them"""
        def epsilon_greedy(sess, observation, epsilon):
            actions = np.ones(n_actions, dtype=float) * epsilon / n_actions
            q_values = estimator.predict(np.expand_dims(observation, 0))[0]
            best_action = np.argmax(q_values)
            actions[best_action] += (1.0 - epsilon)
            return actions

        def greedy(sess, observation):
            q_values = estimator.predict(np.expand_dims(observation, 0),
                                         type="target")[0]
            best_action = np.argmax(q_values)
            return best_action

        if fn_type == 'epsilon_greedy':
            return epsilon_greedy
        elif fn_type == 'greedy':
            return greedy
        else:
            raise Exception("Please Select a proper policy function")

    def take_action(self, state):
        """Take the action based on the policy function"""
        action_probs = self.policy(self.sess, state,
                                   self.epsilon_tensor.eval(self.sess))
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        return action

    def observe_and_save(self, state, action):
        """Function that observe the new state , reward and save it in the memory"""
        next_state, reward, done = self.environment.step(action)
        self.memory.push(state, next_state, action, reward, done)
        return next_state, reward, done

    def update_target_network(self):
        """Update Target network By copying paramter between the two networks in DQN"""
        self.estimator.update_target_network()

    def add_summary(self, summaries_dict, step):
        """Add the summaries to tensorboard"""
        summary_list = self.sess.run(
            [self.summary_ops[tag] for tag in summaries_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in summaries_dict.items()
            })
        for summary in summary_list:
            self.summary_writer.add_summary(summary, step)
        self.summary_writer.flush()

    def train_episodic(self):
        """Train the agent in episodic techniques"""

        # Initialize the epsilon step, it's step, the policy function, the replay memory
        self.epsilon_step = (
            self.config.initial_epsilon -
            self.config.final_epsilon) / self.config.exploration_steps
        self.policy = self.policy_fn(self.config.policy_fn, self.estimator,
                                     self.environment.n_actions)
        self.init_replay_memory()

        for cur_episode in range(
                self.cur_episode_tensor.eval(self.sess) + 1,
                self.config.num_episodes, 1):

            # Save the current checkpoint
            self.save()

            # Update the Cur Episode tensor
            self.cur_episode_assign_op.eval(
                session=self.sess,
                feed_dict={
                    self.cur_epsiode_input:
                    self.cur_episode_tensor.eval(self.sess) + 1
                })

            # Evaluate Now to see how it behave
            if cur_episode % self.config.evaluate_every == 0:
                self.evaluate(cur_episode / self.config.evaluate_every)

            state = self.environment.reset()
            total_reward = 0

            # Take steps in the environment untill terminal state of epsiode
            for t in itertools.count():

                # Update the Global step
                self.global_step_assign_op.eval(
                    session=self.sess,
                    feed_dict={
                        self.global_step_input:
                        self.global_step_tensor.eval(self.sess) + 1
                    })

                # time to update the target estimator
                if self.global_step_tensor.eval(
                        self.sess
                ) % self.config.update_target_estimator_every == 0:
                    self.update_target_network()

                # Calculate the Epsilon for this time step
                # Take an action ..Then observe and save
                self.epsilon_assign_op.eval(
                    {
                        self.epsilon_input:
                        max(
                            self.config.final_epsilon,
                            self.epsilon_tensor.eval(self.sess) -
                            self.epsilon_step)
                    }, self.sess)
                action = self.take_action(state)
                next_state, reward, done = self.observe_and_save(
                    state, self.environment.valid_actions[action])

                # Sample a minibatch from the replay memory
                if self.config.prm:
                    indices_batch, weights_batch, state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.sample(
                    )
                else:
                    state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.get_batch(
                        self.config.batch_size)

                # Calculate targets Then Compute the loss
                q_values_next = self.estimator.predict(next_state_batch,
                                                       type="target")
                targets_batch = reward_batch + np.invert(done_batch).astype(
                    np.float32) * self.config.discount_factor * np.amax(
                        q_values_next, axis=1)

                if self.config.prm:
                    _ = self.estimator.update(state_batch, action_batch,
                                              targets_batch, weights_batch)
                else:
                    _ = self.estimator.update(state_batch, action_batch,
                                              targets_batch)

                total_reward += reward

                if done:  # IF terminal state so exit the episode
                    # Add summaries to tensorboard
                    summaries_dict = {
                        'episode.total_reward': total_reward,
                        'episode.length': t,
                        'epsilon': self.epsilon_tensor.eval(self.sess)
                    }
                    self.add_summary(summaries_dict,
                                     self.global_step_tensor.eval(self.sess))
                    break

                state = next_state

        print("Training Finished")

    def train_continous(self):
        # TODO implement on global step only
        pass

    def play(self, n_episode=10):
        """Function that play greedily on the policy learnt"""
        # Play Greedily
        self.policy = self.policy_fn('greedy', self.estimator,
                                     self.environment.n_actions)

        for cur_episode in range(n_episode):

            state = self.environment.reset()
            total_reward = 0

            for t in itertools.count():

                best_action = self.policy(self.sess, state)
                next_state, reward, done = self.environment.step(
                    self.environment.valid_actions[best_action])

                total_reward += reward

                if done:
                    print("Total Reward in Epsiode " + str(cur_episode) +
                          " = " + str(total_reward))
                    print("Total Length in Epsiode " + str(cur_episode) +
                          " = " + str(t))
                    break

                state = next_state

    def evaluate(self, local_step):

        print('evaluation #{0}'.format(local_step))

        policy = self.policy_fn('greedy', self.estimator,
                                self.evaluation_enviroment.n_actions)

        for cur_episode in range(self.config.evaluation_episodes):

            state = self.evaluation_enviroment.reset()
            total_reward = 0

            for t in itertools.count():

                best_action = policy(self.sess, state)
                next_state, reward, done = self.evaluation_enviroment.step(
                    self.evaluation_enviroment.valid_actions[best_action])

                total_reward += reward

                if done:
                    # Add summaries to tensorboard
                    summaries_dict = {
                        'evaluation.total_reward': total_reward,
                        'evaluation.length': t
                    }
                    self.add_summary(summaries_dict,
                                     local_step * 5 + cur_episode)
                    break

                state = next_state

        print('Finished evaluation #{0}'.format(local_step))
Esempio n. 15
0
def train():
    parser = argparse.ArgumentParser(
        description='Train an agent in the ViZDoom environment.')
    parser.add_argument('map_name', help='path to the map config')
    parser.add_argument('--output_path',
                        dest='output_path',
                        help='output path for agent checkpoints')
    parser.add_argument(
        '--save_interval',
        dest='save_interval',
        default=10,
        type=int,
        help='interval, measured in epochs, between each agent checkpoint')
    parser.add_argument('--cuda',
                        dest='cuda',
                        default=False,
                        action='store_true',
                        help='whether to use cuda')
    parser.add_argument('--log_interval',
                        dest='log_interval',
                        default=10,
                        type=int,
                        help='interval between each progress update log')
    parser.add_argument(
        '--score_buffer_size',
        dest='score_buffer_size',
        default=50,
        type=int,
        help=
        'the amount of last scores that will be saved to compute statistics')

    parser.add_argument('--n_epochs',
                        dest='n_epochs',
                        default=1000,
                        type=int,
                        help='number of epochs')
    parser.add_argument('--epoch_len',
                        dest='epoch_len',
                        default=1024,
                        type=int,
                        help='the length of an epoch')
    parser.add_argument('--lr',
                        dest='lr',
                        default=2.5e-4,
                        type=float,
                        help='learning rate')
    parser.add_argument('--lr_decay',
                        dest='decay_lr',
                        default=False,
                        help='whether to decay learning rate each epoch')
    parser.add_argument('--gamma',
                        dest='gamma',
                        default=0.99,
                        type=float,
                        help='discount factor')
    parser.add_argument('--batch_size',
                        dest='batch_size',
                        default=32,
                        type=int,
                        help='batch size')
    parser.add_argument('--alg',
                        dest='alg',
                        default='ppo',
                        choices=['ppo', 'dqn', 'a2c'],
                        help='the algorithm the agent will use')
    parser.add_argument(
        '--nn',
        dest='nn',
        default='deepmind_cnn',
        choices=['deepmind_cnn', 'capsnet'],
        help='neural network that the agent will use as its feature network')

    parser.add_argument('--frame_skip',
                        dest='frame_skip',
                        default=4,
                        type=int,
                        help='number of frames to skip each action')
    parser.add_argument('--frames_per_state',
                        dest='frames_per_state',
                        default=4,
                        type=int,
                        help='number of frames to stack every state')
    parser.add_argument('--state_w',
                        dest='state_w',
                        default=108,
                        type=int,
                        help='target state width to resize each frame to')
    parser.add_argument('--state_h',
                        dest='state_h',
                        default=60,
                        type=int,
                        help='target state height to resize each frame to')
    parser.add_argument('--state_rgb',
                        dest='rgb',
                        default=False,
                        action='store_true',
                        help='whether to use rgb or gray frames')
    parser.add_argument(
        '--shape_rewards',
        dest='shape_rewards',
        default=False,
        action='store_true',
        help=
        'whether to use a reward shaping function specified for the selected map'
    )
    parser.add_argument(
        '--use_default_actions_for_map',
        dest='use_default_actions',
        default=False,
        action='store_true',
        help=
        'whether to use a default set of actions specified for the selected map'
    )

    parser.add_argument('--ppo_lambda',
                        dest='lam',
                        default=0.95,
                        type=float,
                        help='lambda value for GAE')
    parser.add_argument('--ppo_eps',
                        dest='eps',
                        default=0.1,
                        type=float,
                        help='clipping parameter for PPO')
    parser.add_argument(
        '--ppo_decay_params',
        dest='ppo_decay',
        default=False,
        action='store_true',
        help=
        'whether to decay PPO learning rate and epsilon each epoch linearly')
    parser.add_argument('--ppo_ent_coeff',
                        dest='ent_coeff',
                        default=0.01,
                        type=float,
                        help='entropy coefficient for PPO')
    parser.add_argument('--ppo_value_coeff',
                        dest='value_coeff',
                        default=1.0,
                        type=float,
                        help='value coefficient for PPO')
    parser.add_argument('--ppo_opt_epochs',
                        dest='opt_epochs',
                        default=4,
                        type=int,
                        help='number of optimization epochs for PPO')

    parser.add_argument('--dqn_use_ddqn',
                        dest='ddqn',
                        default=False,
                        action='store_true',
                        help='whether to use ddqn instead of dqn')
    parser.add_argument('--dqn_dueling',
                        dest='dueling',
                        default=False,
                        action='store_true',
                        help='whether to use a dueling architecture in dqn')
    parser.add_argument('--dqn_min_eps',
                        dest='min_eps',
                        default=0.01,
                        type=float,
                        help='minimum value of epsilon for dqn')
    parser.add_argument('--dqn_mem_size',
                        dest='memory_size',
                        default=100000,
                        type=int,
                        help='replay memory size for dqn')
    parser.add_argument('--dqn_init_size',
                        dest='init_size',
                        default=10000,
                        type=int,
                        help='number of timesteps before dqn starts learning')
    parser.add_argument('--dqn_q_update_interval',
                        dest='q_update_interval',
                        default=1,
                        type=int,
                        help='the interval between updates of the q function')
    parser.add_argument(
        '--dqn_target_update_interval',
        dest='target_update_interval',
        default=1000,
        type=int,
        help='the interval between updated of the target q function')

    args = parser.parse_args()

    game = initialize_vizdoom(args.map_name)

    if args.use_default_actions:
        actions = default_actions_for_map(game, args.map_name)
    else:
        actions = all_actions(game)

    reward_fn = default_reward_shaping(
        args.map_name) if args.shape_rewards else None

    in_channels = args.frames_per_state * (3 if args.rgb else 1)

    if args.nn == 'deepmind_cnn':
        feature_net = CNN(in_channels)
    elif args.nn == 'capsnet':
        feature_net = CapsNet(in_channels)

    if args.alg == 'ppo':
        policy = ActorCriticPolicy(feature_net, len(actions))
        optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr)

        eps_sched = LinearSchedule("eps",
                                   args.eps,
                                   1,
                                   args.n_epochs,
                                   end_val=1.0 if not args.ppo_decay else 0.0)
        lr_sched = LRWrapper(
            optimizer,
            LinearSchedule("lr",
                           args.lr,
                           1,
                           args.n_epochs,
                           end_val=1.0 if not args.ppo_decay else 0.0))
        schedules = [lr_sched, eps_sched]

        agent = PPOAgent(policy,
                         optimizer,
                         eps_sched,
                         cuda=args.cuda,
                         n_timesteps=args.epoch_len,
                         batch_size=args.batch_size,
                         opt_epochs=args.opt_epochs,
                         gamma=args.gamma,
                         lam=args.lam,
                         entropy_coeff=args.ent_coeff,
                         value_coeff=args.value_coeff)
    elif args.alg == 'a2c':
        policy = ActorCriticPolicy(feature_net, len(actions))
        optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr)

        lr_sched = LRWrapper(
            optimizer,
            LinearSchedule("lr",
                           args.lr,
                           1,
                           args.n_epochs,
                           end_val=1.0 if not args.decay_lr else 0.0))
        schedules = [lr_sched]

        agent = A2CAgent(policy, optimizer, args.cuda, args.gamma,
                         args.epoch_len)
    elif args.alg == 'dqn':
        q = QNetwork(feature_net, len(actions))
        tq = QNetwork(feature_net, len(actions))
        optimizer = torch.optim.Adam(q.parameters(), lr=args.lr)

        memory = ReplayMemory(args.memory_size)
        eps_sched = LinearSchedule("eps",
                                   1,
                                   1,
                                   args.n_epochs,
                                   end_val=args.min_eps)
        lr_sched = LRWrapper(
            optimizer,
            LinearSchedule("lr",
                           args.lr,
                           1,
                           args.n_epochs,
                           end_val=1.0 if not args.decay_lr else 0.0))
        schedules = [lr_sched, eps_sched]

        agent = DQNAgent(q,
                         tq,
                         optimizer,
                         memory,
                         eps_sched,
                         cuda=args.cuda,
                         init_steps=args.init_size,
                         q_update_interval=args.q_update_interval,
                         target_update_interval=args.target_update_interval,
                         ddqn=args.ddqn,
                         gamma=args.gamma,
                         batch_size=args.batch_size)

    progress_monitor = ProgressMonitor(args.score_buffer_size,
                                       monitor_interval=args.log_interval)

    env_params = {
        "env": {
            "frame_skip": args.frame_skip,
            "frames_per_state": args.frames_per_state,
            "state_dim": (3 if args.rgb else 1, args.state_h, args.state_w),
            "actions": actions
        },
        "agent": {
            "alg": args.alg,
            "nn": args.nn
        },
        "save_path": args.output_path,
        "save_interval": args.save_interval,
        "progress_monitor": progress_monitor,
        "map_name": args.map_name
    }

    if args.output_path:
        checkpoint_monitor = CheckpointMonitor(env_params, agent)
        monitors = [checkpoint_monitor, progress_monitor]
    else:
        monitors = [progress_monitor]

    generator = TrajectoryGenerator(game,
                                    args.n_epochs,
                                    args.epoch_len,
                                    agent,
                                    shape_reward_fn=reward_fn,
                                    monitors=monitors,
                                    param_schedules=schedules,
                                    **env_params["env"])

    generator.run()