Ejemplo n.º 1
0
  def train(self):
  
    for t in range(self.num_iterations):
      actions, returns, losses = self.sample_model_reward_return(t)

      self.sess.run(self.train_op, feed_dict={ 
                    self.action_placeholder : actions, 
                    self.advantage_placeholder : returns-losses})
      
      #avg_acc = np.mean(returns)
      avg_acc = (np.mean(returns)*data_std + data_mean) / 100000.

      #calculate number of used models:
      used = 0
      for key in self._used_dict.keys():
        used += self._used_dict[key]
      #used = np.sum(self._used_dict)
      self._num_used_models.append(used)


      self.log_acc.append(avg_acc)
      #sigma_reward = np.sqrt(np.var(returns) / len(total_rewards))
      msg = "Average accuracy within a batch: {:04.2f}".format(avg_acc*100)
      self.logger.info(msg)
      #print (actions)

  
    self.logger.info("- Training done.")
    #export_plot(self.log_acc, "Batch_Accuracy", 'NAS-DNN', "./batch_accuracy.png", self._num_used_models, "Sampled Model")
    export_plot(self.log_acc, "Score", 'NAS-DNN', "./batch_accuracy.png")
    export_plot(self._num_used_models, "Models Sampled", 'NAS-DNN', "./used_models.png")

    print 'log_acc'; print self.log_acc
    print '_num_used_models'; print self._num_used_models
    def train(self):
        """
        Performs training

        You do not have to change or use anything here, but take a look
        to see how all the code you've written fits together!
        """
        last_eval = 0
        last_record = 0
        scores_eval = []

        self.init_averages()
        scores_eval = []  # list of scores computed at iteration time

        for t in range(self.config.num_batches):

            # collect a minibatch of samples
            paths, total_rewards = self.sample_path(self.env)
            scores_eval = scores_eval + total_rewards
            observations = np.concatenate(
                [path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            rewards = np.concatenate([path["reward"] for path in paths])
            # compute Q-val estimates (discounted future returns) for each time
            # step
            returns = self.get_returns(paths)
            advantages = self.calculate_advantage(returns, observations)

            # run training operations
            if self.config.use_baseline:
                self.update_baseline(returns, observations)
            self.sess.run(self.train_op, feed_dict={
                          self.observation_placeholder: observations,
                          self.action_placeholder: actions,
                          self.advantage_placeholder: advantages})

            # tf stuff
            if (t % self.config.summary_freq == 0):
                self.update_averages(total_rewards, scores_eval)
                self.record_summary(t)

            # compute reward statistics for this batch and log
            avg_reward = np.mean(total_rewards)
            sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards))
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

            if self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        self.logger.info("- Training done.")
        export_plot(
            scores_eval,
            "Score",
            config.env_name,
            self.config.plot_output)
    def train(self, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        self.init_averages()

        t = last_eval = curri_idx = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time

        prog = Progbar(target=self.config.nsteps_train)
        rcopy = RepeatCopy(num_bits=self.config.num_bits,
                           batch_size=self.config.batch_size,
                           min_length=self.config.min_length,
                           max_length=self.config.max_length,
                           min_repeats=self.config.min_repeats,
                           max_repeats=self.config.max_repeats)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            config = self.config
            batch_data = rcopy()

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon,
                                                   batch_data)

            # logging stuff
            if ((t % config.log_freq == 0)
                    and (t % config.learning_freq == 0)):
                self.update_averages(scores_eval)
                lr_schedule.update(t)
                prog.update(t + 1,
                            exact=[("Loss", loss_eval), ("Grads", grad_eval),
                                   ("lr", lr_schedule.epsilon)])

            if t >= config.nsteps_train:
                break

            if last_eval >= config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 4
0
    def train(self):
        self.baseline = -1000.0

        for t in range(self.num_iterations):
            #print ('iterations:', t)
            actions, con_actions, returns, losses = self.sample_model_reward_return(
                t)
            returns = returns * 2
            #self.baseline = (t*self.baseline + np.mean(returns)) / (t+1)
            if self.baseline == -1000.0:
                self.baseline = np.mean(returns)
            else:
                self.baseline = 0.6 * self.baseline + 0.4 * np.mean(returns)

            self.sess.run(self.train_op,
                          feed_dict={
                              self.action_placeholder: actions,
                              self.con_action_placeholder: con_actions,
                              self.advantage_placeholder:
                              returns - self.baseline
                          })

            avg_acc = np.mean(returns)
            used = len(self._used_models)
            self._num_used_models.append(used)

            self.log_acc.append(avg_acc)
            #sigma_reward = np.sqrt(np.var(returns) / len(total_rewards))
            msg = "Average accuracy within a batch: {:04.2f}".format(avg_acc)
            self.logger.info(msg)
            #print (actions)

        self.logger.info("- Training done.")
        export_plot(self.log_acc, "Score", 'NAS-CNN', "./batch_accuracy.png")
        export_plot(self._num_used_models, "Number of distinct models sampled",
                    'NAS-CNN', "./used_models.png")
Ejemplo n.º 5
0
    def train(self):

        for t in range(self.num_batches):
            actions, returns = self.sample_model_reward_return()
            #self.baseline = (t*self.baseline + np.mean(returns)) / (t+1)
            if self.baseline == -1000.0:
                self.baseline = np.mean(returns)
            else:
                self.baseline = 0.6 * self.baseline + 0.4 * np.mean(returns)

            self.sess.run(self.train_op,
                          feed_dict={
                              self.action_placeholder: actions,
                              self.advantage_placeholder: returns
                          })  # not using baseline here

            avg_acc = np.mean(returns)

            #calculate number of used models:
            used = 0
            #for key in self._used_dict.keys():
            #used += self._used_dict[key]
            used = np.sum(self._used_dict)
            self._num_used_models.append(used)

            self.log_acc.append(avg_acc)
            #sigma_reward = np.sqrt(np.var(returns) / len(total_rewards))
            msg = "Average accuracy within a batch: {:04.2f}".format(avg_acc)
            self.logger.info(msg)
            print(actions)

        self.logger.info("- Training done.")
        #export_plot(self.log_acc, "Batch_Accuracy", 'NAS-DNN', "./batch_accuracy.png", self._num_used_models, "Sampled Model")
        export_plot(self.log_acc, "Score", 'NAS-DNN', "./batch_accuracy.png")
        export_plot(self._num_used_models, "Models Sampled", 'NAS-DNN',
                    "./used_models.png")
Ejemplo n.º 6
0
    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        orientation_map = [
            np.array([0, 1]),
            np.array([-1, 0]),
            np.array([0, -1]),
            np.array([1, 0])
        ]

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            flag = True
            while flag:
                state = self.env.reset()  # h x w x c
                agent_location = self.env.state.agent_location
                if self.env.teacher.dist_map[agent_location[1],
                                             agent_location[0]] != np.inf:
                    flag = False
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            h_state = (np.zeros([1, self.config.h_size]),
                       np.zeros([1, self.config.h_size]))
            h_state_a = (np.zeros([1, model_a.config.h_size]),
                         np.zeros([1, model_a.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0
            for i in range(200):
                t += 1
                last_eval += 1
                last_record += 1

                raw_goal_state, goal_state = self.convert_state_to_goal_state(
                    state)
                #### for replay_buffer
                # replay memory stuff
                idx = replay_buffer.store_frame(raw_goal_state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values, h_state = self.get_best_action(
                    [q_input], h_state, slen, [action])
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                reward = 0
                #### perform action in env ####
                #### update goal obs image ####
                if action == 1:
                    if self.config.render_train:
                        self.env.teacher.update_goal_obs_image(self.env.state)
                if self.config.render_train: self.env.render()
                #### teacher move ####
                agent_location = self.env.state.agent_location
                agent_orientation = self.env.state.agent_orientation
                goal_location = agent_location + agent_orientation
                gt_action = self.env.teacher.action_map[agent_location[1],
                                                        agent_location[0]]
                if np.dot(agent_orientation, orientation_map[gt_action]) == 1:
                    new_state, reward_i, done = self.env.step(0)
                else:
                    tmp = np.cross(agent_orientation,
                                   orientation_map[gt_action])
                    if tmp == 1:
                        new_state, reward_i, done = self.env.step(3)
                    else:
                        new_state, reward_i, done = self.env.step(2)
                #### issue command ####
                if action == 1:
                    model_a.env.teacher.set_goal(goal_state, goal_location)
                    reward_a = model_a.navi_goal(h_state_a, goal_state)
                    if model_a.env.teacher.goal_finish:
                        reward += reward_i
                    reward += reward_a
                    reward += -1
                    self.env.state.teleport(
                        self.env.agent, model_a.env.state.agent_location,
                        model_a.env.state.agent_orientation)
                    new_state = self.env.state.onehot_state

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 7
0
    def train(self, exp_schedule, lr_schedule, exp_schedule1, env=None):
        """
        Performs training of Q only on agent 0

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        if env is None:
            env = self.env

        # initialize replay buffer and variables
        rewards = deque(maxlen=self.config.num_episodes_test)
        rewardsB = deque(maxlen=self.config.num_episodes_test)
        self.model_0.rewards = rewards
        self.model_1.rewards = rewardsB
        # self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)
        self.model_0.train_init()
        self.model_1.train_init()

        # next_fire_B = False

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            # need_new_ball = False
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: env.render()

                action_0 = self.model_0.train_step_pre(state, exp_schedule)
                # if exp_schedule.epsilon == 1:
                #     action_1 = exp_schedule.get_action(0,3)  # agent altogether
                # else:
                action_1 = self.model_1.train_step_pre(state[:, ::-1],
                                                       exp_schedule1)
                cur_action = actions.trans(action_0, action_1)

                # perform action in env
                new_state, reward, done, info = env.step(cur_action)

                # print("Reward", reward)

                # Problem
                loss_e0, grad_e0 = self.model_0.train_step_post(
                    reward, done, t, lr_schedule, True)
                self.model_1.train_step_post(-reward, done, t, lr_schedule,
                                             False)
                state = new_state

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    # self.update_averages(rewards, max_q_values, q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[
                                        ("Loss", loss_e0),
                                        ("Avg R", np.mean(rewards)),
                                        ("Max R", np.max(rewards)),
                                        ("Min R", np.min(rewards)),
                                        ("eps", exp_schedule.epsilon),
                                        ("Grads", grad_e0),
                                        ("Max Q",
                                         np.mean(self.model_0.max_q_values)),
                                        ("lr", lr_schedule.epsilon)
                                    ])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)
            rewardsB.append(-total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record(exp_schedule)
                self.model_0.save(t)  # save the models
                self.model_1.save(t)  # save the models

        # last words
        self.logger.info("- Training done.")
        self.model_0.save()  # save the models
        self.model_1.save()  # save the models
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 8
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        last_frames = deque(maxlen=4)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += []

        extractor = PongExtractor()

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            last_frame = state
            last_frames.append(state)
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                feats = extractor.extract(np.squeeze(state))
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                embedding = self.sess.run(self.hidden,
                                          feed_dict={self.s: [q_input]})[0]
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                if t % 100 == 0:
                    # print state.shape
                    # frame = np.zeros(np.squeeze(state).shape)
                    # for f in last_frames:
                    #     frame = frame + np.squeeze(f)
                    # frame = frame / len(last_frames)
                    frame = np.squeeze(state)
                    last_frame = np.squeeze(last_frame)
                    pickle.dump(
                        last_frames,
                        open('frames/embedding/atari{}.p'.format(t), 'w'))
                    for i in range(4):
                        f = np.squeeze(last_frames[i])
                        scipy.misc.imsave(
                            'frames/embedding/atari{}.png'.format(t - 3 + i),
                            f)

                    # scipy.misc.imsave('frames/atari{}.png'.format(t-1),last_frame)
                    # posfile = open('frames/atari{}.txt'.format(t),'w')
                    # posfile.write('Opp Paddle:\t{}\n'.format(oppY))
                    # posfile.write('Player Paddle:\t{}\n'.format(playerY))
                    # posfile.write('ball x:\t{}\n'.format(ballX))
                    # posfile.write('ball y:\t{}\n'.format(ballY))
                    # posfile.close()
                    np.savetxt('frames/embedding/pong{}.txt'.format(t),
                               feats,
                               fmt='%.2f')

                # perform action in env
                new_state, reward, done, info = self.env.step(action)
                # print "state shape:",state.shape()

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                last_frame = state
                state = new_state
                last_frames.append(state)

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
    def train(self, model_i, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        self.init_averages()

        t = last_eval = curri_idx = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            encoding_batch = []
            predflag_batch = []
            target_action_batch = []
            slen_batch = []
            max_len = 0
            for i in range(self.config.batch_size):
                #config = self.config
                #config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[curri_idx]
                #self.env.reset(config) # h x w x c
                encoding, target_action, predflag = model_i.gen_sample_seq(self.config.ndigits, self.config.nway)
                encoding_batch.append(encoding[None])
                predflag_batch.append(predflag[None])
                target_action_batch.append(target_action[None])
                slen_batch.append(encoding.shape[0])
                if encoding.shape[0]>max_len:
                    max_len = encoding.shape[0]

            batch_data = DatasetTensors(np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in encoding_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in target_action_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1]])], axis=1) for x in predflag_batch], axis=0), np.array(slen_batch).astype('int32'))

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data)

            # logging stuff
            if ((t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)):
                self.update_averages(scores_eval)
                lr_schedule.update(t)
                prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)])

            if t >= self.config.nsteps_train:
                break

            if last_eval >= self.config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_i)]
                '''
                if scores_eval[-1]>0.8:
                    curri_idx += 1
                    msg = "Upgrade to lesson {:d}".format(int(curri_idx))
                    self.logger.info(msg)
                    self.logger.info("----------Start Computing Final Score----------")
                    scores_eval += [self.evaluate(model_i)]
                    self.logger.info("----------Finish Computing Final Score----------")
                '''

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(cr_schedule)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 10
0
 def export_score(self, scores_eval):
     export_plot(
         scores_eval, "Scores",
         self.config.plot_output + "scores_" + str(self.index) + ".png")
    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        model_a.env.state.is_render_image = model_a.config.render_train
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]

        npath = self.config.npath # paths to generate in each environment
        nquery = self.config.nquery # query to generate in each environment
        max_plan_len = self.config.max_plan_len
        ndigits = self.config.ndigits
        nway = self.config.nway

        num_classes = len(self.env.state.xmap.item_class_id)

        # three steps:
        #   1. sample paths from the teacher environment and pass to dnc
        #   2. get immediate reward from whether agent could reach the subgoal
        #   3. sample query paths and ask agent to follow the plan, get the final big reward
        #   -- train one step after each teacher's move
        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            self.env.reset()
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            dnc_state = DNC.zero_state(self.config, batch_size=1)
            h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0

            # sample paths
            for i in range(npath):
                state_seq, path_loc, path_ori = self.env.teacher.gen_sample_seq(self.env.state)
                state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway)
                goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool')
                #### missing could be everything ####
                goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq
                #### treat missing observation as correct observation ####
                goal_state_seq[:,:,:,:,num_classes] = True
                #### transpose
                goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1])
                path_len = state_seq.shape[0]
                mask_seq = np.logical_not(state_seq[:,:3,:,num_classes])
                flag_seq = np.zeros([path_len])
                flag_seq[-1] = 1
                model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]])

                for j in range(path_len):
                    # get agate from dnc
                    cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0)
                    agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    agate_dnc_val = agate_dnc_val[0,0]
                    # get q value and sample action
                    idx = replay_buffer.store_frame(state_seq[j])
                    q_input = replay_buffer.encode_recent_observation()
                    best_action, q_values, h_state = self.get_best_action([q_input], h_state, slen, [action], [agate_dnc_val])
                    action = exp_schedule.get_action(best_action)
                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)
                    # take action and update dnc
                    cur_dnc_in[-2] = action
                    dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    # acquire reward
                    reward = 0
                    done = False
                    if action==1:
                        h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                        model_a.env.teacher.set_goal(goal_state_seq[j], path_loc[j])
                        reward_a = model_a.navi_goal(h_state_a, goal_state_seq[j])
                        if not model_a.env.teacher.goal_finish:
                            reward += -0.05
                        reward += -0.05
                        model_a.env.state.teleport(model_a.env.agent, path_loc[j], orientation_map[path_ori[j]])
                    # acquire final reward
                    if i==npath-1 and j==path_len-1:
                        done = True
                        reward_list = list()
                        for k in range(nquery):
                            reward_list.append(0)
                            src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state)
                            src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway)
                            tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway)
                            path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], 
                                self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len})
                            path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2)
                            target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2)
                            path_dnc_val_inner = np.argmax(path_dnc_val, axis=3)
                            target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2)
                            cur_len = max_plan_len
                            for l in range(max_plan_len):
                                if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all():
                                    cur_len = l+1
                                    break
                            path_dnc_val = path_dnc_val[:cur_len]
                            path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0)
                            #### modify goal state ####
                            #### missing could be everything ####
                            path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val
                            #### treat missing observation as correct observation ####
                            path_dnc_val[:,:,:,num_classes] = True
                            model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1]))
                            h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                            for l in range(path_dnc_val.shape[0]):
                                cur_goal_state = path_dnc_val[l]
                                cur_goal_state = np.expand_dims(cur_goal_state, 3)
                                cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1),
                                    np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3)
                                model_a.env.teacher.set_goal(cur_goal_state, tgt_loc)
                                reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state)
                            if model_a.env.teacher.goal_finish:
                                reward_list[-1] += 10
                        reward += sum(reward_list)/len(reward_list)
                    # store everything into replay buffer
                    replay_buffer.store_effect(idx, action, agate_dnc_val, reward, done)

                    t += 1
                    last_eval += 1
                    last_record += 1

                    # perform a training step
                    loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)

                    # logging stuff
                    if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                       (t % self.config.learning_freq == 0)):
                        self.update_averages(rewards, max_q_values, q_values, scores_eval)
                        exp_schedule.update(t)
                        lr_schedule.update(t)
                        if len(rewards) > 0:
                            prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                            ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                            ("Grads", grad_eval), ("Max Q", self.max_q), 
                                            ("lr", lr_schedule.epsilon)])

                    elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                        sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                            self.config.learning_start))
                        sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 12
0
    def train(self):
        """
		Performs training

		You do not have to change or use anything here, but take a look
		to see how all the code you've written fits together!
		"""
        last_eval = 0
        last_record = 0
        scores_eval = []

        self.init_averages()
        scores_eval = []  # list of scores computed at iteration time

        previous_avg_reward = -np.Inf
        for t in range(self.config.num_batches):

            # collect a minibatch of samples
            paths, total_rewards = self.sample_path(self.env)
            scores_eval = scores_eval + total_rewards
            observations = np.concatenate(
                [path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            rewards = np.concatenate([path["reward"] for path in paths])
            # compute Q-val estimates (discounted future returns) for each time step
            returns = self.get_returns(paths)
            advantages = self.calculate_advantage(returns, observations)

            # run training operations
            if self.config.use_baseline:
                self.update_baseline(returns, observations)

            if self.config.use_sgd is True:
                print("USE SGD")
                feed_dict = {
                    self.observation_placeholder: observations,
                    self.action_placeholder: actions,
                    self.sgd_lr_placeholder: self.sgd_lr,
                    self.advantage_placeholder: advantages
                }

                old_params = self.sess.run(self.get_pi_params)

                # 1 time is enough even if policy is stochastic: checks ok ... results are very consistent
                old_actions = self.sess.run(
                    self.sampled_action,
                    feed_dict={self.observation_placeholder: observations})

                start = timer()
                old_penalty = env.penalty(observations, old_actions)
                end = timer()  # Takes 17 seconds on my setup

                print("...Time to compute penalty: {}".format(end - start))
                print("...len: {} {} {}".format(len(old_params),
                                                len(observations),
                                                len(old_actions)))
                print("...old_penalty {}".format(old_penalty))

                sgd_gradient = self.sess.run(
                    self.gradient, feed_dict)  # !!! DO NOT SWAP LINES !!!
                self.sess.run(self.sgd_train_op, feed_dict)

                new_actions = self.sess.run(
                    self.sampled_action,
                    feed_dict={self.observation_placeholder: observations})
                sgd_penalty = env.penalty(observations, new_actions)
                print("...sgd_penalty {}".format(sgd_penalty))

                sgd_params = self.sess.run(self.get_pi_params)
                xxx_params = old_params - self.sgd_lr * sgd_gradient
                print(np.linalg.norm(xxx_params - sgd_params, ord=2))
                #assert 2==1 # checks ongoing

                if sgd_penalty > old_penalty:
                    start = timer()
                    backtrack_lr = copy.copy(self.sgd_lr)
                    for i in range(self.config.backtrack_iters):
                        backtrack_lr *= self.config.backtrack_decay
                        self.sess.run(self.set_pi_params,
                                      feed_dict={
                                          self.v_ph:
                                          old_params -
                                          backtrack_lr * sgd_gradient
                                      })
                        new_actions = self.sess.run(
                            self.sampled_action,
                            feed_dict={
                                self.observation_placeholder: observations
                            })
                        bt_penalty = env.penalty(observations, new_actions)
                        print(
                            "...BACKTRACKING bt_penalty {}".format(bt_penalty))

                        if bt_penalty < sgd_penalty:
                            print(
                                "BACKTRACKING: improvement at iter {} bt_penalty={} sgd_penalty={}"
                                .format(i, bt_penalty, sgd_penalty))
                            break

                        if i == self.config.backtrack_iters - 1:
                            # Nothing better found during backtracking, restore sgd_params
                            self.sess.run(self.set_pi_params,
                                          feed_dict={self.v_ph: sgd_params})
                    end = timer()
                    print("...Backtracking Time: {}".format(end - start))

            else:
                print("USE ADAM")
                self.sess.run(self.train_op,
                              feed_dict={
                                  self.observation_placeholder: observations,
                                  self.action_placeholder: actions,
                                  self.advantage_placeholder: advantages
                              })

            # tf stuff
            if (t % self.config.summary_freq == 0):
                self.update_averages(total_rewards, scores_eval)
                self.record_summary(t)

            # compute reward statistics for this batch and log
            avg_reward = np.mean(total_rewards)
            sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards))
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

            #if self.config.use_sgd is True and avg_reward < previous_avg_reward:
            #	self.sgd_lr = self.sgd_lr * 0.9
            #	print("Decay SGD LR to {}".format(self.sgd_lr))
            #previous_avg_reward = avg_reward

            if self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        self.logger.info("- Training done.")
        export_plot(scores_eval, "Score", config.env_name,
                    self.config.plot_output)
    def train(self, beta_schedule, lr_schedule, cr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        self.init_averages()

        t = last_eval = curri_idx = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        curriculum_batch_size = np.ceil(
            self.config.nsteps_train /
            cr_schedule.n_curriculum).astype('int32')

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            config = self.config
            config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit = cr_schedule[
                curri_idx]
            self.env.reset(config)  # h x w x c
            h_state = DNC.zero_state(config, batch_size=1)
            encoding, predflag, target_action = self.env.prepare_seq()
            slen = np.array(encoding.shape[0]).astype('int32')
            # describe graph, query and planning
            h_state = self.sess.run(self.hs_out,
                                    feed_dict={
                                        self.s: encoding[None],
                                        self.hs: h_state,
                                        self.slen: slen
                                    })
            past_state = -1
            past_action_onehot = -1
            encoding_a = np.zeros([config.max_step_len, encoding.shape[1]])
            predflag_a = np.zeros(config.max_step_len)
            target_action_a = np.zeros(
                [config.max_step_len, target_action.shape[1]])
            for i in range(config.max_step_len):
                current_encoding = GraphWorld.convert_triplets_to_encoding(
                    np.array([[
                        past_state, self.env.current_state, past_action_onehot
                    ]]).astype('int32'), config.ndigits, config.nway)
                current_encoding = np.concatenate(
                    [current_encoding, np.array([[1, 0]])], axis=1)
                pred_action, h_state = self.sess.run(
                    [self.q, self.hs_out],
                    feed_dict={
                        self.s: current_encoding[None],
                        self.hs: h_state,
                        self.slen: np.ones(1).astype('int32')
                    })
                gt_action = self.env.get_gt_action()
                action = self.get_action(pred_action[0], gt_action,
                                         beta_schedule.epsilon)
                past_state = self.env.current_state
                _, done, past_action_onehot = self.env.step(action)
                encoding_a[i, :] = current_encoding[0]
                predflag_a[i] = 1
                target_action_a[i] = gt_action
                slen += 1
                if done:
                    break
            batch_data = (np.concatenate([encoding, encoding_a], axis=0)[None],
                          np.concatenate([predflag, predflag_a], axis=0),
                          np.concatenate([target_action, target_action_a],
                                         axis=0), slen)

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon,
                                                   batch_data)

            # logging stuff
            if ((t % config.log_freq == 0)
                    and (t % config.learning_freq == 0)):
                self.update_averages(scores_eval)
                beta_schedule.update(t)
                lr_schedule.update(t)
                prog.update(t + 1,
                            exact=[("Loss", loss_eval), ("Grads", grad_eval),
                                   ("lr", lr_schedule.epsilon)])

            if t >= config.nsteps_train:
                break

            if last_eval >= config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate(cr_schedule, curri_idx)]
                if scores_eval[-1] > 0.8:
                    curri_idx += 1
                    msg = "Upgrade to lesson {:d}".format(int(curri_idx))
                    self.logger.info(msg)
                    self.logger.info(
                        "----------Start Computing Final Score----------")
                    scores_eval += [self.evaluate(cr_schedule)]
                    self.logger.info(
                        "----------Finish Computing Final Score----------")

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(cr_schedule)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 14
0
    def train(self, exp_schedule, lr_schedule):
        # Initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.FLAGS.buffer_size,
                                     self.FLAGS.state_hist)
        rewards = deque(maxlen=self.FLAGS.num_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = 0  # time control of nb of steps
        loss_eval = grad_eval = 0
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

        self.prog = Progbar(target=self.FLAGS.train_steps)

        # Train for # of train steps
        while t < self.FLAGS.train_steps:
            continual_crash = 0
            try:
                total_reward = 0
                ep_len = 0
                state = self.env.reset()

                # Run for 1 episode and update the buffer
                while True:
                    ep_len += 1

                    # replay memory stuff
                    idx = replay_buffer.store_frame(state)
                    q_input = replay_buffer.encode_recent_observation()

                    # chose action according to current Q and exploration
                    best_action, q_values = self.network.get_best_action(
                        q_input)
                    action = exp_schedule.get_action(best_action)

                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)

                    # perform action in env
                    new_state, reward, done, info = self.env.step(action)

                    # store the transition
                    replay_buffer.store_effect(idx, action, reward, done)
                    state = new_state

                    # Count reward
                    total_reward += reward

                    # Stop at end of episode
                    if done: break

                #Store episodic rewards
                if ep_len > 1: rewards.append(total_reward)

                # Learn using replay
                while True:
                    t += 1
                    ep_len -= 1

                    # Make train step if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.learn_every == 0)):
                        loss_eval, grad_eval = self.network.update_step(
                            t, replay_buffer, lr_schedule.epsilon,
                            self.summary)
                        exp_schedule.update(t)
                        lr_schedule.update(t)

                    if (t % self.FLAGS.target_every == 0):
                        self.network.update_target_params()

                    # Update logs if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.log_every == 0)
                            and (len(rewards) > 0)):
                        self.update_averages(rewards, max_q_values, q_values,
                                             scores_eval)
                        self.update_logs(t, loss_eval, rewards,
                                         exp_schedule.epsilon, grad_eval,
                                         lr_schedule.epsilon)

                    # Update logs if necessary
                    elif (t < self.FLAGS.learn_start) and (
                            t % self.FLAGS.log_every == 0):
                        sys.stdout.write(
                            "\rPopulating the memory {}/{}...".format(
                                t, self.FLAGS.learn_start))
                        sys.stdout.flush()

                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.check_every == 0)):
                        # Evaluate current model
                        scores_eval += [
                            self.evaluate(self.env, self.FLAGS.num_test)
                        ]

                        # Save current Model
                        self.network.save()

                        # Record video of current model
                        if self.FLAGS.record:
                            self.record()

                    if ep_len <= 0 or t >= self.FLAGS.train_steps: break
                continual_crash = 0

            except Exception as e:
                continual_crash += 1
                self.logger.info(e)
                if continual_crash >= 10:
                    self.logger.info("Crashed 10 times -- stopping u suck")
                    raise e
                else:
                    t -= 1
                    self.logger.info("Env crash, making new env")
                    time.sleep(60)
                    self.env = create_slither_env(self.FLAGS.state_type)
                    self.env = Unvectorize(self.env)
                    self.env.configure(fps=self.FLAGS.fps,
                                       remotes=self.FLAGS.remotes,
                                       start_timeout=15 * 60,
                                       vnc_driver='go',
                                       vnc_kwargs={
                                           'encoding': 'tight',
                                           'compress_level': 0,
                                           'fine_quality_level': 50
                                       })
                    time.sleep(60)

        # End of training
        self.logger.info("- Training done.")
        self.network.save()
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]
        export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
Ejemplo n.º 15
0
    def train(self):
        last_record = 0

        self.init_averages()
        scores_eval = []
        self.plot = {
            'room' + str(i): {j: []
                              for j in range(config.num_sub_policies)}
            for i in range(4)
        }

        for t in range(self.config.num_batches):
            # print(t, self.get_epsilon(t))
            paths, total_rewards = self.sample_path(env=self.env)

            scores_eval += total_rewards

            if str(config.env_name).startswith("Fourrooms"):
                observations = np.expand_dims(
                    np.concatenate([path["observation"] for path in paths]),
                    axis=1)
            else:
                observations = np.concatenate(
                    [path["observation"] for path in paths])

            actions = np.concatenate([path["action"] for path in paths])
            rewards = np.concatenate([path["reward"] for path in paths])
            returns = self.get_returns(paths)
            advantages = self.calculate_advantage(returns, observations)

            if self.config.use_baseline:
                self.update_baseline(returns, observations)
            self.sess.run(
                self.train_op,
                feed_dict={
                    self.observation_placeholder: observations,
                    self.action_placeholder: actions,
                    self.advantage_placeholder: advantages
                })

            if t % self.config.summary_freq == 0:
                self.update_averages(total_rewards, scores_eval)
                self.record_summary(self.batch_counter)
            self.batch_counter = self.batch_counter + 1

            avg_reward = np.mean(total_rewards)
            sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards))
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

            last_record += 1
            if self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

            if t % config.record_freq == 0:
                self.save_model_checkpoint(self.sess, self.saver,
                                           os.path.join(
                                               self.config.output_path,
                                               'model.ckpt'), t)

        self.logger.info("- Training done.")
        export_plot(scores_eval, "Score", config.env_name,
                    self.config.plot_output)

        if str(config.env_name).startswith(
                "Fourrooms") and config.examine_master:
            import matplotlib.pyplot as plt
            plt.rcParams["figure.figsize"] = [12, 12]
            f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(
                2, 2, sharex='col', sharey='row')
            axes = {'room0': ax1, 'room1': ax2, 'room2': ax3, 'room3': ax4}
            for room in self.plot:
                axes[room].set_title(room, size=20)
                for sub in range(config.num_sub_policies):
                    prob_list = self.plot[room][sub]
                    axes[room].plot(
                        range(len(prob_list)), prob_list, linewidth=5)
                axes[room].legend(
                    [
                        'subpolicy' + str(sub)
                        for sub in range(config.num_sub_policies)
                    ],
                    loc='upper left',
                    prop={
                        'size': 20
                    })
            plt.tight_layout()
            plt.savefig('Rooms and Subs', dpi=300)
Ejemplo n.º 16
0
    def train(self, exp_schedule, lr_schedule, choose_teacher_strategy=None):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        allsteps = []
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                if self.config.state_subspace is not None:
                    out_of_bounds = False
                    if self.config.state_subspace in [
                            'ball_top_half', 'ball_bottom_half'
                    ]:
                        image = self.env.unwrapped._get_obs()
                        ball_position = ball_half_screen_position(image)
                        # check if ball is in top half but we're restricted to bottom half
                        if ball_position == 1 and self.config.state_subspace == 'ball_bottom_half':
                            out_of_bounds = True
                        # check if ball is in bottom half but we're restricted to top half
                        elif ball_position == 0 and self.config.state_subspace == 'ball_top_half':
                            out_of_bounds = True
                    else:
                        raise NotImplementedError
                    if out_of_bounds:  # current state is outside of this agent's state subspace
                        # perform action in env
                        state, reward, done, info = self.env.step(action)

                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()
                # self.q_inputs.append(q_input)

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                if choose_teacher_strategy is not None:
                    # store the reward with the teacher choice strategy
                    choose_teacher_strategy.store_reward(reward, q_input)

                # perform a training step
                loss_eval, grad_eval = self.train_step(
                    t, replay_buffer, lr_schedule.epsilon,
                    choose_teacher_strategy)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if choose_teacher_strategy is not None:
                        choose_teacher_strategy.update_schedule(t)
                    if len(rewards) > 0:
                        exact = [("Loss", loss_eval),
                                 ("Avg R", self.avg_reward),
                                 ("Max R", np.max(rewards)),
                                 ("eps", exp_schedule.epsilon),
                                 ("Grads", grad_eval), ("Max Q", self.max_q),
                                 ("lr", lr_schedule.epsilon)]
                        if choose_teacher_strategy is not None and hasattr(
                                choose_teacher_strategy, 'eps_schedule'):
                            exact.append(
                                ("Choose teacher eps",
                                 choose_teacher_strategy.eps_schedule.epsilon))
                        prog.update(t + 1, exact=exact)

                elif ((t > self.config.learning_start)
                      and (t % self.config.save_teacher_choice_freq == 0)
                      and (choose_teacher_strategy is not None)):
                    choose_teacher_strategy.save(
                        self.config.teacher_choice_output_path)

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
        if choose_teacher_strategy is not None:
            choose_teacher_strategy.save(
                self.config.teacher_choice_output_path)
Ejemplo n.º 17
0
    def train(self):
        """
        Performs training

        You do not have to change or use anything here, but take a look
        to see how all the code you've written fits together!
        """
        last_eval = 0
        last_record = 0

        self.init_averages()
        scores_eval = [] # list of scores computed at iteration time

        episode = 0
        episode_reward = 0
        step = 0
        episode_rewards = []
        paths = []

        observation = self.env.reset()

        observation = np.tile(observation, (self.config.batch_size, 1))

        self.config.batch_size

        for t in range(self.config.num_batches*self.config.batch_size):
            if self.discrete:
                actions = np.arange(self.action_dim).astype(float)[:, None]
                actions = np.reshape(actions, (1, self.config.batch_size))
                obs = np.tile(observation, (self.action_dim, 1))

                self.sess.run(self.train_op, feed_dict={
                            self.observation_placeholder : obs,
                            self.action_placeholder : actions})

            for i in range(observation.shape[0]):

            action = self.sess.run(self.sampled_actions, feed_dict={self.observation_placeholder : observation[None]})[0]
            next_observation, reward, done, info = env.step(action)
            next_action = self.sess.run(self.target_sampled_actions, feed_dict={self.next_observation_placeholder : next_observation[None]})[0]
            episode_reward += reward
            step += 1

            action = np.array([action])[None]
            next_action = np.array([next_action])[None]
            reward = np.array([reward])[None]
            done = np.array([done])[None]

            self.update_critic(action, next_action, observation[None], next_observation[None], reward, done)

            if (t > 0 and t % self.config.update_critic_freq == 0):
                self.sess.run(self.update_target_op, feed_dict={})

            if (done or step == self.config.max_ep_len-1):
                episode_rewards.append(episode_reward)
                observation = self.env.reset()
                observation = np.tile(observation, (self.config.batch_size, 1))
                episode_reward = 0
                episode += 1
                step = 0
            else:
                observation = next_observation

            # tf stuff
            if (t % (self.config.summary_freq*self.config.batch_size) == 0 and t > 0):
                self.update_averages(episode_rewards, scores_eval)
                self.record_summary(t)

            if (t % self.config.batch_size == 0 and t > 0):
                # compute reward statistics for this batch and log
                avg_reward = np.mean(episode_rewards)
                sigma_reward = np.sqrt(np.var(episode_rewards) / len(episode_rewards))
                msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward)
                self.logger.info(msg)
                scores_eval = scores_eval + episode_rewards
                episode_rewards = []

            if  self.config.record and (last_record > (self.config.record_freq*self.config.batch_size)):
                self.logger.info("Recording...")
                last_record =0
                self.record()

        self.logger.info("- Training done.")
        export_plot(scores_eval, "Score", config.env_name, self.config.plot_output)
Ejemplo n.º 18
0
  def train(self, exp_schedule, lr_schedule):
    # Initialize replay buffer and variables
    replay_buffer = ReplayBufferAC(self.FLAGS.buffer_size, self.FLAGS.state_hist)
    rewards = deque(maxlen=self.FLAGS.num_test)
    max_q_values = deque(maxlen=1000)
    q_values = deque(maxlen=1000)
    self.init_averages()

    t = 0 # time control of nb of steps
    loss_eval = grad_eval = 0
    scores_eval = [] # list of scores computed at iteration time
    #scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

    self.prog = Progbar(target=self.FLAGS.train_steps)
    self.prog2 = Progbar(target=self.FLAGS.train_steps)

    # Train for # of train steps
    while t < self.FLAGS.train_steps:
      total_reward = 0
      ep_len = 0
      state = self.env.reset()
      reward = 0
      first = 1
      q_input = None
      # Run for 1 episode and update the buffer
      while True:
        ep_len += 1
        # replay memory stuff
        if first == 1:
          first = 0
          idx     = replay_buffer.store_frame(state)
          q_input = replay_buffer.encode_recent_observation()
        # chose action according to current Q and exploration
        best_action, q_values = self.network.get_best_action(q_input)
        action                = exp_schedule.get_action(best_action)
        orig_val = self.network.calcState(q_input)


        # store q values
        max_q_values.append(max(q_values))
        q_values += list(q_values)

        # perform action in env
        new_state, new_reward, done, info = self.env.step(action)
        idx = replay_buffer.store_frame(state)
        q_input = replay_buffer.encode_recent_observation()
        new_val = self.network.calcState(q_input)
        orig_val = orig_val[0][0]
        new_val = new_val[0][0]
        print (orig_val, new_reward, done, new_val, ep_len)


        if not done: # Non-terminal state.
          target = reward + ( self.FLAGS.gamma * new_val)
        else:
          target = reward + ( self.FLAGS.gamma * new_reward )

        best_val = max((orig_val), target)

        actor_delta = new_val - orig_val

        replay_buffer.store_effect(idx-1, action, new_reward, done, best_val, actor_delta)
        state = new_state

        if done:
          replay_buffer.store_effect(idx, action, 0, done, 0, 0)

        # Count reward
        total_reward += new_reward

        reward=new_reward

        # Stop at end of episode
        if done: break

      old_t = t
      temp_ep_len = ep_len
      while True:
        t += 1
        temp_ep_len -= 1

        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)):
          if replay_buffer.can_sample(self.FLAGS.batch_size) == True: 
            loss_eval, grad_eval = self.network.update_critic_step(t, replay_buffer, lr_schedule.epsilon, self.summary)


        # Update logs if necessary
        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)):
          self.update_logs2(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon)

        if temp_ep_len <= 0 or t >= self.FLAGS.train_steps: break


      rewards.append(total_reward)

      # Learn using replay
      while True:

        t += 1
        ep_len -= 1

        # Make train step if necessary
        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)):
          if replay_buffer.can_sample(self.FLAGS.batch_size) == True: 
            loss_eval, grad_eval = self.network.update_actor_step(t, replay_buffer, lr_schedule.epsilon, self.summary)
            exp_schedule.update(t)
            lr_schedule.update(t)

        # Update logs if necessary
        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)):
          self.update_averages(rewards, max_q_values, q_values, scores_eval)
          self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon)

        # Update logs if necessary
        elif (t < self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0):
          sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.FLAGS.learn_start))
          sys.stdout.flush()

        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)):
          # Evaluate current model
          scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

          # Save current Model
          self.network.save()

          # Record video of current model
          if self.FLAGS.record:
            self.record()

        if ep_len <= 0 or t >= self.FLAGS.train_steps: break

      # Update episodic rewards

    # End of training
    self.logger.info("- Training done.")
    self.network.save()
    scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]
    export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
Ejemplo n.º 19
0
 def export_score(self, scores_eval):
     export_plot(scores_eval, "Scores", self.config.plot_output)
    def train(self):
        """
    Performs training
    """

        last_eval = 0
        last_record = 0
        scores_eval = []

        self.init_averages()
        scores_eval = []  # list of scores computed at iteration time

        # Update learning rate
        if self.max_roll_distance > 400.0:
            self.learning_rate = pow(self.learning_rate, 0.9)

        for t in range(self.config.num_batches):

            # collect a minibatch of samples
            paths, total_rewards, rollout_distances = self.sample_path()
            scores_eval = scores_eval + total_rewards
            observations = np.concatenate(
                [path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            rewards = np.concatenate([path["reward"] for path in paths])
            # compute Q-val estimates (discounted future returns) for each time step
            returns = self.get_returns(paths)
            advantages = self.calculate_advantage(returns, observations)

            #Check if current model is best:
            if max(rollout_distances) > self.max_max_roll_distance:
                print('New best model found! Saving under: ',
                      self.config.best_model_output)
                self.saver.save(self.sess, self.config.best_model_output)

            # run training operations
            if self.config.use_baseline:
                self.update_baseline(returns, observations)
            self.sess.run(self.train_op,
                          feed_dict={
                              self.observation_placeholder: observations,
                              self.action_placeholder: actions,
                              self.advantage_placeholder: advantages,
                              self.lr: self.learning_rate
                          })

            # tf stuff
            if (t % self.config.summary_freq == 0):
                self.update_averages(total_rewards, scores_eval,
                                     rollout_distances)
                self.record_summary(t)

            print("Learning rate:", self.learning_rate)
            # compute reward statistics for this batch and log
            avg_reward = np.mean(total_rewards)
            sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards))
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)
            self.saver.save(self.sess, self.config.model_output)

        self.logger.info("- Training done.")
        export_plot(scores_eval, "Score", config.env_name,
                    self.config.plot_output)
Ejemplo n.º 21
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        if not self.config.batch:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size, self.config.state_history
            )
        else:
            self.logger.info(
                'Loading replay buffer from {}'.format(self.config.buffer_path)
            )
            replay_buffer = ReplayBuffer.load(self.config.buffer_path)
            self.logger.info(
                'Loaded buffer with {} observations and {} in buffer'.format(
                    len(replay_buffer.obs), replay_buffer.num_in_buffer
                )
            )

        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        episode_lengths = deque(maxlen=1000)
        max_episode_length = 0
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0

            if not self.config.batch:
                state = self.env.reset()

            episode_step = 0
            avg_episode_length = (
                np.nan if len(episode_lengths) == 0 else np.mean(episode_lengths)
            )

            while True:
                t += 1
                episode_step += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train:
                    self.env.render()

                if not self.config.batch:
                    get_action = functools.partial(
                        exp_schedule.get_action,
                        episode_num=len(episode_lengths),
                        episode_step=episode_step,
                        avg_episode_length=avg_episode_length
                    )
                    state, reward, done, _q_values = self.interact(
                        replay_buffer, state, get_action
                    )
                else:
                    reward = 0
                    done = True
                    _q_values = [0]

                # store q values
                max_q_values.append(max(_q_values))
                q_values.extend(list(_q_values))

                # perform a training step
                loss_eval, grad_eval = self.train_step(
                    t, replay_buffer, lr_schedule.epsilon
                )

                # logging stuff
                learning = (t > self.config.learning_start)
                learning_and_loggging = (
                    learning and
                    (t % self.config.log_freq == 0) and
                    (t % self.config.learning_freq == 0)
                )
                if learning_and_loggging:
                    self.update_averages(
                        rewards, max_q_values, q_values,
                        scores_eval, episode_lengths, max_episode_length
                    )
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        if self.config.batch:
                            exact = [
                                ("Loss", loss_eval),
                                ("Grads", grad_eval),
                                ("lr", lr_schedule.epsilon),
                            ]
                        else:
                            exact = [
                                ("Loss", loss_eval),
                                ("Avg_R", self.avg_reward),
                                ("Max_R", np.max(rewards)),
                                ("eps", exp_schedule.epsilon),
                                ("Grads", grad_eval),
                                ("Max_Q", self.max_q),
                                ("lr", lr_schedule.epsilon),
                                ("avg_ep_len", avg_episode_length)
                            ]

                        prog.update(t + 1, exact=exact)

                elif not learning and (t % self.config.log_freq == 0):
                    sys.stdout.write(
                        "\rPopulating the memory {}/{}...".format(
                            t, self.config.learning_start
                        )
                    )
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    episode_lengths.append(episode_step)
                    if episode_step > max_episode_length:
                        max_episode_length = episode_step

                        # retrain the clusters every time the max episode
                        # length changes
                        if hasattr(self, 'reset_counts'):
                            self.reset_counts(
                                n_clusters=max_episode_length,
                                states=replay_buffer.get_encoded_states(),
                                actions=replay_buffer.get_actions()
                            )

                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            should_evaluate = (
                (t > self.config.learning_start) and
                (last_eval > self.config.eval_freq)
            )
            if should_evaluate:
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval.append(self.evaluate())

            should_record = (
                (t > self.config.learning_start) and
                self.config.record and
                (last_record > self.config.record_freq)
            )
            if should_record:
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval.append(self.evaluate())
        export_plot(scores_eval, "Scores", self.config.plot_output)

        if not self.config.batch:
            # save replay buffer
            self.logger.info(
                'Saving buffer to {}'.format(self.config.buffer_path)
            )
            replay_buffer.save(self.config.buffer_path)
    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        model_a.env.state.is_render_image = model_a.config.render_train
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            flag = True
            while flag:
                state = self.env.reset() # h x w x c
                agent_location = self.env.state.agent_location
                if self.env.teacher.dist_map[agent_location[1],agent_location[0]]!=np.inf:
                    flag = False
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            h_state_fw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            h_state_bw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            state_batch = list()
            goal_state_batch = list()
            goal_obs_image_batch = list()
            path_loc = list()
            path_ori = list()
            done_batch = list()
            width, height = self.env.state.xmap.dim['width'], self.env.state.xmap.dim['height']
            side_radius = min(self.config.visible_radius_unit_side, max(width - 1, height - 1))
            block_size = self.env.state.image_block_size
            for i in range(200):
                #### teacher rotate ####
                agent_location = self.env.state.agent_location
                agent_orientation = self.env.state.agent_orientation
                goal_location = agent_location+agent_orientation
                gt_action = self.env.teacher.action_map[agent_location[1], agent_location[0]]
                if np.dot(agent_orientation, orientation_map[gt_action])!=1:
                    tmp = np.cross(agent_orientation, orientation_map[gt_action])
                    if tmp==1:
                        state, reward_i, done = self.env.step(3)
                    else:
                        state, reward_i, done = self.env.step(2)
                    continue
                path_loc.append(copy.deepcopy(goal_location))
                path_ori.append(copy.deepcopy(agent_orientation))
                raw_goal_state, goal_state = self.convert_state_to_goal_state(state)
                state_batch.append(raw_goal_state[None][None])
                goal_state_batch.append(goal_state)
                if self.config.render_train:
                    goal_obs_image_batch.append(self.env.state.image[:3*block_size, (side_radius-1)*block_size:(side_radius+2)*block_size, :])
                state, reward_i, done = self.env.step(0)
                done_batch.append(done)
                if done:
                    break

            slen = np.array([len(state_batch)]).astype('int32')
            state_batch = np.concatenate(state_batch, axis=1)
            best_action_batch, q_values_batch, h_state_fw, h_state_bw = self.get_best_action_batch(state_batch, h_state_fw, h_state_bw, slen)
            action_batch = exp_schedule.get_action_batch(best_action_batch)
            for i in range(q_values_batch.shape[0]):
                max_q_values.append(max(q_values_batch[i]))
                q_values += list(q_values_batch[i])

            reward_batch = list()
            for i, action in enumerate(action_batch):
                if action==0:
                    reward_batch.append(0)
                else:
                    if self.config.render_train:
                        model_a.env.teacher.goal_obs_image = goal_obs_image_batch[i]
                    h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                    model_a.env.teacher.set_goal(goal_state_batch[i], path_loc[i])
                    reward_a = model_a.navi_goal(h_state_a, goal_state_batch[i])
                    if model_a.env.teacher.goal_finish:
                        reward_batch.append(-0.05)
                    else:
                        reward_batch.append(-0.1)
                    #model_a.env.state.teleport(model_a.env.agent, path_loc[i], path_ori[i])
            if action_batch[-1]==1 and model_a.env.teacher.goal_finish:
                reward_batch[-1] += 1
            else:
                if self.config.render_train:
                        model_a.env.teacher.goal_obs_image = goal_obs_image_batch[-1]
                h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                model_a.env.teacher.set_goal(goal_state_batch[-1], path_loc[-1])
                reward_a = model_a.navi_goal(h_state_a, goal_state_batch[-1])
                if model_a.env.teacher.goal_finish:
                    reward_batch[-1] += 1

            for i in range(action_batch.shape[0]):
                idx = replay_buffer.store_frame(state_batch[0][i])
                replay_buffer.store_effect(idx, action_batch[i], reward_batch[i], done_batch[i])

            for i in range(action_batch.shape[0]):
                t += 1
                last_eval += 1
                last_record += 1
                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                   (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                        ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                        ("Grads", grad_eval), ("Max Q", self.max_q), 
                                        ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                        self.config.learning_start))
                    sys.stdout.flush()

            # count reward
            total_reward = sum(reward_batch)
            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 23
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables

        if self.config.use_memory:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size,
                self.config.state_history,
                memory_size=self.config.memory_unit_size)
        else:
            replay_buffer = ReplayBuffer(self.config.buffer_size,
                                         self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()[0]]

        prog = Progbar(target=self.config.nsteps_train)

        evaluation_result_list = []
        oos_evalution_result_list = []

        # interact with environment
        prev_time = time.time()
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                if self.config.use_memory:
                    prev_memory = replay_buffer.encode_recent_memory()
                    best_action, q_values, _, next_memory = self.get_best_action_with_memory(
                        q_input, prev_memory)
                    next_memory = np.squeeze(next_memory)
                else:
                    best_action, q_values = self.get_best_action(q_input)
                # chose action according to current Q and exploration
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                if self.config.use_memory:
                    replay_buffer.store_memory(idx, next_memory)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                time_log_freq = 1000
                if t % time_log_freq == 0:
                    with open(self.config.output_path + 'time_log.txt',
                              'a') as of:
                        of.write('{}\n'.format(time.time() - prev_time))
                        of.write('\n')
                    prev_time = time.time()

                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg_R", self.avg_reward),
                                           ("Max_R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max_Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                score, complete, length = self.evaluate()
                if complete > 0:
                    evaluation_result_list += [(score, complete, length)]
                if score > self.config.extended_eval_threshold:
                    self.logger.info('Extended in-sample evaluation...')
                    self.evaluate(num_episodes=1000)
                    for _ in range(10):
                        self.logger.info(
                            'Extended out-of-sample evaluation...')
                        oos_result = self.evaluate(
                            EnvMaze(n=self.config.maze_size), num_episodes=100)
                        oos_evalution_result_list += [oos_result]
                scores_eval += [score]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()[0]]
        export_plot(scores_eval, "Scores", self.config.plot_output)

        return evaluation_result_list, oos_evalution_result_list
Ejemplo n.º 24
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history, self.config)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        scores_eval += [self.evaluate()]
        
        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx      = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                action, explore       = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done, explore)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon, exp_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                   (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                        ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                        ("Grads", grad_eval), ("Max Q", self.max_q), 
                                        ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                        self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record =0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 25
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()  # h x w x c
            goal_state = self.env.teacher.goal_obs_onehot_state  # h x w x c
            h_state = (np.zeros([1, self.config.h_size]),
                       np.zeros([1, self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0
            for i in range(200):
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                #### for replay_buffer
                # replay memory stuff
                idx = replay_buffer.store_frame(state, goal_state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                curr_attention = np.equal(
                    np.sum(np.equal(q_input, goal_state[None][None][None]), 3),
                    q_input.shape[3])
                best_action, q_values, h_state = self.get_best_action(
                    [q_input], curr_attention[None], h_state, slen, [action])
                #best_action, q_values, h_state = self.get_best_action([q_input], goal_state[None][None], h_state, slen, [action])
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                # perform action in env
                new_state, reward, done = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)