Esempio n. 1
0
    def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10,learning_start=1000,
                 render = False, verbose=1, record_ep_inter=None):
        '''
        :param max_step:
        :param max_ep_time:
        :param max_ep_cycle:  max step in per circle
        .........................show parameter..................................
        :param verbose
        if verbose == 1   show every ep
        if verbose == 2   show every step
        :param record_ep_inter
        record_ep_interact data
        :return: None
        '''
        # if IL_time is not None:
        self.render = render

        # .....................initially——recode...........................#
        rollout = 0
        now_best_reward = -np.inf

        self.dist = make_pdtype(self.env.action_space, self.policy)
        sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter, lstm_enable=self.lstm_enable)
        while self.step < max_step:
            sample = next(sample_generate)
            logger.record_tabular("01.step", self.step)
            logger.record_tabular("02.episode",self.episode)
            logger.record_tabular("03.rollout", rollout)
            logger.record_tabular("04.rollout/ep", sample["ep_used"])
            logger.record_tabular("05.rollout/step", sum(sample["ep_step_used"]))
            logger.record_tabular("06.mean_episode_reward", np.mean(sample["ep_reward"]))
            logger.record_tabular("07.mean_step_reward", np.mean(sample["buffer"]["r"]))
            logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"]))
            logger.dump_tabular()
            csv_record(sample["ep_reward"], self.path)
            record_sample = sample["buffer"]

            rollout += 1

            if self.step > learning_start and self.learning:
                ep_show = {}
                if self.backward_ep_show_list:
                    for key in self.backward_ep_show_list:
                        ep_show[key] = 0
                rollout_loss = 0
                for time in range(train_rollout):
                    loss, other_infor = self.update(record_sample)
                    if verbose == 1:
                        logger.record_tabular("06.train_rollout", time)
                        logger.record_tabular("07.loss", loss)
                        flag = 10
                        if self.backward_step_show_list:
                            for key in self.backward_step_show_list:
                                logger.record_tabular(str(flag) +"."+ key, other_infor[key])
                                flag += 1
                        logger.dump_tabular()
                    rollout_loss += loss
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            ep_show[key] += other_infor[key]
                if verbose == 2:
                    logger.record_tabular("06.rollouts/loss", rollout_loss)
                    logger.record_tabular("07.rollouts/episode_Q_value", torch.mean(
                        torch.tensor(sample["ep_Q_value"])).cpu().detach().numpy())
                    # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"])
                    # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"])
                    # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"]))

                    flag = 10
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            logger.record_tabular(str(flag) + "." + key, ep_show[key])
                            flag += 1
                    logger.dump_tabular()
            if np.mean(sample["ep_reward"])>now_best_reward:
                self.save_weights(self.path)
                print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved")
                now_best_reward = np.mean(sample["ep_reward"])
Esempio n. 2
0
    def interact(self,
                 max_step=50000,
                 max_ep_cycle=2000,
                 render=False,
                 verbose=1,
                 record_ep_inter=None):
        '''
        :param max_step:
        :param max_ep_time:
        :param max_ep_cycle:  max step in per circle
        .........................show parameter..................................
        :param verbose
        if verbose == 1   show every ep
        if verbose == 2   show every step
        :param record_ep_inter
        record_ep_interact data
        :return: None
        '''
        # if IL_time is not None:

        # .....................initially——recode...........................#
        ep_reward = []
        ep_Q_value = []
        ep_loss = []
        now_best_reward = -np.inf
        while self.step < max_step:
            s = self.env.reset()
            'reset the ep record'
            ep_r, ep_q, ep_l = 0, 0, 0
            'reset the RL flag'
            ep_cycle, done = 0, 0
            ep_show = {}
            if self.backward_ep_show_list:
                for key in self.backward_ep_show_list:
                    ep_show[key] = 0
            self.episode += 1
            while done == 0 and ep_cycle < max_ep_cycle:
                self.step += 1
                ep_cycle += 1
                'the interaction part'
                a, Q, info_forward = self.forward(s)
                # print(a)
                s_, r, done, info = self.env.step(a)
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                s = deepcopy(s_)
                loss, info_backward = self.backward(sample)
                if render:
                    self.env.render()
                'the record part'

                if verbose == 1 and self.step > self.learning_starts:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("episodes", self.episode)
                    logger.record_tabular("loss", loss)
                    logger.record_tabular("reward", r)
                    logger.record_tabular("Q", Q)
                    if self.forward_step_show_list:
                        for key in self.forward_step_show_list:
                            logger.record_tabular(key, info_forward[key])
                    if self.backward_step_show_list:
                        for key in self.backward_step_show_list:
                            logger.record_tabular(key, info_backward[key])
                    logger.dump_tabular()
                if record_ep_inter is not None:
                    if self.episode % record_ep_inter == 0:
                        kvs = {
                            "s": s,
                            "a": a,
                            "s_": s_,
                            "r": r,
                            "tr": done,
                            "ep": self.episode,
                            "step": self.step,
                            "ep_step": ep_cycle
                        }
                        self.csvwritter.writekvs(kvs)
                ep_r += r
                ep_q += Q
                ep_l += loss
                if self.backward_ep_show_list:
                    for key in self.backward_ep_show_list:
                        ep_show[key] += info_backward[key]
                if done:
                    ep_reward.append(ep_r)
                    ep_Q_value.append(ep_q)
                    ep_loss.append(ep_l)
                    mean_100ep_reward = round(np.mean(ep_reward[-101:-1]), 1)
                    if verbose == 2 and self.step > self.learning_starts:
                        logger.record_tabular("01.steps", self.step)
                        logger.record_tabular("02.episodes", self.episode)
                        logger.record_tabular("03.episode_reward",
                                              ep_reward[-1])
                        # logger.record_tabular("04.episode_reward_per_step", ep_reward[-1] / ep_cycle)
                        logger.record_tabular("05.episode_loss", ep_l)
                        # logger.record_tabular("06.episode_loss_per_step", ep_l / ep_cycle)
                        # logger.record_tabular("07.episode_Q_value", ep_q)
                        logger.record_tabular("08.episode_Q_value_per_step",
                                              ep_q / ep_cycle)
                        # logger.record_tabular("09.mean 100 episode reward", mean_100ep_reward)
                        # logger.record_tabular("10.step_used", ep_cycle)
                        flag = 11
                        if self.forward_ep_show_list:
                            for key in self.forward_ep_show_list:
                                logger.record_tabular(
                                    str(flag) + "." + key, info_forward[key])
                                flag += 1
                        if self.backward_ep_show_list:
                            for key in self.backward_ep_show_list:
                                logger.record_tabular(
                                    str(flag) + "." + key, ep_show[key])
                                flag += 1
                        logger.dump_tabular()
            if np.mean(ep_r) > now_best_reward:
                self.save_weights(self.path)
                print("the best mean ep reward is ", np.mean(ep_r),
                      "the weight is saved")
                now_best_reward = np.mean(ep_r)