コード例 #1
0
 def __init__(self, path):
     self.step = 0
     self.episode = 0
     """
     config the logfile 
     """
     configlist = ["stdout", "log", 'tensorboard', "csv"]
     if path is None:
         path = "./"
     logger.configure(path, configlist)
     self.csvwritter = CSVOutputFormat(path+"record_trajectory.csv")
     loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')]
     self.writer = loggerCEN.writer
     self.path = path
コード例 #2
0
class Agent_policy_based(ABC):
    """
    所有算法的父类
    Abstract base class for all implemented agents.

    其中包含了
    - `runner`    根据policy 产生 sample
    - `learning`  根据Sample 训练网络
    - `load_weights`  加载权重
    - `save_weights`  存储权重
    - `layers`        网络层
    - 'forward'       前向传播
    - 'backward'       前向传播
    定义 episode 完成一次为一个episode
    定义 step    为交互一次
    """
    def __init__(self, path):
        self.step = 0
        self.episode = 0
        """
        config the logfile 
        """
        configlist = ["stdout", "log", 'tensorboard', "csv"]
        if path is None:
            path = "./"
        logger.configure(path, configlist)
        self.csvwritter = CSVOutputFormat(path+"record_trajectory.csv")
        loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')]
        self.writer = loggerCEN.writer
        self.path = path

    def imitation_learning(self):
        pass

    def train(self, max_step=None, max_ep_cycle=2000, verbose=2, learning_start=1000, render=False, record_ep_inter=None):
        self.learning = True
        print("the train phase ........")
        self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, learning_start=learning_start, render=render,
                 verbose=verbose, record_ep_inter=record_ep_inter)

    def test(self, max_step=None, max_ep_cycle=2000, verbose=2, render=False, record_ep_inter=None):
        self.learning = False
        self.learning_starts = 0
        self.step = 0
        self.episode = 0
        print("the test phase ........")
        self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, render=render,
                 verbose=verbose, record_ep_inter=record_ep_inter)

    def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10,learning_start=1000,
                 render = False, verbose=1, record_ep_inter=None):
        '''
        :param max_step:
        :param max_ep_time:
        :param max_ep_cycle:  max step in per circle
        .........................show parameter..................................
        :param verbose
        if verbose == 1   show every ep
        if verbose == 2   show every step
        :param record_ep_inter
        record_ep_interact data
        :return: None
        '''
        # if IL_time is not None:
        self.render = render

        # .....................initially——recode...........................#
        rollout = 0
        now_best_reward = -np.inf

        self.dist = make_pdtype(self.env.action_space, self.policy)
        sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter, lstm_enable=self.lstm_enable)
        while self.step < max_step:
            sample = next(sample_generate)
            logger.record_tabular("01.step", self.step)
            logger.record_tabular("02.episode",self.episode)
            logger.record_tabular("03.rollout", rollout)
            logger.record_tabular("04.rollout/ep", sample["ep_used"])
            logger.record_tabular("05.rollout/step", sum(sample["ep_step_used"]))
            logger.record_tabular("06.mean_episode_reward", np.mean(sample["ep_reward"]))
            logger.record_tabular("07.mean_step_reward", np.mean(sample["buffer"]["r"]))
            logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"]))
            logger.dump_tabular()
            csv_record(sample["ep_reward"], self.path)
            record_sample = sample["buffer"]

            rollout += 1

            if self.step > learning_start and self.learning:
                ep_show = {}
                if self.backward_ep_show_list:
                    for key in self.backward_ep_show_list:
                        ep_show[key] = 0
                rollout_loss = 0
                for time in range(train_rollout):
                    loss, other_infor = self.update(record_sample)
                    if verbose == 1:
                        logger.record_tabular("06.train_rollout", time)
                        logger.record_tabular("07.loss", loss)
                        flag = 10
                        if self.backward_step_show_list:
                            for key in self.backward_step_show_list:
                                logger.record_tabular(str(flag) +"."+ key, other_infor[key])
                                flag += 1
                        logger.dump_tabular()
                    rollout_loss += loss
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            ep_show[key] += other_infor[key]
                if verbose == 2:
                    logger.record_tabular("06.rollouts/loss", rollout_loss)
                    logger.record_tabular("07.rollouts/episode_Q_value", torch.mean(
                        torch.tensor(sample["ep_Q_value"])).cpu().detach().numpy())
                    # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"])
                    # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"])
                    # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"]))

                    flag = 10
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            logger.record_tabular(str(flag) + "." + key, ep_show[key])
                            flag += 1
                    logger.dump_tabular()
            if np.mean(sample["ep_reward"])>now_best_reward:
                self.save_weights(self.path)
                print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved")
                now_best_reward = np.mean(sample["ep_reward"])

    def runner(self, sample_step=None, sample_ep=None, max_ep_step=2000, record_ep_inter=None, lstm_enable=False):
        if sample_step is not None:
            buffer = ReplayMemory(sample_step, ["value", "logp","info"])
        else:
            buffer = ReplayMemory(sample_ep*max_ep_step, ["value", "logp","info"])
        s = self.env.reset()
        ep_reward, ep_Q_value, ep_step_used = [], [], []
        ep_r, ep_q, ep_cycle = 0, 0, 0
        while True:
            s = torch.from_numpy(s.astype(np.float32))
            with torch.no_grad():
                outcome = self.policy.forward(s.unsqueeze(0))
                Q = self.value.forward(s.unsqueeze(0))
            pd = self.dist(outcome)
            a = pd.sample()
            s_, r, done, info = self.env.step(a.cpu().squeeze(0).numpy())
            if self.render:
                self.env.render()
            ep_r += r
            ep_q += Q
            ep_cycle +=1
            self.step += 1
            logp = pd.log_prob(a)
            sample_ = {
                "s": s,
                "a": a.squeeze(0),
                "r": torch.tensor(np.array([r]).astype(np.float32)),
                "tr": torch.tensor([int(done)]),
                "s_":torch.from_numpy(s_),
                "logp": logp.squeeze(0),
                "value": Q.squeeze(0),
                "info": info}
            buffer.push(sample_)
            s = deepcopy(s_)

            if record_ep_inter is not None:
                if self.episode % record_ep_inter == 0:
                    kvs = {"s": s, "a": a, "s_": s_, "r": r,
                           "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle}
                    self.csvwritter.writekvs(kvs)

            if done:
                s = self.env.reset()
                self.episode += 1
                ep_reward.append(ep_r)
                ep_Q_value.append(ep_q)
                ep_step_used.append(ep_cycle)
                ep_r, ep_q, ep_cycle = 0, 0, 0
                if lstm_enable:
                    self.policy.reset_h()

            if sample_step is not None:
                if self.step > 0 and self.step % sample_step==0:
                    s_ = torch.from_numpy(s_[np.newaxis,:].astype(np.float32))
                    with torch.no_grad():
                        last_Q = self.value.forward(s_).squeeze()
                    #print("now is we have sampled for :", self.step , "and" , self.episode,"\n",
                    #      "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode",
                    #      "and the mean reward per step is",  np.mean(buffer.memory["r"]),
                    #      "the mean ep reward is ", np.mean(ep_reward))
                    yield {"buffer": buffer.memory,
                           "ep_reward": ep_reward,
                           "ep_Q_value": ep_Q_value,
                           "ep_step_used": ep_step_used,
                           "ep_used": len(ep_reward),
                           "step_used": sample_step,
                           "last_Q" : last_Q
                           }
                    ep_reward, ep_Q_value, ep_step_used = [], [], []
                    if sample_step is not None:
                        buffer = ReplayMemory(sample_step, ["value", "logp","info"])
                    else:
                        buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"])

            else:
                if self.step > 0 and self.episode % sample_ep==0:
                    s_ = torch.from_numpy(s_.astype(np.float32))
                    last_Q = self.value.forward(s_)
                    #print("now is we have sampled for :", self.step , "and" , self.episode,"\n",
                    #      "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode",
                    #      "and the mean reward per step is",  np.mean(buffer.memory["r"]),
                     #     "the mean ep reward is ", np.mean(ep_reward))
                    yield {"buffer": buffer.memory,
                           "ep_reward": ep_reward,
                           "ep_Q_value": ep_Q_value,
                           "ep_step_used": ep_step_used,
                           "ep_used": sample_ep,
                           "step_used": len(buffer.memory["tr"]),
                           "last_Q": last_Q
                           }
                    ep_reward, ep_Q_value = [], []
                    if sample_step is not None:
                        buffer = ReplayMemory(sample_step, ["value", "logp","info"])
                    else:
                        buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"])


    def update(self, sample):
        """Updates the agent after having executed the action returned by `forward`.
        If the policy is implemented by a neural network, this corresponds to a weight update using back-prop.

        # Argument
            reward (float): The observed reward after executing the action returned by `forward`.
            terminal (boolean): `True` if the new state of the environment is terminal.

        # Returns
            List of metrics values
        """
        raise NotImplementedError()

    def load_weights(self, filepath):
        """Loads the weights of an agent from an HDF5 file.

        # Arguments
            filepath (str): The path to the HDF5 file.
        """
        raise NotImplementedError()

    def save_weights(self, filepath, overwrite=False):
        """Saves the weights of an agent as an HDF5 file.

        # Arguments
            filepath (str): The path to where the weights should be saved.
            overwrite (boolean): If `False` and `filepath` already exists, raises an error.
        """
        raise NotImplementedError()

    def cuda(self):
        """
        use the cuda
        """
        raise NotImplementedError()


    def Imitation_Learning(self, step_time, data=None, policy=None,learning_start=1000,
                           buffer_size = 5000, value_training_round = 10, value_training_fre = 2500,
                           verbose=2,render = False):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception("The IL only need one way to guide, Please make sure the input ")

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            buffer = ReplayMemory(buffer_size)
            s = self.env.reset()
            loss_BC = 0
            ep_step,ep_reward = 0, 0
            for _ in range(step_time):
                self.step += 1
                ep_step += 1
                a = policy(self.env)
                s_, r, done, info = self.env.step(a)
                #print(r,info)
                ep_reward += r
                if render:
                    self.env.render()
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                buffer.push(sample)
                s = s_[:]
                if self.step > learning_start:
                    sample_ = buffer.sample(self.batch_size)
                    loss = self.policy_behavior_clone(sample_)
                    if self.step % value_training_fre==0:
                        record_sample = {}
                        for key in buffer.memory.keys():
                            record_sample[key] = np.array(buffer.memory[key]).astype(np.float32)[-value_training_fre:]
                        record_sample["value"] = self.value.forward(torch.from_numpy(record_sample["s"]))
                        returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"],
                                                   self.gamma, self.lam)
                        record_sample["advs"] = advants
                        record_sample["return"] = returns
                        for round_ in range(value_training_round):
                            loss_value = self.value_pretrain(record_sample, value_training_fre)
                            print(round_, loss_value)

                    if verbose == 1:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("loss", loss)
                        logger.record_tabular("rewrad",r)
                        logger.dumpkvs()
                    loss_BC += loss
                if done:
                    if verbose == 2:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("step_used", ep_step)
                        logger.record_tabular("loss", loss_BC/ep_step)
                        logger.record_tabular("ep_reward",ep_reward )
                        logger.dumpkvs()

                    s = self.env.reset()
                    loss_BC = 0
                    ep_step,ep_reward = 0, 0

    def policy_behavior_clone(self, sample_):
        raise NotImplementedError()

    def value_pretrain(self, sample_):
        raise NotImplementedError()
コード例 #3
0
ファイル: core_value.py プロジェクト: lelegan/Torch-rl
class Agent_value_based(ABC):
    """
    所有算法的父类
    Abstract base class for all implemented agents.

    其中包含了
    - `forward`   前向传播、计算action
    - `backward`  后向传播、更新网络
    - `load_weights`  加载权重
    - `save_weights`  存储权重
    - `layers`        网络层
    - 'forward'       前向传播
    - 'backward'       前向传播
    定义 episode 完成一次为一个episode
    定义 step    为交互一次
    """
    def __init__(self, path):
        self.step = 0
        self.episode = 0
        """
        config the logfile 
        """
        configlist = ["stdout", "log", 'tensorboard', "csv"]
        if path is None:
            path = "./"
        logger.configure(path, configlist)
        self.csvwritter = CSVOutputFormat(path + "record_trajectory.csv")
        loggerCEN = logger.get_current().output_formats[configlist.index(
            'tensorboard')]
        self.writer = loggerCEN.writer
        self.path = path

    def imitation_learning(self):
        pass

    def train(self,
              max_step=None,
              max_ep_cycle=2000,
              verbose=2,
              render=False,
              record_ep_inter=None):
        self.learning = True
        print("the train phase ........")
        self.interact(max_step=max_step,
                      max_ep_cycle=max_ep_cycle,
                      render=render,
                      verbose=verbose,
                      record_ep_inter=record_ep_inter)

    def test(self,
             max_step=None,
             max_ep_cycle=2000,
             verbose=2,
             render=False,
             record_ep_inter=None):
        self.learning = False
        self.learning_starts = 0
        self.step = 0
        self.episode = 0
        print("the test phase ........")
        self.interact(max_step=max_step,
                      max_ep_cycle=max_ep_cycle,
                      render=render,
                      verbose=verbose,
                      record_ep_inter=record_ep_inter)

    def interact(self,
                 max_step=50000,
                 max_ep_cycle=2000,
                 render=False,
                 verbose=1,
                 record_ep_inter=None):
        '''
        :param max_step:
        :param max_ep_time:
        :param max_ep_cycle:  max step in per circle
        .........................show parameter..................................
        :param verbose
        if verbose == 1   show every ep
        if verbose == 2   show every step
        :param record_ep_inter
        record_ep_interact data
        :return: None
        '''
        # if IL_time is not None:

        # .....................initially——recode...........................#
        ep_reward = []
        ep_Q_value = []
        ep_loss = []
        now_best_reward = -np.inf
        while self.step < max_step:
            s = self.env.reset()
            'reset the ep record'
            ep_r, ep_q, ep_l = 0, 0, 0
            'reset the RL flag'
            ep_cycle, done = 0, 0
            ep_show = {}
            if self.backward_ep_show_list:
                for key in self.backward_ep_show_list:
                    ep_show[key] = 0
            self.episode += 1
            while done == 0 and ep_cycle < max_ep_cycle:
                self.step += 1
                ep_cycle += 1
                'the interaction part'
                a, Q, info_forward = self.forward(s)
                # print(a)
                s_, r, done, info = self.env.step(a)
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                s = deepcopy(s_)
                loss, info_backward = self.backward(sample)
                if render:
                    self.env.render()
                'the record part'

                if verbose == 1 and self.step > self.learning_starts:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("episodes", self.episode)
                    logger.record_tabular("loss", loss)
                    logger.record_tabular("reward", r)
                    logger.record_tabular("Q", Q)
                    if self.forward_step_show_list:
                        for key in self.forward_step_show_list:
                            logger.record_tabular(key, info_forward[key])
                    if self.backward_step_show_list:
                        for key in self.backward_step_show_list:
                            logger.record_tabular(key, info_backward[key])
                    logger.dump_tabular()
                if record_ep_inter is not None:
                    if self.episode % record_ep_inter == 0:
                        kvs = {
                            "s": s,
                            "a": a,
                            "s_": s_,
                            "r": r,
                            "tr": done,
                            "ep": self.episode,
                            "step": self.step,
                            "ep_step": ep_cycle
                        }
                        self.csvwritter.writekvs(kvs)
                ep_r += r
                ep_q += Q
                ep_l += loss
                if self.backward_ep_show_list:
                    for key in self.backward_ep_show_list:
                        ep_show[key] += info_backward[key]
                if done:
                    ep_reward.append(ep_r)
                    ep_Q_value.append(ep_q)
                    ep_loss.append(ep_l)
                    mean_100ep_reward = round(np.mean(ep_reward[-101:-1]), 1)
                    if verbose == 2 and self.step > self.learning_starts:
                        logger.record_tabular("01.steps", self.step)
                        logger.record_tabular("02.episodes", self.episode)
                        logger.record_tabular("03.episode_reward",
                                              ep_reward[-1])
                        # logger.record_tabular("04.episode_reward_per_step", ep_reward[-1] / ep_cycle)
                        logger.record_tabular("05.episode_loss", ep_l)
                        # logger.record_tabular("06.episode_loss_per_step", ep_l / ep_cycle)
                        # logger.record_tabular("07.episode_Q_value", ep_q)
                        logger.record_tabular("08.episode_Q_value_per_step",
                                              ep_q / ep_cycle)
                        # logger.record_tabular("09.mean 100 episode reward", mean_100ep_reward)
                        # logger.record_tabular("10.step_used", ep_cycle)
                        flag = 11
                        if self.forward_ep_show_list:
                            for key in self.forward_ep_show_list:
                                logger.record_tabular(
                                    str(flag) + "." + key, info_forward[key])
                                flag += 1
                        if self.backward_ep_show_list:
                            for key in self.backward_ep_show_list:
                                logger.record_tabular(
                                    str(flag) + "." + key, ep_show[key])
                                flag += 1
                        logger.dump_tabular()
            if np.mean(ep_r) > now_best_reward:
                self.save_weights(self.path)
                print("the best mean ep reward is ", np.mean(ep_r),
                      "the weight is saved")
                now_best_reward = np.mean(ep_r)

    def forward(self, observation):
        """Takes the an observation from the environment and returns the action to be taken next.
        If the policy is implemented by a neural network, this corresponds to a forward (inference) pass.

        # Argument
            observation (object): The current observation from the environment.

        # Returns
            The next action to be executed in the environment.
        """
        raise NotImplementedError()

    def backward(self, sample):
        """Updates the agent after having executed the action returned by `forward`.
        If the policy is implemented by a neural network, this corresponds to a weight update using back-prop.

        # Argument
            reward (float): The observed reward after executing the action returned by `forward`.
            terminal (boolean): `True` if the new state of the environment is terminal.

        # Returns
            List of metrics values
        """
        raise NotImplementedError()

    def load_weights(self, filepath):
        """Loads the weights of an agent from an HDF5 file.

        # Arguments
            filepath (str): The path to the HDF5 file.
        """
        raise NotImplementedError()

    def save_weights(self, filepath, overwrite=False):
        """Saves the weights of an agent as an HDF5 file.

        # Arguments
            filepath (str): The path to where the weights should be saved.
            overwrite (boolean): If `False` and `filepath` already exists, raises an error.
        """
        raise NotImplementedError()

    def cuda(self):
        """
        use the cuda
        """
        raise NotImplementedError()

    def Imitation_Learning(self, step_time, data=None, policy=None, verbose=2):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception(
                "The IL only need one way to guide, Please make sure the input "
            )

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            s = self.env.reset()
            for time in step_time:
                self.step += 1
                a = policy(s)
                s_, r, done, info = self.env.step(a)
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                loss = self.backward(sample)
                s = s_
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()