Beispiel #1
0
    def Imitation_Learning(self, step_time, data=None, policy=None, verbose=2):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception(
                "The IL only need one way to guide, Please make sure the input "
            )

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            s = self.env.reset()
            for time in step_time:
                self.step += 1
                a = policy(s)
                s_, r, done, info = self.env.step(a)
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                loss = self.backward(sample)
                s = s_
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()
Beispiel #2
0
    def Imitation_Learning(self, step_time, data=None, policy=None,learning_start=1000,
                           buffer_size = 5000, value_training_round = 10, value_training_fre = 2500,
                           verbose=2,render = False):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception("The IL only need one way to guide, Please make sure the input ")

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            buffer = ReplayMemory(buffer_size)
            s = self.env.reset()
            loss_BC = 0
            ep_step,ep_reward = 0, 0
            for _ in range(step_time):
                self.step += 1
                ep_step += 1
                a = policy(self.env)
                s_, r, done, info = self.env.step(a)
                #print(r,info)
                ep_reward += r
                if render:
                    self.env.render()
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                buffer.push(sample)
                s = s_[:]
                if self.step > learning_start:
                    sample_ = buffer.sample(self.batch_size)
                    loss = self.policy_behavior_clone(sample_)
                    if self.step % value_training_fre==0:
                        record_sample = {}
                        for key in buffer.memory.keys():
                            record_sample[key] = np.array(buffer.memory[key]).astype(np.float32)[-value_training_fre:]
                        record_sample["value"] = self.value.forward(torch.from_numpy(record_sample["s"]))
                        returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"],
                                                   self.gamma, self.lam)
                        record_sample["advs"] = advants
                        record_sample["return"] = returns
                        for round_ in range(value_training_round):
                            loss_value = self.value_pretrain(record_sample, value_training_fre)
                            print(round_, loss_value)

                    if verbose == 1:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("loss", loss)
                        logger.record_tabular("rewrad",r)
                        logger.dumpkvs()
                    loss_BC += loss
                if done:
                    if verbose == 2:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("step_used", ep_step)
                        logger.record_tabular("loss", loss_BC/ep_step)
                        logger.record_tabular("ep_reward",ep_reward )
                        logger.dumpkvs()

                    s = self.env.reset()
                    loss_BC = 0
                    ep_step,ep_reward = 0, 0