Exemple #1
0
    def Imitation_Learning(self,
                           step_time,
                           data=None,
                           policy=None,
                           learning_start=1000,
                           buffer_size=5000,
                           value_training_round=10,
                           value_training_fre=2500,
                           verbose=2,
                           render=False):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception(
                "The IL only need one way to guide, Please make sure the input "
            )

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            buffer = ReplayMemory(buffer_size)
            s = self.env.reset()
            loss_BC = 0
            ep_step, ep_reward = 0, 0
            for _ in range(step_time):
                self.step += 1
                ep_step += 1
                a = policy(self.env)
                s_, r, done, info = self.env.step(a)
                ep_reward += r
                if render:
                    self.env.render()
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                buffer.push(sample)
                s = s_[:]
                if self.step > learning_start:
                    sample_ = buffer.sample(self.batch_size)
                    loss = self.policy_behavior_clone(sample_)
                    if self.step % value_training_fre == 0:
                        record_sample = {}
                        for key in buffer.memory.keys():
                            record_sample[key] = np.array(
                                buffer.memory[key]).astype(
                                    np.float32)[-value_training_fre:]
                        record_sample["value"] = self.value.forward(
                            torch.from_numpy(record_sample["s"]))
                        returns, advants = get_gae(record_sample["r"],
                                                   record_sample["tr"],
                                                   record_sample["value"],
                                                   self.gamma, self.lam)
                        record_sample["advs"] = advants
                        record_sample["return"] = returns
                        for round_ in range(value_training_round):
                            loss_value = self.value_pretrain(
                                record_sample, value_training_fre)
                            print(round_, loss_value)

                    if verbose == 1:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("loss", loss)
                        logger.record_tabular("rewrad", r)
                        logger.dumpkvs()
                    loss_BC += loss
                if done:
                    if verbose == 2:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("step_used", ep_step)
                        logger.record_tabular("loss", loss_BC / ep_step)
                        logger.record_tabular("ep_reward", ep_reward)
                        logger.dumpkvs()

                    s = self.env.reset()
                    loss_BC = 0
                    ep_step, ep_reward = 0, 0
Exemple #2
0
    def interact(self,
                 max_step=50000,
                 max_ep_cycle=2000,
                 train_rollout=10,
                 learning_start=1000,
                 render=False,
                 verbose=1,
                 record_ep_inter=None):
        '''
        :param max_step:
        :param max_ep_time:
        :param max_ep_cycle:  max step in per circle
        .........................show parameter..................................
        :param verbose
        if verbose == 1   show every ep
        if verbose == 2   show every step
        :param record_ep_inter
        record_ep_interact data
        :return: None
        '''
        # if IL_time is not None:
        self.render = render

        # .....................initially——recode...........................#
        rollout = 0
        now_best_reward = -np.inf

        self.dist = make_pdtype(self.env.action_space, self.policy)
        sample_generate = self.runner(self.sample_rollout, self.sample_ep,
                                      max_ep_cycle, record_ep_inter)
        while self.step < max_step:
            sample = next(sample_generate)

            returns, advants = get_gae(sample["buffer"]["r"],
                                       sample["buffer"]["tr"],
                                       sample["buffer"]["value"], self.gamma,
                                       self.lam)
            # record_sample = gae(sample["buffer"], sample["last_Q"], self.gamma, self.lam)
            record_sample = sample["buffer"]
            record_sample["advs"] = advants.unsqueeze(1)
            record_sample["return"] = returns.unsqueeze(1)
            rollout += 1

            if self.step > learning_start and self.learning:
                ep_show = {}
                if self.backward_ep_show_list:
                    for key in self.backward_ep_show_list:
                        ep_show[key] = 0
                rollout_loss = 0
                for time in range(train_rollout):
                    loss, other_infor = self.update(record_sample)
                    if verbose == 1:
                        logger.record_tabular("1.train_rollout", time)
                        logger.record_tabular("2.loss", loss)
                        flag = 3
                        if self.backward_step_show_list:
                            for key in self.backward_step_show_list:
                                logger.record_tabular(
                                    str(flag) + "." + key, other_infor[key])
                                flag += 1
                        logger.dump_tabular()
                    rollout_loss += loss
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            ep_show[key] += other_infor[key]
                if verbose == 2:
                    logger.record_tabular("01.steps", self.step)
                    logger.record_tabular("02.episodes", self.episode)
                    logger.record_tabular("03.rollouts/num", rollout)
                    logger.record_tabular("04.rollouts/mean_episode_reward",
                                          np.mean(sample["ep_reward"]))
                    logger.record_tabular(
                        "05.rollouts/mean_step_reward",
                        torch.mean(
                            sample["buffer"]["r"]).cpu().detach().numpy())
                    logger.record_tabular("06.rollouts/loss", rollout_loss)
                    logger.record_tabular(
                        "07.rollouts/episode_Q_value",
                        torch.mean(torch.tensor(
                            sample["ep_Q_value"])).cpu().detach().numpy())
                    # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"])
                    # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"])
                    # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"]))
                    logger.record_tabular("08.mean_ep_step_used",
                                          np.mean(sample["ep_step_used"]))
                    flag = 10
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            logger.record_tabular(
                                str(flag) + "." + key, ep_show[key])
                            flag += 1
                    logger.dump_tabular()
            if np.mean(sample["ep_reward"]) > now_best_reward:
                self.save_weights(self.path)
                print("the best mean ep reward is ",
                      np.mean(sample["ep_reward"]), "the weight is saved")
                now_best_reward = np.mean(sample["ep_reward"])
Exemple #3
0
    def update(self, sample):

        returns, advants = get_gae(sample["r"], sample["tr"], sample["value"],
                                   self.gamma, self.lam)
        sample["advs"] = advants.unsqueeze(1)
        sample["return"] = returns.unsqueeze(1)

        sample["cost"] = []
        for info in sample["info"]:
            sample["cost"].append(info["cost"])

        sample["cost_value"] = self.cost_value.forward(sample["s"])

        returns, advants = get_gae(sample["cost"], sample["tr"],
                                   sample["cost_value"], self.gamma, self.lam)
        sample["cost_advs"] = advants.unsqueeze(1)
        sample["cost_return"] = returns.unsqueeze(1)

        step_len = len(sample["s"])
        if self.lstm_enable:
            flagin = [
                time for time in range(step_len) if sample["tr"][time] == 1
            ]
            time_round = len(flagin)
            array_index = []
            for train_time in range(int(time_round) - 1):
                array_index.append(
                    range(flagin[train_time], flagin[train_time + 1]))
        else:
            time_round = np.ceil(step_len / self.batch_size)
            time_left = time_round * self.batch_size - step_len
            array = list(range(step_len)) + list(range(int(time_left)))
            array_index = []
            for train_time in range(int(time_round)):
                array_index.append(
                    array[train_time * self.batch_size:(train_time + 1) *
                          self.batch_size])
        loss_re, pgloss_re, enloss_re, vfloss_re = [], [], [], []

        for key in sample.keys():
            temp = torch.stack(list(sample[key]), 0)
            if self.gpu:
                sample[key] = temp.cuda()
            else:
                sample[key] = temp
        for train_time in range(int(time_round)):
            index = array_index[train_time]
            # for index in range(step_len):
            training_s = sample["s"][index].detach()
            training_a = sample["a"][index].detach()
            training_r = sample["r"][index].detach()
            R = sample["return"][index].detach()
            old_value = sample["value"][index].detach()
            old_neglogp = sample["logp"][index].detach()
            advs = sample["advs"][index].detach()
            c_advs = sample["cost_advs"][index].detach()
            c_value = sample["cost_value"][index].detach()
            cost = sample["cost"][index].detach()

            " CALCULATE THE LOSS"
            " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"

            " the value loss"
            value_now = self.value.forward(training_s)
            # value loss
            value_clip = old_value + torch.clamp(
                old_value - value_now, min=-self.cliprange,
                max=self.cliprange)  # Clipped value
            vf_loss1 = self.loss_cal(value_now, R)  # Unclipped loss
            vf_loss2 = self.loss_cal(value_clip, R)  # clipped loss
            vf_loss = .5 * torch.max(vf_loss1, vf_loss2)

            # generate Policy gradient loss
            outcome = self.policy.forward(training_s)
            new_policy = self.dist(outcome)
            new_neg_lop = new_policy.log_prob(training_a)
            ratio = torch.exp(new_neg_lop - old_neglogp)
            pg_loss1 = -advs * ratio
            pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange,
                                           1.0 + self.cliprange)
            pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean()

            # generate Policy gradient loss
            c_pg_loss1 = -c_advs * ratio
            c_pg_loss2 = -c_advs * torch.clamp(ratio, 1.0 - self.cliprange,
                                               1.0 + self.cliprange)
            c_pg_loss = .5 * torch.max(c_pg_loss1, c_pg_loss2).mean()

            # entropy
            entropy = new_policy.entropy().mean()
            loss = pg_loss - self.ui * c_pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

            self.policy_model_optim.zero_grad()
            pg_loss.backward()
            self.policy_model_optim.step()
            for _ in range(self.value_train_step):
                value_now = self.value.forward(training_s)
                # value loss
                value_clip = old_value + torch.clamp(
                    old_value - value_now,
                    min=-self.cliprange,
                    max=self.cliprange)  # Clipped value
                vf_loss1 = self.loss_cal(value_now, R)  # Unclipped loss
                vf_loss2 = self.loss_cal(value_clip, R)  # clipped loss
                vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
                self.value_model_optim.zero_grad()
                vf_loss1.backward()
                self.value_model_optim.step()

                cost_now = self.cost_value.forward(training_s)
                cost_vloss = self.loss_cal(cost_now, cost)

                self.cost_value_model_optim.zero_grad()
                cost_vloss.backward()
                self.cost_value_model_optim.step()

            loss_re = loss.cpu().detach().numpy()
            pgloss_re.append(pg_loss.cpu().detach().numpy())
            enloss_re.append(entropy.cpu().detach().numpy())
            vfloss_re.append(vf_loss1.cpu().detach().numpy())
        "training the weights ui"
        for i in sample["cost"]:
            cost = self.ui * sample["cost"]
            self.ui_optim.zero_grad()
            cost.backward()
            self.ui_optim.step()

        return np.sum(loss_re), {
            "pg_loss": np.sum(pgloss_re),
            "entropy": np.sum(enloss_re),
            "vf_loss": np.sum(vfloss_re)
        }