def Imitation_Learning(self, step_time, data=None, policy=None, learning_start=1000, buffer_size=5000, value_training_round=10, value_training_fre=2500, verbose=2, render=False): ''' :param data: the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} :param policy: :return: ''' if data is not None and policy is not None: raise Exception( "The IL only need one way to guide, Please make sure the input " ) if data is not None: for time in step_time: self.step += 1 loss = self.backward(data[time]) if verbose == 1: logger.record_tabular("steps", self.step) logger.record_tabular("loss", loss) logger.dumpkvs() if policy is not None: buffer = ReplayMemory(buffer_size) s = self.env.reset() loss_BC = 0 ep_step, ep_reward = 0, 0 for _ in range(step_time): self.step += 1 ep_step += 1 a = policy(self.env) s_, r, done, info = self.env.step(a) ep_reward += r if render: self.env.render() sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} buffer.push(sample) s = s_[:] if self.step > learning_start: sample_ = buffer.sample(self.batch_size) loss = self.policy_behavior_clone(sample_) if self.step % value_training_fre == 0: record_sample = {} for key in buffer.memory.keys(): record_sample[key] = np.array( buffer.memory[key]).astype( np.float32)[-value_training_fre:] record_sample["value"] = self.value.forward( torch.from_numpy(record_sample["s"])) returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"], self.gamma, self.lam) record_sample["advs"] = advants record_sample["return"] = returns for round_ in range(value_training_round): loss_value = self.value_pretrain( record_sample, value_training_fre) print(round_, loss_value) if verbose == 1: logger.record_tabular("learning_steps", self.step) logger.record_tabular("loss", loss) logger.record_tabular("rewrad", r) logger.dumpkvs() loss_BC += loss if done: if verbose == 2: logger.record_tabular("learning_steps", self.step) logger.record_tabular("step_used", ep_step) logger.record_tabular("loss", loss_BC / ep_step) logger.record_tabular("ep_reward", ep_reward) logger.dumpkvs() s = self.env.reset() loss_BC = 0 ep_step, ep_reward = 0, 0
def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10, learning_start=1000, render=False, verbose=1, record_ep_inter=None): ''' :param max_step: :param max_ep_time: :param max_ep_cycle: max step in per circle .........................show parameter.................................. :param verbose if verbose == 1 show every ep if verbose == 2 show every step :param record_ep_inter record_ep_interact data :return: None ''' # if IL_time is not None: self.render = render # .....................initially——recode...........................# rollout = 0 now_best_reward = -np.inf self.dist = make_pdtype(self.env.action_space, self.policy) sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter) while self.step < max_step: sample = next(sample_generate) returns, advants = get_gae(sample["buffer"]["r"], sample["buffer"]["tr"], sample["buffer"]["value"], self.gamma, self.lam) # record_sample = gae(sample["buffer"], sample["last_Q"], self.gamma, self.lam) record_sample = sample["buffer"] record_sample["advs"] = advants.unsqueeze(1) record_sample["return"] = returns.unsqueeze(1) rollout += 1 if self.step > learning_start and self.learning: ep_show = {} if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] = 0 rollout_loss = 0 for time in range(train_rollout): loss, other_infor = self.update(record_sample) if verbose == 1: logger.record_tabular("1.train_rollout", time) logger.record_tabular("2.loss", loss) flag = 3 if self.backward_step_show_list: for key in self.backward_step_show_list: logger.record_tabular( str(flag) + "." + key, other_infor[key]) flag += 1 logger.dump_tabular() rollout_loss += loss if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] += other_infor[key] if verbose == 2: logger.record_tabular("01.steps", self.step) logger.record_tabular("02.episodes", self.episode) logger.record_tabular("03.rollouts/num", rollout) logger.record_tabular("04.rollouts/mean_episode_reward", np.mean(sample["ep_reward"])) logger.record_tabular( "05.rollouts/mean_step_reward", torch.mean( sample["buffer"]["r"]).cpu().detach().numpy()) logger.record_tabular("06.rollouts/loss", rollout_loss) logger.record_tabular( "07.rollouts/episode_Q_value", torch.mean(torch.tensor( sample["ep_Q_value"])).cpu().detach().numpy()) # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"]) # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"]) # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"])) logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"])) flag = 10 if self.backward_ep_show_list: for key in self.backward_ep_show_list: logger.record_tabular( str(flag) + "." + key, ep_show[key]) flag += 1 logger.dump_tabular() if np.mean(sample["ep_reward"]) > now_best_reward: self.save_weights(self.path) print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved") now_best_reward = np.mean(sample["ep_reward"])
def update(self, sample): returns, advants = get_gae(sample["r"], sample["tr"], sample["value"], self.gamma, self.lam) sample["advs"] = advants.unsqueeze(1) sample["return"] = returns.unsqueeze(1) sample["cost"] = [] for info in sample["info"]: sample["cost"].append(info["cost"]) sample["cost_value"] = self.cost_value.forward(sample["s"]) returns, advants = get_gae(sample["cost"], sample["tr"], sample["cost_value"], self.gamma, self.lam) sample["cost_advs"] = advants.unsqueeze(1) sample["cost_return"] = returns.unsqueeze(1) step_len = len(sample["s"]) if self.lstm_enable: flagin = [ time for time in range(step_len) if sample["tr"][time] == 1 ] time_round = len(flagin) array_index = [] for train_time in range(int(time_round) - 1): array_index.append( range(flagin[train_time], flagin[train_time + 1])) else: time_round = np.ceil(step_len / self.batch_size) time_left = time_round * self.batch_size - step_len array = list(range(step_len)) + list(range(int(time_left))) array_index = [] for train_time in range(int(time_round)): array_index.append( array[train_time * self.batch_size:(train_time + 1) * self.batch_size]) loss_re, pgloss_re, enloss_re, vfloss_re = [], [], [], [] for key in sample.keys(): temp = torch.stack(list(sample[key]), 0) if self.gpu: sample[key] = temp.cuda() else: sample[key] = temp for train_time in range(int(time_round)): index = array_index[train_time] # for index in range(step_len): training_s = sample["s"][index].detach() training_a = sample["a"][index].detach() training_r = sample["r"][index].detach() R = sample["return"][index].detach() old_value = sample["value"][index].detach() old_neglogp = sample["logp"][index].detach() advs = sample["advs"][index].detach() c_advs = sample["cost_advs"][index].detach() c_value = sample["cost_value"][index].detach() cost = sample["cost"][index].detach() " CALCULATE THE LOSS" " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss" " the value loss" value_now = self.value.forward(training_s) # value loss value_clip = old_value + torch.clamp( old_value - value_now, min=-self.cliprange, max=self.cliprange) # Clipped value vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss vf_loss2 = self.loss_cal(value_clip, R) # clipped loss vf_loss = .5 * torch.max(vf_loss1, vf_loss2) # generate Policy gradient loss outcome = self.policy.forward(training_s) new_policy = self.dist(outcome) new_neg_lop = new_policy.log_prob(training_a) ratio = torch.exp(new_neg_lop - old_neglogp) pg_loss1 = -advs * ratio pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean() # generate Policy gradient loss c_pg_loss1 = -c_advs * ratio c_pg_loss2 = -c_advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) c_pg_loss = .5 * torch.max(c_pg_loss1, c_pg_loss2).mean() # entropy entropy = new_policy.entropy().mean() loss = pg_loss - self.ui * c_pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef self.policy_model_optim.zero_grad() pg_loss.backward() self.policy_model_optim.step() for _ in range(self.value_train_step): value_now = self.value.forward(training_s) # value loss value_clip = old_value + torch.clamp( old_value - value_now, min=-self.cliprange, max=self.cliprange) # Clipped value vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss vf_loss2 = self.loss_cal(value_clip, R) # clipped loss vf_loss = .5 * torch.max(vf_loss1, vf_loss2) self.value_model_optim.zero_grad() vf_loss1.backward() self.value_model_optim.step() cost_now = self.cost_value.forward(training_s) cost_vloss = self.loss_cal(cost_now, cost) self.cost_value_model_optim.zero_grad() cost_vloss.backward() self.cost_value_model_optim.step() loss_re = loss.cpu().detach().numpy() pgloss_re.append(pg_loss.cpu().detach().numpy()) enloss_re.append(entropy.cpu().detach().numpy()) vfloss_re.append(vf_loss1.cpu().detach().numpy()) "training the weights ui" for i in sample["cost"]: cost = self.ui * sample["cost"] self.ui_optim.zero_grad() cost.backward() self.ui_optim.step() return np.sum(loss_re), { "pg_loss": np.sum(pgloss_re), "entropy": np.sum(enloss_re), "vf_loss": np.sum(vfloss_re) }