def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10,learning_start=1000, render = False, verbose=1, record_ep_inter=None): ''' :param max_step: :param max_ep_time: :param max_ep_cycle: max step in per circle .........................show parameter.................................. :param verbose if verbose == 1 show every ep if verbose == 2 show every step :param record_ep_inter record_ep_interact data :return: None ''' # if IL_time is not None: self.render = render # .....................initially——recode...........................# rollout = 0 now_best_reward = -np.inf self.dist = make_pdtype(self.env.action_space, self.policy) sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter, lstm_enable=self.lstm_enable) while self.step < max_step: sample = next(sample_generate) logger.record_tabular("01.step", self.step) logger.record_tabular("02.episode",self.episode) logger.record_tabular("03.rollout", rollout) logger.record_tabular("04.rollout/ep", sample["ep_used"]) logger.record_tabular("05.rollout/step", sum(sample["ep_step_used"])) logger.record_tabular("06.mean_episode_reward", np.mean(sample["ep_reward"])) logger.record_tabular("07.mean_step_reward", np.mean(sample["buffer"]["r"])) logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"])) logger.dump_tabular() csv_record(sample["ep_reward"], self.path) record_sample = sample["buffer"] rollout += 1 if self.step > learning_start and self.learning: ep_show = {} if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] = 0 rollout_loss = 0 for time in range(train_rollout): loss, other_infor = self.update(record_sample) if verbose == 1: logger.record_tabular("06.train_rollout", time) logger.record_tabular("07.loss", loss) flag = 10 if self.backward_step_show_list: for key in self.backward_step_show_list: logger.record_tabular(str(flag) +"."+ key, other_infor[key]) flag += 1 logger.dump_tabular() rollout_loss += loss if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] += other_infor[key] if verbose == 2: logger.record_tabular("06.rollouts/loss", rollout_loss) logger.record_tabular("07.rollouts/episode_Q_value", torch.mean( torch.tensor(sample["ep_Q_value"])).cpu().detach().numpy()) # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"]) # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"]) # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"])) flag = 10 if self.backward_ep_show_list: for key in self.backward_ep_show_list: logger.record_tabular(str(flag) + "." + key, ep_show[key]) flag += 1 logger.dump_tabular() if np.mean(sample["ep_reward"])>now_best_reward: self.save_weights(self.path) print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved") now_best_reward = np.mean(sample["ep_reward"])
def interact(self, max_step=50000, max_ep_cycle=2000, render=False, verbose=1, record_ep_inter=None): ''' :param max_step: :param max_ep_time: :param max_ep_cycle: max step in per circle .........................show parameter.................................. :param verbose if verbose == 1 show every ep if verbose == 2 show every step :param record_ep_inter record_ep_interact data :return: None ''' # if IL_time is not None: # .....................initially——recode...........................# ep_reward = [] ep_Q_value = [] ep_loss = [] now_best_reward = -np.inf while self.step < max_step: s = self.env.reset() 'reset the ep record' ep_r, ep_q, ep_l = 0, 0, 0 'reset the RL flag' ep_cycle, done = 0, 0 ep_show = {} if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] = 0 self.episode += 1 while done == 0 and ep_cycle < max_ep_cycle: self.step += 1 ep_cycle += 1 'the interaction part' a, Q, info_forward = self.forward(s) # print(a) s_, r, done, info = self.env.step(a) sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} s = deepcopy(s_) loss, info_backward = self.backward(sample) if render: self.env.render() 'the record part' if verbose == 1 and self.step > self.learning_starts: logger.record_tabular("steps", self.step) logger.record_tabular("episodes", self.episode) logger.record_tabular("loss", loss) logger.record_tabular("reward", r) logger.record_tabular("Q", Q) if self.forward_step_show_list: for key in self.forward_step_show_list: logger.record_tabular(key, info_forward[key]) if self.backward_step_show_list: for key in self.backward_step_show_list: logger.record_tabular(key, info_backward[key]) logger.dump_tabular() if record_ep_inter is not None: if self.episode % record_ep_inter == 0: kvs = { "s": s, "a": a, "s_": s_, "r": r, "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle } self.csvwritter.writekvs(kvs) ep_r += r ep_q += Q ep_l += loss if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] += info_backward[key] if done: ep_reward.append(ep_r) ep_Q_value.append(ep_q) ep_loss.append(ep_l) mean_100ep_reward = round(np.mean(ep_reward[-101:-1]), 1) if verbose == 2 and self.step > self.learning_starts: logger.record_tabular("01.steps", self.step) logger.record_tabular("02.episodes", self.episode) logger.record_tabular("03.episode_reward", ep_reward[-1]) # logger.record_tabular("04.episode_reward_per_step", ep_reward[-1] / ep_cycle) logger.record_tabular("05.episode_loss", ep_l) # logger.record_tabular("06.episode_loss_per_step", ep_l / ep_cycle) # logger.record_tabular("07.episode_Q_value", ep_q) logger.record_tabular("08.episode_Q_value_per_step", ep_q / ep_cycle) # logger.record_tabular("09.mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("10.step_used", ep_cycle) flag = 11 if self.forward_ep_show_list: for key in self.forward_ep_show_list: logger.record_tabular( str(flag) + "." + key, info_forward[key]) flag += 1 if self.backward_ep_show_list: for key in self.backward_ep_show_list: logger.record_tabular( str(flag) + "." + key, ep_show[key]) flag += 1 logger.dump_tabular() if np.mean(ep_r) > now_best_reward: self.save_weights(self.path) print("the best mean ep reward is ", np.mean(ep_r), "the weight is saved") now_best_reward = np.mean(ep_r)