def create_model(self): pi = policy(model_path=self.model_path, have_model=False, need_log=False, action_space=self.act_dim, state_space=self.obs_dim) pi.save_model()
def train(batch): pi = policy(have_model=True, need_log=True, action_space=act_dim, state_space=obs_dim) pi.train(batch) pi.save_model()
def train(self, batch): pi = policy(model_path=self.model_path, have_model=True, need_log=True, action_space=self.act_dim, state_space=self.obs_dim) pi.train(batch) pi.save_model()
def evaluateWorker(perform_type: str, eva_num, share_lock): """ evaluate :param perform_type: target or source :param policy_type: expert or policy :return: """ if perform_type == "expert": pi = policy(have_model=True, need_log=False, action_space=act_dim, state_space=obs_dim) else: pi = existing_pi(model_path='./Documents/success/%s/policy' % env_name.lower()) batch = {"sum_reward": []} game_num = 0 while True: game_num += 1 s = env.reset() sum_reward = 0 while True: if eva_num.value > 200: return batch a = pi.get_means(s) s_, r, done, _ = env.step(a * high[0]) sum_reward += r s = s_ if done: batch["sum_reward"].append(sum_reward) share_lock.acquire() eva_num.value += 1 share_lock.release() break
def see_performance_worker(self): pi = policy(model_path=self.model_path, have_model=True, need_log=False, action_space=self.act_dim, state_space=self.obs_dim) batch = {} for key in self.rl_keys: batch[key] = [] traj = 0 while traj < 3: s = self.env.reset() traj_batch = {"reward": []} step = 0 while True: a = pi.get_means(s) s_, r, done, info = self.env.step(a * self.high[0]) traj_batch["reward"].append(r) s = s_ step += 1 if done: batch["sum_reward"].append(sum(traj_batch["reward"])) traj += 1 break return batch
def worker(points_num, share_lock): obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] high = env.action_space.high pi = policy(have_model=True, need_log=False, action_space=act_dim, state_space=obs_dim) batch = {} for key in rl_keys: batch[key] = [] point = 0 while points_num.value <= args.points_num: s = env.reset() traj_batch = { "state": [], "actions": [], "reward": [], "gae": [], "value": [] } step = 0 while True: if points_num.value > args.points_num: break ret = pi.get_action(s) a = ret["actions"] s_, r, done, info = env.step(a * high[0]) r *= (1 - pi.gamma) traj_batch["state"].append(s) traj_batch["reward"].append(r) traj_batch["actions"].append(ret["actions"]) traj_batch["value"].append(ret["value"]) s = s_ step += 1 point += 1 if done: v = pi.get_value(s_) real_next = traj_batch["value"][1:] + [np.array(v)] ret = pi.get_return(traj_batch["reward"]) gae = pi.get_gaes(traj_batch["reward"], traj_batch["value"], real_next) batch["state"].append(traj_batch["state"]) batch["reward"].append(traj_batch["reward"]) batch["action"].append(traj_batch["actions"]) batch["gae"].append(gae) batch["return"].append(ret) batch["trajectory_len"].append(len(traj_batch["state"])) # batch["sum_reward"].append(sum(traj_batch["reward"])) share_lock.acquire() points_num.value += len(traj_batch["state"]) share_lock.release() break traj = 0 while traj < 3: s = env.reset() traj_batch = {"reward": []} step = 0 while True: a = pi.get_means(s) s_, r, done, info = env.step(a * high[0]) traj_batch["reward"].append(r) s = s_ step += 1 if done: batch["sum_reward"].append(sum(traj_batch["reward"])) traj += 1 break return batch
def create_model(): pi = policy(have_model=False, need_log=False, action_space=act_dim, state_space=obs_dim) pi.save_model()
def worker(self, points_num, share_lock): pi = policy(model_path=self.model_path, have_model=True, need_log=False, action_space=self.act_dim, state_space=self.obs_dim) batch = {} for key in self.rl_keys: batch[key] = [] point = 0 while True: s = self.env.reset() traj_batch = { "state": [], "actions": [], "reward": [], "gae": [], "value": [] } step = 0 while True: if points_num.value > args.points_num: return batch ret = pi.get_action(s) a = ret["actions"] s_, r, done, info = self.env.step(a * self.high[0]) r *= (1 - pi.gamma) traj_batch["state"].append(s) traj_batch["reward"].append(r) traj_batch["actions"].append(ret["actions"]) traj_batch["value"].append(ret["value"]) s = s_ step += 1 point += 1 if done: v = pi.get_value(s_) real_next = traj_batch["value"][1:] + [np.array(v)] ret = pi.get_return(traj_batch["reward"]) gae = pi.get_gaes(traj_batch["reward"], traj_batch["value"], real_next) batch["state"].append(traj_batch["state"]) batch["reward"].append(traj_batch["reward"]) batch["action"].append(traj_batch["actions"]) batch["gae"].append(gae) batch["return"].append(ret) batch["trajectory_len"].append(len(traj_batch["state"])) batch["sum_reward"].append(sum(traj_batch["reward"])) share_lock.acquire() points_num.value += len(traj_batch["state"]) share_lock.release() break
def worker(): obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] high = env.action_space.high pi = policy(have_model=True, need_log=False, action_space=act_dim, state_space=obs_dim) batch = {} for key in rl_keys: batch[key] = [] point = 0 points_per_process = args.points_num / args.process_num while point < points_per_process: s = env.reset() traj_batch = { "state": [], "actions": [], "reward": [], "gae": [], "value": [], } step = 0 while True: ret = pi.get_action(s) a = ret["actions"] s_, r, done, info = env.step(a * high[0]) r *= (1 - pi.gamma) traj_batch["state"].append(s) traj_batch["reward"].append(r) traj_batch["actions"].append(ret["actions"]) traj_batch["value"].append(ret["value"]) s = s_ step += 1 point += 1 if done: v = pi.get_value(s_) real_next = traj_batch["value"][1:] + [np.array(v)] ret = pi.get_return(traj_batch["reward"]) gae = pi.get_gaes(traj_batch["reward"], traj_batch["value"], real_next) batch["state"].append(traj_batch["state"]) batch["reward"].append(traj_batch["reward"]) batch["action"].append(traj_batch["actions"]) batch["gae"].append(gae) batch["return"].append(ret) batch["sum_reward"].append(sum(traj_batch["reward"])) batch["trajectory_len"].append(len(traj_batch["state"])) break return batch