def __init__(self, args): # Make the gym environment self.env = MDP(10, 5, 'test.npy', 'r_file.npy') # self.env = gym.make(args.env) # self.env.seed(args.seed) self.save_model = args.save_model # Num of iterations self.train_iter = args.train_iter self.test_iter = args.test_iter args.discrete = True args.ac_dim = 5 args.ob_dim = 10 self.ob_dim = args.ob_dim self.ac_dim = args.ac_dim # Make agent self.agent = QLAgent(args) # Make expert self.expert = QLAgent(args) expert_path = os.path.join('.', 'Experts', 'MDP', 'QLearning', 'expert.npy') self.expert.load(expert_path) # Offline buffer self.buffer = OfflineBuffer(args.buffer_size) self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.concat_rewards = args.concat_rewards # Replay Buffer # self.buffer = ReplayBuffer(args.buffer_size) # Training Trajectory Collect Policy if args.collect_policy == 'random': self.collect_policy = RandomPolicy(self.env.na) else: self.collect_policy = self.agent.actor # Logger self.logger = Logger(args.logdir)
def __init__(self, args): # Make the gym environment self.env = gym.make(args.env) self.env.seed(args.seed) self.save_model = args.save_model # Num of iterations self.train_iter = args.train_iter self.test_iter = args.test_iter args.discrete = isinstance(self.env.action_space, gym.spaces.Discrete) args.ac_dim = utils.get_space_dim(self.env.action_space) args.ob_dim = utils.get_space_dim(self.env.observation_space) self.ob_dim = args.ob_dim self.ac_dim = args.ac_dim # Make agent self.agent = QLAgent(args) self.agent.actor.q_table = np.random.rand(self.ob_dim, self.ac_dim).astype(np.float32) print(self.agent.actor.q_table) # print("====") # Make expert self.expert = QLAgent(args) expert_path = os.path.join('.', 'Experts', 'QLearning', 'expert.npy') self.expert.load(expert_path) # self.agent.load(expert_path) # Offline buffer self.buffer = OfflineBuffer(args.buffer_size) self.buffer_size = args.buffer_size self.batch_size = args.batch_size # Training Trajectory Collect Policy if args.collect_policy == 'random': self.collect_policy = RandomPolicy(self.env.action_space) else: self.collect_policy = self.expert.actor
class MDPTrainer(object): def __init__(self, args): # Make the gym environment self.env = MDP(10, 5, 'test.npy', 'r_file.npy') # self.env = gym.make(args.env) # self.env.seed(args.seed) self.save_model = args.save_model # Num of iterations self.train_iter = args.train_iter self.test_iter = args.test_iter args.discrete = True args.ac_dim = 5 args.ob_dim = 10 self.ob_dim = args.ob_dim self.ac_dim = args.ac_dim # Make agent self.agent = QLAgent(args) # Make expert # self.expert = QLAgent(args) # expert_path = os.path.join('.', 'Experts', 'QLearning', 'expert.npy') # self.expert.load(expert_path) # Offline buffer # self.buffer = OfflineBuffer(args.buffer_size) # self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.concat_rewards = args.concat_rewards # Replay Buffer self.buffer = ReplayBuffer(args.buffer_size) # Training Trajectory Collect Policy if args.collect_policy == 'random': self.collect_policy = RandomPolicy(self.env.action_space) else: self.collect_policy = self.agent.actor # Logger self.logger = Logger(args.logdir) # def generate_buffer(self): # print("Generating offline dataset...") # counter = 0 # data = defaultdict(lambda : defaultdict(list)) # while counter < self.buffer_size: # path = utils.sample_trajectory(self.env, self.collect_policy, 200, True, render_mode=()) # obs, acs, rewards, next_obs, terminals = path["observation"], path["action"], path["reward"], path["next_observation"], path["terminal"] # # assert len(obs) == len(acs) == len(rewards) == len(terminals) == len(next_obs) # print(len(obs), len(acs), len(rewards), len(terminals), len(next_obs)) # for i in range(len(obs)): # s = obs[i] # a = acs[i] # data[s][a].append((rewards[i], next_obs[i], terminals[i])) # counter += len(obs) # print("Offline dataset Generated") # self.data = data def train(self): self.agent.training() for itr in range(self.train_iter): print('************ Iteration {} ************'.format(itr)) paths, _ = utils.sample_trajectories(self.env, self.collect_policy, self.batch_size, 50, True, render_mode=()) for p in paths: p['observation'] = p['observation'].astype(np.int) p['action'] = p['action'].astype(np.int) p['next_observation'] = p['next_observation'].astype(np.int) # print(p) self.buffer.add_trajectory(paths) observations, actions, unconcatenated_rews, next_observations, terminals = self.buffer.sample_recent_data( self.batch_size, concat_rew=True) log = self.agent.train(observations, actions, unconcatenated_rews, next_observations, terminals) self.logging(itr, paths, log) def test(self): self.agent.testing() ep_rewards = [] state_feq = np.zeros(self.ob_dim) for itr in range(self.test_iter): path = utils.sample_trajectory(self.env, self.agent.actor, 50, True, render_mode=()) rewards = path["reward"] ep_rewards.append(np.sum(rewards)) if self.save_model: expert_dir = os.path.join('.', 'Experts', 'MDP') if not os.path.exists(expert_dir): os.makedirs(expert_dir) self.agent.save(expert_dir) def logging(self, itr, train_paths, agent_log): epislon = self.agent.testing() eval_paths, _ = utils.sample_trajectories(self.env, self.collect_policy, self.batch_size, 50, False, render_mode=()) # if itr % 20 == 0: # _ = utils.sample_n_trajectories(self.env, self.collect_policy, 5, 200, False, render_mode=()) train_returns = [path["reward"].sum() for path in train_paths] eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] train_ep_lens = [len(path["reward"]) for path in train_paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) if agent_log: logs.update(agent_log) # logs["Train_EnvstepsSoFar"] = self.total_envsteps # logs["TimeSinceStart"] = time.time() - self.start_time # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.agent.training(epislon) self.logger.flush()
class MDPTrainer(object): def __init__(self, args): # Make the gym environment self.env = MDP(10, 5, 'test.npy', 'r_file.npy') # self.env = gym.make(args.env) # self.env.seed(args.seed) self.save_model = args.save_model # Num of iterations self.train_iter = args.train_iter self.test_iter = args.test_iter args.discrete = True args.ac_dim = 5 args.ob_dim = 10 self.ob_dim = args.ob_dim self.ac_dim = args.ac_dim # Make agent self.agent = QLAgent(args) # Make expert self.expert = QLAgent(args) expert_path = os.path.join('.', 'Experts', 'MDP', 'QLearning', 'expert.npy') self.expert.load(expert_path) # Offline buffer self.buffer = OfflineBuffer(args.buffer_size) self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.concat_rewards = args.concat_rewards # Replay Buffer # self.buffer = ReplayBuffer(args.buffer_size) # Training Trajectory Collect Policy if args.collect_policy == 'random': self.collect_policy = RandomPolicy(self.env.na) else: self.collect_policy = self.agent.actor # Logger self.logger = Logger(args.logdir) def generate_buffer(self): print("Generating offline dataset...") counter = 0 data = defaultdict(lambda: defaultdict(list)) while counter < self.buffer_size: path = utils.sample_trajectory(self.env, self.collect_policy, 50, True, render_mode=()) obs, acs, rewards, next_obs, terminals = path[ "observation"].astype(np.int), path["action"].astype( np.int), path["reward"], path["next_observation"].astype( np.int), path["terminal"] # assert len(obs) == len(acs) == len(rewards) == len(terminals) == len(next_obs) print(len(obs), len(acs), len(rewards), len(terminals), len(next_obs)) for i in range(len(obs)): s = obs[i] a = acs[i] data[s][a].append((rewards[i], next_obs[i], terminals[i])) counter += len(obs) print("Offline dataset Generated") self.data = data def train(self): self.agent.training() for itr in range(self.train_iter): print('************ Iteration {} ************'.format(itr)) for s, v in self.data.items(): for a, data in v.items(): rand_indices = np.random.permutation( len(data))[:self.batch_size] rewards = [] next_obs = [] terminals = [] for i in rand_indices: rewards.append(data[i][0]) next_obs.append(data[i][1]) terminals.append(data[i][2]) obs = np.full(len(rewards), s, dtype=np.int) acs = np.full(len(rewards), a, dtype=np.int) loss = self.agent.train(obs, acs, np.array(rewards, dtype=np.int), np.array(next_obs, dtype=np.int), terminals) self.logging(itr, rewards) def test(self): self.agent.testing() ep_rewards = [] state_feq = np.zeros(self.ob_dim) for itr in range(self.test_iter): path = utils.sample_trajectory(self.env, self.collect_policy, 50, True, render_mode=()) rewards = path["reward"] ep_rewards.append(np.sum(rewards)) print("Average Total Rewards: {}".format(np.mean(ep_rewards))) if self.save_model: expert_dir = os.path.join('.', 'Experts', 'Offline') self.agent.save(expert_dir) def logging(self, itr, rewards): print('Rewards: {}'.format(np.sum(rewards))) print('EpLen: {}'.format(len(rewards)))
class OfflineTrainer(object): def __init__(self, args): # Make the gym environment self.env = gym.make(args.env) self.env.seed(args.seed) self.save_model = args.save_model # Num of iterations self.train_iter = args.train_iter self.test_iter = args.test_iter args.discrete = isinstance(self.env.action_space, gym.spaces.Discrete) args.ac_dim = utils.get_space_dim(self.env.action_space) args.ob_dim = utils.get_space_dim(self.env.observation_space) self.ob_dim = args.ob_dim self.ac_dim = args.ac_dim # Make agent self.agent = QLAgent(args) # Make expert self.expert = QLAgent(args) expert_path = os.path.join('.', 'Experts', 'QLearning', 'expert.npy') self.expert.load(expert_path) # Offline buffer self.buffer = OfflineBuffer(args.buffer_size) self.batch_size = args.batch_size # Training Trajectory Collect Policy if args.collect_policy == 'random': self.collect_policy = RandomPolicy(self.env.action_space) else: self.collect_policy = self.expert.actor def generate_buffer(self): print("Generating offline dataset...") while not self.buffer.full(): obs, acs, rewards, next_obs, terminals, image_obs = utils.sample_trajectory(self.env, self.collect_policy, 200, True, render_mode=()) self.buffer.add_trajectory(utils.Path(obs, image_obs, acs, rewards, next_obs, terminals)) print("Offline dataset Generated") state_freq = np.zeros(self.ob_dim) for i in range(len(self.buffer.obs)): state_freq[self.buffer.obs[i]] += 1 if self.buffer.terminals[i]: state_freq[self.buffer.next_obs[i]] += 1 state_freq = state_freq.reshape(8,8) fix, ax = plt.subplots() im, cbar = heatmap(state_freq, np.arange(8), np.arange(8), ax=ax, cmap="YlGn") for i in range(8): for j in range(8): text = ax.text(j, i, "{:.0f}".format(state_freq[i, j]), ha="center", va="center", color="black") # plt.show() ax.set_title("Offline Buffer State Frequency") plt.savefig('./state_freq_buffer_{}.png'.format(1)) plt.close() def train(self): self.agent.training() for itr in range(self.train_iter): print('************ Iteration {} ************'.format(itr)) obs, acs, rewards, next_obs, terminals = self.buffer.sample_random_data(self.batch_size) loss = self.agent.train(obs, acs, rewards, next_obs, terminals) self.logging(itr, rewards) def test(self): self.agent.testing() ep_rewards = [] state_feq = np.zeros(self.ob_dim) for itr in range(self.test_iter): obs, acs, rewards, next_obs, terminals, image_obs = utils.sample_trajectory(self.env, self.agent.actor, 200, True, render_mode=()) ep_rewards.append(np.sum(rewards)) for i in range(len(obs)): state_feq[obs[i]] += 1 if terminals[i]: state_feq[next_obs[i]] += 1 plt.bar(list(range(self.ob_dim)), state_feq, log=True) plt.savefig('./img2.png') plt.close() print("Average Total Rewards: {}".format(np.mean(ep_rewards))) if self.save_model: expert_dir = os.path.join('.', 'Experts', 'Offline') self.agent.save(expert_dir) def logging(self, itr, rewards): print('Rewards: {}'.format(np.sum(rewards))) print('EpLen: {}'.format(len(rewards)))
class EDPTrainer(object): def __init__(self, args): # Make the gym environment self.env = gym.make(args.env) self.env.seed(args.seed) self.save_model = args.save_model # Num of iterations self.train_iter = args.train_iter self.test_iter = args.test_iter args.discrete = isinstance(self.env.action_space, gym.spaces.Discrete) args.ac_dim = utils.get_space_dim(self.env.action_space) args.ob_dim = utils.get_space_dim(self.env.observation_space) self.ob_dim = args.ob_dim self.ac_dim = args.ac_dim # Make agent self.agent = QLAgent(args) self.agent.actor.q_table = np.random.rand(self.ob_dim, self.ac_dim).astype(np.float32) print(self.agent.actor.q_table) # print("====") # Make expert self.expert = QLAgent(args) expert_path = os.path.join('.', 'Experts', 'QLearning', 'expert.npy') self.expert.load(expert_path) # self.agent.load(expert_path) # Offline buffer self.buffer = OfflineBuffer(args.buffer_size) self.buffer_size = args.buffer_size self.batch_size = args.batch_size # Training Trajectory Collect Policy if args.collect_policy == 'random': self.collect_policy = RandomPolicy(self.env.action_space) else: self.collect_policy = self.expert.actor def generate_buffer(self): print("Generating offline dataset...") counter = 0 data = defaultdict(lambda : defaultdict(list)) while counter < self.buffer_size: path = utils.sample_trajectory(self.env, self.collect_policy, 200, True, render_mode=()) obs, acs, rewards, next_obs, terminals = path["observation"], path["action"], path["reward"], path["next_observation"], path["terminal"] # assert len(obs) == len(acs) == len(rewards) == len(terminals) == len(next_obs) # print(len(obs), len(acs), len(rewards), len(terminals), len(next_obs)) for i in range(len(obs)): s = obs[i] a = acs[i] data[s][a].append((rewards[i], next_obs[i], terminals[i])) counter += len(obs) print("Offline dataset Generated") self.data = data def train(self): self.agent.training() for itr in range(self.train_iter): print('************ Iteration {} ************'.format(itr)) values = defaultdict(lambda: 0) for s, v in self.data.items(): for i in range(10000): # if s == 23.0: # print("====") obs = [] rewards = [] ob = s step = 0 total_rewards = 0 while True: a = np.float32(self.agent.get_action(ob.astype(int))) if ob in [31.0, 39.0, 47.0, 55.0]: a = 2.0 # print(len(self.data[ob][2.])) if s in [23.0]: # print(a, ob) # for k23, v23 in self.data[ob].items(): # print(k23, len(v23)) # a = 2.0 obs.append(ob) if len(self.data[ob][a]) != 0: a_data = self.data[ob][a] index = np.random.randint(len(a_data)) # if s == 23.0: # print(ob, a) r, ob, done = a_data[index] # if s == 23.0: # print(r, ob, done) # print(step) rewards.append(r) total_rewards += r if done or step > 300: break step += 1 else: break values[s] += total_rewards print(values) q_values = np.zeros(self.ob_dim * self.ac_dim).reshape(self.ob_dim, self.ac_dim).astype(np.float32) for s, v_data in self.data.items(): for action, action_data in v_data.items(): rand_indices = np.random.permutation(len(action_data))[:self.batch_size] r = [] for i in rand_indices: val = 0 if action_data[i][2] else values[action_data[i][1]] r.append(action_data[i][0] + val) q_values[s.astype(int)][action.astype(int)] = self.agent.actor.q_table[s.astype(int)][action.astype(int)] if len(r) == 0 else np.mean(r) self.agent.actor.q_table = q_values self.agent.actor.epsilon = max(self.agent.actor.epsilon - self.agent.actor.e_decay_rate, 0.0) print(self.agent.actor.q_table) self.logging(itr, []) def test(self): self.agent.testing() ep_rewards = [] state_feq = np.zeros(self.ob_dim) for itr in range(self.test_iter): path = utils.sample_trajectory(self.env, self.agent.actor, 200, True, render_mode=()) rewards = path["reward"] ep_rewards.append(np.sum(rewards)) print("Average Total Rewards: {}".format(np.mean(ep_rewards))) if self.save_model: expert_dir = os.path.join('.', 'Experts', 'Offline') self.agent.save(expert_dir) def logging(self, itr, rewards): print('Rewards: {}'.format(np.sum(rewards))) print('EpLen: {}'.format(len(rewards)))