def collect_samples(pid, queue, env, policy, render, running_state, custom_reward, min_batch_size): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 num_episodes = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') total_reward = 0 while num_steps < min_batch_size: state = env.reset() episode_reward = 0 if running_state: state = running_state(state) for t in range(10000): if render: env.render() state_tensor = FLOAT(state).unsqueeze(0) with torch.no_grad(): action, log_prob = policy.get_action_log_prob(state_tensor) action = action.cpu().numpy()[0] log_prob = log_prob.cpu().numpy()[0] next_state, reward, done, _ = env.step(action) if custom_reward: reward = custom_reward(state, action) episode_reward += reward if running_state: next_state = running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') memory.push(state, action, reward, next_state, mask, log_prob) num_steps += 1 if done or num_steps >= min_batch_size: break state = next_state # num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, render, running_state, min_batch_size): log = dict() memory = Memory() num_steps = 0 num_episodes = 0 min_episode_reward = float("inf") max_episode_reward = float("-inf") total_reward = 0 while num_steps < min_batch_size: state = env.reset() episode_reward = 0 if running_state: state = running_state(state) for t in range(10000): if render: env.render() state_tensor = tf.expand_dims(tf.convert_to_tensor(state, dtype=TDOUBLE), axis=0) action, log_prob = policy.get_action_log_prob(state_tensor) action = action.numpy()[0] log_prob = log_prob.numpy()[0] next_state, reward, done, _ = env.step(action) episode_reward += reward if running_state: next_state = running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') memory.push(state, action, reward, next_state, mask, log_prob) num_steps += 1 if done or num_steps >= min_batch_size: break state = next_state # num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) log["num_steps"] = num_steps log["num_episodes"] = num_episodes log["total_reward"] = total_reward log["avg_reward"] = total_reward / num_episodes log["max_episode_reward"] = max_episode_reward log["min_episode_reward"] = min_episode_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
class SAC_Alpha: def __init__(self, env_id, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_a=3e-4, lr_q=1e-3, gamma=0.99, polyak=0.995, explore_size=10000, step_per_iter=3000, batch_size=100, min_update_step=1000, update_step=50, target_update_delay=1, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.polyak = polyak self.memory = Memory(memory_size) self.explore_size = explore_size self.step_per_iter = step_per_iter self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_a = lr_a self.lr_q = lr_q self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.target_update_delay = target_update_delay self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert env_continuous, "SAC is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] self.target_entropy = -np.prod(self.env.action_space.shape) # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, max_action=self.action_high, use_sac=True).to(device) self.q_net_1 = QValue(num_states, self.num_actions).to(device) self.q_net_target_1 = QValue(num_states, self.num_actions).to(device) self.q_net_2 = QValue(num_states, self.num_actions).to(device) self.q_net_target_2 = QValue(num_states, self.num_actions).to(device) # self.alpha init self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_() self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_sac_alpha.p".format(self.env_id)) self.policy_net, self.q_net_1, self.q_net_2, self.running_state \ = pickle.load(open('{}/{}_sac_alpha.p'.format(self.model_path, self.env_id), "rb")) self.q_net_target_1.load_state_dict(self.q_net_1.state_dict()) self.q_net_target_2.load_state_dict(self.q_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, _ = self.policy_net.rsample(state) action = action.cpu().numpy()[0] return action, None def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action, _ = self.choose_action(state) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """interact""" global_steps = (i_iter - 1) * self.step_per_iter + 1 log = dict() num_steps = 0 num_episodes = 0 total_reward = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') while num_steps < self.step_per_iter: state = self.env.reset() state = self.running_state(state) episode_reward = 0 for t in range(10000): if self.render: self.env.render() if global_steps < self.explore_size: # explore action = self.env.action_space.sample() else: # action action, _ = self.choose_action(state) next_state, reward, done, _ = self.env.step(action) next_state = self.running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask, None) episode_reward += reward global_steps += 1 num_steps += 1 if global_steps >= self.min_update_step and global_steps % self.update_step == 0: for k in range(1, self.update_step + 1): batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch, k) if done or num_steps >= self.step_per_iter: break state = next_state num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) self.env.close() log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}") # record reward information writer.add_scalar("total reward", log['total_reward'], i_iter) writer.add_scalar("average reward", log['avg_reward'], i_iter) writer.add_scalar("min reward", log['min_episode_reward'], i_iter) writer.add_scalar("max reward", log['max_episode_reward'], i_iter) writer.add_scalar("num steps", log['num_steps'], i_iter) def update(self, batch, k_iter): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by SAC Alpha alg_step_stats = sac_alpha_step( self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1, self.q_net_target_2, self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, self.target_entropy, k_iter % self.target_update_delay == 0) def save(self, save_path): """save model""" check_path(save_path) pickle.dump( (self.policy_net, self.q_net_1, self.q_net_2, self.running_state), open('{}/{}_sac_alpha.p'.format(save_path, self.env_id), 'wb'))
class DQN: def __init__(self, env_id, render=False, num_process=1, memory_size=1000000, explore_size=10000, step_per_iter=3000, lr_q=1e-3, gamma=0.99, batch_size=128, min_update_step=1000, epsilon=0.90, update_target_gap=50, seed=1, model_path=None): self.env_id = env_id self.render = render self.num_process = num_process self.memory = Memory(size=memory_size) self.explore_size = explore_size self.step_per_iter = step_per_iter self.lr_q = lr_q self.gamma = gamma self.batch_size = batch_size self.min_update_step = min_update_step self.update_target_gap = update_target_gap self.epsilon = epsilon self.seed = seed self.model_path = model_path self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert not env_continuous, "DQN is only applicable to discontinuous environment !!!!" # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) # initialize networks self.value_net = QNet_dqn(num_states, self.num_actions).to(device) self.value_net_target = QNet_dqn(num_states, self.num_actions).to(device) self.running_state = ZFilter((num_states, ), clip=5) # load model if necessary if self.model_path: print("Loading Saved Model {}_dqn.p".format(self.env_id)) self.value_net, self.running_state = pickle.load( open('{}/{}_dqn.p'.format(self.model_path, self.env_id), "rb")) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer = optim.Adam(self.value_net.parameters(), lr=self.lr_q) def choose_action(self, state): state = FLOAT(state).unsqueeze(0).to(device) if np.random.uniform() <= self.epsilon: with torch.no_grad(): action = self.value_net.get_action(state) action = action.cpu().numpy()[0] else: # choose action greedy action = np.random.randint(0, self.num_actions) return action def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action = self.choose_action(state) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """interact""" global_steps = (i_iter - 1) * self.step_per_iter log = dict() num_steps = 0 num_episodes = 0 total_reward = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') while num_steps < self.step_per_iter: state = self.env.reset() state = self.running_state(state) episode_reward = 0 for t in range(10000): if self.render: self.env.render() if global_steps < self.explore_size: # explore action = self.env.action_space.sample() else: # choose according to target net action = self.choose_action(state) next_state, reward, done, _ = self.env.step(action) next_state = self.running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask, None) episode_reward += reward global_steps += 1 num_steps += 1 if global_steps >= self.min_update_step: batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch) if global_steps % self.update_target_gap == 0: self.value_net_target.load_state_dict( self.value_net.state_dict()) if done or num_steps >= self.step_per_iter: break state = next_state num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) self.env.close() log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}") # record reward information writer.add_scalar("total reward", log['total_reward'], i_iter) writer.add_scalar("average reward", log['avg_reward'], i_iter) writer.add_scalar("min reward", log['min_episode_reward'], i_iter) writer.add_scalar("max reward", log['max_episode_reward'], i_iter) writer.add_scalar("num steps", log['num_steps'], i_iter) def update(self, batch): batch_state = FLOAT(batch.state).to(device) batch_action = LONG(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) alg_step_stats = dqn_step(self.value_net, self.optimizer, self.value_net_target, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma) def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.value_net, self.running_state), open('{}/{}_dqn.p'.format(save_path, self.env_id), 'wb'))
class DDPG: def __init__(self, env_id, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_v=1e-3, gamma=0.99, polyak=0.995, explore_size=10000, step_per_iter=3000, batch_size=100, min_update_step=1000, update_step=50, action_noise=0.1, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.polyak = polyak self.memory = Memory(memory_size) self.explore_size = explore_size self.step_per_iter = step_per_iter self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.action_noise = action_noise self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert env_continuous, "DDPG is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, self.action_high).to(device) self.policy_net_target = Policy(num_states, self.num_actions, self.action_high).to(device) self.value_net = Value(num_states, self.num_actions).to(device) self.value_net_target = Value(num_states, self.num_actions).to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ddpg.p".format(self.env_id)) self.policy_net, self.value_net, self.running_state = pickle.load( open('{}/{}_ddpg.p'.format(self.model_path, self.env_id), "rb")) self.policy_net_target.load_state_dict(self.policy_net.state_dict()) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state, noise_scale): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] # add noise noise = noise_scale * np.random.randn(self.num_actions) action += noise action = np.clip(action, -self.action_high, self.action_high) return action def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() # state = self.running_state(state) action = self.choose_action(state, 0) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """interact""" global_steps = (i_iter - 1) * self.step_per_iter log = dict() num_steps = 0 num_episodes = 0 total_reward = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') while num_steps < self.step_per_iter: state = self.env.reset() # state = self.running_state(state) episode_reward = 0 for t in range(10000): if self.render: self.env.render() if global_steps < self.explore_size: # explore action = self.env.action_space.sample() else: # action with noise action = self.choose_action(state, self.action_noise) next_state, reward, done, _ = self.env.step(action) # next_state = self.running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask, None) episode_reward += reward global_steps += 1 num_steps += 1 if global_steps >= self.min_update_step and global_steps % self.update_step == 0: for _ in range(self.update_step): batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch) if done or num_steps >= self.step_per_iter: break state = next_state num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) self.env.close() log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}") # record reward information writer.add_scalar("total reward", log['total_reward'], i_iter) writer.add_scalar("average reward", log['avg_reward'], i_iter) writer.add_scalar("min reward", log['min_episode_reward'], i_iter) writer.add_scalar("max reward", log['max_episode_reward'], i_iter) writer.add_scalar("num steps", log['num_steps'], i_iter) def update(self, batch): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by DDPG alg_step_stats = ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p, self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak) def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_ddpg.p'.format(save_path, self.env_id), 'wb'))