def __init__(self,config,dev): self.dev = dev self.num_env =config['num_envs'] self.get_img_from_render = config['get_img_from_render'] self.obs_shape = (self.num_env,)+config['obs_space'][1:] self.reward_shape = (self.num_env,)+config['reward_space'][1:] self.gamma_shape = (self.num_env,)+config['gamma_space'][1:] if self.num_env == 1: self.env = gym.make(config['game_name']) else: def make_env(): def _thunk(): env = gym.make(config['game_name']) return env return _thunk envs = [make_env() for i in range(self.num_env)] self.env = SubprocVecEnv(envs)
def __init__(self, num_env_workers, make_env_func, agent, batch_size, rollout_length, num_recurrence_steps, state_shape, action_shape, stats): ''' -one agent is assigned to a collector. -a collector runs a bunch of envs in paralel to feed to that agent -you could run a bunch of collectors simultaniously, |- and then use weight mixing on the agents seperately ''' self.num_env_workers = num_env_workers self.envs = SubprocVecEnv( [make_env_func() for i in range(num_env_workers)]) self.agent = agent self.batch_size = batch_size self.rollout_length = rollout_length self.num_recurrence_steps = num_recurrence_steps self.state_shape = state_shape self.action_shape = action_shape self.stats = stats self.buffer_full = False self.GAE_calculated = False self.gamma = 0.8 self.tau = 0.8 self.rollout_indices = np.zeros(batch_size) self.buffer_width = self.rollout_length + self.num_recurrence_steps - 1 self.states = torch.zeros( (batch_size, self.buffer_width + 1, *state_shape), dtype=torch.float32).to(self.agent.device) self.actions = torch.zeros( (batch_size, self.buffer_width + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.log_probs = torch.zeros( (batch_size, self.buffer_width + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.values = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.rewards = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.done_masks = torch.zeros( (batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.advantages = torch.zeros( (batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.returns = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.state = self.envs.reset() self.hidden_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) self.cell_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device)
def gen_multi_envs(n_envs, policy): def make_env(): def _thunk(): env = gen_env(policy) return env return _thunk envs = [make_env() for i in range(n_envs)] envs = SubprocVecEnv(envs) return envs
def main(): pixels = ( (0.0, 1.0, 1.0), (0.0, 1.0, 0.0), (0.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 0.0), (0.0, 0.0, 0.0), (1.0, 0.0, 0.0), ) pixel_to_categorical = {pix: i for i, pix in enumerate(pixels)} num_pixels = len(pixels) #For each mode in MiniPacman there are different rewards mode_rewards = { "regular": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "avoid": [0.1, -0.1, -5, -10, -20], "hunt": [0, 1, 10, -20], "ambush": [0, -0.1, 10, -20], "rush": [0, -0.1, 9.9] } reward_to_categorical = { mode: {reward: i for i, reward in enumerate(mode_rewards[mode])} for mode in mode_rewards.keys() } mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n env_model = EnvModel(envs.observation_space.shape, num_pixels, len(mode_rewards["regular"])) actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(env_model.parameters())
def __init__(self, args): """"Constructor which allows the PPO class to initialize the attributes of the class""" self.args = args self.random_seed() # Check if GPU is available via CUDA driver self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # Initialize the actor critic class self.actor_critic = ActorCritic( self.args.nb_states, self.args.nb_actions, self.args.hidden_layer_size).to(self.device) # Define the optimizer used for the optimization of the surrogate loss self.optimizer = self.args.optimizer(self.actor_critic.parameters(), self.args.lr) # For training multiple instances of the env are needed (Shoulder model) self.envs = [self.make_env() for i in range(self.args.num_envs)] self.envs = SubprocVecEnv(self.envs) # To validate the intermediate learning process one test env is needed self.env_test = self.args.env self.env_test.seed(self.args.seed) self.env_test.set_scaling(self.args.output_scaling) # Lists for Tensorboard to visualize learning process during learning self.test_rewards = [] self.loss = [] self.lr = [] self.actor_grad_weight = [] self.action_bang_bang = [] self.lr.append(self.args.lr) # Dump bin files if self.args.play is False: self.output_path = "trained_models" + '/PPO_{}'.format( datetime.now().strftime('%Y%b%d_%H%M%S')) + "/" os.mkdir(self.output_path) self.writer = SummaryWriter(self.output_path)
def main(): num_envs = 16 env_name = "CartPole-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make("CartPole-v0") STATE_SIZE = env.observation_space.shape[0] N_ACTIONS = env.action_space.n agent = Agent(STATE_SIZE, N_ACTIONS) trainer = Trainer(envs, agent, lr=3e-4) trainer.train(epochs=10000, max_steps=5, test_every=50)
def main(): num_envs = 16 envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make("CartPole-v0") num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.n # Hyper params: hidden_size = 256 lr = 3e-4 num_steps = 5 model = ActorCritic(num_inputs,num_outputs,hidden_size).to(device) optimizer = optim.Adam(model.parameters()) max_frames = 20000 frame_idx = 0 test_rewards = [] state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 #每个子网络运行num_steps个steps,实现n步采样 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() #记录下这num_steps步的各子网络相关参数 log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 if frame_idx % 100 == 0: test_rewards.append(np.mean([test_env(model, env) for _ in range(10)])) plot(frame_idx, test_rewards) #将子网络的参数传给主网络,并进行参数更新 next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) #将5个step的值串起来 log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values #计算loss均值 actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step()
def main(): envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n num_rewards = len(task_rewards[mode]) full_rollout = True env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards) env_model.load_state_dict(torch.load("env_model_" + mode)) distil_policy = ActorCritic(envs.observation_space.shape, envs.action_space.n) distil_optimizer = optim.Adam(distil_policy.parameters()) imagination = ImaginationCore(1, state_shape, num_actions, num_rewards, env_model, distil_policy, full_rollout=full_rollout) actor_critic = I2A(state_shape, num_actions, num_rewards, 256, imagination, full_rollout=full_rollout) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) #if USE_CUDA: # env_model = env_model.cuda() # distil_policy = distil_policy.cuda() # actor_critic = actor_critic.cuda() gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e5) rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) #rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() current_state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(current_state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in tqdm(range(num_frames)): for step in range(num_steps): #if USE_CUDA: # current_state = current_state.cuda() action = actor_critic.act(autograd.Variable(current_state)) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks #if USE_CUDA: # masks = masks.cuda() current_state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, current_state, action.data, reward, masks) _, next_value = actor_critic( autograd.Variable(rollout.states[-1], volatile=True)) next_value = next_value.data returns = rollout.compute_returns(next_value, gamma) logit, action_log_probs, values, entropy = actor_critic.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) distil_logit, _, _, _ = distil_policy.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) distil_loss = 0.01 * (F.softmax(logit).detach() * F.log_softmax(distil_logit)).sum(1).mean() values = values.view(num_steps, num_envs, 1) action_log_probs = action_log_probs.view(num_steps, num_envs, 1) advantages = autograd.Variable(returns) - values value_loss = advantages.pow(2).mean() action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() distil_optimizer.zero_grad() distil_loss.backward() optimizer.step() if i_update % 100 == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) #clear_output(True) plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) plt.plot(all_rewards) plt.subplot(132) plt.title('loss %s' % all_losses[-1]) plt.plot(all_losses) plt.show() rollout.after_update() torch.save(actor_critic.state_dict(), "i2a_" + mode)
actor_loss = - torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() if __name__ == '__main__': logger = Logger('./log') env = ProstheticsEnv(False) envs = [make_env() for i in range(NUM_ENVS)] envs = SubprocVecEnv(envs) model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) optimizer = optim.Adam(model.parameters(), lr=lr) frame_idx = 0 test_rewards = [] state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] states = [] actions = [] rewards = []
class PPO(object): """Main PPO class""" def __init__(self, args): """"Constructor which allows the PPO class to initialize the attributes of the class""" self.args = args self.random_seed() # Check if GPU is available via CUDA driver self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # Initialize the actor critic class self.actor_critic = ActorCritic( self.args.nb_states, self.args.nb_actions, self.args.hidden_layer_size).to(self.device) # Define the optimizer used for the optimization of the surrogate loss self.optimizer = self.args.optimizer(self.actor_critic.parameters(), self.args.lr) # For training multiple instances of the env are needed (Shoulder model) self.envs = [self.make_env() for i in range(self.args.num_envs)] self.envs = SubprocVecEnv(self.envs) # To validate the intermediate learning process one test env is needed self.env_test = self.args.env self.env_test.seed(self.args.seed) self.env_test.set_scaling(self.args.output_scaling) # Lists for Tensorboard to visualize learning process during learning self.test_rewards = [] self.loss = [] self.lr = [] self.actor_grad_weight = [] self.action_bang_bang = [] self.lr.append(self.args.lr) # Dump bin files if self.args.play is False: self.output_path = "trained_models" + '/PPO_{}'.format( datetime.now().strftime('%Y%b%d_%H%M%S')) + "/" os.mkdir(self.output_path) self.writer = SummaryWriter(self.output_path) #self.delta = (self.args.lr-self.args.lr_end)/1e6 def train(self): """Main training function""" frame_idx = 0 state = self.envs.reset() mean_100_reward = -np.inf self.info() while frame_idx < self.args.max_frames: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = self.args.entropy for _ in range(self.args.nb_steps): state = torch.FloatTensor(state).to(self.device) dist, value = self.actor_critic(state) action = dist.sample() # Make sure action is loaded to CPU (not GPU) next_state, reward, done, _ = self.envs.step( action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append( torch.FloatTensor(reward).unsqueeze(1).to(self.device)) masks.append( torch.FloatTensor(1 - done).unsqueeze(1).to(self.device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 #self.scheduler() # Evaluate training process and write data to tensorboard if frame_idx % 1000 == 0: test_reward = np.mean( [self.test_env(self.args.vis) for _ in range(10)]) self.test_rewards.append(test_reward) if self.args.play is False: print("Mean reward: ", np.round(np.mean(self.test_rewards[-101:-1]), 0)) if mean_100_reward < np.round( np.mean(self.test_rewards[-101:-1]), 0): mean_100_reward = np.round( np.mean(self.test_rewards[-101:-1]), 0) self.save_network(mean_100_reward) if len(self.test_rewards) >= 10: self.writer.add_scalar( 'data/reward', np.mean(self.test_rewards[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/ppo_loss', np.mean(self.loss[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/nb_actions_outside_range', np.mean(self.action_bang_bang[-11:-1]), frame_idx * self.args.num_envs) # if test_reward > threshold_reward: early_stop = True next_state = torch.FloatTensor(next_state).to(self.device) _, next_value = self.actor_critic(next_state) returns = self.calc_gae(next_value, rewards, masks, values, self.args.gamma, self.args.tau) # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size, states, actions, log_probs, returns, advantage, self.args.clip) def make_env(self): # Private trunk function for calling the SubprocVecEnv class def _trunk(): env = self.args.env # in this simple case the class TestEnv() is called (see openAI for more envs) env.seed(self.args.seed) env.set_scaling(self.args.output_scaling) return env return _trunk def test_env(self, vis=False): state = self.env_test.reset() if vis: self.env_test.render() done = False total_reward = 0 action_bang_bang = 0 step = 0 while not done: step += 1 state = torch.FloatTensor(state).unsqueeze(0).to(self.device) dist, _ = self.actor_critic(state) action = dist.sample().cpu().numpy()[0] force = action * self.args.output_scaling next_state, reward, done, _ = self.env_test.step(action) if force > 0.5 or force < -0.5: action_bang_bang += 1 state = next_state if vis: self.env_test.render() total_reward += reward self.action_bang_bang.append(action_bang_bang / step) return total_reward # Plain functions except that one can call them from an instance or the class @staticmethod def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95): values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns @staticmethod def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], actions[rand_ids, :], log_probs[ rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): for _ in range(ppo_epochs): for state, action, old_log_probs, return_, advantage in self.ppo_iter( mini_batch_size, states, actions, log_probs, returns, advantages): dist, value = self.actor_critic(state) entropy = dist.entropy().mean() new_log_probs = dist.log_prob(action) ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy self.loss.append(loss.item()) # Important step: self.optimizer.zero_grad() #pdb.set_trace() loss.backward() if self.args.grad_norm is not None: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.args.grad_norm) self.optimizer.step() def save_network(self, reward): network_path = self.output_path + "/network" + str(reward) pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb")) def load_network(self, path): network_new = pickle.load(open(path, "rb")) self.actor_critic.load_state_dict(network_new) def random_seed(self): torch.manual_seed(self.args.seed) random.seed(self.args.seed) np.random.seed(self.args.seed) def scheduler(self): for g in self.optimizer.param_groups: lr = g["lr"] if self.args.lr_end > lr: lr = self.args.lr_end else: lr -= self.delta self.lr.append(lr) g["lr"] = lr def info(self): fhandler = logging.FileHandler(filename=self.output_path + '/mylog.log', mode='a') logger.addHandler(fhandler) logger.info("--- INFO ---") logger.info("args: {}".format(self.args))
# https://github.com/openai/baselines/blob/f2729693253c0ef4d4086231d36e0a4307ec1cb3/baselines/acktr/utils.py num = (q_mu - p_mu)**2 + q_sigma**2 - p_sigma**2 den = 2 * (p_sigma**2) + 1e-8 kl = torch.mean(num/den + torch.log(p_sigma) - torch.log(q_sigma)) return kl def make_env(): def _thunk(): env = ActiveVisionDatasetEnv() return env return _thunk if __name__ == "__main__": num_envs =6 envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape print("!!!state_shape:",state_shape) #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 10 num_frames = int(10e7) #rmsprop hyperparams: lr = 1e-4 eps = 1e-5 alpha = 0.99
def main(): current_time = time.ctime().replace(":", "_") log_dir = "logs/PPO/{}".format(current_time) # tensorboard writer = SummaryWriter(log_dir=log_dir) # csv logfile_name = "{}/train_log.csv".format(log_dir) with open(logfile_name, 'w+', newline='') as f: csv_writer = csv.writer(f, delimiter=";") csv_writer.writerow([ 'update', 'running_loss', 'Reward', 'loss', 'actor_loss', 'critic_loss', 'entropy_loss', 'time' ]) ############## Hyperparameters ############## # env_name = "CartPole-v0" # creating environment envs = SubprocVecEnv([ lambda: rpg.Environment('gym', "Neo"), lambda: rpg.Environment('gym', "Morpheus"), lambda: rpg.Environment('gym', "Trinity"), lambda: rpg.Environment('gym', "Oracle"), lambda: rpg.Environment('gym', "Cypher"), lambda: rpg.Environment('gym', "Tank"), lambda: rpg.Environment('gym', "Agent_Smith"), lambda: rpg.Environment('gym', "Dozer") ]) env = VecPyTorch(envs, device) state_dim = (3, 64, 64) action_dim = env.action_space.n save_freq = 10000 print_freq = 10 max_episodes = 500001 # max training episodes max_timesteps = 5 # max timesteps in one episode n_latent_var = 256 # number of variables in hidden layer update_timestep = 15 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factror K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = 11 actor_loss = 0 critic_loss = 0 entropy_loss = 0 loss = 0 ############################################# if random_seed: os.environ['PYTHONHASHSEED'] = str(random_seed) random.seed(random_seed) numpy.random.seed(random_seed) torch.manual_seed(random_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) # logging variables running_reward = 0 avg_length = 0 timestep = 0 state, minimap = env.reset() # training loop for i_episode in range(1, max_episodes + 1): # state, minimap = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: dist, _ = ppo.policy_old(state, minimap) action = dist.sample() state, minimap, reward, done, _ = env.step(action.unsqueeze(1)) memory.states.append(state) memory.maps.append(minimap) memory.actions.append(action) memory.logprobs.append(dist.log_prob(action)) # Saving reward and is_terminal: memory.rewards.append(reward.to(device).squeeze()) memory.is_terminals.append(done) # update if its time if timestep % update_timestep == 0: loss, actor_loss, critic_loss, entropy_loss = ppo.update( memory) memory.clear_memory() timestep = 0 running_reward += reward.mean().item() # avg_length += t # logging if i_episode % print_freq == 0: print("********************************************************") print("episode: {0}".format(i_episode)) print("mean/median reward: {:.1f}/{:.1f}".format( reward.mean(), reward.median())) print("min/max reward: {:.1f}/{:.1f}".format( reward.min(), reward.max())) print("actor loss: {:.5f}, critic loss: {:.5f}, entropy: {:.5f}". format(actor_loss, critic_loss, entropy_loss)) print("Loss: {0}".format(loss)) print("********************************************************") # show data in tensorflow writer.add_scalar('Loss/Loss', loss, i_episode) writer.add_scalar('Loss/Actor Loss', actor_loss, i_episode) writer.add_scalar('Loss/Critic Loss', critic_loss, i_episode) writer.add_scalar('Loss/Entropy', entropy_loss, i_episode) writer.add_scalar('Reward/Running Reward', running_reward, i_episode) writer.add_scalar('Reward/Min', reward.min(), i_episode) writer.add_scalar('Reward/Max', reward.max(), i_episode) writer.add_scalar('Reward/Mean', reward.mean(), i_episode) writer.add_scalar('Reward/Median', reward.median(), i_episode) writer.add_scalar('Reward/Sum', reward.sum(), i_episode) with open(logfile_name, 'a+', newline='') as f: csv_writer = csv.writer(f, delimiter=";") csv_writer.writerow([ i_episode, running_reward, reward.mean(), loss, actor_loss, critic_loss, entropy_loss, time.ctime() ]) if save_freq > 0 and i_episode % save_freq == 0: torch.save(ppo.policy.state_dict(), '{}/model.pth'.format(log_dir)) torch.save(ppo.policy_old.state_dict(), '{}/model_old.pth'.format(log_dir)) print("saved")
env = gym.make(enviorment_name) env.render(mode='human', close=False) # to visulize 3d render # fuction to create functions for making enviorment, for the multiprocessing lib def make_env_list(): def env_multiprocessing(): env = gym.make(enviorment_name) return env return env_multiprocessing # create container that contain the different parallel enviorments. envs = [make_env_list() for i in range(num_envs)] envs = SubprocVecEnv(envs) #General Advantage Esitmator def GAE(next_critic_value, rewards, masks, values, gamma, lambda_): gae = 0 values_ = values + [next_critic_value] returns = [] for k in reversed(range(len(rewards))): re = torch.transpose(rewards[k].unsqueeze(1), 0, 1) gv = gamma * torch.transpose(values_[k + 1], 0, 1) * masks[k] vv = torch.transpose(values_[k], 0, 1) delta = re + gv - vv # "exponential decay" gae = delta + gamma * lambda_ * masks[k] * gae # "smoothing" returns.append(gae + vv) return list(reversed(returns))
from multiprocessing_env import SubprocVecEnv num_envs = 16 env_name = "Pendulum-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) class ActorCritic: def __init__(self, sess, obs, acs, hidden_size, name, trainable, init_std=1.0): self.sess = sess self.obs = obs
class env_cover(): def __init__(self, config, dev): self.dev = dev self.num_env = config['num_envs'] self.get_img_from_render = config['get_img_from_render'] self.obs_shape = (self.num_env, ) + config['obs_space'][1:] # print(self.obs_shape) self.reward_shape = (self.num_env, ) + config['reward_space'][1:] self.gamma_shape = (self.num_env, ) + config['gamma_space'][1:] if self.num_env == 1: self.env = gym.make(config['game_name']) else: def make_env(): def _thunk(): env = gym.make(config['game_name']) return env return _thunk envs = [make_env() for i in range(self.num_env)] self.env = SubprocVecEnv(envs) # #def obs_preproc(x): # if IMG_GET_RENDER ==False: # return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0) # x = np.dot(x, np.array([[0.299, 0.587, 0.114]]).T) # x = np.reshape(x, (1,x.shape[1], x.shape[0])) # return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0)/255 # def reset(self): st = self.env.reset() if self.get_img_from_render: st = self.env.render(mode='rgb_array') st = np.resize(st, self.obs_shape) / 255. return torch.FloatTensor(st).reshape(self.obs_shape).to( self.dev), torch.zeros(self.reward_shape).to( self.dev), torch.zeros(self.gamma_shape).to(self.dev) #return st, 0,False # def get_obs(self,obs): # return torch.from_numpy(obs).detach().float().view(1,config['obs_space']) def step(self, action): st, rt, dt, _ = self.env.step(action) if self.get_img_from_render: st = self.env.render(mode='rgb_array') st = np.resize(st, self.obs_shape) / 255. # print(st) st = torch.FloatTensor(st).reshape(self.obs_shape).to(self.dev) rt = torch.FloatTensor([rt]).reshape(self.reward_shape).to(self.dev) if self.num_env == 1: dt = torch.FloatTensor([dt]).reshape(self.gamma_shape).to(self.dev) else: dt = torch.FloatTensor(dt.astype(int)).reshape( self.gamma_shape).to(self.dev) return st, rt, dt def end_dummy(self): return torch.zeros(self.obs_shape).to(self.dev), torch.zeros( self.reward_shape).to(self.dev), torch.zeros(self.gamma_shape).to( self.dev) def render(self): self.env.render() def close(self): self.env.close()
for i in range(num_envs_possible): if (rospy.has_param("/GETjag" + str(i) + "/worker_ready")): if (rospy.get_param("/GETjag" + str(i) + "/worker_ready")): num_envs += 1 print("worker_", num_envs) def make_env(i): def _thunk(): env = robotEnv(i) return env return _thunk envs = [make_env(i+1) for i in range(num_envs)] envs = SubprocVecEnv(envs) state_size_map = envs.observation_space[0].shape[0] * envs.observation_space[1].shape[1] state_size_depth = envs.observation_space[1].shape[0] * envs.observation_space[1].shape[1] state_size_goal = envs.observation_space[2].shape[0] num_outputs = envs.action_space.shape[0] stack_size = 1 class image_stacker(): def __init__(self, state_size, stack_size): self.stacked_frames = deque([np.zeros((state_size_map), dtype=np.float32) for i in range(stack_size)], maxlen=stack_size) def return_stacked_frame(self): return self.stacked_frames
class RolloutCollector: def __init__(self, num_env_workers, make_env_func, agent, batch_size, rollout_length, state_shape, action_shape, stats): ''' -one agent is assigned to a collector. -a collector runs a bunch of envs in paralel to feed to that agent -you could run a bunch of collectors simultaniously, |- and then use weight mixing on the agents seperately ''' #self.storage_device = torch.device("cpu") self.num_env_workers = num_env_workers self.envs = SubprocVecEnv( [make_env_func() for i in range(num_env_workers)]) self.agent = agent self.batch_size = batch_size self.rollout_length = rollout_length self.state_shape = state_shape self.action_shape = action_shape self.stats = stats self.buffer_full = False self.GAE_calculated = False self.gamma = 0.8 self.tau = 0.8 self.rollout_indices = np.zeros(batch_size) self.states = torch.zeros( (batch_size, rollout_length + 1, *state_shape), dtype=torch.float32).to(self.agent.device) self.actions = torch.zeros( (batch_size, rollout_length + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.log_probs = torch.zeros( (batch_size, rollout_length + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.values = torch.zeros((batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.rewards = torch.zeros((batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.done_masks = torch.zeros( (batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.advantages = torch.zeros( (batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.returns = torch.zeros((batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.state = self.envs.reset() def collect_samples(self): if self.buffer_full: raise Exception( "tried to collect more samples when buffer already full") num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers) with torch.no_grad(): for collection_run in range(num_runs_to_full): start_index = collection_run * self.num_env_workers end_index_exclusive = min(start_index + self.num_env_workers, self.batch_size) run_indices = torch.arange(start_index, end_index_exclusive, dtype=torch.long) worker_indices = run_indices % self.num_env_workers for rollout_idx in range(self.rollout_length + 1): state = torch.Tensor(self.state).float().to( self.agent.device) policy_dist = self.agent.actor(state) action = policy_dist.sample() if self.agent.tanh_action_clamping: action = torch.tanh(action) else: action = action.clamp(-1, 1) # depends on env cpu_actions = action.cpu().numpy() state_, reward, done, info = self.envs.step(cpu_actions) value = self.agent.critic(state) log_prob = policy_dist.log_prob(action) reward = torch.Tensor(reward).float().unsqueeze(1).to( self.agent.device) done_masks = torch.Tensor(1.0 - done).float().unsqueeze(1).to( self.agent.device) self.states[run_indices, rollout_idx] = state[worker_indices] self.actions[run_indices, rollout_idx] = action[worker_indices] self.log_probs[run_indices, rollout_idx] = log_prob[worker_indices] self.values[run_indices, rollout_idx] = value[worker_indices] self.rewards[run_indices, rollout_idx] = reward[worker_indices] self.done_masks[run_indices, rollout_idx] = done_masks[worker_indices] self.state = state_ self.buffer_full = True self.stats.update_collection_stats( num_samples_collected_inc=self.batch_size * self.rollout_length) def compute_gae(self): if not self.buffer_full: raise Exception( "buffer is not full of new samples yet (so not ready for GAE)") gae = torch.zeros((self.batch_size, 1)).to(self.agent.device) for i in reversed(range(self.rollout_length)): delta = self.rewards[:, i] + self.gamma * self.values[:, i + 1] * self.done_masks[:, i] - self.values[:, i] gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae self.returns[:, i] = gae + self.values[:, i] self.advantages[:, i] = gae self.GAE_calculated = True def random_batch_iter(self): if not self.buffer_full and not self.GAE_calculated: raise Exception( "buffer is not ready for sampling yet. (not full/no GAE)") '''-theres no way all the workers are aligned, especially after an episode or so. so we might just be able to use a vertical index''' batch_indices = torch.randperm(self.rollout_length) for i in range(self.rollout_length): index = batch_indices[i] state = self.states[:, index] action = self.actions[:, index] log_prob = self.log_probs[:, index] advantage = self.advantages[:, index] return_ = self.returns[:, index] yield state, action, log_prob, advantage, return_ def reset(self): self.buffer_full = False self.GAE_calculated = False
##################################################################### from multiprocessing_env import SubprocVecEnv num_envs = 16 env_name = "Pendulum-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) ##################################################################### class ActorCritic: def __init__(self, sess, obs, acs, hidden_size, name, trainable, init_std=1.0): self.sess = sess self.obs = obs self.acs = acs self.hidden_size = hidden_size self.name = name self.trainable = trainable self.init_std = init_std
num_envs = 8 env_name = "CartPole-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk plt.ion() envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) # 8 env env = gym.make(env_name) # a single env class ActorCritic(nn.Module): def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): super(ActorCritic, self).__init__() self.critic = nn.Sequential( # network that outputs value nn.Linear(num_inputs, hidden_size), nn.ReLU(), nn.Linear(hidden_size, 1)) self.actor = nn.Sequential( # network that outputs prob of action nn.Linear(num_inputs, hidden_size), nn.ReLU(),
def create_envs(p, args, N): #creates multiple environments for training urdf_path = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf") envs = [make_env(p, urdf_path, args=args) for i in range(N)] envs = SubprocVecEnv(envs) return envs
def main(): mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e3) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 #Init a2c and rmsprop actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) #if USE_CUDA: # actor_critic = actor_critic.cuda() rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) #rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in tqdm(range(num_frames)): for step in range(num_steps): action = actor_critic.act(autograd.Variable(state)) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks #if USE_CUDA: # masks = masks.cuda() state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, state, action.data, reward, masks) _, next_value = actor_critic( autograd.Variable(rollout.states[-1], volatile=True)) next_value = next_value.data returns = rollout.compute_returns(next_value, gamma) logit, action_log_probs, values, entropy = actor_critic.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) values = values.view(num_steps, num_envs, 1) action_log_probs = action_log_probs.view(num_steps, num_envs, 1) advantages = autograd.Variable(returns) - values value_loss = advantages.pow(2).mean() action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() if i_update % num_frames == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) #clear_output(True) plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) plt.plot(all_rewards) plt.subplot(132) plt.title('loss %s' % all_losses[-1]) plt.plot(all_losses) plt.show() rollout.after_update() torch.save(actor_critic.state_dict(), "actor_critic_" + mode) import time def displayImage(image, step, reward): #clear_output(True) s = "step: " + str(step) + " reward: " + str(reward) plt.figure(figsize=(10, 3)) plt.title(s) plt.imshow(image) plt.show() time.sleep(0.1) env = MiniPacman(mode, 1000) done = False state = env.reset() total_reward = 0 step = 1 while not done: current_state = torch.FloatTensor(state).unsqueeze(0) #if USE_CUDA: # current_state = current_state.cuda() action = actor_critic.act(autograd.Variable(current_state)) next_state, reward, done, _ = env.step(action.data[0, 0]) total_reward += reward state = next_state image = torch.FloatTensor(state).permute(1, 2, 0).cpu().numpy() displayImage(image, step, total_reward) step += 1
class RolloutCollector: def __init__(self, num_env_workers, make_env_func, agent, batch_size, rollout_length, num_recurrence_steps, state_shape, action_shape, stats): ''' -one agent is assigned to a collector. -a collector runs a bunch of envs in paralel to feed to that agent -you could run a bunch of collectors simultaniously, |- and then use weight mixing on the agents seperately ''' self.num_env_workers = num_env_workers self.envs = SubprocVecEnv( [make_env_func() for i in range(num_env_workers)]) self.agent = agent self.batch_size = batch_size self.rollout_length = rollout_length self.num_recurrence_steps = num_recurrence_steps self.state_shape = state_shape self.action_shape = action_shape self.stats = stats self.buffer_full = False self.GAE_calculated = False self.gamma = 0.8 self.tau = 0.8 self.rollout_indices = np.zeros(batch_size) self.buffer_width = self.rollout_length + self.num_recurrence_steps - 1 self.states = torch.zeros( (batch_size, self.buffer_width + 1, *state_shape), dtype=torch.float32).to(self.agent.device) self.actions = torch.zeros( (batch_size, self.buffer_width + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.log_probs = torch.zeros( (batch_size, self.buffer_width + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.values = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.rewards = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.done_masks = torch.zeros( (batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.advantages = torch.zeros( (batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.returns = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.state = self.envs.reset() self.hidden_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) self.cell_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) def collect_samples(self): if self.buffer_full: raise Exception( "tried to collect more samples when buffer already full") num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers) with torch.no_grad(): self.hidden_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) self.cell_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) for collection_run in range(num_runs_to_full): start_index = collection_run * self.num_env_workers end_index_exclusive = min(start_index + self.num_env_workers, self.batch_size) run_indices = torch.arange(start_index, end_index_exclusive, dtype=torch.long) worker_indices = run_indices % self.num_env_workers for rollout_idx in range(self.buffer_width + 1): state = torch.Tensor(self.state).float().to( self.agent.device) # for recurrences lstm_input = state.view(-1, 1, *self.state_shape) output, (hidden, cell) = self.agent.lstm( lstm_input, (self.hidden_state, self.cell_state)) output = output.reshape(self.num_env_workers, self.agent.hidden_state_size) policy_dist = self.agent.actor(output) action = policy_dist.sample() action = action.clamp(-1, 1) # depends on env state_, reward, done, info = self.envs.step( action.cpu().numpy()) value = self.agent.critic(output) log_prob = policy_dist.log_prob(action) reward = torch.Tensor(reward).float().unsqueeze(1).to( self.agent.device) done_masks = torch.Tensor(1.0 - done).float().unsqueeze(1).to( self.agent.device) self.states[run_indices, rollout_idx] = state[worker_indices] self.actions[run_indices, rollout_idx] = action[worker_indices] self.log_probs[run_indices, rollout_idx] = log_prob[worker_indices] self.values[run_indices, rollout_idx] = value[worker_indices] self.rewards[run_indices, rollout_idx] = reward[worker_indices] self.done_masks[run_indices, rollout_idx] = done_masks[worker_indices] self.hidden_state[0, worker_indices] *= self.done_masks[ run_indices, rollout_idx].expand(-1, self.agent.hidden_state_size) self.cell_state[0, worker_indices] *= self.done_masks[ run_indices, rollout_idx].expand(-1, self.agent.hidden_state_size) self.state = state_ self.buffer_full = True self.stats.update_collection_stats( num_samples_collected_inc=self.batch_size * self.rollout_length) def compute_gae(self): if not self.buffer_full: raise Exception( "buffer is not full of new samples yet (so not ready for GAE)") gae = torch.zeros((self.batch_size, 1)).to(self.agent.device) for i in reversed(range(self.buffer_width)): delta = self.rewards[:, i] + self.gamma * self.values[:, i + 1] * self.done_masks[:, i] - self.values[:, i] gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae self.returns[:, i] = gae + self.values[:, i] self.advantages[:, i] = gae self.GAE_calculated = True def get_leading_states(self, index): indices_with_leading_states = torch.arange( self.num_recurrence_steps) - self.num_recurrence_steps + 1 + index leading_states = self.states[:, indices_with_leading_states] # some of the leading states might be from previous episodes # # in which case, we dont want to consider those at all. leading_state_indices = indices_with_leading_states[:-1] leading_dones = 1 - self.done_masks[:, leading_state_indices] last_leading_dones = leading_dones.nonzero()[:, :2] for batch_index, last_done in last_leading_dones: previous_episode_indices = torch.arange(last_done + 1) leading_states[batch_index, previous_episode_indices] = 0 return leading_states def random_batch_iter(self): if not self.buffer_full and not self.GAE_calculated: raise Exception( "buffer is not ready for sampling yet. (not full/no GAE)") '''-theres no way all the workers are aligned, especially after an episode or so. so we might just be able to use a vertical index''' batch_indices = torch.randperm(self.rollout_length) # recurrence stuff if self.num_recurrence_steps > 0: batch_indices = torch.randperm( self.rollout_length) + self.num_recurrence_steps - 1 self.hidden_state = torch.zeros( (1, self.batch_size, self.agent.hidden_state_size)).to(self.agent.device) self.cell_state = torch.zeros( (1, self.batch_size, self.agent.hidden_state_size)).to(self.agent.device) for i in range(self.rollout_length): index = batch_indices[i] leading_states = self.get_leading_states(index) output, (hidden, cell) = self.agent.lstm( leading_states, (self.hidden_state, self.cell_state)) state = output[:, -1, :] action = self.actions[:, index] log_prob = self.log_probs[:, index] advantage = self.advantages[:, index] return_ = self.returns[:, index] yield state, action, log_prob, advantage, return_ def reset(self): self.buffer_full = False self.GAE_calculated = False
def train(env, agent, flags): """""" # set random seeds (for reproducibility) torch.manual_seed(flags['seed']) torch.cuda.manual_seed_all(flags['seed']) envs = [make_env(flags['env'], flags['seed'], i) for i in range(flags['num_envs'])] envs = SubprocVecEnv(envs) # instantiate the policy and optimiser num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.n optimizer = optim.Adam(model.parameters(), lr=learning_rate) current_step_number = 0 test_rewards = [] state = envs.reset() while current_step_number < flags['max_steps']: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for _ in range(flags['num_step_td_update']): # sample an action from the distribution action = agent.act(state) # take a step in the environment next_state, reward, done, _ = envs.step(action.cpu().numpy()) # compute the log probability log_prob = dist.log_prob(action) # compute the entropy entropy += dist.entropy().mean() # save the log probability, value and reward log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) # if done, save episode rewards state = next_state current_step_number += 1 if current_step_number % 1000 and flags['plot_test'] == 0: test_rewards.append(np.mean([test_env(model) for _ in range(10)])) plot(current_step_number, test_rewards) next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) # calculate the discounted return of the episode returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() # loss function loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() return rewards
parser = argparse.ArgumentParser() parser.add_argument("--epoch", default=int(5e5), type=int) args = parser.parse_args() mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 10 num_frames = args.epoch #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99
num_boids = np.random.randint(MIN_NUM_BOIDS, MAX_NUM_BOIDS + 1) num_spheres = np.random.randint(MIN_NUM_SPHERES, MAX_NUM_SPHERES + 1) env_num_boids.append(num_boids) envs.append(make_env(num_boids, num_spheres)) edges = utils.system_edges(NUM_GOALS, num_spheres, num_boids) edge_types = one_hot(edges, EDGE_TYPES) padded_edge_types.append( utils.pad_data(edge_types, MAX_NUM_NODES, dims=[0, 1])) mask = utils.get_mask(num_boids, MAX_NUM_NODES) masks.append(mask) envs = SubprocVecEnv(envs) padded_edge_types = np.array(padded_edge_types) masks = np.array(masks) swarmnet_params = load_model_params('config/il_rl.json') actorcritic = get_swarmnet_actorcritic(swarmnet_params, '../../Logs/swarmnet_rl_test') swarmnet_agent = PPOAgent(actorcritic, NDIM, action_bound=None, rollout_steps=ROLLOUT_STEPS, memory_capacity=4096, summary_writer=None, mode=0)
num_envs = 16 env_name = "Pendulum-v0" # TODO : rajouter env reset def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) def plot(frame_idx, rewards): clear_output(True) plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) plt.plot(rewards) plt.show() def test_env(vis=False): state = env.reset()
if __name__ == "__main__": mkdir('.', 'checkpoints') parser = argparse.ArgumentParser() parser.add_argument("-n", "--name", default=ENV_ID, help="Name of the run") args = parser.parse_args() writer = SummaryWriter(comment="ppo_" + args.name) # Autodetect CUDA use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print('Device:', device) # Prepare environments envs = [make_env() for i in range(NUM_ENVS)] envs = SubprocVecEnv(envs) env = gym.make(ENV_ID) env.n_foods = 10 obs_ = env.reset() num_inputs = obs_.shape num_outputs = env.action_space.shape[0] model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE, std=0.1).to(device) print(model) optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) frame_idx = 0 train_epoch = 0 best_reward = None state = envs.reset()
default=False, help="Use multi process") args = parser.parse_args() writer = SummaryWriter(comment="ppo_connectx") # Autodetect CUDA use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print('Device:', device) # Prepare environments envs = [make_env() for i in range(args.envs)] envs = MultiEnv(envs) if args.mp: envs = SubprocVecEnv(envs) env = OhlcvEnv(WINDOW_SIZE, './data/test/') obs_ = env.reset() num_inputs = env.observation_space.shape num_outputs = env.action_space.n model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE, std=0.0).to(device) print(model) optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) frame_idx = 0 train_epoch = 0 best_reward = None state = envs.reset()
actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() if __name__ == '__main__': logger = Logger('./log') env = ProstheticsEnv(False) envs = [make_env() for i in range(NUM_ENVS)] envs = SubprocVecEnv(envs) model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) optimizer = optim.Adam(model.parameters(), lr=lr) frame_idx = 0 test_rewards = [] state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] states = [] actions = [] rewards = []
num_envs = 8 env_name = "CartPole-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk plt.ion() envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) # 8 env env = gym.make(env_name) # a single env class ActorCritic(nn.Module): def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): super(ActorCritic, self).__init__() self.critic = nn.Sequential(nn.Linear(num_inputs, hidden_size), nn.ReLU(), nn.Linear(hidden_size, 1)) self.actor = nn.Sequential( nn.Linear(num_inputs, hidden_size), nn.ReLU(), nn.Linear(hidden_size, num_outputs),
def dqn_algorithm(ENV_NAME, NUM_ENV=8, SEED=1, TOTAL_TIMESTEPS=100000, GAMMA=0.95, MEMORY_SIZE=1000, BATCH_SIZE=32, EXPLORATION_MAX=1.0, EXPLORATION_MIN=0.02, EXPLORATION_FRACTION=0.7, TRAINING_FREQUENCY=1000, FILE_PATH='results/', SAVE_MODEL=False, MODEL_FILE_NAME='model', LOG_FILE_NAME='log', TIME_FILE_NAME='time', PRINT_FREQ=100, N_EP_AVG=100, VERBOSE='False', MLP_LAYERS=[64, 64], MLP_ACTIVATIONS=['relu', 'relu'], LEARNING_RATE=1e-3, EPOCHS=1, GRAD_CLIP=False, DOUBLE_DQN=False, USE_TARGET_NETWORK=True, TARGET_UPDATE_FREQUENCY=5000, LOAD_WEIGHTS=False, LOAD_WEIGHTS_MODEL_PATH='results/model0.h5'): ''' DQN Algorithm execution env_name : string for a gym environment num_env : no. for environment vectorization (multiprocessing env) total_timesteps : Total number of timesteps training_frequency : frequency of training (experience replay) gamma : discount factor : buffer_size : Replay buffer size batch_size : batch size for experience replay exploration_max : maximum exploration at the begining exploration_min : minimum exploration at the end exploration_fraction : fraction of total timesteps on which the exploration decay takes place output_folder : output filepath save_model : boolean to specify whether the model is to be saved model_file_name : name of file to save the model at the end learning log_file_name : name of file to store DQN results time_file_name : name of file to store computation time print_frequency : results printing episodic frequency n_ep_avg : no. of episodes to be considered while computing average reward verbose : print episodic results mlp_layers : list of neurons in each hodden layer of the DQN network mlp_activations : list of activation functions in each hodden layer of the DQN network learning_rate : learning rate for the neural network epochs : no. of epochs in every experience replay grad_clip : boolean to specify whether to use gradient clipping in the optimizer (graclip value 10.0) double_dqn : boolean to specify whether to employ double DQN use_target_network : boolean to use target neural network in DQN target_update_frequency : timesteps frequency to do weight update from online network to target network load_weights : boolean to specify whether to use a prespecified model to initializa the weights of neural network load_weights_model_path : path for the model to use for weight initialization ''' before = time.time() num_envs = NUM_ENV env_name = ENV_NAME if TOTAL_TIMESTEPS % NUM_ENV: print('Error: total timesteps is not divisible by no. of envs') return def make_env(): def _thunk(): env = gym.make(env_name) env.seed(SEED) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) # for reproducibility set_seed(SEED) observation_space = envs.observation_space.shape[0] action_space = envs.action_space.n dqn_solver = DQNSolver(observation_space, action_space, MLP_LAYERS, MLP_ACTIVATIONS, LEARNING_RATE, EPOCHS, USE_TARGET_NETWORK, GRAD_CLIP, DOUBLE_DQN, LOAD_WEIGHTS, LOAD_WEIGHTS_MODEL_PATH, TOTAL_TIMESTEPS, MEMORY_SIZE, BATCH_SIZE, GAMMA, EXPLORATION_MAX, EXPLORATION_MIN, EXPLORATION_FRACTION) envs = ParallelEnvWrapper(envs) t = 0 episode_rewards = [0.0] * num_envs explore_percent, episodes, mean100_rew, steps, NN_tr_loss = [],[],[],[],[] while True: state = envs.reset() # state = np.reshape(state, [1, observation_space]) while True: t += num_envs dqn_solver.eps_timestep_decay(t) action = dqn_solver.act(state) state_next, reward, terminal, _ = envs.step(action) # print(terminal) # reward = reward if not terminal else -reward # state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) if t % TRAINING_FREQUENCY == 0: dqn_solver.experience_replay() state = state_next episode_rewards[-num_envs:] = [ i + j for (i, j) in zip(episode_rewards[-num_envs:], reward) ] # num_episodes = len(episode_rewards) # print(terminal) if (t % PRINT_FREQ == 0): explore_percent.append(dqn_solver.exploration_rate * 100) episodes.append(len(episode_rewards)) mean100_rew.append( round(np.mean(episode_rewards[(-1 - N_EP_AVG):-1]), 1)) steps.append(t) NN_tr_loss.append(dqn_solver.loss) if VERBOSE: print('Exploration %: ' + str(int(explore_percent[-1])) + ' ,Episodes: ' + str(episodes[-1]) + ' ,Mean_reward: ' + str(mean100_rew[-1]) + ' ,timestep: ' + str(t) + ' , tr_loss: ' + str(round(NN_tr_loss[-1], 4))) if t > TOTAL_TIMESTEPS: output_table = np.stack((steps, mean100_rew, episodes, explore_percent, NN_tr_loss)) if not os.path.exists(FILE_PATH): os.makedirs(FILE_PATH) file_name = str(FILE_PATH) + LOG_FILE_NAME + '.csv' np.savetxt( file_name, np.transpose(output_table), delimiter=',', header= 'Timestep,Rewards,Episodes,Exploration %,Training Score') after = time.time() time_taken = after - before np.save(str(FILE_PATH) + TIME_FILE_NAME, time_taken) if SAVE_MODEL: file_name = str(FILE_PATH) + MODEL_FILE_NAME + '.h5' dqn_solver.model.save(file_name) return dqn_solver.model if USE_TARGET_NETWORK and t % TARGET_UPDATE_FREQUENCY == 0: dqn_solver.update_target_network() # print(t) if terminal.all(): episode_rewards += [0.0] * num_envs break