def sample_from_env(self, env: SubprocVecEnv, policy: MlpPolicy, timestep_limit=None, render=False): """ return: dimension is Size(timesteps, n_envs, feature_size) """ # todo: use a default dict for these data collection. Much cleaner. mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], [] true_reward = [] dones = [False] * env.num_envs if render: env.render() # while sum(dones) < env.num_envs: for _ in range(timestep_limit or G.batch_timesteps): # M.red("obs shape is: {}, value is: {}".format(self.obs.shape, self.obs)) try: obs = self.obs except AttributeError: obs = self.obs = env.reset() actions, values, neglogpacs = policy.step(obs) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(dones) self.obs[:], rewards, dones, info = env.step(actions) if render: env.render() mb_rewards.append(rewards) if 'avg_reward' in info: true_reward.append(info['avg_reward']) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = policy.value(self.obs) # discount/bootstrap off value fn mb_advs = np.zeros_like(mb_rewards) last_gae_lam = 0 n_rollouts = len(mb_obs) for t in reversed(range(n_rollouts)): if t == n_rollouts - 1: next_non_terminal = 1.0 - dones # np.array(self.dones, dtype=float) next_values = last_values else: next_non_terminal = 1.0 - mb_dones[t + 1] next_values = mb_values[t + 1] delta = mb_rewards[t] + G.gamma * next_values * next_non_terminal - mb_values[t] mb_advs[t] = last_gae_lam = delta + G.gamma * G.lam * next_non_terminal * last_gae_lam mb_returns = mb_advs + mb_values # return dimension is Size(timesteps, n_envs, feature_size) return dict(obs=mb_obs, rewards=mb_rewards, returns=mb_returns, dones=mb_dones, actions=mb_actions, values=mb_values, neglogpacs=mb_neglogpacs, ep_info=dict(reward=np.mean(true_reward)))
class Env: def __init__(self, env_name, actors=1): self.env = SubprocVecEnv([make_env(env_name) for _ in range(actors)]) self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.actors = actors try: self.action_space_low = torch.FloatTensor( self.env.action_space.low) self.action_space_high = torch.FloatTensor( self.env.action_space.high) except: self.action_space_low = None self.action_space_high = None def reset(self): s = self.env.reset() if len(np.array(s).shape) == 0: s = np.expand_dims(s, axis=0) return s def explore_step(self, a): s2, r, done, info = self.env.step(a) if len(np.array(s2).shape) == 0: s2 = np.expand_dims(s2, axis=0) return s2, r, done, info def step(self, a): if isinstance(a, torch.Tensor): a = a.cpu().numpy() s2, r, done, info = self.env.step(a) if len(np.array(s2).shape) == 0: s2 = np.expand_dims(s2, axis=0) return s2, r, done, info def random_action(self): return np.stack( [self.env.action_space.sample() for _ in range(self.actors)]) def render(self): return self.env.render() def close(self): return self.env.close()
def test(config): base_dir = os.path.join('./results/', args.algo, model_architecture, config.env_id) log_dir = os.path.join(base_dir, 'logs/') model_dir = os.path.join(base_dir, 'saved_model/') seed = np.random.randint(0, int(1e6)) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) env = [ make_env_a2c_smb(config.env_id, seed, config.num_agents + 1, log_dir, dim=args.dim, stack_frames=config.stack_frames, adaptive_repeat=config.adaptive_repeat, reward_type=config.reward_type, sticky=args.sticky_actions, vid=args.render, base_dir=base_dir) ] env = SubprocVecEnv(env) model = Model(env=env, config=config, log_dir=base_dir, static_policy=args.inference) model.load_w() obs = env.reset() if args.render: env.render() obs = torch.from_numpy(obs.astype(np.float32)).to(config.device) state = model.config.rollouts.states[0, 0].view(1, -1) mask = model.config.rollouts.masks[0, 0].view(1, -1) episode_rewards = np.zeros(1, dtype=np.float) final_rewards = np.zeros(1, dtype=np.float) start = timer() print_threshold = args.print_threshold max_dist = np.zeros(1, dtype=np.float) done = False tstep = 0 while not done: tstep += 1 with torch.no_grad(): value, action, action_log_prob, state = model.get_action( obs, state, mask) cpu_action = action.view(-1).cpu().numpy() obs, reward, done, info = env.step(cpu_action) if args.render: env.render() obs = torch.from_numpy(obs.astype(np.float32)).to(config.device) episode_rewards += reward mask = 1. - done.astype(np.float32) final_rewards += (1. - mask) * episode_rewards for index, inf in enumerate(info): if inf['x_pos'] < 60000: #there's a simulator glitch? Ignore this value max_dist[index] = np.max((max_dist[index], inf['x_pos'])) mask = torch.from_numpy(mask).to(config.device).view(-1, 1) #print end = timer() total_num_steps = tstep print("Num timesteps {}, FPS {}, Distance {:.1f}, Reward {:.1f}".format( total_num_steps, int(total_num_steps / (end - start)), np.mean(max_dist), np.mean(final_rewards))) env.close()
class A2C: def __init__(self, parameters): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.parameters = parameters self.envname = self.parameters['ENVIRONMENT'] self.env = [ self.make_env(self.envname, seed) for seed in range(self.parameters['N_PROC']) ] self.env = SubprocVecEnv(self.env) self.test_env = gym.make(self.envname) self.model = ActorCritic(self.env.observation_space.shape[0], self.env.action_space.shape[0], N_HIDDEN=self.parameters['N_HIDDEN']).to( self.device) self.optimizer = optim.Adam(self.model.parameters(), self.parameters['LR']) self.data = {"loss": []} self.start_time = None self.end_time = None def make_env(self, env_id, seed): def _f(): env = gym.make(env_id) env.seed(seed) return env return _f def select_action(self, state): """ :param state: numpy array (N_PROC x observation_space) :return: action: numpy array (N_PROC x action_space) action selected by model for each environment log_prob: torch tensor (N_PROC x 1) log probability for each action selected value: torch tensor (N_PROC x 1) value assigned to each state by the model entropy: torch scalar () average entropy over all samples """ state = state[:, np.newaxis, :] # allows for batch processing with the NN mu, var, value = self.model(torch.tensor(state).float()) value = torch.squeeze(value, dim=1) print(var) distribution = torch.distributions.Normal(mu, var.sqrt()) action = distribution.sample() action = torch.clamp(action, min=self.env.action_space.low[0], max=self.env.action_space.high[0]) log_prob = distribution.log_prob(action).mean(-1) entropy = distribution.entropy().mean().unsqueeze(0) # This must be numpy to be passed to the openai environments action = torch.squeeze(action, 1) action = action.detach().cpu().numpy() return action, log_prob, value, entropy def update_a2c(self, rewards, log_probs, values, isdone, state, entropies): """ :param log_probs: torch tensor (N_PROC x FINITE_HORIZON) log probability of each action taken at each time and environment :param values: torch tensor (N_PROC x FINITE_HORIZON) value of each state at each timepoint and environment :param rewards: list of tensors [N_PROC x FINITE_HORIZON] rewards at each timepoint and environment :param isdone: list of tensors [N_PROC x FINITE_HORIZON] boolean values representing if each episode is complete :param state: numpy array (N_PROC x observation_space) :param entropies torch tensor (N_PROC, ) :return: loss: numpy scalar (scalar) loss used for backpropagation """ # Find the estimated value of the final state of the finite horizon state = state[:, np.newaxis, :] # allows for batch processing with the NN _, _, td_target = self.model(torch.tensor(state).float()) td_target = torch.squeeze(td_target, dim=2) td_targets = [] for reward, done in zip(rewards[::-1], isdone[::-1]): td_target = reward + done * self.parameters['GAMMA'] * td_target td_targets.append(td_target) td_targets = td_targets[::-1] td_targets = torch.cat(td_targets, dim=1) advantage = td_targets - values actor_loss = -(log_probs * advantage).mean() critic_loss = F.mse_loss(td_targets, values) entropy_loss = self.parameters['ENTROPY_C'] * entropies.mean() print(log_probs) print("actor loss:", actor_loss.clone().detach().cpu().numpy()) print("critic loss:", critic_loss.clone().detach().cpu().numpy()) print("entropy loss:", entropy_loss.clone().detach().cpu().numpy()) loss = actor_loss + critic_loss - entropy_loss self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), 5) self.optimizer.step() return loss.clone().detach().cpu().numpy() # Main training loop. def train(self): print("Going to be training for a total of {} training steps".format( self.parameters['MAX_TRAINING_STEPS'])) self.start_time = time.time() state = self.env.reset() loss_list = [] test_list = [] for step_num in tqdm(range(self.parameters['MAX_TRAINING_STEPS'])): rewards = [] log_probs = [] values = [] isdone = [] entropies = [] for _ in range(self.parameters['FINITE_HORIZON']): action, log_prob, value, entropy = self.select_action(state) state, reward, done, _ = self.env.step(action) reward = torch.unsqueeze(torch.tensor(reward), 1).to(self.device) done = torch.unsqueeze(torch.tensor(1 - done), 1).to(self.device) log_probs.append(log_prob) values.append(value) rewards.append(reward) isdone.append(done) entropies.append(entropy) # format lists into torch tensors log_probs = torch.cat(log_probs, dim=1).to(self.device) values = torch.cat(values, dim=1).to(self.device) entropies = torch.cat(entropies).to(self.device) # Update Actor - Critic loss = self.update_a2c(rewards, log_probs, values, isdone, state, entropies) loss_list.append(loss) if (step_num % self.parameters['PRINT_DATA']) == 0 and step_num != 0: y = np.array(loss_list) kernel = (1 / self.parameters['PRINT_DATA']) * np.ones( self.parameters['PRINT_DATA']) ma_y = np.convolve(y, kernel, mode='same') plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0)) plt.plot(y, '-b') plt.plot(ma_y, '-r') plt.axhline(color='k') plt.xlabel("Number of Training Steps") plt.ylabel("Loss") plt.title("Training Loss") plt.legend([ 'Loss', 'Moving Average (n={})'.format( self.parameters['PRINT_DATA']) ]) plt.savefig("train_loss.png") plt.close() if (step_num % self.parameters['TEST_FREQUENCY']) == 0 and step_num != 0: test_mean, test_std = self.test() test_list.append([test_mean, test_std]) x = np.arange(1, step_num, self.parameters['TEST_FREQUENCY']) y = np.array(test_list) plt.errorbar(x, y[:, 0], yerr=y[:, 1], fmt='.k') plt.axhline(color='k') plt.xlabel("Number of Training Steps") plt.ylabel("Mean Episode Cumulative Reward (n={})".format( self.parameters['TEST_EPISODES'])) plt.title("Test Episode Cumulative Reward Progression") plt.savefig("test_reward.png") plt.close() self.env.close() def test(self): testing_rewards = [] for _ in range(self.parameters['TEST_EPISODES']): state = self.test_env.reset() temp_reward = 0 for _ in range(self.parameters['MAX_STEPS_PER_EP']): action, _, _, _ = self.select_action(state[None, :]) state, reward, done, _ = self.test_env.step(action) temp_reward += reward if done: break testing_rewards.append(temp_reward) return np.mean(testing_rewards), np.std(testing_rewards) def demonstrate(self, save_snapshots=None): self.env = gym.make(self.envname) state = self.env.reset() while not done: self.env.render() action, log_prob, value = self.select_action(state) state, reward, done, _ = self.env.step(action) def save_experiment(self, environment): path = "experiments/" + environment + "_a2c_" + exp_name torch.save(self.ActorCritic.state_dict(), path) # if you want to load the model, use something similar to the following # network = actor() # actor.load_state_dict(torch.load(file_path)) parameters = { "Environment Name": self.envname, "MAX_EPISODES": MAX_EPISODES, "MAX_STEPS_PER_EP": MAX_STEPS_PER_EP, "GAMMA": GAMMA, "TAU": TAU, "LEARNING_RATE_ACTOR": LR_ACTOR, "LEARNING_RATE_CRITIC": LR_CRITIC, } parameters_path = "experiments/" + environment + "_a2c_" + exp_name + ".csv" with open(parameters_path, "w") as file: w = csv.writer(file) for key, val in parameters.items(): w.writerow([key, val, "\n"])
def sample_from_env(self, env: SubprocVecEnv, policy: MlpPolicy, timestep_limit=None, render=False): # todo: use a default dict for these data collection. Much cleaner. mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], [] dones = [False] * env.num_envs if render: env.render() # while sum(dones) < env.num_envs: for _ in range(timestep_limit or G.batch_timesteps): # M.red("obs shape is: {}, value is: {}".format(self.obs.shape, self.obs)) try: obs = self.obs except AttributeError: obs = self.obs = env.reset() actions, values, neglogpacs = policy.step(obs) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(dones) self.obs[:], rewards, dones, infos = env.step(actions) if render: env.render() mb_rewards.append(rewards) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = policy.value(self.obs) # discount/bootstrap off value fn mb_advs = np.zeros_like(mb_rewards) last_gae_lam = 0 n_rollouts = len(mb_obs) for t in reversed(range(n_rollouts)): if t == n_rollouts - 1: next_non_terminal = 1.0 - dones # np.array(self.dones, dtype=float) next_values = last_values else: next_non_terminal = 1.0 - mb_dones[t + 1] next_values = mb_values[t + 1] delta = mb_rewards[ t] + G.gamma * next_values * next_non_terminal - mb_values[t] mb_advs[ t] = last_gae_lam = delta + G.gamma * G.lam * next_non_terminal * last_gae_lam mb_returns = mb_advs + mb_values def sf01(arr): """swap and then flatten axes 0 and 1""" s = arr.shape return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) mb_obs, mb_rewards, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs = \ map(sf01, (mb_obs, mb_rewards, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)) return dict(obs=mb_obs, rewards=mb_rewards, returns=mb_returns, dones=mb_dones, actions=mb_actions, values=mb_values, neglogpacs=mb_neglogpacs)
def train(): logger.configure() ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True #pylint: disable=E1101 tf.Session(config=config).__enter__() ##### POMMERMAN def make_env(seed): def f(): config = ffa_competition_env() env = Wrapped_Env(**config["env_kwargs"]) env.observation_space = spaces.Box(0, 20, shape=(11, 11, 18), dtype=np.float32) # Add 3 random agents agents = [] for agent_id in range(3): # if agent_id == env.winner_id: # agents.append(TrainingAgent(config["agent"](agent_id, config["game_type"]))) # else: agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) agent_id += 1 agents.append( TrainingAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) if logger.get_dir(): env = Monitor(env, logger.get_dir(), allow_early_resets=True) return env return f ######### envs = [make_env(seed) for seed in range(8)] env = SubprocVecEnv(envs) num_timesteps = 10000 policy = CnnPolicy # env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) # policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy] model = ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(num_timesteps * 1.1)) logger.log("Running trained model") # obs = np.zeros((env.num_envs,) + env.observation_space.shape) env = make_env(0)() obs = env.reset() obs = np.expand_dims(obs, 0) while True: print(obs.shape) actions = model.step(obs)[0] obs[:], reward, done, info = env.step(actions) if done: obs = env.reset() obs = np.expand_dims(obs, 0) env.render()