def main(): #remote env env = grc.RemoteEnv('tmp/sock') #FIXME: DEBUG #import retro #env = retro.make(game='SonictheHedgehog-Genesis', state='GreenHillZone.Act1') #load the policy name = 'learner_global' state = process_state(env.reset()) test_policy = Policy(state.shape, env.action_space.n, name, act_int=False, recover=True, sess=tf.Session(), pull_scope=name) #run the env lstm_state = test_policy.lstm_init_state while True: action, _, _, lstm_state = test_policy.act(state, lstm_state, explore=False) state, reward, done, _ = env.step(action) state = process_state(state) if done: env.reset()
class PytorchAgent(pommerman.agents.BaseAgent): def __init__(self, character=pommerman.characters.Bomber): super(PytorchAgent, self).__init__(character) # FIXME: Very ugly magic numbers from around the pommerman code. FIX PLEASE self.nn_kwargs = { 'batch_norm': True, 'recurrent': False, 'hidden_size': 512, } # Found in main.py self.config = { 'recode_agents': True, 'compact_powerups': True, 'compact_structure': True, 'rescale': True, } # Found in pommerman.py self.num_channels = 15 # Found in pommerman.py if self.config['recode_agents']: self.num_channels -= 2 if self.config['compact_powerups']: self.num_channels -= 2 if self.config['compact_structure']: self.num_channels -= 2 obs_unflat = get_unflat_obs_space( self.num_channels, 11, self.config['rescale']) # 11 is boardsize and is constant min_flat_obs = np.concatenate( [obs_unflat.spaces[0].low.flatten(), obs_unflat.spaces[1].low]) max_flat_obs = np.concatenate( [obs_unflat.spaces[0].high.flatten(), obs_unflat.spaces[1].high]) self.observation_space = spaces.Box(min_flat_obs, max_flat_obs) self.masks = torch.zeros(1, 1) # Is true if recurrent == False path = os.path.join('../../', 'PommeFFACompetitionFast-v0.pt') state_list = torch.load(path) # Needed for loading in simple_ffa_run self.policy = Policy( PommNet(obs_shape=self.observation_space.shape, **self.nn_kwargs), action_space=spaces.Discrete(6) ) #Observations Space is apperently 9*11*11 + 3, action_space is from v0 self.policy.load_state_dict( state_list[0]) # load saved model into weights self.recurrent_hidden_state = 1 # Is one if recurrent == False def act(self, obs, action_space): new_obs = featurize(obs, self.config) _, action, _, self.recurrent_hidden_state = self.policy.act( new_obs, self.recurrent_hidden_state, self.masks) return action.numpy()
class PPO(): def __init__(self, print_output=False, file_name=None, eval=False, eval_cycle=16, save_interval=1e6, dist_mode='easy', use_background=False, model_path=MODEL_PATH, data_path=DATA_PATH): #Save parameters from hyperparameters module self.total_steps = h.total_steps self.num_envs = h.num_envs self.num_levels = h.num_levels self.num_steps = h.num_steps self.num_epochs = h.num_epochs self.batch_size = h.batch_size self.eps = h.eps self.grad_eps = h.grad_eps self.value_coef = h.value_coef self.entropy_coef = h.entropy_coef self.lr = h.lr self.gamma = h.gamma self.lmbda = h.lmbda self.version = h.version self.time_limit = 60 * 60 * h.time_limit_hours + 60 * h.time_limit_minutes + h.time_limit_seconds self.value_clipping = h.value_clipping self.death_penalty = h.death_penalty self.penalty = h.penalty self.save_interval = save_interval self.step_start = 0 self.dist_mode = dist_mode self.use_background = use_background self.model_path = model_path self.data_path = data_path #Create file_name self.file_name = self.create_file_name(file_name) self.eval = eval self.eval_cycle = eval_cycle self.print_output = print_output #Create Model if h.encoder == "Nature": self.encoder = NatureEncoder(in_channels=h.in_channels, feature_dim=h.feature_dim) elif h.encoder == "Impala": self.encoder = ImpalaEncoder(in_channels=h.in_channels, feature_dim=h.feature_dim) #TODO else: raise ValueError('Only valid encoders are "Nature" and "Impala"') self.policy = Policy(encoder=self.encoder, feature_dim=h.feature_dim, num_actions=15) self.policy.cuda() self.optimizer = h.optimizer(self.policy.parameters(), lr=self.lr, eps=h.opt_extra) self.env = make_env(self.num_envs, num_levels=self.num_levels, dist_mode=self.dist_mode, use_backgrounds=self.use_background) #print if print_output: print('Observation space:', self.env.observation_space) print('Action space:', self.env.action_space.n) # Define temporary storage self.storage = self.create_storage() def create_storage(self): return Storage(self.env.observation_space.shape, self.num_steps, self.num_envs, gamma=self.gamma, lmbda=self.lmbda) def create_file_name(self, file_name): if file_name is not None: return file_name else: now = datetime.now(timezone('Europe/Copenhagen')) return self.version + '_Run_' + now.strftime("%d%b_%Hh%Mm%Ss") def init_log_files(self): create_data_file(self.file_name + '.csv', data_path=self.data_path) add_to_data_file("Step, Mean reward\n", self.file_name + '.csv', data_path=self.data_path) create_data_file(self.file_name + '.txt', data_path=self.data_path) add_to_data_file("Parameter name, Value\n", self.file_name + '.txt', data_path=self.data_path) if self.eval: create_data_file(self.file_name + '_EVAL' + '.csv', data_path=self.data_path) #add header header = "step," for i in range(self.num_envs): header += "env_{}(mean),env_{}(var),".format(i, i) header += "avg\n" add_to_data_file(header, self.file_name + '_EVAL' + '.csv', data_path=self.data_path) hyperpar_string = "" for key, val in vars(self).items(): if key in [ "encoder", "print_output", "policy", "optimizer", "storage", "env" ]: continue hyperpar_string += "{}, {}\n".format(key, val) add_to_data_file(hyperpar_string, self.file_name + '.txt', data_path=self.data_path) #TODO run through hyperparameters and log them #endregion #region training def train(self): """ Run training """ #INIT LOG self.init_log_files() self.start_time = time.time() obs = self.env.reset() step = self.step_start m_counter = 1 while step < self.total_steps: #If time limit exceeded: if self.is_time_spent(): self.end_training(step) return self.policy # Use policy to collect data for num_steps steps self.run_policy(obs) # Optimize policy self.optimize_policy() #TODO: put in method #save model every now and then if step > self.step_start + m_counter * self.save_interval: self.save_policy(self.file_name + "_{}steps".format(step)) m_counter += 1 # Update stats step += self.num_envs * self.num_steps if self.print_output: print( f'Step: {step}\tMean reward: {self.storage.get_reward()}') add_to_data_file("{}, {}\n".format(step, self.storage.get_reward()), self.file_name + '.csv', data_path=self.data_path) if int((step / (self.num_envs * self.num_steps)) % self.eval_cycle) == 0: total_reward, all_episode_rewards = self.evaluate_policy( min(50, self.num_levels), eval_dist_mode=self.dist_mode, eval_use_background=self.use_background) if self.print_output: print("Evaluation done with avg score of {:10f}".format( total_reward)) add_to_data_file("{},".format(step), self.file_name + '_EVAL' + '.csv', data_path=self.data_path) for key in sorted(all_episode_rewards.keys()): add_to_data_file("{:10f}, {:10f},".format( np.mean(all_episode_rewards[key]), np.var(all_episode_rewards[key])), self.file_name + '_EVAL' + '.csv', data_path=self.data_path) add_to_data_file("{:10f}\n".format(total_reward), self.file_name + '_EVAL' + '.csv', data_path=self.data_path) #end while loop if self.print_output: print('Completed training!') self.end_training(step) return self.policy def end_training(self, last_step): #Add to log file add_to_data_file('Time spent (in seconds), {:.2f}\n'.format(time.time()-self.start_time) + \ "Steps taken, {}\n".format(last_step) + \ "Done, False\n", self.file_name + '.txt', data_path=self.data_path) self.save_policy(self.file_name + "_{}steps".format(last_step)) def save_policy(self, file_name, model_path=None): if model_path is None: model_path = self.model_path if self.print_output: print( "Saved current model in models folder with name {}.pt".format( file_name)) torch.save( { 'policy_state_dict': self.policy.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), }, model_path + file_name + '.pt') def load_policy(self, file_name, model_path=MODEL_PATH, data_path=None): if data_path is None: data_path = self.data_path checkpoint = torch.load(model_path + file_name + '.pt') self.policy.load_state_dict(checkpoint["policy_state_dict"]) self.policy.cuda() self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if self.print_output: print("Loaded current model from models folder with name {}.pt". format(file_name)) #save old step count if "steps" in file_name: self.step_start = int( file_name.split("_")[-1].replace("steps", "")) #manually read last step from file else: f = open(data_path + file_name + '.csv', "r") for last_line in f: pass f.close() last_line = last_line.rstrip() #to remove a trailing newline steps, reward = last_line.split(",") self.step_start = int(steps) #update file_name if "steps" in file_name or "loaded" in file_name: new_name = "" for sub_str in file_name.split("_"): if "steps" in sub_str or "loaded" in sub_str: break new_name += sub_str + "_" file_name = new_name[:-1] now = datetime.now(timezone('Europe/Copenhagen')) self.file_name = file_name + "_loaded_" + now.strftime( "%d%b_%Hh%Mm%Ss") self.total_steps += self.step_start return self.policy def is_time_spent(self): time_spent = time.time() - self.start_time return time_spent > self.time_limit def run_policy(self, obs): self.policy.eval() for _ in range(self.num_steps): # Use policy action, log_prob, value = self.policy.act(obs) # Take step in environment next_obs, reward, done, info = self.env.step(action) if self.death_penalty: reward = reward - self.penalty * done # Store data self.storage.store(obs, action, reward, done, info, log_prob, value) # Update current observation obs = next_obs # Add the last observation to collected data _, _, value = self.policy.act(obs) self.storage.store_last(obs, value) # Compute return and advantage self.storage.compute_return_advantage() def optimize_policy(self): # Optimize policy self.policy.train() for _ in range(self.num_epochs): # Iterate over batches of transitions generator = self.storage.get_generator(self.batch_size) for batch in generator: #Results from using old policy on environment b_obs, b_action, b_log_prob, b_value, b_returns, b_advantage = batch # Get current policy outputs new_dist, new_value = self.policy(b_obs) new_log_prob = new_dist.log_prob(b_action) # Clipped policy objective pi_loss = ClippedPPOLoss(advantage=b_advantage, log_pi=new_log_prob, log_old_pi=b_log_prob, eps=self.eps) # # Clipped value function objective # #Assume value_loss = ClippedValueFunctionLoss if self.value_clipping: value_loss = ClippedValueFunctionLoss( value=new_value, sampled_value=b_value, sampled_return=b_returns, clip=self.eps) else: value_loss = ValueFunctionLoss(new_value=new_value, old_value=b_value) # Entropy loss entropy_loss = new_dist.entropy().mean() # Backpropagate losses loss = -(pi_loss - self.value_coef * value_loss + self.entropy_coef * entropy_loss) loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self.grad_eps) # Update policy self.optimizer.step() self.optimizer.zero_grad() #endregion #region evaluation def evaluate_policy(self, nr_of_levels, print_output=False, normalize_reward=True, eval_dist_mode='easy', eval_use_background=False): """ TODO: Add Video generation """ model = self policy = model.policy #pick levels we did not train on. eval_env = make_env(model.num_envs, start_level=model.num_levels, num_levels=nr_of_levels, normalize_reward=normalize_reward, dist_mode=eval_dist_mode, use_backgrounds=eval_use_background) obs = eval_env.reset() #book-keeping completed_envs = [] counter_compl_envs = np.zeros(model.num_envs) episode_rewards = np.zeros(model.num_envs) #current episode rewards rewards = {} for i in range(model.num_envs): rewards[i] = [] step_counter = 0 policy.eval() while True: # Use policy action, log_prob, value = policy.act(obs) # Take step in environment obs, reward, done, info = eval_env.step(action) #if any reward, update envs still not done for i in range(len(reward)): if reward[i] != 0 and i not in completed_envs: episode_rewards[i] += reward[i] # If new environment done, complete it for i in [ index for index in range(len(done)) if done[index] == True ]: if i not in completed_envs: counter_compl_envs[i] += 1 if print_output: print( "Environment {:2d} completed its {:4d}th level at timestep {:6d} with a reward of {:10f}" .format(i, int(counter_compl_envs[i]), step_counter, episode_rewards[i])) rewards[i].append(episode_rewards[i]) episode_rewards[i] = 0 if counter_compl_envs[i] == nr_of_levels: completed_envs.append(i) # If all environments are done, break if len(completed_envs) == model.num_envs: break step_counter += 1 # end while # Calculate average return total_reward = [] for key, value in rewards.items(): total_reward.append(sum(value)) total_reward = np.mean(total_reward) / nr_of_levels if print_output: print('Average return:', total_reward) policy.train() return total_reward, rewards
class Model: def __init__(self, transfer=False): if transfer: self.nn_kwargs = { 'batch_norm': True, 'recurrent': False, 'hidden_size': 512, } # Found in main.py else: self.nn_kwargs = { 'batch_norm': True, 'recurrent': True, 'hidden_size': 512, } # Found in main.py self.config = { 'recode_agents': True, 'compact_powerups': True, 'compact_structure': True, 'rescale': True, } # Found in pommerman.py self.num_channels = 15 # Found in pommerman.py self.transfer = transfer if self.config['recode_agents']: self.num_channels -= 2 if self.config['compact_powerups']: self.num_channels -= 2 if self.config['compact_structure']: self.num_channels -= 2 obs_unflat = get_unflat_obs_space( self.num_channels, 11, self.config['rescale']) # 11 is boardsize and is constant min_flat_obs = np.concatenate( [obs_unflat.spaces[0].low.flatten(), obs_unflat.spaces[1].low]) max_flat_obs = np.concatenate( [obs_unflat.spaces[0].high.flatten(), obs_unflat.spaces[1].high]) self.observation_space = spaces.Box(min_flat_obs, max_flat_obs) self.masks = torch.zeros(1, 1) # Is true if recurrent == False self.policy = Policy(PommNet(obs_shape=self.observation_space.shape, **self.nn_kwargs), action_space=spaces.Discrete(6)) if not transfer: self.params = self.policy.state_dict() else: self.params = torch.load('../PommeFFACompetitionFast-v0.pt')[0] self.policy.load_state_dict(self.params) self.recurrent_hidden_state = torch.zeros( 1, self.policy.recurrent_hidden_state_size) def copy(self): copy = Model(transfer=self.transfer) copy.params = self.params copy.policy.load_state_dict(copy.params) return copy def update_params(self, epsilon, rewards, learning_rate): for idx, reward in enumerate(rewards): for key, weights in epsilon[idx].items(): self.params[key] += learning_rate * 1 / len( rewards) * reward * weights self.policy.load_state_dict(self.params) def shape(self): shape_dict = {} for key, weights in self.params.items(): shape_dict[key] = weights.shape return shape_dict def act(self, state): new_obs = state _, action, _, self.recurrent_hidden_state = self.policy.act( new_obs, self.recurrent_hidden_state, self.masks) return action.numpy()
import gym import torch from policy import Policy DEVICE = 'cpu' env = gym.make('CartPole-v0') env._max_episode_steps = 10000 env = gym.wrappers.Monitor(env, "./monitor_output", force=True) policy = Policy() policy.load_state_dict(torch.load('trained_policy_20201105-135133.pth')) state = env.reset() for _ in range(10000): action, _ = policy.act(state, DEVICE) state, reward, done, _ = env.step(action) if done: break env.close()
'visdom': False, 'seed': 41, 'max_step_length': 10000, 'observation_space': OBSERVATION_SPACE, 'action_space': ACTION_SPACE, 'reward_function': reward, 'observation_function': observation, 'action_function': action, }) agent_policy = Policy() agent_policy.setup() observation = env.reset() total_reward = 0. for i in range(1000): action = agent_policy.act(observation) observation, reward, done, _ = env.step(action) total_reward += reward if done: print("simulation ended") break env.close() agent_policy.teardown() print("Accumulated reward:", total_reward)
class MultiAgentDDPG: def __init__(self, env: [UnityMlFacade], device, seed, verbose=1, gamma=0.99, actor_learning_rate=0.001, critic_learning_rate=0.001, buffer_size=100000, batch_size=100, snapshot_window=5, hidden_layers_comma_sep='400,30'): self.env = env self.device = device self.seed = seed self.verbose = verbose self.gamma = gamma self.buffer_size = buffer_size self.batch_size = batch_size self.snapshot_window = snapshot_window self.policy_snapshots = deque(maxlen=self.snapshot_window) self.current_policy_snapshot = -1 self.last_save = 0 self.last_swap = 0 self.action_size = self.env.action_space.shape[0] * self.env.num_agents self.state_size = self.env.observation_space.shape[0] * self.env.num_agents # this should be 48 hidden_layers = [int(layer_width) for layer_width in hidden_layers_comma_sep.split(',')] # create agent1 self.player_policy = Policy(0, state_size=self.state_size, action_size=self.action_size, hidden_dims=hidden_layers, device=self.device, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, random_seed=seed) # create agent2 self.opponent_policy = Policy(1, state_size=self.state_size, action_size=self.action_size, hidden_dims=hidden_layers, device=self.device, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, random_seed=seed) self.t_step = 0 def learn_random(self, total_timesteps, callback=None): # start with random actions, just to test the loop action_size = self.env.action_space.shape[0] for i in range(1, 6): scores = np.zeros(self.env.num_agents) state, reward, done = self.env.reset() while True: actions = np.random.randn(self.env.num_agents, action_size) actions = np.clip(actions, -1, 1) next_states, rewards, dones, info = self.env.step(actions) scores += rewards states = next_states if np.any(dones): break print('Score (max over agents) from episode {}: {} in steps: {}'.format(i, np.max(scores), self.env.episode_step)) def learn(self, total_timesteps, callback): ou_scale = 1.0 # initial scaling factor ou_decay = 0.9995 # decay of the scaling factor ou_scale ou_mu = 0.0 # asymptotic mean of the noise ou_theta = 0.15 # magnitude of the drift term ou_sigma = 0.20 # magnitude of the diffusion term # this slowly decreases to 0 # create the noise process noise_process = OUNoise(self.action_size, ou_mu, ou_theta, ou_sigma) # create the replay buffer buffer = ReplayBuffer(seed=self.seed, action_size=self.action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, device=self.device) self.t_step = 0 episode = 0 while self.t_step < total_timesteps: callback.on_start_episode(episode) episode_scores = np.zeros(self.env.num_agents) states, _, _ = self.env.reset() scores = np.zeros(2) while True: states = np.reshape(states, (1, 48)) # reshape so we can feed both agents states to each agent # split into the states into the parts observed by each agent states_0 = states[0, :24].reshape((1, 24)) states_1 = states[0, 24:].reshape((1, 24)) # generate noise noise = ou_scale * noise_process.get_noise().reshape((1, 4)) # split the noise into the parts for each agent noise_0 = noise[0, :2].reshape((1, 2)) noise_1 = noise[0, 2:].reshape((1, 2)) # determine actions for the unity agents from current sate, using noise for exploration actions_0 = self.player_policy.act(states_0, use_target=False, add_noise=True, noise_value=noise_0)\ .detach().cpu().numpy() actions_1 = self.opponent_policy.act(states_1, use_target=False, add_noise=True, noise_value=noise_1)\ .detach().cpu().numpy() actions = np.vstack((actions_0.flatten(), actions_1.flatten())) # take the action in the environment next_states, rewards, dones, info = self.env.step(actions) # store (S, A, R, S') info in the replay buffer (memory) buffer.add(states.flatten(), actions.flatten(), rewards, next_states.flatten(), dones) episode_scores += rewards states = next_states self.t_step += 1 """ Policy learning """ ## train the agents if we have enough replays in the buffer if len(buffer) >= self.batch_size: self.player_policy.learn(buffer.sample(), self.opponent_policy) self.opponent_policy.learn(buffer.sample(), self.player_policy) if np.any(dones): break if not callback.on_step(np.max(episode_scores), self.t_step): break # decrease the scaling factor of the noise ou_scale *= ou_decay episode += 1 def save(self, model_folder): # Save trained Actor and Critic network weights for agent 1 an_filename = os.path.join(model_folder, "ddpg_player_actor.pth") torch.save(self.player_policy.actor.state_dict(), an_filename) cn_filename = "ddpg_player_critic.pth" torch.save(self.player_policy.critic.state_dict(), cn_filename) # Save trained Actor and Critic network weights for agent 2 an_filename = "ddpg_opponent_actor.pth" torch.save(self.opponent_policy.actor.state_dict(), an_filename) cn_filename = "ddpg_opponent_critic.pth" torch.save(self.opponent_policy.critic.state_dict(), cn_filename) def _save_snapshot(self, policy: Policy) -> None: """save a snapshot of the provided Policy weights""" weights = policy.get_weights() self.policy_snapshots.append(weights) self.current_policy_snapshot = weights def _swap_snapshots(self) -> None: if np.random.uniform() < (1 - self.play_against_current_self_ratio): x = np.random.randint(len(self.policy_snapshots)) snapshot = self.policy_snapshots[x] self.current_opponent = x else: snapshot = self.current_policy_snapshot self.current_opponent = -1 self.opponent_policy.load_weights(snapshot) def _step(self, states, actions, rewards, next_states, dones, info): """This method is called each training step with our (s,a,r,s',done)
device = torch.device("cuda" if args.cuda else "cpu") assert args.vae, "You need to provide a VAE file." assert args.policy, "You need to provide a policy file." env = gym.make(args.env) env = CropCarRacing(env) env = ResizeObservation(env, (32, 32, 3)) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env = NormalizeRGB(env) env = VAEObservation(env, args.vae, arch=args.arch) policy = Policy(env) policy.load_state_dict(torch.load(args.policy)) policy.eval() env.seed(args.seed) for i in trange(args.episodes): obs = env.reset() done = False i = 0 rtotal = 0 while not done and i < args.horizon: action, action_proba = policy.act(obs) obs, reward, done, info = env.step(action) env.render() rtotal += reward i += 1 print(rtotal) env.close()
policy = Policy(policy_env) policy.load_state_dict(torch.load(args.policy)) policy.eval() VAE_class = VAEbyArch(args.arch) vae = VAE_class(latent_size=args.latent_size) vae.load_state_dict(torch.load(args.vae)) vae.eval() # Data generation dataset = [] obs = env.reset() step = 0 for i in trange(args.size): if args.policy: obs_torch = torch.from_numpy(NCHW([obs])).float().to(device) mu, _ = vae.encode(obs_torch) action, action_proba = policy.act(mu.detach().numpy()) action = action[0] else: action = env.action_space.sample() action = [action[0], 0.3, 0.0] obs, reward, done, info = env.step(action) step += 1 #env.render() dataset.append(obs) if done or step >= args.horizon: env.seed(args.seed + i) obs = env.reset() step = 0 env.close() np.random.shuffle(dataset)
policy = Policy() policy.load_state_dict( parameters['policy_state_dic']) # load saved parameters to policy network policy = policy.to(device) N = 2000 ac = Acrobot(m1, l1, m2, l2) # create acrobot object with saved parameters ac.reset() torque, t = np.zeros((N, 1), dtype=int), np.zeros((N, 1), dtype=float) r = np.zeros((N, 1), dtype=float) s = np.zeros((N, 4), dtype=float) for i in range(N): # generate a trajectory with the optimized policy network a, _ = policy.act(ac.state) s[i, :], r[i] = ac.step(a) torque[i], t[i] = ac.torque, i * ac.dt height = -l1 * cos(s[:, 0]) - l2 * cos(s[:, 0] + s[:, 1]) plt.figure(1) plt.plot(t, height, linewidth=3, label="height") plt.legend(fontsize=20, loc='best') plt.grid() plt.savefig('acrobot_height.png', dpi=300) plt.figure(2) plt.plot(t, torque, 'k', linewidth=3, label="Motor torque") plt.legend(fontsize=20, loc='best') plt.grid() plt.savefig('acrobot_torque.png', dpi=300)
'visdom': False, 'seed': seed, 'max_step_length': 10000, 'observation_space': OBSERVATION_SPACE, 'action_space': ACTION_SPACE, 'reward_function': reward, 'observation_function': observation, 'action_function': action, }) accumulated_reward = 0 for i in range(10): env_obs = env.reset() total_reward = 0. for _ in range(1000): pred_action = agent_policy.act(env_obs) env_obs, env_reward, done, _ = env.step(pred_action) total_reward += env_reward if done: # print("simulation ended") break accumulated_reward += total_reward print("Iteration {} on track {} with {} vehicles: {}".format(str(i), track, str(nvehicle), total_reward)) evaluation_reward += accumulated_reward print("Total on track {} with {} vehicles: {}".format(track, str(nvehicle), accumulated_reward)) print("##########") env.close() agent_policy.teardown()
# Evaluate policy in_channels = eval_env.observation_space.shape[0] feature_dim = 512 num_actions = eval_env.action_space.n encoder = Impala(in_channels, feature_dim) policy = Policy(encoder.cuda(), feature_dim, num_actions) policy.load_state_dict(torch.load('checkpoint.pt')) policy.cuda() policy.eval() for _ in range(512): # Use policy action, log_prob, value = policy.act(obs) # Take step in environment obs, reward, done, info = eval_env.step(action) total_reward.append(torch.Tensor(reward)) # Render environment and store frame = (torch.Tensor(eval_env.render(mode='rgb_array'))*255.).byte() frames.append(frame) # Calculate average return total_reward = torch.stack(total_reward).sum(0).mean(0) print('Average return:', total_reward) # Save frames as video frames = torch.stack(frames)