def test_label_buffer(): import matplotlib.pyplot as plt import random from doom_rdqn.arguments import parse_game_args params = parse_game_args() params.decimate = False env = DoomEnvironment(params) for i in range(10): env.make_action(random.choice(list(range(8)))) state = env.game.get_state() labels_buffer = state.labels_buffer label = state.labels plt.subplot(1, 2, 1) plt.imshow(env.get_observation().transpose(1, 2, 0)) plt.subplot(1, 2, 2) plt.imshow(labels_buffer) plt.figure() plt.imshow(resize(labels_buffer, (56, 32), cv2.INTER_AREA)) plt.figure() plt.imshow( resize(env.get_observation().transpose(1, 2, 0), (112, 64), cv2.INTER_AREA)) data = env.get_observation() def resize_test(image): return resize(image.transpose(1, 2, 0), (112, 64)).transpose(2, 0, 1)
def evaluate_saved_model(): params = parse_game_args() env = DoomEnvironment(params, is_train=True) print(env.num_actions) obs_shape = (3, params.screen_height, params.screen_width) actor_critic = CNNPolicy(obs_shape[0], obs_shape, params) assert params.model_checkpoint, 'No model checkpoint found' assert os.path.isfile( params.model_checkpoint), 'The model could not be loaded' # This lambda stuff is required otherwise it will try and load on GPU checkpoint = torch.load(params.model_checkpoint, map_location=lambda storage, loc: storage) actor_critic.load_state_dict(checkpoint['model']) base_filename = params.model_checkpoint.split('.')[0].split('/')[1] agent = BaseAgent(actor_critic, params) for i in range(params.num_mazes_test): env = DoomEnvironment(params, idx=i, is_train=True) movie_name = 'videos/{}_rollout_{:0004}.mp4'.format(base_filename, i) print('Creating movie {}'.format(movie_name)) make_movie(agent, env, movie_name, params)
logger.write('Step: {:0004}, Game rewards: {}, Game times: {}'.format( step, reward_list, time_list)) def write_movie(params, logger, observations, step, score): observations = [o.transpose(1, 2, 0) * 255.0 for o in observations] clip = ImageSequenceClip(observations, fps=int(30 / params.frame_skip)) output_dir = logger.get_eval_output() clip.write_videofile('{}eval{:0004}_{:00005.0f}.mp4'.format( output_dir, step, score * 100)) if __name__ == '__main__': # Test to improve movie with action probs, values etc params = parse_game_args() params.norm_obs = False params.recurrent_policy = True envs = MultiEnvs(params.simulator, 1, 1, params) obs_shape = envs.obs_shape obs_shape = (obs_shape[0] * params.num_stack, *obs_shape[1:]) model = CNNPolicy(obs_shape[0], envs.num_actions, params.recurrent_policy, obs_shape) env = DoomEnvironment(params) agent = BaseAgent(model, params) env.reset() agent.reset() rewards = [] obss = []
def gen_classic(selh, file, scenario = False, model="model_final"): params = parse_game_args() # Charge le scénario if not scenario : params.scenario = "custom_scenario003.cfg" else: params.scenario = scenario env = DoomEnvironment(params) device = torch.device("cuda" if False else "cpu") num_actions = env.num_actions network = CNNPolicy(3, num_actions, True, (3, 64, 112)).to(device) # Chargement du modèle de base network = CNNPolicy(3, num_actions, True, (3, 64, 112)).to(device) checkpoint = torch.load('models/' + model + '.pth.tar', map_location=lambda storage, loc: storage) """Remplacement des clefs du dictionnaire qui posent problème""" checkpoint['model']["dist.linear.weight"] = checkpoint['model']["dist_linear.weight"] del checkpoint['model']["dist_linear.weight"] checkpoint['model']["dist.linear.bias"] = checkpoint['model']["dist_linear.bias"] del checkpoint['model']["dist_linear.bias"] network.load_state_dict(checkpoint['model']) agent = BaseAgent(network, params) ERU = {'env': env, 'agent': agent} # Chargement des checkpoints num_checkpoints = [98, 98, 159] checkpoints = [1]*sum(num_checkpoints) networks = [1]*sum(num_checkpoints) agents = [1]*sum(num_checkpoints) ERUs = [1]*sum(num_checkpoints) for i in range(len(num_checkpoints)): for j in range(num_checkpoints[i]): iter = i*num_checkpoints[0]+j # if i==0: # checkpoint_filename = '/home/adam/Bureau/Transfer Learning/5 - 28-03-21/checkpoint_{}_{}.pth.tar'.format(str(i + 1), str(j + 88)) #else: checkpoint_filename = '/home/adam/Bureau/Transfer Learning/5 - 28-03-21/checkpoint_{}_{}.pth.tar'.format(str(i + 1), str(j + 1)) checkpoints[i*num_checkpoints[0]+j] = torch.load(checkpoint_filename, map_location=lambda storage, loc: storage) """Remplacement des clefs du dictionnaire qui posent problème""" checkpoints[iter]['model']["dist.linear.weight"] = checkpoints[iter]['model']["dist_linear.weight"] del checkpoints[iter]['model']["dist_linear.weight"] checkpoints[iter]['model']["dist.linear.bias"] = checkpoints[iter]['model']["dist_linear.bias"] del checkpoints[iter]['model']["dist_linear.bias"] networks[iter] = CNNPolicy(3, num_actions, True, (3, 64, 112)).to(device) networks[iter].load_state_dict(checkpoints[iter]['model']) agents[iter] = BaseAgent(networks[iter], params) ERUs[iter] = {'env': env, 'agent': agents[iter]} ERUs[iter]['env'].reset() selhs = [] for i in range(sum(num_checkpoints)): selh = tsne_1d_projection(127) selh = torch.from_numpy(selh).type(torch.FloatTensor) selh = Variable(selh, volatile=True) selhs.append(selh) scores = [] hiddens = [] inputs = [] actions = [] #Boucle pour obtenir les images du modèle de base obss = [] actions = [] for i in range(50): obs = ERU['env'].get_observation() action, value, action_probs, grads = ERU['agent'].get_action_value_and_probs_zeroes(obs, selh, epsilon=0.0) ERU['env'].make_action(int(action)) obss.append(obs) actions.append(action) #Boucle pour évaluer les checkpoints sur les situations du modèle de base for i in range(sum(num_checkpoints)): for obs2 in obss: action, value, action_probs, grads = ERUs[i]['agent'].get_action_value_and_probs_zeroes(obs2, selhs[i], epsilon=0.0) hidden = ERUs[i]['agent'].model.get_gru_h() h = '' for elem in hidden[0][0]: h += str(elem) + "," h = h[:-1] h = h.split(',') hiddens.append(h) ERU['env'].make_action(int(action)) im = Image.new('P', (sum(num_checkpoints), 128)) for i in range(len(hiddens)): for j in range(len(hiddens[i])): value = int((float(hiddens[i][j])+1)*255/2) im.putpixel((i, j), (value, value, value, 255)) im.show() im.save("timeline.png") im = Image.new('P', (sum(num_checkpoints)-1, 128)) for i in range(len(hiddens)-1): for j in range(len(hiddens[i])): value = int((abs(float(hiddens[i][j])-float(hiddens[i+1][j])))*255*1.5) if value>255: value=255 im.putpixel((i, j), (value, value, value, 255)) im.show() im.save("variation.png")
def train(): # define params params = parse_game_args() logger = Logger(params) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_updates = int( params.num_frames) // params.num_steps // params.num_environments # environments envs = MultiEnvsMPPipes(params.simulator, params.num_environments, 1, params) obs_shape = envs.obs_shape obs_shape = (obs_shape[0] * params.num_stack, *obs_shape[1:]) evaluator = Evaluator(params) print('creating model') actor_critic = CNNPolicy(obs_shape[0], obs_shape, params).to(device) print('model created') start_j = 0 if params.reload_model: checkpoint_idx = params.reload_model.split(',')[1] checkpoint_filename = '{}models/checkpoint_{}.pth.tar'.format( params.output_dir, checkpoint_idx) assert os.path.isfile( checkpoint_filename), 'The model could not be found {}'.format( checkpoint_filename) logger.write('Loading model{}'.format(checkpoint_filename)) if device == 'cuda': # The checkpoint will try to load onto the GPU storage unless specified checkpoint = torch.load(checkpoint_filename) else: checkpoint = torch.load(checkpoint_filename, map_location=lambda storage, loc: storage) actor_critic.load_state_dict(checkpoint['model']) start_j = (int(checkpoint_idx) // params.num_steps // params.num_environments) + 1 print('creating optimizer') optimizer = optim.RMSprop( [p for p in actor_critic.parameters() if p.requires_grad], params.learning_rate, eps=params.eps, alpha=params.alpha, momentum=params.momentum) if params.reload_model: optimizer.load_state_dict(checkpoint['optimizer']) rollouts = RolloutStorage(params.num_steps, params.num_environments, obs_shape, actor_critic.state_size, params) current_obs = torch.zeros(params.num_environments, *obs_shape) # For Frame stacking def update_current_obs(obs): shape_dim0 = envs.obs_shape[0] obs = torch.from_numpy(obs).float() if params.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs print('getting first obs') obs = envs.reset() print('update current obs') update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([params.num_environments, 1]) final_rewards = torch.zeros([params.num_environments, 1]) current_obs = current_obs.to(device) rollouts.set_device(device) print('Starting training loop') start = time.time() print(num_updates) for j in range(start_j, num_updates): # STARTING no grad scope with torch.no_grad(): if j % params.eval_freq == 0 and not params.skip_eval: print('Evaluating model') if params.simulator == 'doom': actor_critic.eval() total_num_steps = ( j + 1) * params.num_environments * params.num_steps #eval_model(actor_critic, params, logger, j, total_num_steps, params.eval_games) evaluator.evaluate(actor_critic, params, logger, j, total_num_steps, params.eval_games) actor_critic.train() # ============================================================================= # Take steps in the environment # ============================================================================= for step in range(params.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then create masks to clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks masks = masks.to(device) if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states, action, action_log_prob, value, reward, masks) # ============================================================================= # Compute discounted returns, re-step through the environment # ============================================================================= next_value = actor_critic(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1])[0] rollouts.compute_returns(next_value, params.use_gae, params.gamma, params.tau) # FINISHED no grad scope model_output = actor_critic.evaluate_actions( rollouts.observations[:-1].view(-1, *obs_shape), rollouts.states[0].view(-1, actor_critic.state_size), rollouts.masks[:-1].view(-1, 1), rollouts.actions.view(-1, 1)) values, action_log_probs, dist_entropy, states = model_output values = values.view(params.num_steps, params.num_environments, 1) action_log_probs = action_log_probs.view(params.num_steps, params.num_environments, 1) advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * params.value_loss_coef + action_loss - dist_entropy * params.entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), params.max_grad_norm) optimizer.step() rollouts.after_update() if j % params.model_save_rate == 0: total_num_steps = (j + 1) * params.num_environments * params.num_steps checkpoint = { 'step': step, 'params': params, 'model': actor_critic.state_dict(), 'optimizer': optimizer.state_dict() } filepath = logger.output_dir + 'models/' torch.save( checkpoint, '{}checkpoint_{:00000000010}.pth.tar'.format( filepath, total_num_steps)) if j % params.log_interval == 0: end = time.time() total_num_steps = (j + 1) * params.num_environments * params.num_steps save_num_steps = ( start_j) * params.num_environments * params.num_steps logger.write( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format( j, total_num_steps, int((total_num_steps - save_num_steps) / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.item(), value_loss.item(), action_loss.item())) evaluator.cancel() envs.cancel()
def test(): def simulate_rollout(env): from random import choice buffer = [] env.reset() k = 0 while not env.is_episode_finished(): k += 1 obs = env.get_observation() buffer.append(obs) # Makes a random action and save the reward. reward = env.make_action(choice(list(range(env.num_actions)))) print('Game finished in {} steps'.format(k)) print('Total rewards = {}'.format(env.get_total_reward())) return k, buffer # ============================================================================= # Test the environment # ============================================================================= from arguments import parse_game_args params = parse_game_args() env = DoomEnvironment(params) print(env.num_actions) print(env.game.get_available_buttons()) print(len(env.action_map)) print(env.game.get_screen_height(), env.game.get_screen_width()) print(env.get_observation().shape) import matplotlib.pyplot as plt plt.imshow(env.get_observation().transpose(1, 2, 0)) plt.figure() plt.imshow(env.get_observation().transpose(1, 2, 0)) env.decimate = False def resize_obs(observation): observation = observation.transpose(1, 2, 0) observation = resize( observation, (observation.shape[0] / 2, observation.shape[1] / 2)) observation = observation.transpose(2, 0, 1) return observation data = env.get_observation().transpose(1, 2, 0) from skimage.transform import rescale, resize, downscale_local_mean data_resized = resize(data, (data.shape[0] / 2, data.shape[1] / 2)) plt.figure() plt.imshow(data_resized) obs = env.get_observation() obs_rs = resize_obs(obs) assert 0 for action in env.action_map.keys(): reward = env.make_action(action) print(reward, env.is_episode_finished()) for i in range(100): k, b = simulate_rollout(env) print(env.game.get_available_game_variables()) print(env.game.get_game_variable(GameVariable.HEALTH))
def gen_classic(selh, file): params = parse_game_args() params.scenario = "health_gathering_supreme.cfg" env = DoomEnvironment(params) device = torch.device("cuda" if False else "cpu") num_actions = env.num_actions network = CNNPolicy(3, num_actions, True, (3, 64, 112)).to(device) checkpoint = torch.load('models/' + "health_gathering_supreme" + '.pth.tar', map_location=lambda storage, loc: storage) network.load_state_dict(checkpoint['model']) agent = BaseAgent(network, params) ERU = {'env': env, 'agent': agent} selh = torch.from_numpy(selh).type(torch.FloatTensor) selh = Variable(selh, volatile=True) ERU['env'].set_seed(randint(0, 999999999)) ERU['env'].reset() scores = [] hiddens = [] inputs = [] saliencies = [] actions = [] probabilities = [] health = [] positions = [] orientations = [] velocities = [] items = [] fov = [] w = 0 while not ERU['env'].is_episode_finished(): obsvervation = io.BytesIO() obs = ERU['env'].get_observation() temp = ERU['env'].state.screen_buffer Image.fromarray(temp.transpose(1, 2, 0)).save(obsvervation, format="JPEG") action, value, action_probs, grads = ERU[ 'agent'].get_action_value_and_probs_zeroes(obs, selh, epsilon=0.0) hidden = ERU['agent'].model.get_gru_h() h = '' for elem in hidden[0][0]: h += str(elem) + "," h = h[:-1] h = h.split(',') probs = "" for elem in action_probs[0]: probs += str(elem) + "," probs = probs[:-1] probs = probs.split(',') sa = io.BytesIO() t = Image.fromarray(grads, 'L') t.save(sa, format="JPEG") scores.append(str(round(ERU['env'].game.get_total_reward(), 2))) hiddens.append(h) inputs.append(base64.b64encode(obsvervation.getvalue())) saliencies.append(base64.b64encode(sa.getvalue())) actions.append(str(action)) probabilities.append(probs) health.append(ERU['env'].get_health()) positions.append(ERU['env'].get_pos()) orientations.append(ERU['env'].get_ori()) velocities.append(ERU['env'].get_velo()) items.append(ERU['env'].get_item()) fov.append(ERU['env'].get_fov()) ERU['env'].make_action(int(action)) print('Iteration', w, '/525') w += 1 result = { 'episode0': { 'inputs': inputs, 'actions': actions, 'probabilities': probabilities, 'saliencies': saliencies, 'scores': scores, 'positions': positions, 'health': health, 'hiddens': hiddens, 'orientations': orientations, 'velocities': velocities, 'items': items, 'fov': fov } } with open(file, 'w') as f: ujson.dump(result, f, indent=4, sort_keys=True) return result