def main(): args = command_line_args() set_global_seeds(args.seed) model_dir = '{}/{}_{:%Y-%m-%d_%H:%M:%S}'.format( args.model_dir, args.exp_name, datetime.datetime.now()) logger.configure(model_dir) num_env = args.num_env if not args.evaluate else 1 train_envs, eval_env = make_atari_env( env_id=args.env_id, num_env=num_env, seed=args.seed) train_envs = VecFrameStack(train_envs, 4) eval_env = VecFrameStack(eval_env, 4) if not args.use_mlp: cnn = True else: cnn = False agent = A2CAgent( train_envs, eval_env, model_dir=model_dir, n_steps=args.n_steps, num_learning_steps=args.num_learning_steps, debug=args.debug, summary_every=args.summary_every, gamma=args.gamma, tensorboard_summaries=args.tensorboard_summaries, cnn=cnn, seed=args.seed, save_every=args.save_every, load_checkpoint=args.load_checkpoint, checkpoint_prefix=args.checkpoint_prefix) if args.evaluate: agent.evaluate() else: agent.learn()
def evaluate_saved_model(): args = parse_a2c_args() args2 = parse_a2c_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_updates = int( args.num_frames) // args.num_steps // args.num_environments # Writer will output to ./runs/ directory by default writer = torch.utils.tensorboard.SummaryWriter() train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True) # Création des environnements de test des niveaux classiques args2.scenario_dir = "scenarios_transfer_learning/mazes_classic_test/" args2.scenario = "custom_scenario_test{:003}.cfg" classic_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) # Création des environnements de test des niveaux peignes args2.scenario_dir = "scenarios_transfer_learning/little_combs_test/" little_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) args2.scenario_dir = "scenarios_transfer_learning/medium_combs_test/" medium_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) obs_shape = train_envs.obs_shape policy = CNNPolicy(obs_shape, args).to(device) agent = A2CAgent(policy, args.hidden_size, value_weight=args.value_loss_coef, entropy_weight=args.entropy_coef, num_steps=args.num_steps, num_parallel=args.num_environments, gamma=args.gamma, lr=args.learning_rate, opt_alpha=args.alpha, opt_momentum=args.momentum, max_grad_norm=args.max_grad_norm) obs = little_combs_test_envs.reset() num_checkpoints = 355 for j in range(num_checkpoints): if j % 8 == 0: checkpoint_filename = '/home/adam/Bureau/Transfer Learning/FINAL/checkpoint_{}.pth.tar'.format( str(j + 1)) agent.load_model(checkpoint_filename) total_num_steps = (j + 1) * args.num_environments * args.num_steps mean_rewards_classic, game_times_classic = agent.evaluate( classic_test_envs, j, total_num_steps) mean_rewards_little, game_times_little = agent.evaluate( little_combs_test_envs, j, total_num_steps) mean_rewards_medium, game_times_medium = agent.evaluate( medium_combs_test_envs, j, total_num_steps) writer.add_scalar("Reward classic levels", mean_rewards_classic, (j + 1) * 100) writer.add_scalar("Reward little combs levels", mean_rewards_little, (j + 1) * 100) writer.add_scalar("Reward medium combs levels", mean_rewards_medium, (j + 1) * 100) print(j)
import sys # mkdir('log') Path('log').mkdir(parents=True, exist_ok=True) Path('data').mkdir(parents=True, exist_ok=True) # set_one_thread() # os.environ['OMP_NUM_THREADS'] = '1' # os.environ['MKL_NUM_THREADS'] = '1' # torch.set_num_threads(1) # seed np.random.seed(333) torch.manual_seed(np.random.randint(int(1e6))) config = Config() agent = A2CAgent(config, Env('reacher.app', is_mock=config.is_mock)) agent_name = agent.__class__.__name__ t0 = time.time() episode = 1 avg_scores = [] eval_scores = [] num_eval_episodes = 100 target_avg_score = 30.0 agent_last_steps = 0 # Todo # importable log for plotting try: while True:
entropy_coef = 0.01 value_loss_coef = 0.5 num_frames_per_proc = 5 # num_frames_per_proc * num_procs = batch_size train_epochs = 300000 test_episode = 10 log_interval = 100 test_interval = 1000 save_interval = 1000 env = make_env('BreakoutNoFrameskip-v4', seed, num_procs) in_ch = env.observation_space.shape[-1] n_action = env.action_space.n import ipdb;ipdb.set_trace() model = CNNModel(in_ch, n_action) obs_preproc = ObsPreproc(device=device) agent = A2CAgent(model, env, obs_preproc, device, lr, gamma, entropy_coef, value_loss_coef) test_env = make_env('BreakoutNoFrameskip-v4', seed, 1, clip_reward=False) test_agent = TestAgent(model, test_env, obs_preproc, device, test_episode) for i in range(train_epochs): batch, log = agent.collect_batch(num_frames_per_proc) info = agent.update_parameters(batch) if i % log_interval == 0: print_dict({'step': i}, info, log) if i % test_interval == 0: print('=' * 20 + 'Test Agent' + '=' * 20) info = test_agent.evaluate() print_dict(info) if i % save_interval == 0:
import os import numpy as np import gym import matplotlib.pyplot as plt import tensorflow as tf from tensorflow.keras.models import Model, load_model from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout from tensorflow.keras.optimizers import RMSprop from tensorflow.keras.losses import mean_squared_error, SparseCategoricalCrossentropy, CategoricalCrossentropy from a2c_model import a2c_Model from a2c_agent import A2CAgent if __name__ == '__main__': env = gym.make('CartPole-v0') env.render() model = a2c_Model(env.observation_space.shape[0], env.action_space.n) agent = A2CAgent(model) agent.test_model(env, 'models/model840.h5', 100)
def stack_frames(frames, state, is_new=False): frame = preprocess_frame(state, (1, -1, -1, 1), 84) frames = stack_frame(frames, frame, is_new) return frames INPUT_SHAPE = (4, 84, 84) ACTION_SIZE = len(possible_actions) SEED = 0 GAMMA = 0.99 # discount factor ALPHA = 0.0001 # Actor learning rate BETA = 0.0005 # Critic learning rate UPDATE_EVERY = 100 # how often to update the network agent = A2CAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA, UPDATE_EVERY, ActorCnn, CriticCnn) ''' env.viewer = None # watch an untrained agent state = stack_frames(None, env.reset(), True) for j in range(200): env.render(close=False) action, _, _ = agent.act(state) next_state, reward, done, _ = env.step(possible_actions[action]) state = stack_frames(state, next_state, False) if done: env.reset() break env.render(close=True) '''
def train(): args = parse_a2c_args() args2 = parse_a2c_args() output_dir = initialize_logging(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_updates = int( args.num_frames) // args.num_steps // args.num_environments # Create the train and test environments with Multiple processes train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True) #Création des environnements de test des niveaux classiques args2.scenario_dir = "scenarios_transfer_learning/mazes_classic_test/" args2.scenario = "custom_scenario_test{:003}.cfg" classic_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) #Création des environnements de test des niveaux peignes args2.scenario_dir = "scenarios_transfer_learning/little_combs_test/" little_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) args2.scenario_dir = "scenarios_transfer_learning/medium_combs_test/" medium_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) test_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=False) # Writer will output to ./runs/ directory by default writer = torch.utils.tensorboard.SummaryWriter() obs_shape = train_envs.obs_shape # The agent's policy network and training algorithm A2C policy = CNNPolicy(obs_shape, args).to(device) agent = A2CAgent(policy, args.hidden_size, value_weight=args.value_loss_coef, entropy_weight=args.entropy_coef, num_steps=args.num_steps, num_parallel=args.num_environments, gamma=args.gamma, lr=args.learning_rate, opt_alpha=args.alpha, opt_momentum=args.momentum, max_grad_norm=args.max_grad_norm) start_j = 0 if args.reload_model: checkpoint_idx = args.reload_model.split(',')[1] checkpoint_filename = '{}models/base_line.pth.tar'.format(output_dir) agent.load_model(checkpoint_filename) start_j = 0 #(int(checkpoint_idx) // args.num_steps // args.num_environments) + 1 obs = train_envs.reset() start = time.time() nb_of_saves = 0 for j in range(start_j, num_updates): print("------", j / num_updates * 100, "-------") # Test des performances du modèle if not args.skip_eval and j % args.eval_freq == 0: total_num_steps = (j + 1) * args.num_environments * args.num_steps mean_rewards_classic, game_times_classic = agent.evaluate( classic_test_envs, j, total_num_steps) mean_rewards_little, game_times_little = agent.evaluate( little_combs_test_envs, j, total_num_steps) mean_rewards_medium, game_times_medium = agent.evaluate( medium_combs_test_envs, j, total_num_steps) # succes_classic = sum([1 if i!=525 else 0 for i in game_times_classic])/16 # succes_little = sum([1 if i!=525 else 0 for i in game_times_little])/16 # succes_medium = sum([1 if i!=525 else 0 for i in game_times_medium])/16 writer.add_scalar("Reward classic levels", mean_rewards_classic, j) writer.add_scalar("Reward little combs levels", mean_rewards_little, j) writer.add_scalar("Reward medium combs levels", mean_rewards_medium, j) # writer.add_scalar("Success rate classic levels", succes_classic, j) # writer.add_scalar("Success rate little combs levels", succes_little, j) # writer.add_scalar("Success rate medium combs levels", succes_medium, j) for step in range(args.num_steps): action = agent.get_action(obs, step) obs, reward, done, info = train_envs.step(action) agent.add_rewards_masks(reward, done, step) report = agent.update(obs) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_environments * args.num_steps save_num_steps = (start_j) * args.num_environments * args.num_steps FPS = int((total_num_steps - save_num_steps) / (end - start)), logging.info(report.format(j, total_num_steps, FPS)) if j % args.model_save_rate == 0: nb_of_saves += 1 agent.save_policy2(nb_of_saves, args, output_dir) # cancel the env processes train_envs.cancel() test_envs.cancel()
def main(): es = [make_env(i) for i in range(num_processes)] envs = VecEnv([es[i] for i in range(num_processes)]) spatial_obs_space = es[0].observation_space.spaces['board'].shape board_dim = (spatial_obs_space[1], spatial_obs_space[2]) board_squares = spatial_obs_space[1] * spatial_obs_space[2] non_spatial_obs_space = es[0].observation_space.spaces['state'].shape[0] + es[0].observation_space.spaces['procedures'].shape[0] + es[0].observation_space.spaces['available-action-types'].shape[0] non_spatial_action_types = FFAIEnv.simple_action_types + FFAIEnv.defensive_formation_action_types + FFAIEnv.offensive_formation_action_types num_non_spatial_action_types = len(non_spatial_action_types) spatial_action_types = FFAIEnv.positional_action_types num_spatial_action_types = len(spatial_action_types) num_spatial_actions = num_spatial_action_types * spatial_obs_space[1] * spatial_obs_space[2] action_space = num_non_spatial_action_types + num_spatial_actions def compute_action_masks(observations): masks = [] m = False for ob in observations: mask = np.zeros(action_space) i = 0 for action_type in non_spatial_action_types: mask[i] = ob['available-action-types'][action_type.name] i += 1 for action_type in spatial_action_types: if ob['available-action-types'][action_type.name] == 0: mask[i:i+board_squares] = 0 elif ob['available-action-types'][action_type.name] == 1: position_mask = ob['board'][f"{action_type.name.replace('_', ' ').lower()} positions"] position_mask_flatten = np.reshape(position_mask, (1, board_squares)) for j in range(board_squares): mask[i + j] = position_mask_flatten[0][j] i += board_squares assert 1 in mask if m: print(mask) masks.append(mask) return masks def compute_action(action_idx): if action_idx < len(non_spatial_action_types): return non_spatial_action_types[action_idx], 0, 0 spatial_idx = action_idx - num_non_spatial_action_types spatial_pos_idx = spatial_idx % board_squares spatial_y = int(spatial_pos_idx / board_dim[1]) spatial_x = int(spatial_pos_idx % board_dim[1]) spatial_action_type_idx = int(spatial_idx / board_squares) spatial_action_type = spatial_action_types[spatial_action_type_idx] return spatial_action_type, spatial_x, spatial_y # MODEL ac_agent = CNNPolicy(spatial_obs_space, non_spatial_obs_space, hidden_nodes=num_hidden_nodes, kernels=num_cnn_kernels, actions=action_space) # OPTIMIZER optimizer = optim.RMSprop(ac_agent.parameters(), learning_rate) # MEMORY STORE memory = Memory(steps_per_update, num_processes, spatial_obs_space, (1, non_spatial_obs_space), action_space) # PPCG difficulty = 0.0 if ppcg else 1.0 dif_delta = 0.01 # Reset environments obs = envs.reset(difficulty) spatial_obs, non_spatial_obs = update_obs(obs) # Add obs to memory memory.spatial_obs[0].copy_(spatial_obs) memory.non_spatial_obs[0].copy_(non_spatial_obs) # Variables for storing stats all_updates = 0 all_episodes = 0 all_steps = 0 episodes = 0 proc_rewards = np.zeros(num_processes) proc_tds = np.zeros(num_processes) proc_tds_opp = np.zeros(num_processes) episode_rewards = [] episode_tds = [] episode_tds_opp = [] wins = [] value_losses = [] policy_losses = [] log_updates = [] log_episode = [] log_steps = [] log_win_rate = [] log_td_rate = [] log_td_rate_opp = [] log_mean_reward = [] log_difficulty = [] # self-play selfplay_next_save = selfplay_save_steps selfplay_next_swap = selfplay_swap_steps selfplay_models = 0 if selfplay: model_name = f"{exp_id}_selfplay_0.nn" model_path = os.path.join(model_dir, model_name) torch.save(ac_agent, model_path) envs.swap(A2CAgent(name=model_name, env_name=env_name, filename=model_path)) selfplay_models += 1 renderer = ffai.Renderer() while all_steps < num_steps: for step in range(steps_per_update): action_masks = compute_action_masks(obs) action_masks = torch.tensor(action_masks, dtype=torch.bool) values, actions = ac_agent.act( Variable(memory.spatial_obs[step]), Variable(memory.non_spatial_obs[step]), Variable(action_masks)) action_objects = [] for action in actions: action_type, x, y = compute_action(action.numpy()[0]) action_object = { 'action-type': action_type, 'x': x, 'y': y } action_objects.append(action_object) obs, env_reward, shaped_reward, tds_scored, tds_opp_scored, done, info = envs.step(action_objects, difficulty=difficulty) # envs.render() ''' for j in range(len(obs)): ob = obs[j] renderer.render(ob, j) ''' reward = torch.from_numpy(np.expand_dims(np.stack(env_reward), 1)).float() shaped_reward = torch.from_numpy(np.expand_dims(np.stack(shaped_reward), 1)).float() r = reward.numpy() sr = shaped_reward.numpy() for i in range(num_processes): proc_rewards[i] += sr[i] proc_tds[i] += tds_scored[i] proc_tds_opp[i] += tds_opp_scored[i] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) dones = masks.squeeze() episodes += num_processes - int(dones.sum().item()) for i in range(num_processes): if done[i]: if r[i] > 0: wins.append(1) difficulty += dif_delta elif r[i] < 0: wins.append(0) difficulty -= dif_delta else: wins.append(0.5) difficulty -= dif_delta if ppcg: difficulty = min(1.0, max(0, difficulty)) else: difficulty = 1 episode_rewards.append(proc_rewards[i]) episode_tds.append(proc_tds[i]) episode_tds_opp.append(proc_tds_opp[i]) proc_rewards[i] = 0 proc_tds[i] = 0 proc_tds_opp[i] = 0 # Update the observations returned by the environment spatial_obs, non_spatial_obs = update_obs(obs) # insert the step taken into memory memory.insert(step, spatial_obs, non_spatial_obs, actions.data, values.data, shaped_reward, masks, action_masks) next_value = ac_agent(Variable(memory.spatial_obs[-1], requires_grad=False), Variable(memory.non_spatial_obs[-1], requires_grad=False))[0].data # Compute returns memory.compute_returns(next_value, gamma) spatial = Variable(memory.spatial_obs[:-1]) spatial = spatial.view(-1, *spatial_obs_space) non_spatial = Variable(memory.non_spatial_obs[:-1]) non_spatial = non_spatial.view(-1, non_spatial.shape[-1]) actions = Variable(torch.LongTensor(memory.actions.view(-1, 1))) actions_mask = Variable(memory.action_masks[:-1]) # Evaluate the actions taken action_log_probs, values, dist_entropy = ac_agent.evaluate_actions(spatial, non_spatial, actions, actions_mask) values = values.view(steps_per_update, num_processes, 1) action_log_probs = action_log_probs.view(steps_per_update, num_processes, 1) advantages = Variable(memory.returns[:-1]) - values value_loss = advantages.pow(2).mean() #value_losses.append(value_loss) # Compute loss action_loss = -(Variable(advantages.data) * action_log_probs).mean() #policy_losses.append(action_loss) optimizer.zero_grad() total_loss = (value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef) total_loss.backward() nn.utils.clip_grad_norm_(ac_agent.parameters(), max_grad_norm) optimizer.step() memory.non_spatial_obs[0].copy_(memory.non_spatial_obs[-1]) memory.spatial_obs[0].copy_(memory.spatial_obs[-1]) # Updates all_updates += 1 # Episodes all_episodes += episodes episodes = 0 # Steps all_steps += num_processes * steps_per_update # Self-play save if selfplay and all_steps >= selfplay_next_save: selfplay_next_save = max(all_steps+1, selfplay_next_save+selfplay_save_steps) model_name = f"{exp_id}_selfplay_{selfplay_models}.nn" model_path = os.path.join(model_dir, model_name) print(f"Saving {model_path}") torch.save(ac_agent, model_path) selfplay_models += 1 # Self-play swap if selfplay and all_steps >= selfplay_next_swap: selfplay_next_swap = max(all_steps + 1, selfplay_next_swap+selfplay_swap_steps) lower = max(0, selfplay_models-1-(selfplay_window-1)) i = random.randint(lower, selfplay_models-1) model_name = f"{exp_id}_selfplay_{i}.nn" model_path = os.path.join(model_dir, model_name) print(f"Swapping opponent to {model_path}") envs.swap(A2CAgent(name=model_name, env_name=env_name, filename=model_path)) # Logging if all_updates % log_interval == 0 and len(episode_rewards) >= num_processes: td_rate = np.mean(episode_tds) td_rate_opp = np.mean(episode_tds_opp) episode_tds.clear() episode_tds_opp.clear() mean_reward = np.mean(episode_rewards) episode_rewards.clear() win_rate = np.mean(wins) wins.clear() #mean_value_loss = np.mean(value_losses) #mean_policy_loss = np.mean(policy_losses) log_updates.append(all_updates) log_episode.append(all_episodes) log_steps.append(all_steps) log_win_rate.append(win_rate) log_td_rate.append(td_rate) log_td_rate_opp.append(td_rate_opp) log_mean_reward.append(mean_reward) log_difficulty.append(difficulty) log = "Updates: {}, Episodes: {}, Timesteps: {}, Win rate: {:.2f}, TD rate: {:.2f}, TD rate opp: {:.2f}, Mean reward: {:.3f}, Difficulty: {:.2f}" \ .format(all_updates, all_episodes, all_steps, win_rate, td_rate, td_rate_opp, mean_reward, difficulty) log_to_file = "{}, {}, {}, {}, {}, {}, {}\n" \ .format(all_updates, all_episodes, all_steps, win_rate, td_rate, td_rate_opp, mean_reward, difficulty) # Save to files log_path = os.path.join(log_dir, f"{exp_id}.dat") print(f"Save log to {log_path}") with open(log_path, "a") as myfile: myfile.write(log_to_file) print(log) episodes = 0 value_losses.clear() policy_losses.clear() # Save model model_name = f"{exp_id}.nn" model_path = os.path.join(model_dir, model_name) torch.save(ac_agent, model_path) # plot n = 3 if ppcg: n += 1 fig, axs = plt.subplots(1, n, figsize=(4*n, 5)) axs[0].ticklabel_format(axis="x", style="sci", scilimits=(0,0)) axs[0].plot(log_steps, log_mean_reward) axs[0].set_title('Reward') #axs[0].set_ylim(bottom=0.0) axs[0].set_xlim(left=0) axs[1].ticklabel_format(axis="x", style="sci", scilimits=(0,0)) axs[1].plot(log_steps, log_td_rate, label="Learner") axs[1].set_title('TD/Episode') axs[1].set_ylim(bottom=0.0) axs[1].set_xlim(left=0) if selfplay: axs[1].ticklabel_format(axis="x", style="sci", scilimits=(0, 0)) axs[1].plot(log_steps, log_td_rate_opp, color="red", label="Opponent") axs[2].ticklabel_format(axis="x", style="sci", scilimits=(0,0)) axs[2].plot(log_steps, log_win_rate) axs[2].set_title('Win rate') axs[2].set_yticks(np.arange(0, 1.001, step=0.1)) axs[2].set_xlim(left=0) if ppcg: axs[3].ticklabel_format(axis="x", style="sci", scilimits=(0, 0)) axs[3].plot(log_steps, log_difficulty) axs[3].set_title('Difficulty') axs[3].set_yticks(np.arange(0, 1.001, step=0.1)) axs[3].set_xlim(left=0) fig.tight_layout() plot_name = f"{exp_id}_{'_selfplay' if selfplay else ''}.png" plot_path = os.path.join(plot_dir, plot_name) fig.savefig(plot_path) plt.close('all') model_name = f"{exp_id}.nn" model_path = os.path.join(model_dir, model_name) torch.save(ac_agent, model_path) envs.close()
def run_env(env): env.step(n_steps) if __name__ == "__main__": n_env = multiprocessing.cpu_count() envs = [ EnvWrapper(frame_size, skip_frames, stack_size) for i in range(n_env) ] action_size = envs[0].get_action_size() tf.reset_default_graph() gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) train_model = A2CAgent("train_model", True, sess, input_shape, action_size, lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef, clip_range, load_model) old_model = A2CAgent("old_model", False, sess, input_shape, action_size, lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef, clip_range, False) sync_ops = old_model.create_sync_ops(train_model) sess.run(sync_ops) summary_writer = tf.summary.FileWriter("./log/sum", sess.graph) # envs[0].set_render(True) for env in envs: env.set_agent(old_model) p = ThreadPool(n_env)
def train(): args = parse_a2c_args() output_dir = initialize_logging(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_updates = int(args.num_frames) // args.num_steps // args.num_environments # Create the train and test environments with Multiple processes train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True) test_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=False) obs_shape = train_envs.obs_shape # The agent's policy network and training algorithm A2C policy = CNNPolicy(obs_shape, args).to(device) agent = A2CAgent(policy, args.hidden_size, value_weight=args.value_loss_coef, entropy_weight=args.entropy_coef, num_steps=args.num_steps, num_parallel=args.num_environments, gamma=args.gamma, lr=args.learning_rate, opt_alpha=args.alpha, opt_momentum=args.momentum, max_grad_norm=args.max_grad_norm) start_j = 0 if args.reload_model: checkpoint_idx = args.reload_model.split(',')[1] checkpoint_filename = '{}models/checkpoint_{}.pth.tar'.format(output_dir, checkpoint_idx) agent.load_model(checkpoint_filename) start_j = (int(checkpoint_idx) // args.num_steps // args.num_environments) + 1 obs = train_envs.reset() start = time.time() for j in range(start_j, num_updates): if not args.skip_eval and j % args.eval_freq == 0: total_num_steps = (j + 1) * args.num_environments * args.num_steps mean_rewards, game_times = agent.evaluate(test_envs, j, total_num_steps) logging.info(mean_rewards) logging.info(game_times) for step in range(args.num_steps): action = agent.get_action(obs, step) obs, reward, done, info = train_envs.step(action) agent.add_rewards_masks(reward, done, step) report = agent.update(obs) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_environments * args.num_steps save_num_steps = (start_j) * args.num_environments * args.num_steps FPS = int((total_num_steps - save_num_steps) / (end - start)), logging.info(report.format(j, total_num_steps, FPS)) if j % args.model_save_rate == 0: total_num_steps = (j + 1) * args.num_environments * args.num_steps agent.save_policy(total_num_steps, args, output_dir) # cancel the env processes train_envs.cancel() test_envs.cancel()