def main():
  args = command_line_args()

  set_global_seeds(args.seed)
  model_dir = '{}/{}_{:%Y-%m-%d_%H:%M:%S}'.format(
      args.model_dir, args.exp_name, datetime.datetime.now())
  logger.configure(model_dir)

  num_env = args.num_env if not args.evaluate else 1

  train_envs, eval_env = make_atari_env(
      env_id=args.env_id, num_env=num_env, seed=args.seed) 
  train_envs = VecFrameStack(train_envs, 4)
  eval_env = VecFrameStack(eval_env, 4)

  if not args.use_mlp:
    cnn = True
  else:
    cnn = False

  agent = A2CAgent(
      train_envs,
      eval_env,
      model_dir=model_dir,
      n_steps=args.n_steps,
      num_learning_steps=args.num_learning_steps,
      debug=args.debug,
      summary_every=args.summary_every,
      gamma=args.gamma,
      tensorboard_summaries=args.tensorboard_summaries,
      cnn=cnn,
      seed=args.seed,
      save_every=args.save_every,
      load_checkpoint=args.load_checkpoint,
      checkpoint_prefix=args.checkpoint_prefix)

  if args.evaluate:
    agent.evaluate()
  else:
    agent.learn()
Exemple #2
0
def evaluate_saved_model():
    args = parse_a2c_args()
    args2 = parse_a2c_args()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    num_updates = int(
        args.num_frames) // args.num_steps // args.num_environments

    # Writer will output to ./runs/ directory by default
    writer = torch.utils.tensorboard.SummaryWriter()

    train_envs = MultiEnv(args.simulator,
                          args.num_environments,
                          args,
                          is_train=True)

    # Création des environnements de test des niveaux classiques
    args2.scenario_dir = "scenarios_transfer_learning/mazes_classic_test/"
    args2.scenario = "custom_scenario_test{:003}.cfg"
    classic_test_envs = MultiEnv(args.simulator,
                                 args.num_environments,
                                 args2,
                                 is_train=False)
    # Création des environnements de test des niveaux peignes
    args2.scenario_dir = "scenarios_transfer_learning/little_combs_test/"
    little_combs_test_envs = MultiEnv(args.simulator,
                                      args.num_environments,
                                      args2,
                                      is_train=False)
    args2.scenario_dir = "scenarios_transfer_learning/medium_combs_test/"
    medium_combs_test_envs = MultiEnv(args.simulator,
                                      args.num_environments,
                                      args2,
                                      is_train=False)

    obs_shape = train_envs.obs_shape

    policy = CNNPolicy(obs_shape, args).to(device)

    agent = A2CAgent(policy,
                     args.hidden_size,
                     value_weight=args.value_loss_coef,
                     entropy_weight=args.entropy_coef,
                     num_steps=args.num_steps,
                     num_parallel=args.num_environments,
                     gamma=args.gamma,
                     lr=args.learning_rate,
                     opt_alpha=args.alpha,
                     opt_momentum=args.momentum,
                     max_grad_norm=args.max_grad_norm)

    obs = little_combs_test_envs.reset()

    num_checkpoints = 355

    for j in range(num_checkpoints):
        if j % 8 == 0:
            checkpoint_filename = '/home/adam/Bureau/Transfer Learning/FINAL/checkpoint_{}.pth.tar'.format(
                str(j + 1))
            agent.load_model(checkpoint_filename)

            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            mean_rewards_classic, game_times_classic = agent.evaluate(
                classic_test_envs, j, total_num_steps)
            mean_rewards_little, game_times_little = agent.evaluate(
                little_combs_test_envs, j, total_num_steps)
            mean_rewards_medium, game_times_medium = agent.evaluate(
                medium_combs_test_envs, j, total_num_steps)

            writer.add_scalar("Reward classic levels", mean_rewards_classic,
                              (j + 1) * 100)
            writer.add_scalar("Reward little combs levels",
                              mean_rewards_little, (j + 1) * 100)
            writer.add_scalar("Reward medium combs levels",
                              mean_rewards_medium, (j + 1) * 100)
            print(j)
import sys
# mkdir('log')
Path('log').mkdir(parents=True, exist_ok=True)
Path('data').mkdir(parents=True, exist_ok=True)

# set_one_thread()
# os.environ['OMP_NUM_THREADS'] = '1'
# os.environ['MKL_NUM_THREADS'] = '1'
# torch.set_num_threads(1)

# seed
np.random.seed(333)
torch.manual_seed(np.random.randint(int(1e6)))

config = Config()
agent = A2CAgent(config, Env('reacher.app', is_mock=config.is_mock))
agent_name = agent.__class__.__name__
t0 = time.time()
episode = 1

avg_scores = []
eval_scores = []
num_eval_episodes = 100
target_avg_score = 30.0
agent_last_steps = 0

# Todo
# importable log for plotting

try:
    while True:
entropy_coef = 0.01
value_loss_coef = 0.5
num_frames_per_proc = 5  # num_frames_per_proc * num_procs = batch_size
train_epochs = 300000
test_episode = 10
log_interval = 100
test_interval = 1000
save_interval = 1000

env = make_env('BreakoutNoFrameskip-v4', seed, num_procs)
in_ch = env.observation_space.shape[-1]
n_action = env.action_space.n
import ipdb;ipdb.set_trace()
model = CNNModel(in_ch, n_action)
obs_preproc = ObsPreproc(device=device)
agent = A2CAgent(model, env, obs_preproc, device, lr, gamma, entropy_coef, value_loss_coef)

test_env = make_env('BreakoutNoFrameskip-v4', seed, 1, clip_reward=False)
test_agent = TestAgent(model, test_env, obs_preproc, device, test_episode)


for i in range(train_epochs):
    batch, log = agent.collect_batch(num_frames_per_proc)
    info = agent.update_parameters(batch)
    if i % log_interval == 0:
        print_dict({'step': i}, info, log)
    if i % test_interval == 0:
        print('=' * 20 + 'Test Agent' + '=' * 20)
        info = test_agent.evaluate()
        print_dict(info)
    if i % save_interval == 0:
import os
import numpy as np
import gym
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import mean_squared_error, SparseCategoricalCrossentropy, CategoricalCrossentropy
from a2c_model import a2c_Model
from a2c_agent import A2CAgent

if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env.render()
    model = a2c_Model(env.observation_space.shape[0], env.action_space.n)
    agent = A2CAgent(model)
    agent.test_model(env, 'models/model840.h5', 100)
def stack_frames(frames, state, is_new=False):
    frame = preprocess_frame(state, (1, -1, -1, 1), 84)
    frames = stack_frame(frames, frame, is_new)

    return frames


INPUT_SHAPE = (4, 84, 84)
ACTION_SIZE = len(possible_actions)
SEED = 0
GAMMA = 0.99  # discount factor
ALPHA = 0.0001  # Actor learning rate
BETA = 0.0005  # Critic learning rate
UPDATE_EVERY = 100  # how often to update the network

agent = A2CAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA,
                 UPDATE_EVERY, ActorCnn, CriticCnn)
'''
env.viewer = None
# watch an untrained agent
state = stack_frames(None, env.reset(), True)
for j in range(200):
    env.render(close=False)
    action, _, _ = agent.act(state)
    next_state, reward, done, _ = env.step(possible_actions[action])
    state = stack_frames(state, next_state, False)
    if done:
        env.reset()
        break
env.render(close=True)
'''
def train():
    args = parse_a2c_args()
    args2 = parse_a2c_args()
    output_dir = initialize_logging(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    num_updates = int(
        args.num_frames) // args.num_steps // args.num_environments
    # Create the train and test environments with Multiple processes
    train_envs = MultiEnv(args.simulator,
                          args.num_environments,
                          args,
                          is_train=True)

    #Création des environnements de test des niveaux classiques
    args2.scenario_dir = "scenarios_transfer_learning/mazes_classic_test/"
    args2.scenario = "custom_scenario_test{:003}.cfg"
    classic_test_envs = MultiEnv(args.simulator,
                                 args.num_environments,
                                 args2,
                                 is_train=False)
    #Création des environnements de test des niveaux peignes
    args2.scenario_dir = "scenarios_transfer_learning/little_combs_test/"
    little_combs_test_envs = MultiEnv(args.simulator,
                                      args.num_environments,
                                      args2,
                                      is_train=False)
    args2.scenario_dir = "scenarios_transfer_learning/medium_combs_test/"
    medium_combs_test_envs = MultiEnv(args.simulator,
                                      args.num_environments,
                                      args2,
                                      is_train=False)

    test_envs = MultiEnv(args.simulator,
                         args.num_environments,
                         args,
                         is_train=False)

    # Writer will output to ./runs/ directory by default
    writer = torch.utils.tensorboard.SummaryWriter()

    obs_shape = train_envs.obs_shape

    # The agent's policy network and training algorithm A2C
    policy = CNNPolicy(obs_shape, args).to(device)
    agent = A2CAgent(policy,
                     args.hidden_size,
                     value_weight=args.value_loss_coef,
                     entropy_weight=args.entropy_coef,
                     num_steps=args.num_steps,
                     num_parallel=args.num_environments,
                     gamma=args.gamma,
                     lr=args.learning_rate,
                     opt_alpha=args.alpha,
                     opt_momentum=args.momentum,
                     max_grad_norm=args.max_grad_norm)

    start_j = 0
    if args.reload_model:
        checkpoint_idx = args.reload_model.split(',')[1]
        checkpoint_filename = '{}models/base_line.pth.tar'.format(output_dir)
        agent.load_model(checkpoint_filename)
        start_j = 0  #(int(checkpoint_idx) // args.num_steps // args.num_environments) + 1

    obs = train_envs.reset()
    start = time.time()
    nb_of_saves = 0

    for j in range(start_j, num_updates):
        print("------", j / num_updates * 100, "-------")

        # Test des performances du modèle
        if not args.skip_eval and j % args.eval_freq == 0:
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            mean_rewards_classic, game_times_classic = agent.evaluate(
                classic_test_envs, j, total_num_steps)
            mean_rewards_little, game_times_little = agent.evaluate(
                little_combs_test_envs, j, total_num_steps)
            mean_rewards_medium, game_times_medium = agent.evaluate(
                medium_combs_test_envs, j, total_num_steps)

            # succes_classic = sum([1 if i!=525 else 0 for i in game_times_classic])/16
            #  succes_little = sum([1 if i!=525 else 0 for i in game_times_little])/16
            # succes_medium = sum([1 if i!=525 else 0 for i in game_times_medium])/16

            writer.add_scalar("Reward classic levels", mean_rewards_classic, j)
            writer.add_scalar("Reward little combs levels",
                              mean_rewards_little, j)
            writer.add_scalar("Reward medium combs levels",
                              mean_rewards_medium, j)
        # writer.add_scalar("Success rate classic levels", succes_classic, j)
        # writer.add_scalar("Success rate little combs levels", succes_little, j)
        # writer.add_scalar("Success rate medium combs levels", succes_medium, j)

        for step in range(args.num_steps):
            action = agent.get_action(obs, step)
            obs, reward, done, info = train_envs.step(action)
            agent.add_rewards_masks(reward, done, step)

        report = agent.update(obs)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            save_num_steps = (start_j) * args.num_environments * args.num_steps
            FPS = int((total_num_steps - save_num_steps) / (end - start)),

            logging.info(report.format(j, total_num_steps, FPS))

        if j % args.model_save_rate == 0:
            nb_of_saves += 1
            agent.save_policy2(nb_of_saves, args, output_dir)

    # cancel the env processes
    train_envs.cancel()
    test_envs.cancel()
Exemple #8
0
def main():
    es = [make_env(i) for i in range(num_processes)]
    envs = VecEnv([es[i] for i in range(num_processes)])

    spatial_obs_space = es[0].observation_space.spaces['board'].shape
    board_dim = (spatial_obs_space[1], spatial_obs_space[2])
    board_squares = spatial_obs_space[1] * spatial_obs_space[2]

    non_spatial_obs_space = es[0].observation_space.spaces['state'].shape[0] + es[0].observation_space.spaces['procedures'].shape[0] + es[0].observation_space.spaces['available-action-types'].shape[0]
    non_spatial_action_types = FFAIEnv.simple_action_types + FFAIEnv.defensive_formation_action_types + FFAIEnv.offensive_formation_action_types
    num_non_spatial_action_types = len(non_spatial_action_types)
    spatial_action_types = FFAIEnv.positional_action_types
    num_spatial_action_types = len(spatial_action_types)
    num_spatial_actions = num_spatial_action_types * spatial_obs_space[1] * spatial_obs_space[2]
    action_space = num_non_spatial_action_types + num_spatial_actions

    def compute_action_masks(observations):
        masks = []
        m = False
        for ob in observations:
            mask = np.zeros(action_space)
            i = 0
            for action_type in non_spatial_action_types:
                mask[i] = ob['available-action-types'][action_type.name]
                i += 1
            for action_type in spatial_action_types:
                if ob['available-action-types'][action_type.name] == 0:
                    mask[i:i+board_squares] = 0
                elif ob['available-action-types'][action_type.name] == 1:
                    position_mask = ob['board'][f"{action_type.name.replace('_', ' ').lower()} positions"]
                    position_mask_flatten = np.reshape(position_mask, (1, board_squares))
                    for j in range(board_squares):
                        mask[i + j] = position_mask_flatten[0][j]
                i += board_squares
            assert 1 in mask
            if m:
                print(mask)
            masks.append(mask)
        return masks

    def compute_action(action_idx):
        if action_idx < len(non_spatial_action_types):
            return non_spatial_action_types[action_idx], 0, 0
        spatial_idx = action_idx - num_non_spatial_action_types
        spatial_pos_idx = spatial_idx % board_squares
        spatial_y = int(spatial_pos_idx / board_dim[1])
        spatial_x = int(spatial_pos_idx % board_dim[1])
        spatial_action_type_idx = int(spatial_idx / board_squares)
        spatial_action_type = spatial_action_types[spatial_action_type_idx]
        return spatial_action_type, spatial_x, spatial_y

    # MODEL
    ac_agent = CNNPolicy(spatial_obs_space, non_spatial_obs_space, hidden_nodes=num_hidden_nodes, kernels=num_cnn_kernels, actions=action_space)

    # OPTIMIZER
    optimizer = optim.RMSprop(ac_agent.parameters(), learning_rate)

    # MEMORY STORE
    memory = Memory(steps_per_update, num_processes, spatial_obs_space, (1, non_spatial_obs_space), action_space)

    # PPCG
    difficulty = 0.0 if ppcg else 1.0
    dif_delta = 0.01

    # Reset environments
    obs = envs.reset(difficulty)
    spatial_obs, non_spatial_obs = update_obs(obs)

    # Add obs to memory
    memory.spatial_obs[0].copy_(spatial_obs)
    memory.non_spatial_obs[0].copy_(non_spatial_obs)

    # Variables for storing stats
    all_updates = 0
    all_episodes = 0
    all_steps = 0
    episodes = 0
    proc_rewards = np.zeros(num_processes)
    proc_tds = np.zeros(num_processes)
    proc_tds_opp = np.zeros(num_processes)
    episode_rewards = []
    episode_tds = []
    episode_tds_opp = []
    wins = []
    value_losses = []
    policy_losses = []
    log_updates = []
    log_episode = []
    log_steps = []
    log_win_rate = []
    log_td_rate = []
    log_td_rate_opp = []
    log_mean_reward = []
    log_difficulty = []

    # self-play
    selfplay_next_save = selfplay_save_steps
    selfplay_next_swap = selfplay_swap_steps
    selfplay_models = 0
    if selfplay:
        model_name = f"{exp_id}_selfplay_0.nn"
        model_path = os.path.join(model_dir, model_name)
        torch.save(ac_agent, model_path)
        envs.swap(A2CAgent(name=model_name, env_name=env_name, filename=model_path))
        selfplay_models += 1

    renderer = ffai.Renderer()

    while all_steps < num_steps:

        for step in range(steps_per_update):

            action_masks = compute_action_masks(obs)
            action_masks = torch.tensor(action_masks, dtype=torch.bool)

            values, actions = ac_agent.act(
                Variable(memory.spatial_obs[step]),
                Variable(memory.non_spatial_obs[step]),
                Variable(action_masks))

            action_objects = []

            for action in actions:
                action_type, x, y = compute_action(action.numpy()[0])
                action_object = {
                    'action-type': action_type,
                    'x': x,
                    'y': y
                }
                action_objects.append(action_object)

            obs, env_reward, shaped_reward, tds_scored, tds_opp_scored, done, info = envs.step(action_objects, difficulty=difficulty)
            # envs.render()

            '''
            for j in range(len(obs)):
                ob = obs[j]
                renderer.render(ob, j)
            '''

            reward = torch.from_numpy(np.expand_dims(np.stack(env_reward), 1)).float()
            shaped_reward = torch.from_numpy(np.expand_dims(np.stack(shaped_reward), 1)).float()
            r = reward.numpy()
            sr = shaped_reward.numpy()
            for i in range(num_processes):
                proc_rewards[i] += sr[i]
                proc_tds[i] += tds_scored[i]
                proc_tds_opp[i] += tds_opp_scored[i]

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            dones = masks.squeeze()
            episodes += num_processes - int(dones.sum().item())
            for i in range(num_processes):
                if done[i]:
                    if r[i] > 0:
                        wins.append(1)
                        difficulty += dif_delta
                    elif r[i] < 0:
                        wins.append(0)
                        difficulty -= dif_delta
                    else:
                        wins.append(0.5)
                        difficulty -= dif_delta
                    if ppcg:
                        difficulty = min(1.0, max(0, difficulty))
                    else:
                        difficulty = 1
                    episode_rewards.append(proc_rewards[i])
                    episode_tds.append(proc_tds[i])
                    episode_tds_opp.append(proc_tds_opp[i])
                    proc_rewards[i] = 0
                    proc_tds[i] = 0
                    proc_tds_opp[i] = 0

            # Update the observations returned by the environment
            spatial_obs, non_spatial_obs = update_obs(obs)

            # insert the step taken into memory
            memory.insert(step, spatial_obs, non_spatial_obs,
                          actions.data, values.data, shaped_reward, masks, action_masks)

        next_value = ac_agent(Variable(memory.spatial_obs[-1], requires_grad=False), Variable(memory.non_spatial_obs[-1], requires_grad=False))[0].data

        # Compute returns
        memory.compute_returns(next_value, gamma)

        spatial = Variable(memory.spatial_obs[:-1])
        spatial = spatial.view(-1, *spatial_obs_space)
        non_spatial = Variable(memory.non_spatial_obs[:-1])
        non_spatial = non_spatial.view(-1, non_spatial.shape[-1])

        actions = Variable(torch.LongTensor(memory.actions.view(-1, 1)))
        actions_mask = Variable(memory.action_masks[:-1])

        # Evaluate the actions taken
        action_log_probs, values, dist_entropy = ac_agent.evaluate_actions(spatial, non_spatial, actions, actions_mask)

        values = values.view(steps_per_update, num_processes, 1)
        action_log_probs = action_log_probs.view(steps_per_update, num_processes, 1)

        advantages = Variable(memory.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()
        #value_losses.append(value_loss)

        # Compute loss
        action_loss = -(Variable(advantages.data) * action_log_probs).mean()
        #policy_losses.append(action_loss)

        optimizer.zero_grad()

        total_loss = (value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef)
        total_loss.backward()

        nn.utils.clip_grad_norm_(ac_agent.parameters(), max_grad_norm)

        optimizer.step()

        memory.non_spatial_obs[0].copy_(memory.non_spatial_obs[-1])
        memory.spatial_obs[0].copy_(memory.spatial_obs[-1])

        # Updates
        all_updates += 1
        # Episodes
        all_episodes += episodes
        episodes = 0
        # Steps
        all_steps += num_processes * steps_per_update

        # Self-play save
        if selfplay and all_steps >= selfplay_next_save:
            selfplay_next_save = max(all_steps+1, selfplay_next_save+selfplay_save_steps)
            model_name = f"{exp_id}_selfplay_{selfplay_models}.nn"
            model_path = os.path.join(model_dir, model_name)
            print(f"Saving {model_path}")
            torch.save(ac_agent, model_path)
            selfplay_models += 1

        # Self-play swap
        if selfplay and all_steps >= selfplay_next_swap:
            selfplay_next_swap = max(all_steps + 1, selfplay_next_swap+selfplay_swap_steps)
            lower = max(0, selfplay_models-1-(selfplay_window-1))
            i = random.randint(lower, selfplay_models-1)
            model_name = f"{exp_id}_selfplay_{i}.nn"
            model_path = os.path.join(model_dir, model_name)
            print(f"Swapping opponent to {model_path}")
            envs.swap(A2CAgent(name=model_name, env_name=env_name, filename=model_path))

        # Logging
        if all_updates % log_interval == 0 and len(episode_rewards) >= num_processes:
            td_rate = np.mean(episode_tds)
            td_rate_opp = np.mean(episode_tds_opp)
            episode_tds.clear()
            episode_tds_opp.clear()
            mean_reward = np.mean(episode_rewards)
            episode_rewards.clear()
            win_rate = np.mean(wins)
            wins.clear()
            #mean_value_loss = np.mean(value_losses)
            #mean_policy_loss = np.mean(policy_losses)    
            
            log_updates.append(all_updates)
            log_episode.append(all_episodes)
            log_steps.append(all_steps)
            log_win_rate.append(win_rate)
            log_td_rate.append(td_rate)
            log_td_rate_opp.append(td_rate_opp)
            log_mean_reward.append(mean_reward)
            log_difficulty.append(difficulty)

            log = "Updates: {}, Episodes: {}, Timesteps: {}, Win rate: {:.2f}, TD rate: {:.2f}, TD rate opp: {:.2f}, Mean reward: {:.3f}, Difficulty: {:.2f}" \
                .format(all_updates, all_episodes, all_steps, win_rate, td_rate, td_rate_opp, mean_reward, difficulty)

            log_to_file = "{}, {}, {}, {}, {}, {}, {}\n" \
                .format(all_updates, all_episodes, all_steps, win_rate, td_rate, td_rate_opp, mean_reward, difficulty)

            # Save to files
            log_path = os.path.join(log_dir, f"{exp_id}.dat")
            print(f"Save log to {log_path}")
            with open(log_path, "a") as myfile:
                myfile.write(log_to_file)

            print(log)

            episodes = 0
            value_losses.clear()
            policy_losses.clear()

            # Save model
            model_name = f"{exp_id}.nn"
            model_path = os.path.join(model_dir, model_name)
            torch.save(ac_agent, model_path)
            
            # plot
            n = 3
            if ppcg:
                n += 1
            fig, axs = plt.subplots(1, n, figsize=(4*n, 5))
            axs[0].ticklabel_format(axis="x", style="sci", scilimits=(0,0))
            axs[0].plot(log_steps, log_mean_reward)
            axs[0].set_title('Reward')
            #axs[0].set_ylim(bottom=0.0)
            axs[0].set_xlim(left=0)
            axs[1].ticklabel_format(axis="x", style="sci", scilimits=(0,0))
            axs[1].plot(log_steps, log_td_rate, label="Learner")
            axs[1].set_title('TD/Episode')
            axs[1].set_ylim(bottom=0.0)
            axs[1].set_xlim(left=0)
            if selfplay:
                axs[1].ticklabel_format(axis="x", style="sci", scilimits=(0, 0))
                axs[1].plot(log_steps, log_td_rate_opp, color="red", label="Opponent")
            axs[2].ticklabel_format(axis="x", style="sci", scilimits=(0,0))
            axs[2].plot(log_steps, log_win_rate)
            axs[2].set_title('Win rate')            
            axs[2].set_yticks(np.arange(0, 1.001, step=0.1))
            axs[2].set_xlim(left=0)
            if ppcg:
                axs[3].ticklabel_format(axis="x", style="sci", scilimits=(0, 0))
                axs[3].plot(log_steps, log_difficulty)
                axs[3].set_title('Difficulty')
                axs[3].set_yticks(np.arange(0, 1.001, step=0.1))
                axs[3].set_xlim(left=0)
            fig.tight_layout()
            plot_name = f"{exp_id}_{'_selfplay' if selfplay else ''}.png"
            plot_path = os.path.join(plot_dir, plot_name)
            fig.savefig(plot_path)
            plt.close('all')

    model_name = f"{exp_id}.nn"
    model_path = os.path.join(model_dir, model_name)
    torch.save(ac_agent, model_path)
    envs.close()
Exemple #9
0
def run_env(env):
    env.step(n_steps)


if __name__ == "__main__":
    n_env = multiprocessing.cpu_count()
    envs = [
        EnvWrapper(frame_size, skip_frames, stack_size) for i in range(n_env)
    ]
    action_size = envs[0].get_action_size()

    tf.reset_default_graph()
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    train_model = A2CAgent("train_model", True, sess, input_shape, action_size,
                           lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef,
                           clip_range, load_model)

    old_model = A2CAgent("old_model", False, sess, input_shape, action_size,
                         lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef,
                         clip_range, False)

    sync_ops = old_model.create_sync_ops(train_model)
    sess.run(sync_ops)
    summary_writer = tf.summary.FileWriter("./log/sum", sess.graph)

    # envs[0].set_render(True)

    for env in envs:
        env.set_agent(old_model)
    p = ThreadPool(n_env)
def train():
    args = parse_a2c_args()
    output_dir = initialize_logging(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    num_updates = int(args.num_frames) // args.num_steps // args.num_environments
    # Create the train and test environments with Multiple processes
    train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True)
    test_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=False)
    
    obs_shape = train_envs.obs_shape
    
    # The agent's policy network and training algorithm A2C
    policy = CNNPolicy(obs_shape, args).to(device)
    agent = A2CAgent(policy, 
                     args.hidden_size,
                     value_weight=args.value_loss_coef, 
                     entropy_weight=args.entropy_coef, 
                     num_steps=args.num_steps, 
                     num_parallel=args.num_environments,
                     gamma=args.gamma,
                     lr=args.learning_rate,
                     opt_alpha=args.alpha,
                     opt_momentum=args.momentum,
                     max_grad_norm=args.max_grad_norm)
    
    start_j = 0
    if args.reload_model:
        checkpoint_idx = args.reload_model.split(',')[1]
        checkpoint_filename = '{}models/checkpoint_{}.pth.tar'.format(output_dir, checkpoint_idx)        
        agent.load_model(checkpoint_filename)
        start_j = (int(checkpoint_idx) // args.num_steps // args.num_environments) + 1
        
    obs = train_envs.reset()
    start = time.time()
    
    for j in range(start_j, num_updates):
        if not args.skip_eval and j % args.eval_freq == 0:
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            mean_rewards, game_times = agent.evaluate(test_envs, j, total_num_steps)
            logging.info(mean_rewards)
            logging.info(game_times)
            
        for step in range(args.num_steps): 
            action = agent.get_action(obs, step)
            obs, reward, done, info = train_envs.step(action)
            agent.add_rewards_masks(reward, done, step)
            
        report = agent.update(obs)
        
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            save_num_steps = (start_j) * args.num_environments * args.num_steps
            FPS = int((total_num_steps - save_num_steps) / (end - start)),
            
            logging.info(report.format(j, total_num_steps, FPS))  
        
        if j % args.model_save_rate == 0:
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            agent.save_policy(total_num_steps, args, output_dir)
        
    # cancel the env processes    
    train_envs.cancel()
    test_envs.cancel()