Esempio n. 1
0
def train(config, dir_manager=None, logger=None, pbar="default_pbar"):
    # A few safety checks

    check_training_args(config)

    # Creates a directory manager that encapsulates our directory-tree structure

    if dir_manager is None:
        dir_manager = DirectoryManager(agent_alg=config.agent_alg,
                                       env_name=config.env_name,
                                       desc=config.desc,
                                       seed=config.seed)
        dir_manager.create_directories()

    # Creates logger and prints config

    if logger is None:
        logger = create_logger('MASTER', config.log_level,
                               dir_manager.seed_dir / 'logger.out')
    logger.debug(config_to_str(config))

    # Creates a progress-bar

    if type(pbar) is str:
        if pbar == "default_pbar":
            pbar = tqdm()

    if pbar is not None:
        pbar.n = 0
        pbar.desc += f'{dir_manager.storage_dir.name}/{dir_manager.experiment_dir.name}/{dir_manager.seed_dir.name}'
        pbar.total = config.n_episodes

    # Encapsulates in a dict all user-defined params that concern the world (scenario.make_world())

    world_params = {}
    world_params['use_dense_rewards'] = config.use_dense_rewards
    if config.env_name == 'chase':
        if config.n_preys is not None: world_params['n_preys'] = config.n_preys
        if config.n_preds is not None: world_params['n_preds'] = config.n_preds
        if config.prey_variance is not None:
            world_params['prey_variance'] = config.prey_variance
        if config.individual_reward is not None:
            world_params['individual_reward'] = config.individual_reward

    elif config.env_name == 'gather':
        if config.n_agents is not None:
            world_params['n_agents'] = config.n_agents

    elif config.env_name == 'intersection':
        if config.n_agents is not None:
            world_params['n_agents'] = config.n_agents

    elif config.env_name == 'bounce':
        world_params['episode_length'] = config.episode_length
        if config.line_length is not None:
            world_params['line_length'] = config.line_length

    elif config.env_name == 'compromise':
        if config.line_length is not None:
            world_params['line_length'] = config.line_length
        if config.show_all_landmarks is not None:
            world_params['show_all_landmarks'] = config.show_all_landmarks

    elif config.env_name == 'imitation':
        if config.staged is not None: world_params['staged'] = config.staged
        if config.set_trap is not None:
            world_params['set_trap'] = config.set_trap

    elif config.env_name == 'intersection':
        if config.by_stander is not None:
            world_params['by_stander'] = config.by_stander

    elif config.env_name == 'spread':
        if config.n_agents is not None:
            world_params['n_agents'] = config.n_agents
        if config.shuffle_landmarks is not None:
            world_params['shuffle_landmarks'] = config.shuffle_landmarks
        if config.color_objects is not None:
            world_params['color_objects'] = config.color_objects
        if config.small_agents is not None:
            world_params['small_agents'] = config.small_agents

    save_dict_to_json(world_params,
                      str(dir_manager.seed_dir / 'world_params.json'))

    # Encapsulates in a dict all user-defined params that concern the environment (multiagent.environment.MultiAgentEnv)

    env_params = {}
    env_params['env_name'] = config.env_name
    if 'football' not in config.env_name:
        env_params['use_max_speed'] = config.use_max_speed

    save_dict_to_json(env_params,
                      str(dir_manager.seed_dir / 'env_params.json'))

    # Sets the random seeds (for reproducibility)

    set_seeds(config.seed)

    # Initializes environments

    # TODO: Check reproductibility and that different envs are seeded differently
    if '3v2football' == config.env_name:

        obs_rep = config.representation

        if config.feature_extractor == 'identity':
            assert obs_rep in ['simple115', 'simple37']
        elif config.feature_extractor == 'convNet':
            assert obs_rep == 'extracted'
        else:
            raise NotImplemented(
                f"config.feature_extractor={config.feature_extractor} not recognized."
            )

        env = make_parallel_football_env(
            seed_dir=dir_manager.seed_dir,
            seed=config.seed,
            dump_freq=config.dump_freq,
            representation=obs_rep,
            render=False,
            n_rollout_threads=config.n_rollout_threads
        )  # no rendering during training
    else:
        env = make_parallel_particle_env(
            scenario_name=config.env_name,
            n_rollout_threads=config.n_rollout_threads,
            seed=config.seed,
            use_discrete_action=config.use_discrete_action,
            use_max_speed=config.use_max_speed,
            world_params=world_params)

    if not config.use_cuda:
        torch.set_num_threads(config.n_training_threads)

    # Initialize the algo

    algorithm = init_from_config(env, config, logger)

    # Creates recorders and stores basic info regarding agent types

    os.makedirs(dir_manager.recorders_dir, exist_ok=True)
    train_recorder = algorithm.create_train_recorder()
    train_recorder.tape['agent_colors'] = env.agent_colors

    if 'football' in config.env_name:

        if config.feature_extractor == "convNet":
            n_stack = 4
        elif config.feature_extractor == "identity":
            n_stack = 1
        else:
            raise NotImplemented

        obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads,
                                          n_stack=n_stack)
        replay_buffer = StackingReplayBuffer(
            max_steps=config.buffer_length,
            num_agents=algorithm.nagents,
            obs_dims=[obsp.shape for obsp in env.observation_space],
            ac_dims=[
                acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                for acsp in env.action_space
            ],
            n_stack=n_stack)

    else:
        # defines observation buffer for multi-step
        obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads,
                                          n_stack=1)

        replay_buffer = ReplayBuffer(
            max_steps=config.buffer_length,
            num_agents=algorithm.nagents,
            obs_dims=[obsp.shape for obsp in env.observation_space],
            ac_dims=[
                acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                for acsp in env.action_space
            ])

    # Saves initial models

    current_model = "model_ep0.pt"

    best_eval_reward_exploit = -100000.
    best_model_exploit = "model_ep0_exploit_best.pt"
    algorithm.save(dir_manager.seed_dir / current_model)
    algorithm.save(dir_manager.seed_dir / best_model_exploit)

    best_eval_reward_explore = -100000.
    best_model_explore = "model_ep0_explore_best.pt"
    algorithm.save(dir_manager.seed_dir / current_model)
    algorithm.save(dir_manager.seed_dir / best_model_explore)

    # Initializes step and episode counters

    step_i = 0
    ep_steps = np.zeros(shape=(config.n_rollout_threads, ), dtype=np.int)
    ep_dones = 0
    ep_recorders = [
        EpisodeRecorder(stuff_to_record=['reward'])
        for _ in range(config.n_rollout_threads)
    ]
    obs = env.reset()
    obs_buffers.fill(obs)

    algorithm.set_exploration(
        begin_decay_proportion=config.begin_exploration_decay,
        n_episodes=config.n_episodes,
        end_decay_proportion=config.end_exploration_decay,
        initial_scale=config.init_noise_scale,
        final_scale=config.final_noise_scale,
        current_episode=ep_dones)

    # EPISODES LOOP

    while ep_dones < config.n_episodes:

        start_time = time.time()

        # ENVIRONMENT STEP

        # convert observations to torch Variable

        torch_obs = [
            Variable(torch.Tensor(obs_buffers.read()[:, i]),
                     requires_grad=False) for i in range(algorithm.nagents)
        ]

        # get actions as torch Variables

        torch_agent_actions = algorithm.select_action(torch_obs,
                                                      is_exploring=True)

        # convert actions to numpy arrays

        agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

        # rearrange actions to be per environment

        actions = [[ac[i] for ac in agent_actions]
                   for i in range(config.n_rollout_threads)]

        # makes one step in the environment

        next_obs, rewards, dones, infos = env.step(actions)

        # put transitions in the memory buffer

        replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)

        # saves relevant info in episode recorders

        for i in range(config.n_rollout_threads):
            ep_recorders[i].add_step(obs[i], actions[i], rewards[i],
                                     next_obs[i])

        # ending step

        obs = next_obs
        obs_buffers.append(obs)

        step_i += config.n_rollout_threads
        step_time = time.time() - start_time

        ep_steps += 1

        # LEARNING STEP

        if (len(replay_buffer) >= config.batch_size * config.warmup) \
                and (step_i % config.steps_per_update) < config.n_rollout_threads:

            # Prepares models to training

            if config.use_cuda:
                algorithm.prep_training(device='gpu')
            else:
                algorithm.prep_training(device='cpu')

            # Performs one algorithm update

            sample = replay_buffer.sample(config.batch_size,
                                          to_gpu=config.use_cuda,
                                          normalize_rewards=False)
            algorithm.update(sample, train_recorder)

            # Update target networks

            algorithm.update_all_targets()

            # Prepares models to go back in rollout phase

            algorithm.prep_rollouts(device='cpu')

        # EPISODE ENDINGS

        episodes_over = dones | (ep_steps >= config.episode_length)

        if any(episodes_over):

            if pbar is not None:
                pbar.update(sum(episodes_over))

            for env_i, is_over in enumerate(episodes_over):
                if is_over:
                    ep_dones += 1
                    ep_steps[env_i] = 0

                    # Reset environments

                    obs[env_i] = env.reset(env_i=env_i)

                    obs_buffers[env_i].flush()
                    obs_buffers[env_i].fill(obs[env_i])

                    # Summarizes episode metrics

                    train_recorder.append(
                        'total_reward', ep_recorders[env_i].get_total_reward())

                    # Reinitialise episode recorder

                    ep_recorders[env_i] = EpisodeRecorder(
                        stuff_to_record=['reward'])

                    # Printing if one third of training is completed

                    if (ep_dones -
                            1) % (config.n_episodes //
                                  3) == 0 and ep_dones != config.n_episodes:
                        step_time = time.time() - start_time
                        logger.info(
                            f"Episode {ep_dones}/{config.n_episodes}, "
                            f"speed={round_to_two(float(config.n_rollout_threads) / step_time)}steps/s"
                        )

            # Sets exploration noise

            current_noise_scale = algorithm.set_exploration(
                begin_decay_proportion=config.begin_exploration_decay,
                n_episodes=config.n_episodes,
                end_decay_proportion=config.end_exploration_decay,
                initial_scale=config.init_noise_scale,
                final_scale=config.final_noise_scale,
                current_episode=ep_dones)

            # BOOK-KEEPING

            if ep_dones % config.episodes_per_save < config.n_rollout_threads:

                # Model checkpoints

                if config.save_incrementals:
                    os.makedirs(dir_manager.incrementals_dir, exist_ok=True)
                    algorithm.save(dir_manager.incrementals_dir /
                                   ('model_ep%i.pt' % (ep_dones + 1)))
                os.remove(dir_manager.seed_dir / current_model)
                current_model = f"model_ep{ep_dones}.pt"
                algorithm.save(dir_manager.seed_dir / current_model)
                logger.debug('Saving model checkpoint')

                # Current model evaluation (run episodes without exploration)

                if config.n_evaluation_episodes > 0:
                    logger.debug(
                        f'Evaluating model for {config.n_evaluation_episodes} episodes'
                    )
                    set_seeds(
                        config.evaluation_seed)  # fixed seed for evaluation
                    env.seed(config.evaluation_seed)

                    eval_config = get_evaluation_args(overwritten_args="")
                    eval_config.storage_name = dir_manager.storage_dir.name
                    eval_config.experiment_num = int(
                        dir_manager.experiment_dir.stem.strip('experiment'))
                    eval_config.seed_num = int(
                        dir_manager.seed_dir.stem.strip('seed'))
                    eval_config.render = False
                    eval_config.n_episodes = config.n_evaluation_episodes
                    eval_config.last_model = True
                    eval_config.noise_scale = None
                    eval_config.episode_length = config.episode_length
                    eval_config.representation = config.representation

                    # Evaluate with exploit (without explorarion)
                    eval_reward_exploit = np.vstack(evaluate(eval_config))

                    train_recorder.append('eval_episodes', ep_dones)
                    train_recorder.append('eval_total_reward_exploit',
                                          eval_reward_exploit)
                    if eval_reward_exploit.mean() > best_eval_reward_exploit:
                        logger.debug("New best exploit model")
                        os.remove(dir_manager.seed_dir / best_model_exploit)
                        best_model_exploit = f"model_ep{ep_dones}_exploit_best.pt"
                        algorithm.save(dir_manager.seed_dir /
                                       best_model_exploit)
                        best_eval_reward_exploit = eval_reward_exploit.mean()

                    # Evaluate with exploration
                    eval_config.noise_scale = current_noise_scale

                    eval_reward_explore = np.vstack(evaluate(eval_config))

                    train_recorder.append('eval_total_reward_explore',
                                          eval_reward_explore)
                    if eval_reward_explore.mean() > best_eval_reward_explore:
                        logger.debug("New best explore model")
                        os.remove(dir_manager.seed_dir / best_model_explore)
                        best_model_explore = f"model_ep{ep_dones}_explore_best.pt"
                        algorithm.save(dir_manager.seed_dir /
                                       best_model_explore)
                        best_eval_reward_explore = eval_reward_explore.mean()

                set_seeds(config.seed + ep_dones)
                env.seed(config.seed + ep_dones)

                # Graphs checkpoints

                logger.debug('Saving recorder checkpoints and graphs')
                train_recorder.save(dir_manager.recorders_dir /
                                    'train_recorder.pkl')

                # Saving graphs

                if len(train_recorder.tape['actor_loss']) > 0:
                    algorithm.save_training_graphs(
                        train_recorder=train_recorder,
                        save_dir=dir_manager.seed_dir)

    # Saves model one last time and close the environment

    os.remove(dir_manager.seed_dir / current_model)
    current_model = f"model_ep{ep_dones}.pt"
    algorithm.save(dir_manager.seed_dir / current_model)
    env.close()
Esempio n. 2
0
def main():
    args = parse_args()
    config = get_random_config(args)
    # config = get_config(args)
    print_setup(config)

    model = build_model(optimizer=config['optimizer'],
                        lr=config['learning_rate'],
                        decay=config['decay'],
                        momentum=config['momentum'],
                        loss=config['loss'],
                        classes=8)
    # model.summary()

    train_generator = get_train_generator(args.dataset_dir,
                                          config['batch_size'])
    validation_generator = get_validation_generator(args.dataset_dir,
                                                    config['batch_size'])

    tb_callback = callbacks.TensorBoard(
        log_dir=os.path.join(args.log_dir, config_to_str(config)))
    es_callback = callbacks.EarlyStopping(monitor='val_loss',
                                          min_delta=0,
                                          patience=args.patience,
                                          verbose=1,
                                          mode='auto',
                                          baseline=None,
                                          restore_best_weights=True)

    history = None
    last_epoch = 0

    if not args.train_full:
        history = model.fit_generator(
            train_generator,
            steps_per_epoch=train_generator.samples //
            train_generator.batch_size,
            epochs=config['epochs'],
            verbose=2,
            callbacks=[tb_callback, es_callback],
            validation_data=validation_generator,
            validation_steps=validation_generator.samples //
            validation_generator.batch_size,
            workers=4)

        # this is a small hack: https://github.com/keras-team/keras/issues/1766
        last_epoch = len(history.history['loss'])

        print('\nFine-tuning top layers done. Training full network now...\n')

    for layer in model.layers:
        layer.trainable = True
    K.set_value(model.optimizer.lr,
                config['learning_rate'] * config['second_fit_lr_fraction'])
    model.compile(model.optimizer, model.loss, model.metrics)

    history2 = model.fit_generator(
        train_generator,
        steps_per_epoch=train_generator.samples // train_generator.batch_size,
        epochs=config['epochs'],
        verbose=2,
        callbacks=[tb_callback, es_callback],
        validation_data=validation_generator,
        validation_steps=validation_generator.samples //
        validation_generator.batch_size,
        workers=4,
        initial_epoch=last_epoch)

    model_file = os.path.join(args.output_dir,
                              'nasnet__{}.h5'.format(config_to_str(config)))
    model.save(model_file)
    print('Model saved to {}'.format(model_file))

    if history:
        for k in history.history.keys():
            history.history[k].extend(history2.history[k])
    else:
        history = history2
    history_file = os.path.join(
        args.output_dir, 'history__{}.pkl'.format(config_to_str(config)))
    with open(history_file, 'wb') as f:
        pickle.dump(history.history, f)
    print('History saved to {}'.format(history_file))

    print('Best accuracy:', max(history.history['val_acc']))