def train(config, dir_manager=None, logger=None, pbar="default_pbar"): # A few safety checks check_training_args(config) # Creates a directory manager that encapsulates our directory-tree structure if dir_manager is None: dir_manager = DirectoryManager(agent_alg=config.agent_alg, env_name=config.env_name, desc=config.desc, seed=config.seed) dir_manager.create_directories() # Creates logger and prints config if logger is None: logger = create_logger('MASTER', config.log_level, dir_manager.seed_dir / 'logger.out') logger.debug(config_to_str(config)) # Creates a progress-bar if type(pbar) is str: if pbar == "default_pbar": pbar = tqdm() if pbar is not None: pbar.n = 0 pbar.desc += f'{dir_manager.storage_dir.name}/{dir_manager.experiment_dir.name}/{dir_manager.seed_dir.name}' pbar.total = config.n_episodes # Encapsulates in a dict all user-defined params that concern the world (scenario.make_world()) world_params = {} world_params['use_dense_rewards'] = config.use_dense_rewards if config.env_name == 'chase': if config.n_preys is not None: world_params['n_preys'] = config.n_preys if config.n_preds is not None: world_params['n_preds'] = config.n_preds if config.prey_variance is not None: world_params['prey_variance'] = config.prey_variance if config.individual_reward is not None: world_params['individual_reward'] = config.individual_reward elif config.env_name == 'gather': if config.n_agents is not None: world_params['n_agents'] = config.n_agents elif config.env_name == 'intersection': if config.n_agents is not None: world_params['n_agents'] = config.n_agents elif config.env_name == 'bounce': world_params['episode_length'] = config.episode_length if config.line_length is not None: world_params['line_length'] = config.line_length elif config.env_name == 'compromise': if config.line_length is not None: world_params['line_length'] = config.line_length if config.show_all_landmarks is not None: world_params['show_all_landmarks'] = config.show_all_landmarks elif config.env_name == 'imitation': if config.staged is not None: world_params['staged'] = config.staged if config.set_trap is not None: world_params['set_trap'] = config.set_trap elif config.env_name == 'intersection': if config.by_stander is not None: world_params['by_stander'] = config.by_stander elif config.env_name == 'spread': if config.n_agents is not None: world_params['n_agents'] = config.n_agents if config.shuffle_landmarks is not None: world_params['shuffle_landmarks'] = config.shuffle_landmarks if config.color_objects is not None: world_params['color_objects'] = config.color_objects if config.small_agents is not None: world_params['small_agents'] = config.small_agents save_dict_to_json(world_params, str(dir_manager.seed_dir / 'world_params.json')) # Encapsulates in a dict all user-defined params that concern the environment (multiagent.environment.MultiAgentEnv) env_params = {} env_params['env_name'] = config.env_name if 'football' not in config.env_name: env_params['use_max_speed'] = config.use_max_speed save_dict_to_json(env_params, str(dir_manager.seed_dir / 'env_params.json')) # Sets the random seeds (for reproducibility) set_seeds(config.seed) # Initializes environments # TODO: Check reproductibility and that different envs are seeded differently if '3v2football' == config.env_name: obs_rep = config.representation if config.feature_extractor == 'identity': assert obs_rep in ['simple115', 'simple37'] elif config.feature_extractor == 'convNet': assert obs_rep == 'extracted' else: raise NotImplemented( f"config.feature_extractor={config.feature_extractor} not recognized." ) env = make_parallel_football_env( seed_dir=dir_manager.seed_dir, seed=config.seed, dump_freq=config.dump_freq, representation=obs_rep, render=False, n_rollout_threads=config.n_rollout_threads ) # no rendering during training else: env = make_parallel_particle_env( scenario_name=config.env_name, n_rollout_threads=config.n_rollout_threads, seed=config.seed, use_discrete_action=config.use_discrete_action, use_max_speed=config.use_max_speed, world_params=world_params) if not config.use_cuda: torch.set_num_threads(config.n_training_threads) # Initialize the algo algorithm = init_from_config(env, config, logger) # Creates recorders and stores basic info regarding agent types os.makedirs(dir_manager.recorders_dir, exist_ok=True) train_recorder = algorithm.create_train_recorder() train_recorder.tape['agent_colors'] = env.agent_colors if 'football' in config.env_name: if config.feature_extractor == "convNet": n_stack = 4 elif config.feature_extractor == "identity": n_stack = 1 else: raise NotImplemented obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads, n_stack=n_stack) replay_buffer = StackingReplayBuffer( max_steps=config.buffer_length, num_agents=algorithm.nagents, obs_dims=[obsp.shape for obsp in env.observation_space], ac_dims=[ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ], n_stack=n_stack) else: # defines observation buffer for multi-step obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads, n_stack=1) replay_buffer = ReplayBuffer( max_steps=config.buffer_length, num_agents=algorithm.nagents, obs_dims=[obsp.shape for obsp in env.observation_space], ac_dims=[ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # Saves initial models current_model = "model_ep0.pt" best_eval_reward_exploit = -100000. best_model_exploit = "model_ep0_exploit_best.pt" algorithm.save(dir_manager.seed_dir / current_model) algorithm.save(dir_manager.seed_dir / best_model_exploit) best_eval_reward_explore = -100000. best_model_explore = "model_ep0_explore_best.pt" algorithm.save(dir_manager.seed_dir / current_model) algorithm.save(dir_manager.seed_dir / best_model_explore) # Initializes step and episode counters step_i = 0 ep_steps = np.zeros(shape=(config.n_rollout_threads, ), dtype=np.int) ep_dones = 0 ep_recorders = [ EpisodeRecorder(stuff_to_record=['reward']) for _ in range(config.n_rollout_threads) ] obs = env.reset() obs_buffers.fill(obs) algorithm.set_exploration( begin_decay_proportion=config.begin_exploration_decay, n_episodes=config.n_episodes, end_decay_proportion=config.end_exploration_decay, initial_scale=config.init_noise_scale, final_scale=config.final_noise_scale, current_episode=ep_dones) # EPISODES LOOP while ep_dones < config.n_episodes: start_time = time.time() # ENVIRONMENT STEP # convert observations to torch Variable torch_obs = [ Variable(torch.Tensor(obs_buffers.read()[:, i]), requires_grad=False) for i in range(algorithm.nagents) ] # get actions as torch Variables torch_agent_actions = algorithm.select_action(torch_obs, is_exploring=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # makes one step in the environment next_obs, rewards, dones, infos = env.step(actions) # put transitions in the memory buffer replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # saves relevant info in episode recorders for i in range(config.n_rollout_threads): ep_recorders[i].add_step(obs[i], actions[i], rewards[i], next_obs[i]) # ending step obs = next_obs obs_buffers.append(obs) step_i += config.n_rollout_threads step_time = time.time() - start_time ep_steps += 1 # LEARNING STEP if (len(replay_buffer) >= config.batch_size * config.warmup) \ and (step_i % config.steps_per_update) < config.n_rollout_threads: # Prepares models to training if config.use_cuda: algorithm.prep_training(device='gpu') else: algorithm.prep_training(device='cpu') # Performs one algorithm update sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_cuda, normalize_rewards=False) algorithm.update(sample, train_recorder) # Update target networks algorithm.update_all_targets() # Prepares models to go back in rollout phase algorithm.prep_rollouts(device='cpu') # EPISODE ENDINGS episodes_over = dones | (ep_steps >= config.episode_length) if any(episodes_over): if pbar is not None: pbar.update(sum(episodes_over)) for env_i, is_over in enumerate(episodes_over): if is_over: ep_dones += 1 ep_steps[env_i] = 0 # Reset environments obs[env_i] = env.reset(env_i=env_i) obs_buffers[env_i].flush() obs_buffers[env_i].fill(obs[env_i]) # Summarizes episode metrics train_recorder.append( 'total_reward', ep_recorders[env_i].get_total_reward()) # Reinitialise episode recorder ep_recorders[env_i] = EpisodeRecorder( stuff_to_record=['reward']) # Printing if one third of training is completed if (ep_dones - 1) % (config.n_episodes // 3) == 0 and ep_dones != config.n_episodes: step_time = time.time() - start_time logger.info( f"Episode {ep_dones}/{config.n_episodes}, " f"speed={round_to_two(float(config.n_rollout_threads) / step_time)}steps/s" ) # Sets exploration noise current_noise_scale = algorithm.set_exploration( begin_decay_proportion=config.begin_exploration_decay, n_episodes=config.n_episodes, end_decay_proportion=config.end_exploration_decay, initial_scale=config.init_noise_scale, final_scale=config.final_noise_scale, current_episode=ep_dones) # BOOK-KEEPING if ep_dones % config.episodes_per_save < config.n_rollout_threads: # Model checkpoints if config.save_incrementals: os.makedirs(dir_manager.incrementals_dir, exist_ok=True) algorithm.save(dir_manager.incrementals_dir / ('model_ep%i.pt' % (ep_dones + 1))) os.remove(dir_manager.seed_dir / current_model) current_model = f"model_ep{ep_dones}.pt" algorithm.save(dir_manager.seed_dir / current_model) logger.debug('Saving model checkpoint') # Current model evaluation (run episodes without exploration) if config.n_evaluation_episodes > 0: logger.debug( f'Evaluating model for {config.n_evaluation_episodes} episodes' ) set_seeds( config.evaluation_seed) # fixed seed for evaluation env.seed(config.evaluation_seed) eval_config = get_evaluation_args(overwritten_args="") eval_config.storage_name = dir_manager.storage_dir.name eval_config.experiment_num = int( dir_manager.experiment_dir.stem.strip('experiment')) eval_config.seed_num = int( dir_manager.seed_dir.stem.strip('seed')) eval_config.render = False eval_config.n_episodes = config.n_evaluation_episodes eval_config.last_model = True eval_config.noise_scale = None eval_config.episode_length = config.episode_length eval_config.representation = config.representation # Evaluate with exploit (without explorarion) eval_reward_exploit = np.vstack(evaluate(eval_config)) train_recorder.append('eval_episodes', ep_dones) train_recorder.append('eval_total_reward_exploit', eval_reward_exploit) if eval_reward_exploit.mean() > best_eval_reward_exploit: logger.debug("New best exploit model") os.remove(dir_manager.seed_dir / best_model_exploit) best_model_exploit = f"model_ep{ep_dones}_exploit_best.pt" algorithm.save(dir_manager.seed_dir / best_model_exploit) best_eval_reward_exploit = eval_reward_exploit.mean() # Evaluate with exploration eval_config.noise_scale = current_noise_scale eval_reward_explore = np.vstack(evaluate(eval_config)) train_recorder.append('eval_total_reward_explore', eval_reward_explore) if eval_reward_explore.mean() > best_eval_reward_explore: logger.debug("New best explore model") os.remove(dir_manager.seed_dir / best_model_explore) best_model_explore = f"model_ep{ep_dones}_explore_best.pt" algorithm.save(dir_manager.seed_dir / best_model_explore) best_eval_reward_explore = eval_reward_explore.mean() set_seeds(config.seed + ep_dones) env.seed(config.seed + ep_dones) # Graphs checkpoints logger.debug('Saving recorder checkpoints and graphs') train_recorder.save(dir_manager.recorders_dir / 'train_recorder.pkl') # Saving graphs if len(train_recorder.tape['actor_loss']) > 0: algorithm.save_training_graphs( train_recorder=train_recorder, save_dir=dir_manager.seed_dir) # Saves model one last time and close the environment os.remove(dir_manager.seed_dir / current_model) current_model = f"model_ep{ep_dones}.pt" algorithm.save(dir_manager.seed_dir / current_model) env.close()
def main(): args = parse_args() config = get_random_config(args) # config = get_config(args) print_setup(config) model = build_model(optimizer=config['optimizer'], lr=config['learning_rate'], decay=config['decay'], momentum=config['momentum'], loss=config['loss'], classes=8) # model.summary() train_generator = get_train_generator(args.dataset_dir, config['batch_size']) validation_generator = get_validation_generator(args.dataset_dir, config['batch_size']) tb_callback = callbacks.TensorBoard( log_dir=os.path.join(args.log_dir, config_to_str(config))) es_callback = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=args.patience, verbose=1, mode='auto', baseline=None, restore_best_weights=True) history = None last_epoch = 0 if not args.train_full: history = model.fit_generator( train_generator, steps_per_epoch=train_generator.samples // train_generator.batch_size, epochs=config['epochs'], verbose=2, callbacks=[tb_callback, es_callback], validation_data=validation_generator, validation_steps=validation_generator.samples // validation_generator.batch_size, workers=4) # this is a small hack: https://github.com/keras-team/keras/issues/1766 last_epoch = len(history.history['loss']) print('\nFine-tuning top layers done. Training full network now...\n') for layer in model.layers: layer.trainable = True K.set_value(model.optimizer.lr, config['learning_rate'] * config['second_fit_lr_fraction']) model.compile(model.optimizer, model.loss, model.metrics) history2 = model.fit_generator( train_generator, steps_per_epoch=train_generator.samples // train_generator.batch_size, epochs=config['epochs'], verbose=2, callbacks=[tb_callback, es_callback], validation_data=validation_generator, validation_steps=validation_generator.samples // validation_generator.batch_size, workers=4, initial_epoch=last_epoch) model_file = os.path.join(args.output_dir, 'nasnet__{}.h5'.format(config_to_str(config))) model.save(model_file) print('Model saved to {}'.format(model_file)) if history: for k in history.history.keys(): history.history[k].extend(history2.history[k]) else: history = history2 history_file = os.path.join( args.output_dir, 'history__{}.pkl'.format(config_to_str(config))) with open(history_file, 'wb') as f: pickle.dump(history.history, f) print('History saved to {}'.format(history_file)) print('Best accuracy:', max(history.history['val_acc']))