num_non_primary_programs = env_tmp.get_num_non_primary_programs() observation_dim = env_tmp.get_observation_dim() programs_library = env_tmp.programs_library # Load alphanpi policy encoder = DrawEnvEncoder(env_tmp.get_observation_dim(), conf.encoding_dim) indices_non_primary_programs = [p['index'] for _, p in programs_library.items() if p['level'] > 0] policy = Policy(encoder, conf.hidden_size, num_programs, num_non_primary_programs, conf.program_embedding_dim, conf.encoding_dim, indices_non_primary_programs, conf.learning_rate) # Load replay buffer idx_tasks = [prog['index'] for key, prog in env_tmp.programs_library.items() if prog['level'] > 0] buffer = PrioritizedReplayBuffer(conf.buffer_max_length, idx_tasks, p1=conf.proba_replay_buffer) # Load curriculum sequencer curriculum_scheduler = CurriculumScheduler(conf.reward_threshold, num_non_primary_programs, programs_library, moving_average=0.99) # Prepare mcts params length = 5 max_depth_dict = {1: 100, 2: 20, 3: 20} mcts_train_params = {'number_of_simulations': conf.number_of_simulations, 'max_depth_dict': max_depth_dict, 'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': False, 'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma, 'use_dirichlet_noise': True} mcts_test_params = {'number_of_simulations': conf.number_of_simulations_for_validation, 'max_depth_dict': max_depth_dict, 'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': True, 'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma} # Instanciate trainer
num_non_primary_programs = env_tmp.get_num_non_primary_programs() observation_dim = env_tmp.get_observation_dim() programs_library = env_tmp.programs_library # Load alphanpi policy encoder = ListEnvEncoder(env_tmp.get_observation_dim(), conf.encoding_dim) indices_non_primary_programs = [p['index'] for _, p in programs_library.items() if p['level'] > 0] policy = Policy(encoder, conf.hidden_size, num_programs, num_non_primary_programs, conf.program_embedding_dim, conf.encoding_dim, indices_non_primary_programs, conf.learning_rate) # Load replay buffer idx_tasks = [prog['index'] for key, prog in env_tmp.programs_library.items() if prog['level'] > 0] buffer = PrioritizedReplayBuffer(conf.buffer_max_length, idx_tasks, p1=conf.proba_replay_buffer) # Load curriculum sequencer curriculum_scheduler = CurriculumScheduler(conf.reward_threshold, num_non_primary_programs, programs_library, moving_average=0.99) # Prepare mcts params length = 5 max_depth_dict = {1: 5, 2: 2 * length + 3, 3: 2 * length + 3} mcts_train_params = {'number_of_simulations': conf.number_of_simulations, 'max_depth_dict': max_depth_dict, 'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': False, 'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma, 'use_dirichlet_noise': True} mcts_test_params = {'number_of_simulations': conf.number_of_simulations_for_validation, 'max_depth_dict': max_depth_dict, 'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': True, 'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma} # Instanciate trainer