Example #1
0
num_non_primary_programs = env_tmp.get_num_non_primary_programs()
observation_dim = env_tmp.get_observation_dim()
programs_library = env_tmp.programs_library

# Load alphanpi policy
encoder = DrawEnvEncoder(env_tmp.get_observation_dim(), conf.encoding_dim)
indices_non_primary_programs = [p['index'] for _, p in programs_library.items() if p['level'] > 0]
policy = Policy(encoder, conf.hidden_size, num_programs, num_non_primary_programs, conf.program_embedding_dim,
                conf.encoding_dim, indices_non_primary_programs, conf.learning_rate)

# Load replay buffer
idx_tasks = [prog['index'] for key, prog in env_tmp.programs_library.items() if prog['level'] > 0]
buffer = PrioritizedReplayBuffer(conf.buffer_max_length, idx_tasks, p1=conf.proba_replay_buffer)

# Load curriculum sequencer
curriculum_scheduler = CurriculumScheduler(conf.reward_threshold, num_non_primary_programs, programs_library,
                                           moving_average=0.99)

# Prepare mcts params
length = 5
max_depth_dict = {1: 100, 2: 20, 3: 20}
mcts_train_params = {'number_of_simulations': conf.number_of_simulations, 'max_depth_dict': max_depth_dict,
                     'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': False,
                     'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma,
                     'use_dirichlet_noise': True}

mcts_test_params = {'number_of_simulations': conf.number_of_simulations_for_validation,
                    'max_depth_dict': max_depth_dict, 'temperature': conf.temperature,
                    'c_puct': conf.c_puct, 'exploit': True, 'level_closeness_coeff': conf.level_closeness_coeff,
                    'gamma': conf.gamma}

# Instanciate trainer
    num_non_primary_programs = env_tmp.get_num_non_primary_programs()
    observation_dim = env_tmp.get_observation_dim()
    programs_library = env_tmp.programs_library

    # Load alphanpi policy
    encoder = ListEnvEncoder(env_tmp.get_observation_dim(), conf.encoding_dim)
    indices_non_primary_programs = [p['index'] for _, p in programs_library.items() if p['level'] > 0]
    policy = Policy(encoder, conf.hidden_size, num_programs, num_non_primary_programs, conf.program_embedding_dim,
                    conf.encoding_dim, indices_non_primary_programs, conf.learning_rate)

    # Load replay buffer
    idx_tasks = [prog['index'] for key, prog in env_tmp.programs_library.items() if prog['level'] > 0]
    buffer = PrioritizedReplayBuffer(conf.buffer_max_length, idx_tasks, p1=conf.proba_replay_buffer)

    # Load curriculum sequencer
    curriculum_scheduler = CurriculumScheduler(conf.reward_threshold, num_non_primary_programs, programs_library,
                                               moving_average=0.99)

    # Prepare mcts params
    length = 5
    max_depth_dict = {1: 5, 2: 2 * length + 3, 3: 2 * length + 3}
    mcts_train_params = {'number_of_simulations': conf.number_of_simulations, 'max_depth_dict': max_depth_dict,
                         'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': False,
                         'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma,
                         'use_dirichlet_noise': True}

    mcts_test_params = {'number_of_simulations': conf.number_of_simulations_for_validation,
                        'max_depth_dict': max_depth_dict, 'temperature': conf.temperature,
                        'c_puct': conf.c_puct, 'exploit': True, 'level_closeness_coeff': conf.level_closeness_coeff,
                        'gamma': conf.gamma}

    # Instanciate trainer