def prepare_experiment(env, args): # Manager to share PER between a learner and explorers SyncManager.register('PrioritizedReplayBuffer', PrioritizedReplayBuffer) manager = SyncManager() manager.start() kwargs = get_default_rb_dict(args.replay_buffer_size, env) kwargs["check_for_update"] = True global_rb = manager.PrioritizedReplayBuffer(**kwargs) # queues to share network parameters between a learner and explorers n_queue = 1 if args.n_env > 1 else args.n_explorer n_queue += 1 # for evaluation queues = [manager.Queue() for _ in range(n_queue)] # Event object to share training status. if event is set True, all exolorers stop sampling transitions is_training_done = Event() # Lock lock = manager.Lock() # Shared memory objects to count number of samples and applied gradients trained_steps = Value('i', 0) return global_rb, queues, is_training_done, lock, trained_steps
def main(args_): if args_.n_explorer is None: n_explorer = multiprocessing.cpu_count() - 1 else: n_explorer = args_.n_explorer assert n_explorer > 0, "[error] number of explorers must be positive integer" env = env_fn() # Manager to share PER between a learner and explorers SyncManager.register('PrioritizedReplayBuffer', PrioritizedReplayBuffer) manager = SyncManager() manager.start() global_rb = manager.PrioritizedReplayBuffer( obs_shape=env.observation_space.shape, act_dim=env.action_space.low.size, size=args_.replay_buffer_size) # queues to share network parameters between a learner and explorers queues = [Queue() for _ in range(n_explorer)] # Event object to share training status. if event is set True, all exolorers stop sampling transitions is_training_done = Event() # Lock lock = manager.Lock() # Shared memory objects to count number of samples and applied gradients trained_steps = Value('i', 0) n_transition = Value('i', 0) tasks = [] # Add explorers for i in range(n_explorer): tasks.append(Process( target=explorer, args=[global_rb, queues[i], trained_steps, n_transition, is_training_done, lock, env_fn, policy_fn, args_.local_buffer_size])) # Add learner tasks.append(Process( target=learner, args=[global_rb, trained_steps, is_training_done, lock, env_fn, policy_fn, args_.max_batch, args_.param_update_freq, *queues])) for task in tasks: task.start() for task in tasks: task.join()
PER_a = 0.6 # P(i) = p(i) ** a / total_priority ** a env_dict = { "obs": { "shape": state_size }, "act": {}, "rew": {}, "next_obs": { "shape": state_size }, "done": {} } global_rb = manager.PrioritizedReplayBuffer(memory_size, env_dict=env_dict, alpha=PER_a, default_dtype=np.float16, check_for_update=True) n_explorer = multiprocessing.cpu_count() - 1 epsilons = [ pow(0.4, 1 + (i / (n_explorer - 1)) * 7) for i in range(n_explorer) ] # apex paper n_queue = n_explorer n_queue += 1 # for evaluation # n_queue += 1 # for prefetch queues = [manager.Queue() for _ in range(n_queue)] # Event object to share training status. if event is set True, all exolorers stop sampling transitions is_training_done = Event()
PER_e = 0.01 # epsilon -> pi = |delta| + epsilon transitions which have zero error also have chance to be selected PER_a = 0.6 # P(i) = p(i) ** a / total_priority ** a env_dict = { "obs": { "shape": (state_size, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (state_size, ) }, "done": {} } global_rb = manager.PrioritizedReplayBuffer(memory_size, env_dict=env_dict, alpha=PER_a, eps=PER_e) n_explorer = multiprocessing.cpu_count() - 1 n_queue = n_explorer n_queue += 1 # for evaluation queues = [manager.Queue() for _ in range(n_queue)] # Event object to share training status. if event is set True, all exolorers stop sampling transitions is_training_done = Event() # Lock lock = manager.Lock() # Shared memory objects to count number of samples and applied gradients