def actor(args): device = args["device"] discount_factor = args["discount_factor"] # local buffer of fixed size to store transitions before sending n_step = args["n_step"] size_local_memory_buffer = args["size_local_memory_buffer"] + n_step local_buffer_T = [None] * size_local_memory_buffer # Transtions local_buffer_Q = [None] * size_local_memory_buffer # Q values buffer_idx = 0 n_step_S = [None] * n_step # State n_step_A = [None] * n_step # Actions n_step_Q = [None] * n_step # Q values n_step_R = [0] * n_step # Rewards n_step_idx = 0 # index # set network to eval mode NN = args["model"] if NN == NN_11 or NN == NN_17: NN_config = args["model_config"] model = NN(NN_config["system_size"], NN_config["number_of_actions"], args["device"]) else: model = NN() model.to(device) model.eval() # env and env params no_envs = args["no_envs"] env = gym.make(args["env"], config=args["env_config"]) envs = EnvSet(env, no_envs) size = env.system_size no_actions = int(env.action_space.high[-1]) grid_shift = int(env.system_size / 2) epsilon = args["epsilon"] transition_type = np.dtype([('perspective', (np.int, (2, size, size))), ('action', action_type), ('reward', np.float), ('next_perspective', (np.int, (2, size, size))), ('terminal', np.bool)]) # startup state = envs.resetAll() steps_per_episode = np.zeros(no_envs) # Local buffer of fixed size to store transitions before sending. size_local_memory_buffer = args["size_local_memory_buffer"] + 1 local_buffer_T = np.empty((no_envs, size_local_memory_buffer), dtype=transition_type) # Transtions local_buffer_A = np.empty((no_envs, size_local_memory_buffer, 4), dtype=np.int) # A values local_buffer_Q = np.empty((no_envs, size_local_memory_buffer), dtype=(np.float, 3)) # Q values local_buffer_R = np.empty((no_envs, size_local_memory_buffer)) # R values buffer_idx = 0 # Get initial network params base_comm = args["mpi_base_comm"] learner_rank = args["mpi_learner_rank"] msg = None msg = base_comm.bcast(msg, root=learner_rank) msg, weights = msg if msg != "weights": weights = None # load weights vector_to_parameters(weights.to(device), model.parameters()) # init counters steps_counter = 0 update_counter = 1 local_memory_index = 0 # main loop over training steps print("Actor start loop") while True: steps_per_episode += 1 # select action using epsilon greedy policy action, q_values = selectActionParallel(number_of_actions=no_actions, epsilon=epsilon, grid_shift=grid_shift, toric_size=size, state=state, model=model, device=device) next_state, reward, terminal_state, _ = envs.step(action) transition = generateTransitionParallel(action, reward, state, next_state, terminal_state, grid_shift, transition_type) local_buffer_T[:, buffer_idx] = transition local_buffer_A[:, buffer_idx] = action local_buffer_Q[:, buffer_idx] = q_values local_buffer_R[:, buffer_idx] = reward buffer_idx += 1 # If buffer full, send transitions if buffer_idx >= size_local_memory_buffer: print("Actor Sync") # receive new weights msg = base_comm.bcast(msg, root=learner_rank) msg, weights = msg if msg == "weights": vector_to_parameters(weights.to(device), model.parameters()) elif msg == "terminate": break priorities = computePrioritiesParallel( local_buffer_A[:, :-1], local_buffer_R[:, :-1], local_buffer_Q[:, :-1], np.roll(local_buffer_Q, -1, axis=1)[:, :-1], discount_factor) to_send = [ *zip(local_buffer_T[:, :-1].flatten(), priorities.flatten()) ] # send buffer to learner base_comm.gather(to_send, root=learner_rank) buffer_idx = 0 too_many_steps = steps_per_episode > args["max_actions_per_episode"] if np.any(terminal_state) or np.any(too_many_steps): # Reset terminal envs idx = np.argwhere(np.logical_or(terminal_state, too_many_steps)).flatten() reset_states = envs.resetTerminalEnvs(idx) # Reset n_step buffers next_state[idx] = reset_states steps_per_episode[idx] = 0 state = next_state
def actor(args): device = args["device"] discount_factor = args["discount_factor"] no_envs = args["no_envs"] device = args["device"] discount_factor = args["discount_factor"] epsilon_final = np.array(args["epsilon_final"]) epsilon = np.ones(len(epsilon_final)) epsilon_delta = args["epsilon_delta"] actor_id = args["id"] should_log = args["log_actor"] # env and env params env_p_error_start = args["env_p_error_start"] env_p_error_final = args["env_p_error_final"] env_p_error_delta = args["env_p_error_delta"] env_p_error_strategy = args['env_p_error_strategy'] env_p_errors = np.ones(no_envs)*env_p_error_start env = gym.make(args["env"], config=args["env_config"]) envs = EnvSet(env, no_envs) size = env.system_size transition_type = np.dtype([('perspective', (np.int, (2,size,size))), ('action', action_type), ('reward', np.float), ('next_perspective', (np.int, (2,size,size))), ('terminal',np.bool)]) no_actions = int(env.action_space.high[-1]) grid_shift = int(size/2) # startup state = envs.resetAll(p_errors=env_p_errors) steps_per_episode = np.zeros(no_envs) # Local buffer of fixed size to store transitions before sending. size_local_memory_buffer = args["size_local_memory_buffer"] + 1 local_buffer_T = np.empty((no_envs, size_local_memory_buffer), dtype=transition_type) # Transtions local_buffer_A = np.empty((no_envs, size_local_memory_buffer, 4), dtype=np.int) # A values local_buffer_Q = np.empty((no_envs, size_local_memory_buffer), dtype=(np.float, 3)) # Q values local_buffer_R = np.empty((no_envs, size_local_memory_buffer)) # R values buffer_idx = 0 # Comms actor_io_queue = args["actor_io_queue"] # Transition queue shared_mem_weights = args["shared_mem_weights"] shared_mem_weight_id = args["shared_mem_weight_id"] current_weight_id = 0 new_weights = False # Init networkds NN = args["model"] model_no_params = args["model_no_params"] if NN == NN_11 or NN == NN_17: NN_config = args["model_config"] model = NN(NN_config["system_size"], NN_config["number_of_actions"], args["device"]) else: model = NN() # load initial network weights weights = np.empty(model_no_params) with shared_mem_weights.get_lock(): reader = np.frombuffer(shared_mem_weights.get_obj()) np.copyto(weights, reader) vector_to_parameters(from_numpy(weights).to(device).type(torch.FloatTensor), model.parameters()) model.to(device) model.eval() performance_start = time.time() performance_stop = None print("Actor ",actor_id,": starting loop device: ",device) # main loop over training steps while True: steps_per_episode += 1 # select action using epsilon greedy policy action, q_values = selectActionBatch(number_of_actions=no_actions, epsilon=epsilon, grid_shift=grid_shift, toric_size = size, state = state, model = model, device = device) next_state, reward, terminal_state, _ = envs.step(action) transition = generateTransitionParallel(action, reward, state, next_state, terminal_state, grid_shift, transition_type) local_buffer_T[:, buffer_idx] = transition local_buffer_A[:, buffer_idx] = action local_buffer_Q[:, buffer_idx] = q_values local_buffer_R[:, buffer_idx] = reward buffer_idx += 1 # If buffer full, send transitions if buffer_idx >= size_local_memory_buffer: # read new weights from shared memory with shared_mem_weights.get_lock(): if current_weight_id < shared_mem_weight_id.value: print("Actor updated network params.") reader = np.frombuffer(shared_mem_weights.get_obj()) np.copyto(weights, reader) current_weight_id = shared_mem_weight_id.value new_weights = True # load new weights into model if new_weights: new_weights = False vector_to_parameters(from_numpy(weights).type(torch.FloatTensor).to(device), model.parameters()) epsilon = np.maximum(epsilon - epsilon_delta, epsilon_final) # compute priorities priorities = computePrioritiesParallel(local_buffer_A[:,:-1], local_buffer_R[:,:-1], local_buffer_Q[:,:-1], np.roll(local_buffer_Q, -1, axis=1)[:,:-1], discount_factor) to_send = [*zip(local_buffer_T[:,:-1].flatten(), priorities.flatten())] performance_stop = time.time() performance_elapsed = performance_stop - performance_start performence_transitions = len(to_send) #print("Actor ",actor_id," generating ",performence_transitions/performance_elapsed, "tranistions/s") performance_start = time.time() # send buffer to learner actor_io_queue.put(to_send) buffer_idx = 0 too_many_steps = steps_per_episode > args["max_actions_per_episode"] if np.any(terminal_state) or np.any(too_many_steps): # Reset terminal envs idx = np.argwhere(np.logical_or(terminal_state, too_many_steps)).flatten() # find terminal envs env_p_errors[idx] = np.minimum(env_p_error_final, env_p_errors[idx] + env_p_error_delta) # calculate new p_error roof interval if env_p_error_strategy == 'random': p_errors = np.random.uniform(env_p_error_start, env_p_errors[idx]) # randomly select new p_error else: # linear p_errors = env_p_errors[idx] # linearly select new p_error reset_states = envs.resetTerminalEnvs(idx, p_errors=p_errors) # reset using new p_error next_state[idx] = reset_states steps_per_episode[idx] = 0 state = next_state
env = gym.make('toric-code-v0', config=env_config) envs = EnvSet(env, NO_ENVS) transition_type = np.dtype([('perspective', (np.int, (2, SIZE, SIZE))), ('action', action_type), ('reward', np.float), ('next_perspective', (np.int, (2, SIZE, SIZE))), ('terminal', np.bool)]) model = ResNet18() states = envs.resetAll() # compile call a, q = numba_sap(3, 0, int(SIZE / 2), SIZE, states, model, 'cpu') next_state, reward, terminal, _ = envs.step(a) transitions = numba_gtp(a, reward, states, next_state, terminal, int(SIZE / 2), transition_type) for i in range(100): states = envs.resetAll() a, q = numba_sap(3, 0, int(SIZE / 2), SIZE, states, model, 'cpu') next_state, reward, terminal, _ = envs.step(a) start = time.time() mid = time.time() transitions = numba_gtp(a, reward, states, next_state, terminal, int(SIZE / 2), transition_type) end = time.time()
def actor(args): no_envs = args["no_envs"] device = args["device"] discount_factor = args["discount_factor"] epsilon = np.array(args["epsilon"]) # env and env params env = gym.make(args["env"], config=args["env_config"]) envs = EnvSet(env, no_envs) size = env.system_size transition_type = np.dtype([('perspective', (np.int, (2, size, size))), ('action', action_type), ('reward', np.float), ('next_perspective', (np.int, (2, size, size))), ('terminal', np.bool)]) no_actions = int(env.action_space.high[-1]) grid_shift = int(size / 2) # startup state = envs.resetAll() steps_per_episode = np.zeros(no_envs) # Local buffer of fixed size to store transitions before sending. size_local_memory_buffer = args["size_local_memory_buffer"] + 1 local_buffer_T = np.empty((no_envs, size_local_memory_buffer), dtype=transition_type) # Transtions local_buffer_A = np.empty((no_envs, size_local_memory_buffer, 4), dtype=np.int) # A values local_buffer_Q = np.empty((no_envs, size_local_memory_buffer), dtype=(np.float, 3)) # Q values local_buffer_R = np.empty((no_envs, size_local_memory_buffer)) # R values buffer_idx = 0 # set network to eval mode NN = args["model"] if NN == NN_11 or NN == NN_17: NN_config = args["model_config"] model = NN(NN_config["system_size"], NN_config["number_of_actions"], args["device"]) else: model = NN() model.to(device) model.eval() count = 0 # main loop over training steps while True: steps_per_episode += 1 # select action using epsilon greedy policy action, q_values = selectActionBatch(number_of_actions=no_actions, epsilon=epsilon, grid_shift=grid_shift, toric_size=size, state=state, model=model, device=device) next_state, reward, terminal_state, _ = envs.step(action) transition = generateTransitionParallel(action, reward, state, next_state, terminal_state, grid_shift, transition_type) local_buffer_T[:, buffer_idx] = transition local_buffer_A[:, buffer_idx] = action local_buffer_Q[:, buffer_idx] = q_values local_buffer_R[:, buffer_idx] = reward buffer_idx += 1 # If buffer full, send transitions if buffer_idx >= size_local_memory_buffer: priorities = computePrioritiesParallel( local_buffer_A[:, :-1], local_buffer_R[:, :-1], local_buffer_Q[:, :-1], np.roll(local_buffer_Q, -1, axis=1)[:, :-1], discount_factor) to_send = [ *zip(local_buffer_T[:, :-1].flatten(), priorities.flatten()) ] return to_send buffer_idx = 0 too_many_steps = steps_per_episode > args["max_actions_per_episode"] if np.any(terminal_state) or np.any(too_many_steps): # Reset terminal envs idx = np.argwhere(np.logical_or(terminal_state, too_many_steps)).flatten() reset_states = envs.resetTerminalEnvs(idx) next_state[idx] = reset_states steps_per_episode[idx] = 0 state = next_state count += 1
def actor(rank, world_size, args): """ An actor that performs actions in an environment. Params ====== rank: (int) rank of the actor in a multiprocessing setting. world_size: (int) total number of actors and learners. args: (dict) training specific parameters { train_steps: (int) number of training steps , max_actions_per_episode: (int) number of actions before the episode is cut short , update_policy: (int) (depricated) number of steps until updating policy , size_local_memory_buffer: (int) size of the local replay buffer , min_qubit_errors: (int) minumum number of qbit errors on the toric code , model: (Class torch.nn) model to make predictions , model_config: (dict) { system_size: (int) size of the toric grid. , number_of_actions (int) } , env: (String) environment to act in , env_config (dict) { size: (int) , min_qbit_errors (int) , p_error (float) } , device: (String) {"cpu", "cuda"} device to operate whenever possible , epsilon: (float) probability of selecting a random action , n_step: (int) n-step learning , con_learner: (multiprocessing.Connection) connection where new weights are received and termination , transition_queue: (multiprocessing.Queue) SimpleQueue where transitions are sent to replay buffer } """ # queues con_learner = args["con_learner"] transition_queue_to_memory = args["transition_queue_to_memory"] no_envs = args["no_envs"] device = args["device"] discount_factor = args["discount_factor"] epsilon = np.array(args["epsilon"]) # env and env params env = gym.make(args["env"], config=args["env_config"]) envs = EnvSet(env, no_envs) size = env.system_size transition_type = np.dtype([('perspective', (np.int, (2, size, size))), ('action', action_type), ('reward', np.float), ('next_perspective', (np.int, (2, size, size))), ('terminal', np.bool)]) no_actions = int(env.action_space.high[-1]) grid_shift = int(size / 2) # startup state = envs.resetAll() steps_per_episode = np.zeros(no_envs) # Local buffer of fixed size to store transitions before sending. size_local_memory_buffer = args["size_local_memory_buffer"] + 1 local_buffer_T = np.empty((no_envs, size_local_memory_buffer), dtype=transition_type) # Transtions local_buffer_A = np.empty((no_envs, size_local_memory_buffer, 4), dtype=np.int) # A values local_buffer_Q = np.empty((no_envs, size_local_memory_buffer), dtype=(np.float, 3)) # Q values local_buffer_R = np.empty((no_envs, size_local_memory_buffer)) # R values buffer_idx = 0 # set network to eval mode NN = args["model"] if NN == NN_11 or NN == NN_17: NN_config = args["model_config"] model = NN(NN_config["system_size"], NN_config["number_of_actions"], args["device"]) else: model = NN() # Get initial network params weights = None while weights == None: msg, weights = con_learner.recv() # blocking op if msg != "weights": weights = None # load weights vector_to_parameters(weights, model.parameters()) model.to(device) model.eval() # main loop over training steps while True: if con_learner.poll(): msg, weights = con_learner.recv() if msg == "weights": vector_to_parameters(weights, model.parameters()) elif msg == "prep_terminate": con_learner.send("ok") break steps_per_episode += 1 # select action using epsilon greedy policy action, q_values = selectActionBatch(number_of_actions=no_actions, epsilon=epsilon, grid_shift=grid_shift, toric_size=size, state=state, model=model, device=device) next_state, reward, terminal_state, _ = envs.step(action) transition = generateTransitionParallel(action, reward, state, next_state, terminal_state, grid_shift, transition_type) local_buffer_T[:, buffer_idx] = transition local_buffer_A[:, buffer_idx] = action local_buffer_Q[:, buffer_idx] = q_values local_buffer_R[:, buffer_idx] = reward buffer_idx += 1 # If buffer full, send transitions if buffer_idx >= size_local_memory_buffer: priorities = computePrioritiesParallel( local_buffer_A[:, :-1], local_buffer_R[:, :-1], local_buffer_Q[:, :-1], np.roll(local_buffer_Q, -1, axis=1)[:, :-1], discount_factor) to_send = [ *zip(local_buffer_T[:, :-1].flatten(), priorities.flatten()) ] # send buffer to learner transition_queue_to_memory.put(to_send) buffer_idx = 0 too_many_steps = steps_per_episode > args["max_actions_per_episode"] if np.any(terminal_state) or np.any(too_many_steps): # Reset terminal envs idx = np.argwhere(np.logical_or(terminal_state, too_many_steps)).flatten() reset_states = envs.resetTerminalEnvs(idx) # Reset n_step buffers next_state[idx] = reset_states steps_per_episode[idx] = 0 state = next_state # ready to terminate while True: msg, _ = con_learner.recv() if msg == "terminate": transition_queue_to_memory.close() con_learner.send("ok") break