Example #1
0
def actor(args):

    device = args["device"]

    discount_factor = args["discount_factor"]

    # local buffer of fixed size to store transitions before sending
    n_step = args["n_step"]
    size_local_memory_buffer = args["size_local_memory_buffer"] + n_step
    local_buffer_T = [None] * size_local_memory_buffer  # Transtions
    local_buffer_Q = [None] * size_local_memory_buffer  # Q values
    buffer_idx = 0
    n_step_S = [None] * n_step  # State
    n_step_A = [None] * n_step  # Actions
    n_step_Q = [None] * n_step  # Q values
    n_step_R = [0] * n_step  # Rewards
    n_step_idx = 0  # index

    # set network to eval mode
    NN = args["model"]
    if NN == NN_11 or NN == NN_17:
        NN_config = args["model_config"]
        model = NN(NN_config["system_size"], NN_config["number_of_actions"],
                   args["device"])
    else:
        model = NN()

    model.to(device)
    model.eval()

    # env and env params
    no_envs = args["no_envs"]
    env = gym.make(args["env"], config=args["env_config"])
    envs = EnvSet(env, no_envs)
    size = env.system_size

    no_actions = int(env.action_space.high[-1])
    grid_shift = int(env.system_size / 2)

    epsilon = args["epsilon"]

    transition_type = np.dtype([('perspective', (np.int, (2, size, size))),
                                ('action', action_type), ('reward', np.float),
                                ('next_perspective', (np.int, (2, size,
                                                               size))),
                                ('terminal', np.bool)])

    # startup
    state = envs.resetAll()
    steps_per_episode = np.zeros(no_envs)

    # Local buffer of fixed size to store transitions before sending.
    size_local_memory_buffer = args["size_local_memory_buffer"] + 1
    local_buffer_T = np.empty((no_envs, size_local_memory_buffer),
                              dtype=transition_type)  # Transtions
    local_buffer_A = np.empty((no_envs, size_local_memory_buffer, 4),
                              dtype=np.int)  # A values
    local_buffer_Q = np.empty((no_envs, size_local_memory_buffer),
                              dtype=(np.float, 3))  # Q values
    local_buffer_R = np.empty((no_envs, size_local_memory_buffer))  # R values
    buffer_idx = 0

    # Get initial network params
    base_comm = args["mpi_base_comm"]
    learner_rank = args["mpi_learner_rank"]
    msg = None
    msg = base_comm.bcast(msg, root=learner_rank)

    msg, weights = msg
    if msg != "weights":
        weights = None

    # load weights
    vector_to_parameters(weights.to(device), model.parameters())

    # init counters
    steps_counter = 0
    update_counter = 1
    local_memory_index = 0
    # main loop over training steps

    print("Actor start loop")
    while True:

        steps_per_episode += 1
        # select action using epsilon greedy policy

        action, q_values = selectActionParallel(number_of_actions=no_actions,
                                                epsilon=epsilon,
                                                grid_shift=grid_shift,
                                                toric_size=size,
                                                state=state,
                                                model=model,
                                                device=device)
        next_state, reward, terminal_state, _ = envs.step(action)

        transition = generateTransitionParallel(action, reward, state,
                                                next_state, terminal_state,
                                                grid_shift, transition_type)

        local_buffer_T[:, buffer_idx] = transition
        local_buffer_A[:, buffer_idx] = action
        local_buffer_Q[:, buffer_idx] = q_values
        local_buffer_R[:, buffer_idx] = reward
        buffer_idx += 1

        # If buffer full, send transitions
        if buffer_idx >= size_local_memory_buffer:
            print("Actor Sync")
            # receive new weights
            msg = base_comm.bcast(msg, root=learner_rank)
            msg, weights = msg
            if msg == "weights":
                vector_to_parameters(weights.to(device), model.parameters())
            elif msg == "terminate":
                break

            priorities = computePrioritiesParallel(
                local_buffer_A[:, :-1], local_buffer_R[:, :-1],
                local_buffer_Q[:, :-1],
                np.roll(local_buffer_Q, -1, axis=1)[:, :-1], discount_factor)

            to_send = [
                *zip(local_buffer_T[:, :-1].flatten(), priorities.flatten())
            ]

            # send buffer to learner
            base_comm.gather(to_send, root=learner_rank)
            buffer_idx = 0

        too_many_steps = steps_per_episode > args["max_actions_per_episode"]
        if np.any(terminal_state) or np.any(too_many_steps):

            # Reset terminal envs
            idx = np.argwhere(np.logical_or(terminal_state,
                                            too_many_steps)).flatten()
            reset_states = envs.resetTerminalEnvs(idx)

            # Reset n_step buffers
            next_state[idx] = reset_states
            steps_per_episode[idx] = 0

        state = next_state
def actor(args):

    no_envs = args["no_envs"]
    device = args["device"]
    discount_factor = args["discount_factor"]
    epsilon = np.array(args["epsilon"])

    # env and env params
    env = gym.make(args["env"], config=args["env_config"])
    envs = EnvSet(env, no_envs)

    size = env.system_size

    transition_type = np.dtype([('perspective', (np.int, (2, size, size))),
                                ('action', action_type), ('reward', np.float),
                                ('next_perspective', (np.int, (2, size,
                                                               size))),
                                ('terminal', np.bool)])

    no_actions = int(env.action_space.high[-1])
    grid_shift = int(size / 2)

    # startup
    state = envs.resetAll()
    steps_per_episode = np.zeros(no_envs)

    # Local buffer of fixed size to store transitions before sending.
    size_local_memory_buffer = args["size_local_memory_buffer"] + 1
    local_buffer_T = np.empty((no_envs, size_local_memory_buffer),
                              dtype=transition_type)  # Transtions
    local_buffer_A = np.empty((no_envs, size_local_memory_buffer, 4),
                              dtype=np.int)  # A values
    local_buffer_Q = np.empty((no_envs, size_local_memory_buffer),
                              dtype=(np.float, 3))  # Q values
    local_buffer_R = np.empty((no_envs, size_local_memory_buffer))  # R values
    buffer_idx = 0

    # set network to eval mode
    NN = args["model"]
    if NN == NN_11 or NN == NN_17:
        NN_config = args["model_config"]
        model = NN(NN_config["system_size"], NN_config["number_of_actions"],
                   args["device"])
    else:
        model = NN()

    model.to(device)
    model.eval()
    count = 0
    # main loop over training steps
    while True:

        steps_per_episode += 1

        # select action using epsilon greedy policy
        action, q_values = selectActionBatch(number_of_actions=no_actions,
                                             epsilon=epsilon,
                                             grid_shift=grid_shift,
                                             toric_size=size,
                                             state=state,
                                             model=model,
                                             device=device)
        next_state, reward, terminal_state, _ = envs.step(action)

        transition = generateTransitionParallel(action, reward, state,
                                                next_state, terminal_state,
                                                grid_shift, transition_type)

        local_buffer_T[:, buffer_idx] = transition
        local_buffer_A[:, buffer_idx] = action
        local_buffer_Q[:, buffer_idx] = q_values
        local_buffer_R[:, buffer_idx] = reward
        buffer_idx += 1

        # If buffer full, send transitions
        if buffer_idx >= size_local_memory_buffer:
            priorities = computePrioritiesParallel(
                local_buffer_A[:, :-1], local_buffer_R[:, :-1],
                local_buffer_Q[:, :-1],
                np.roll(local_buffer_Q, -1, axis=1)[:, :-1], discount_factor)

            to_send = [
                *zip(local_buffer_T[:, :-1].flatten(), priorities.flatten())
            ]

            return to_send

            buffer_idx = 0

        too_many_steps = steps_per_episode > args["max_actions_per_episode"]
        if np.any(terminal_state) or np.any(too_many_steps):

            # Reset terminal envs
            idx = np.argwhere(np.logical_or(terminal_state,
                                            too_many_steps)).flatten()
            reset_states = envs.resetTerminalEnvs(idx)

            next_state[idx] = reset_states
            steps_per_episode[idx] = 0

        state = next_state
        count += 1
Example #3
0
def actor(args):
     
    device          = args["device"]
    discount_factor = args["discount_factor"]
    no_envs         = args["no_envs"]
    device          = args["device"]
    discount_factor = args["discount_factor"]
    epsilon_final   = np.array(args["epsilon_final"])
    epsilon         = np.ones(len(epsilon_final))
    epsilon_delta   = args["epsilon_delta"]
    actor_id        = args["id"]

    should_log = args["log_actor"]

    # env and env params
    env_p_error_start    = args["env_p_error_start"]
    env_p_error_final    = args["env_p_error_final"]
    env_p_error_delta    = args["env_p_error_delta"]
    env_p_error_strategy = args['env_p_error_strategy'] 
    env_p_errors         = np.ones(no_envs)*env_p_error_start
    env  = gym.make(args["env"], config=args["env_config"])
    envs = EnvSet(env, no_envs)
    size = env.system_size
    
    transition_type = np.dtype([('perspective', (np.int, (2,size,size))),
                                ('action', action_type),
                                ('reward', np.float),
                                ('next_perspective', (np.int, (2,size,size))),
                                ('terminal',np.bool)])

    no_actions = int(env.action_space.high[-1])
    grid_shift = int(size/2)

    # startup
    state = envs.resetAll(p_errors=env_p_errors)
    steps_per_episode = np.zeros(no_envs)


    # Local buffer of fixed size to store transitions before sending.
    size_local_memory_buffer    = args["size_local_memory_buffer"] + 1
    local_buffer_T              = np.empty((no_envs, size_local_memory_buffer), dtype=transition_type)  # Transtions
    local_buffer_A              = np.empty((no_envs, size_local_memory_buffer, 4), dtype=np.int)        # A values
    local_buffer_Q              = np.empty((no_envs, size_local_memory_buffer), dtype=(np.float, 3))    # Q values
    local_buffer_R              = np.empty((no_envs, size_local_memory_buffer))                         # R values
    buffer_idx                  = 0

    # Comms
    actor_io_queue         = args["actor_io_queue"] # Transition queue
    shared_mem_weights     = args["shared_mem_weights"]
    shared_mem_weight_id   = args["shared_mem_weight_id"]
    current_weight_id      = 0
    new_weights            = False

    # Init networkds
    NN              = args["model"]
    model_no_params = args["model_no_params"]
    if NN == NN_11 or NN == NN_17:
        NN_config = args["model_config"]
        model = NN(NN_config["system_size"], NN_config["number_of_actions"], args["device"])
    else:
        model = NN()
    
    # load initial network weights
    weights = np.empty(model_no_params)
    with shared_mem_weights.get_lock():
        reader = np.frombuffer(shared_mem_weights.get_obj())
        np.copyto(weights, reader)
    vector_to_parameters(from_numpy(weights).to(device).type(torch.FloatTensor), model.parameters())
    
    model.to(device)
    model.eval()
    
    performance_start = time.time()
    performance_stop = None
    print("Actor ",actor_id,": starting loop device: ",device)
    # main loop over training steps
    while True:
        
        steps_per_episode += 1    
        
        # select action using epsilon greedy policy
        action, q_values = selectActionBatch(number_of_actions=no_actions,
                                             epsilon=epsilon,
                                             grid_shift=grid_shift,
                                             toric_size = size,
                                             state = state,
                                             model = model,
                                             device = device)
        next_state, reward, terminal_state, _ = envs.step(action)
         
        transition = generateTransitionParallel(action,
                                                reward, 
                                                state,
                                                next_state, 
                                                terminal_state,
                                                grid_shift,
                                                transition_type)

        local_buffer_T[:, buffer_idx] = transition
        local_buffer_A[:, buffer_idx] = action
        local_buffer_Q[:, buffer_idx] = q_values
        local_buffer_R[:, buffer_idx] = reward
        buffer_idx += 1

        # If buffer full, send transitions
        if buffer_idx >= size_local_memory_buffer:      
            
           
             # read new weights from shared memory

            with shared_mem_weights.get_lock():
                if current_weight_id < shared_mem_weight_id.value:
                    print("Actor updated network params.")
                    reader = np.frombuffer(shared_mem_weights.get_obj())
                    np.copyto(weights, reader)
                    current_weight_id = shared_mem_weight_id.value
                    new_weights = True
            # load new weights into model
            if new_weights:
                new_weights = False
                vector_to_parameters(from_numpy(weights).type(torch.FloatTensor).to(device), model.parameters())
                epsilon = np.maximum(epsilon - epsilon_delta, epsilon_final)

            # compute priorities
            priorities = computePrioritiesParallel(local_buffer_A[:,:-1],
                                                   local_buffer_R[:,:-1],
                                                   local_buffer_Q[:,:-1],
                                                   np.roll(local_buffer_Q, -1, axis=1)[:,:-1],
                                                   discount_factor)

            to_send = [*zip(local_buffer_T[:,:-1].flatten(), priorities.flatten())]
            
            
            performance_stop = time.time()
            performance_elapsed = performance_stop - performance_start
            performence_transitions = len(to_send)
            #print("Actor ",actor_id," generating ",performence_transitions/performance_elapsed, "tranistions/s")
            performance_start = time.time()

            # send buffer to learner
            actor_io_queue.put(to_send)
            buffer_idx = 0

        too_many_steps = steps_per_episode > args["max_actions_per_episode"]
        if np.any(terminal_state) or np.any(too_many_steps):
            
            # Reset terminal envs
            idx                    = np.argwhere(np.logical_or(terminal_state, too_many_steps)).flatten() # find terminal envs
            env_p_errors[idx]      = np.minimum(env_p_error_final, env_p_errors[idx] + env_p_error_delta) # calculate new p_error roof interval
            if env_p_error_strategy == 'random':
                p_errors = np.random.uniform(env_p_error_start, env_p_errors[idx])                        # randomly select new p_error
            else: # linear
                p_errors = env_p_errors[idx]                                                              # linearly select new p_error
            reset_states           = envs.resetTerminalEnvs(idx, p_errors=p_errors)                       # reset using new p_error
            next_state[idx]        = reset_states
            steps_per_episode[idx] = 0

        state = next_state
Example #4
0
def actor(rank, world_size, args):
    """ An actor that performs actions in an environment.

    Params
    ======
    rank:       (int) rank of the actor in a multiprocessing setting.
    world_size: (int) total number of actors and learners.
    args:       (dict) training specific parameters 
    {
        train_steps:                (int) number of training steps
        , max_actions_per_episode:  (int) number of actions before
                                    the episode is cut short
        , update_policy:            (int) (depricated) number of 
                                    steps until updating policy
        , size_local_memory_buffer: (int) size of the local replay buffer
        , min_qubit_errors:         (int) minumum number of qbit 
                                    errors on the toric code
        , model:                    (Class torch.nn) model to make predictions
        , model_config:             (dict)
        {
            system_size:        (int) size of the toric grid.
            , number_of_actions (int)
        }
        , env:                      (String) environment to act in
        , env_config                (dict)
        {
            size:               (int)
            , min_qbit_errors   (int)
            , p_error           (float)
        }
        , device:                   (String) {"cpu", "cuda"} device to
                                    operate whenever possible
        , epsilon:                  (float) probability of selecting a
                                    random action
        , n_step:                   (int) n-step learning
        , con_learner:              (multiprocessing.Connection) connection
                                    where new weights are received and termination
        , transition_queue:         (multiprocessing.Queue) SimpleQueue where
                                    transitions are sent to replay buffer
    }

    """
    # queues
    con_learner = args["con_learner"]
    transition_queue_to_memory = args["transition_queue_to_memory"]

    no_envs = args["no_envs"]
    device = args["device"]
    discount_factor = args["discount_factor"]
    epsilon = np.array(args["epsilon"])

    # env and env params
    env = gym.make(args["env"], config=args["env_config"])
    envs = EnvSet(env, no_envs)

    size = env.system_size

    transition_type = np.dtype([('perspective', (np.int, (2, size, size))),
                                ('action', action_type), ('reward', np.float),
                                ('next_perspective', (np.int, (2, size,
                                                               size))),
                                ('terminal', np.bool)])

    no_actions = int(env.action_space.high[-1])
    grid_shift = int(size / 2)

    # startup
    state = envs.resetAll()
    steps_per_episode = np.zeros(no_envs)

    # Local buffer of fixed size to store transitions before sending.
    size_local_memory_buffer = args["size_local_memory_buffer"] + 1
    local_buffer_T = np.empty((no_envs, size_local_memory_buffer),
                              dtype=transition_type)  # Transtions
    local_buffer_A = np.empty((no_envs, size_local_memory_buffer, 4),
                              dtype=np.int)  # A values
    local_buffer_Q = np.empty((no_envs, size_local_memory_buffer),
                              dtype=(np.float, 3))  # Q values
    local_buffer_R = np.empty((no_envs, size_local_memory_buffer))  # R values
    buffer_idx = 0

    # set network to eval mode
    NN = args["model"]
    if NN == NN_11 or NN == NN_17:
        NN_config = args["model_config"]
        model = NN(NN_config["system_size"], NN_config["number_of_actions"],
                   args["device"])
    else:
        model = NN()

    # Get initial network params
    weights = None
    while weights == None:
        msg, weights = con_learner.recv()  # blocking op
        if msg != "weights":
            weights = None

    # load weights
    vector_to_parameters(weights, model.parameters())

    model.to(device)
    model.eval()

    # main loop over training steps
    while True:

        if con_learner.poll():
            msg, weights = con_learner.recv()

            if msg == "weights":
                vector_to_parameters(weights, model.parameters())

            elif msg == "prep_terminate":
                con_learner.send("ok")
                break

        steps_per_episode += 1

        # select action using epsilon greedy policy
        action, q_values = selectActionBatch(number_of_actions=no_actions,
                                             epsilon=epsilon,
                                             grid_shift=grid_shift,
                                             toric_size=size,
                                             state=state,
                                             model=model,
                                             device=device)
        next_state, reward, terminal_state, _ = envs.step(action)

        transition = generateTransitionParallel(action, reward, state,
                                                next_state, terminal_state,
                                                grid_shift, transition_type)

        local_buffer_T[:, buffer_idx] = transition
        local_buffer_A[:, buffer_idx] = action
        local_buffer_Q[:, buffer_idx] = q_values
        local_buffer_R[:, buffer_idx] = reward
        buffer_idx += 1

        # If buffer full, send transitions
        if buffer_idx >= size_local_memory_buffer:
            priorities = computePrioritiesParallel(
                local_buffer_A[:, :-1], local_buffer_R[:, :-1],
                local_buffer_Q[:, :-1],
                np.roll(local_buffer_Q, -1, axis=1)[:, :-1], discount_factor)

            to_send = [
                *zip(local_buffer_T[:, :-1].flatten(), priorities.flatten())
            ]

            # send buffer to learner
            transition_queue_to_memory.put(to_send)
            buffer_idx = 0

        too_many_steps = steps_per_episode > args["max_actions_per_episode"]
        if np.any(terminal_state) or np.any(too_many_steps):

            # Reset terminal envs
            idx = np.argwhere(np.logical_or(terminal_state,
                                            too_many_steps)).flatten()
            reset_states = envs.resetTerminalEnvs(idx)

            # Reset n_step buffers
            next_state[idx] = reset_states
            steps_per_episode[idx] = 0

        state = next_state

    # ready to terminate
    while True:
        msg, _ = con_learner.recv()
        if msg == "terminate":
            transition_queue_to_memory.close()
            con_learner.send("ok")
            break