Ejemplo n.º 1
0
def execute():
    """Execute the learning algorithm"""
    s = lp.s
    a = lp.a
    alpha = lp.alpha
    q = lp.q
    v = lp.v
    policy = lp.policy

    agent.execute_action(a)
    time.sleep(lp.step_time)

    sp = agent.observe_state()
    r = agent.obtain_reward(s, a, sp)

    ap = agent.select_action(sp)

    # update Q
    delta = r + exp.GAMMA * q[sp, ap] - q[s, a]  # TD error (SARSA)
    q[s, a] = q[s, a] + alpha * delta  # update rule

    # Update V and Policy
    v[s] = np.max(q[s])
    policy[s] = np.argmax(q[s])

    lp.s = s
    lp.a = a
    lp.sp = sp
    lp.ap = ap
    lp.r = r
    lp.alpha = alpha
    lp.q = q
    lp.v = v
    lp.policy = policy
    lp.delta = delta
Ejemplo n.º 2
0
def execute():
    """ Execute Learning Algorithm """

    global eligibility, q_old

    assert initiated, "TOSL not initiated! setup() must be previously called"

    s = lp.s
    a = lp.a
    alpha = lp.alpha
    q = lp.q
    v = lp.v
    policy = lp.policy

    # Specific Learning Algorithm
    agent.execute_action(a)
    time.sleep(lp.step_time)
    # robot.stop()
    # time.sleep(TASK.STEP_TIME/2)
    sp = agent.observe_state()
    r = agent.obtain_reward(s, a, sp)

    ap = agent.select_action(sp)  # Exploration strategy

    diff_q = q[s, a] - q_old
    q_old = q[sp, ap]

    delta = r + exp.GAMMA * q[sp, ap] - q[s, a]  # TD error

    eligibility[s, a] = (1.0 - alpha) * eligibility[s, a] + 1

    if eli_queue.count(s) > 0:
        eli_queue.remove(s)
    assert eli_queue.count(s) == 0, ("duplicated states found in ET ", str(s))
    eli_queue.appendleft(s)

    for i in eli_queue:  # no all states updated, just those in eli_queue
        # replace eli_queue by range(task.n_states) for non-reduced ET
        for j in range(task.n_actions):
            if eligibility[i, j] > 0.01:
                q[i, j] = q[i, j] + alpha * (delta + diff_q) * eligibility[i, j]
                eligibility[i, j] *= exp.GAMMA * exp.LAMBDA
            else:
                eligibility[i, j] = 0

            if i == s and j == a:
                q[i, j] = q[i, j] - alpha * diff_q

        # update v and policy
        v[i] = np.max(q[i])
        policy[i] = np.argmax(q[i])

    lp.s, lp.a = s, a
    lp.sp, lp.ap = sp, ap
    lp.r = r
    lp.alpha = alpha
    lp.q = q
    lp.v = v
    lp.policy = policy
    lp.delta = delta
Ejemplo n.º 3
0
def test_agent(agent, x_start, y_start, epsilon, goal, pit, labyrinth, plots):
    # initialize and plot the environment
    state = [x_start, y_start]
    env = environment.Environment(x, y, state, goal, pit, labyrinth)
    if plots: plot_map(x, y, state, goal, pit, labyrinth, walls, 0)
    reward = 0
    # run episodes
    for step in range(1, 30):

        # find state index
        state_index = state[0] * y + state[1]
        # choose an action
        action = agent.select_action(state_index, epsilon)

        # the agent moves in the environment
        result = env.move(action)

        # update state
        state = result[0]
        reward += result[1]

        # plot the environment in the current state
        if plots: plot_map(x, y, state, goal, pit, labyrinth, walls, step)

        if (state[0] * y == goal[0]) and (state[1] == goal[1]):
            print('The agent reached the goal starting from x:', x_start,
                  ' y:', y_start, 'in ', step, ' steps')
            break
Ejemplo n.º 4
0
def execute():
    """ Execute the learning algorithm """

    global eligibility

    assert initiated, " SL not initiated! setup() must be previously called"

    s = lp.s
    a = lp.a
    alpha = lp.alpha
    q = lp.q
    v = lp.v
    policy = lp.policy

    # Specific Learning Algorithm
    agent.execute_action(a)
    time.sleep(lp.step_time)
    sp = agent.observe_state()
    r = agent.obtain_reward(s, a, sp)

    ap = agent.select_action(sp)  # Exploration strategy

    delta = r + exp.GAMMA * q[sp, ap] - q[s, a]  # TD error

    eligibility[s, a] = 1.0  # replace trace

    if eli_queue.count(s) > 0:
        eli_queue.remove(s)
    assert eli_queue.count(s) == 0, ("duplicated states found in ET: ", str(s))
    eli_queue.appendleft(s)

    # only the states in eli_queue are updated:
    for i in eli_queue:  # no all states updated, just those in eli_queue
        # replace eli_queue by range(task.n_states) for non-reduced ET
        for j in range(task.n_actions):
            if eligibility[i, j] > 0.01:
                q[i, j] = q[i, j] + alpha * delta * eligibility[i, j]
                eligibility[i, j] *= exp.GAMMA * exp.LAMBDA
            else:
                eligibility[i, j] = 0

        # update v and policy
        v[i] = np.max(q[i])
        policy[i] = np.argmax(q[i])

    lp.s = s
    lp.a = a
    lp.sp = sp
    lp.ap = ap
    lp.r = r
    lp.alpha = alpha
    lp.q = q
    lp.v = v
    lp.policy = policy
    lp.delta = delta
Ejemplo n.º 5
0
def train(env, n_episodes, render=False):
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    score = 0

    for episode in range(n_episodes):
        action_info = env.reset(arenas_configurations_input=arena_config_in)
        obs = action_info[brain_name].visual_observations[0][0]
        state = get_state(obs)
        #total_reward = 0.0
        for t in range(100):
            action = agent.select_action(state)
            conv_action = convert_action(action)

            action_info = env.step(conv_action)
            obs = action_info[brain_name].visual_observations[0][0]
            reward = action_info[brain_name].rewards[0]
            score += reward
            done = action_info[brain_name].local_done[0]

            #total_reward += reward

            if not done:
                next_state = get_state(obs)
            else:
                next_state = None

            reward = torch.tensor([reward], device=device)

            agent.memory.push(state, action.to('cpu'), next_state,
                              reward.to('cpu'))
            state = next_state

            if agent.steps_done > INITIAL_MEMORY:
                agent.optimize_model()

                if agent.steps_done % TARGET_UPDATE == 0:
                    agent.target_net.load_state_dict(
                        agent.policy_net.state_dict())

            if done:
                break
        scores_window.append(score)
        scores.append(score)
        if episode % 20 == 0:
            #print('Total steps: {} \t Episode: {}\t Total reward: {}'.format(agent.steps_done, episode, total_reward))
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                episode, np.mean(scores_window)))

    env.close()
    return
Ejemplo n.º 6
0
def validation_agent(agent, epsilon, episode_length, goal, pit, labyrinth,
                     walls, x, y):
    avg_val = []

    for index in range(0, 100):

        # start from a random state not on the walls if inside the labyrinth
        if labyrinth:
            initial = [np.random.randint(0, x), np.random.randint(0, y)]
            for i in walls:
                yy = i % 10
                xx = i // 10
                if initial[0] == xx and initial[1] == yy:
                    initial = start_walls(initial, walls, x, y)
        else:
            # start from a random state
            initial = [np.random.randint(0, x), np.random.randint(0, y)]

        # initialize environment
        state = initial
        env = environment.Environment(x, y, state, goal, pit, labyrinth)
        val_reward = 0

        # run episode
        for step in range(0, episode_length):
            # find state index
            state_index = state[0] * y + state[1]

            # choose an action
            action = agent.select_action(state_index, epsilon)

            # the agent moves in the environment
            result = env.move(action)

            # update state and reward
            val_reward += result[1]
            state = result[0]

        val_reward /= episode_length
        avg_val.append(val_reward)

        if (index + 1) % 50 == 0:
            print('Episode ', index + 1,
                  ': the agent has obtained an average reward of ', val_reward,
                  ' starting from position ', initial)
    return np.mean(avg_val)
Ejemplo n.º 7
0
            dim=1)[0].eq(0).type(torch.bool)
        non_final_state_locations = (final_state_locations == False)
        non_final_states = next_states[non_final_state_locations]
        batch_size = next_states.shape[0]
        values = torch.zeros(batch_size).to(QValues.device)
        values[non_final_state_locations] = target_net(non_final_states).max(
            dim=1)[0].detach()
        return values


for episode in range(NUM_EPISODES):
    env_manage.reset()
    state = env_manage.get_state()

    for step in count():
        action = agent.select_action(state, policy_net)
        reward = env_manage.take_action(action)
        next_state = env_manage.get_state()
        memory.push(tools.Experience(state, action, next_state, reward))
        state = next_state

        experiences = memory.sample(BATCH_SIZE)
        if experiences:
            states, actions, rewards, next_states = extract_tensors(
                experiences)

            q_values = QValues.get_current(policy_net, states, actions)
            next_q_values = QValues.get_next(target_net, next_states)
            optimal_q_values = (next_q_values * GAMMA) + rewards

            loss = F.mse_loss(q_values, optimal_q_values.unsqueeze(1))
Ejemplo n.º 8
0
        agent = dill.load(agent_file)

    episodes = 5000
    episode_length = 50
    epsilon = np.linspace(0.1, 0.001, 5000)

    # Perform actions
    actions = []
    reward = 0
    # run episode
    for step in range(0, episode_length):
        # find state index
        state_index = state[0] * 10 + state[1]

        #print('State index:', state_index)
        # choose an action
        action = agent.select_action(state_index, epsilon[-1])
        #print('Action:', action)
        # the agent moves in the environment
        result = env.move(action, verbose=False)
        #print('Result:', result)
        # Q-learning update
        next_index = result[0][0] * 10 + result[0][1]
        reward += result[1]
        actions.append(env.matrix.copy())
        #print('----------')
        state = result[0]
    # Show final status
    im = matpl.plot(env.matrix, env, figsize=(6, 6), reduct=True)
    matpl.add_patches(im, env)