def execute(): """Execute the learning algorithm""" s = lp.s a = lp.a alpha = lp.alpha q = lp.q v = lp.v policy = lp.policy agent.execute_action(a) time.sleep(lp.step_time) sp = agent.observe_state() r = agent.obtain_reward(s, a, sp) ap = agent.select_action(sp) # update Q delta = r + exp.GAMMA * q[sp, ap] - q[s, a] # TD error (SARSA) q[s, a] = q[s, a] + alpha * delta # update rule # Update V and Policy v[s] = np.max(q[s]) policy[s] = np.argmax(q[s]) lp.s = s lp.a = a lp.sp = sp lp.ap = ap lp.r = r lp.alpha = alpha lp.q = q lp.v = v lp.policy = policy lp.delta = delta
def execute(): """ Execute Learning Algorithm """ global eligibility, q_old assert initiated, "TOSL not initiated! setup() must be previously called" s = lp.s a = lp.a alpha = lp.alpha q = lp.q v = lp.v policy = lp.policy # Specific Learning Algorithm agent.execute_action(a) time.sleep(lp.step_time) # robot.stop() # time.sleep(TASK.STEP_TIME/2) sp = agent.observe_state() r = agent.obtain_reward(s, a, sp) ap = agent.select_action(sp) # Exploration strategy diff_q = q[s, a] - q_old q_old = q[sp, ap] delta = r + exp.GAMMA * q[sp, ap] - q[s, a] # TD error eligibility[s, a] = (1.0 - alpha) * eligibility[s, a] + 1 if eli_queue.count(s) > 0: eli_queue.remove(s) assert eli_queue.count(s) == 0, ("duplicated states found in ET ", str(s)) eli_queue.appendleft(s) for i in eli_queue: # no all states updated, just those in eli_queue # replace eli_queue by range(task.n_states) for non-reduced ET for j in range(task.n_actions): if eligibility[i, j] > 0.01: q[i, j] = q[i, j] + alpha * (delta + diff_q) * eligibility[i, j] eligibility[i, j] *= exp.GAMMA * exp.LAMBDA else: eligibility[i, j] = 0 if i == s and j == a: q[i, j] = q[i, j] - alpha * diff_q # update v and policy v[i] = np.max(q[i]) policy[i] = np.argmax(q[i]) lp.s, lp.a = s, a lp.sp, lp.ap = sp, ap lp.r = r lp.alpha = alpha lp.q = q lp.v = v lp.policy = policy lp.delta = delta
def test_agent(agent, x_start, y_start, epsilon, goal, pit, labyrinth, plots): # initialize and plot the environment state = [x_start, y_start] env = environment.Environment(x, y, state, goal, pit, labyrinth) if plots: plot_map(x, y, state, goal, pit, labyrinth, walls, 0) reward = 0 # run episodes for step in range(1, 30): # find state index state_index = state[0] * y + state[1] # choose an action action = agent.select_action(state_index, epsilon) # the agent moves in the environment result = env.move(action) # update state state = result[0] reward += result[1] # plot the environment in the current state if plots: plot_map(x, y, state, goal, pit, labyrinth, walls, step) if (state[0] * y == goal[0]) and (state[1] == goal[1]): print('The agent reached the goal starting from x:', x_start, ' y:', y_start, 'in ', step, ' steps') break
def execute(): """ Execute the learning algorithm """ global eligibility assert initiated, " SL not initiated! setup() must be previously called" s = lp.s a = lp.a alpha = lp.alpha q = lp.q v = lp.v policy = lp.policy # Specific Learning Algorithm agent.execute_action(a) time.sleep(lp.step_time) sp = agent.observe_state() r = agent.obtain_reward(s, a, sp) ap = agent.select_action(sp) # Exploration strategy delta = r + exp.GAMMA * q[sp, ap] - q[s, a] # TD error eligibility[s, a] = 1.0 # replace trace if eli_queue.count(s) > 0: eli_queue.remove(s) assert eli_queue.count(s) == 0, ("duplicated states found in ET: ", str(s)) eli_queue.appendleft(s) # only the states in eli_queue are updated: for i in eli_queue: # no all states updated, just those in eli_queue # replace eli_queue by range(task.n_states) for non-reduced ET for j in range(task.n_actions): if eligibility[i, j] > 0.01: q[i, j] = q[i, j] + alpha * delta * eligibility[i, j] eligibility[i, j] *= exp.GAMMA * exp.LAMBDA else: eligibility[i, j] = 0 # update v and policy v[i] = np.max(q[i]) policy[i] = np.argmax(q[i]) lp.s = s lp.a = a lp.sp = sp lp.ap = ap lp.r = r lp.alpha = alpha lp.q = q lp.v = v lp.policy = policy lp.delta = delta
def train(env, n_episodes, render=False): scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores score = 0 for episode in range(n_episodes): action_info = env.reset(arenas_configurations_input=arena_config_in) obs = action_info[brain_name].visual_observations[0][0] state = get_state(obs) #total_reward = 0.0 for t in range(100): action = agent.select_action(state) conv_action = convert_action(action) action_info = env.step(conv_action) obs = action_info[brain_name].visual_observations[0][0] reward = action_info[brain_name].rewards[0] score += reward done = action_info[brain_name].local_done[0] #total_reward += reward if not done: next_state = get_state(obs) else: next_state = None reward = torch.tensor([reward], device=device) agent.memory.push(state, action.to('cpu'), next_state, reward.to('cpu')) state = next_state if agent.steps_done > INITIAL_MEMORY: agent.optimize_model() if agent.steps_done % TARGET_UPDATE == 0: agent.target_net.load_state_dict( agent.policy_net.state_dict()) if done: break scores_window.append(score) scores.append(score) if episode % 20 == 0: #print('Total steps: {} \t Episode: {}\t Total reward: {}'.format(agent.steps_done, episode, total_reward)) print('\rEpisode {}\tAverage Score: {:.2f}'.format( episode, np.mean(scores_window))) env.close() return
def validation_agent(agent, epsilon, episode_length, goal, pit, labyrinth, walls, x, y): avg_val = [] for index in range(0, 100): # start from a random state not on the walls if inside the labyrinth if labyrinth: initial = [np.random.randint(0, x), np.random.randint(0, y)] for i in walls: yy = i % 10 xx = i // 10 if initial[0] == xx and initial[1] == yy: initial = start_walls(initial, walls, x, y) else: # start from a random state initial = [np.random.randint(0, x), np.random.randint(0, y)] # initialize environment state = initial env = environment.Environment(x, y, state, goal, pit, labyrinth) val_reward = 0 # run episode for step in range(0, episode_length): # find state index state_index = state[0] * y + state[1] # choose an action action = agent.select_action(state_index, epsilon) # the agent moves in the environment result = env.move(action) # update state and reward val_reward += result[1] state = result[0] val_reward /= episode_length avg_val.append(val_reward) if (index + 1) % 50 == 0: print('Episode ', index + 1, ': the agent has obtained an average reward of ', val_reward, ' starting from position ', initial) return np.mean(avg_val)
dim=1)[0].eq(0).type(torch.bool) non_final_state_locations = (final_state_locations == False) non_final_states = next_states[non_final_state_locations] batch_size = next_states.shape[0] values = torch.zeros(batch_size).to(QValues.device) values[non_final_state_locations] = target_net(non_final_states).max( dim=1)[0].detach() return values for episode in range(NUM_EPISODES): env_manage.reset() state = env_manage.get_state() for step in count(): action = agent.select_action(state, policy_net) reward = env_manage.take_action(action) next_state = env_manage.get_state() memory.push(tools.Experience(state, action, next_state, reward)) state = next_state experiences = memory.sample(BATCH_SIZE) if experiences: states, actions, rewards, next_states = extract_tensors( experiences) q_values = QValues.get_current(policy_net, states, actions) next_q_values = QValues.get_next(target_net, next_states) optimal_q_values = (next_q_values * GAMMA) + rewards loss = F.mse_loss(q_values, optimal_q_values.unsqueeze(1))
agent = dill.load(agent_file) episodes = 5000 episode_length = 50 epsilon = np.linspace(0.1, 0.001, 5000) # Perform actions actions = [] reward = 0 # run episode for step in range(0, episode_length): # find state index state_index = state[0] * 10 + state[1] #print('State index:', state_index) # choose an action action = agent.select_action(state_index, epsilon[-1]) #print('Action:', action) # the agent moves in the environment result = env.move(action, verbose=False) #print('Result:', result) # Q-learning update next_index = result[0][0] * 10 + result[0][1] reward += result[1] actions.append(env.matrix.copy()) #print('----------') state = result[0] # Show final status im = matpl.plot(env.matrix, env, figsize=(6, 6), reduct=True) matpl.add_patches(im, env)