def __init__(self, starting_state, state_space, action_space, alpha=0.5, gamma=0.95, exploration_strategy=EpsilonGreedy()): super(QLAgent, self).__init__(state_space, action_space) self.state = starting_state self.action_space = action_space self.action = None self.alpha = alpha self.gamma = gamma self.q_table = {self.state: [0 for _ in range(action_space.n)]} self.exploration = exploration_strategy self.acc_reward = 0
]) if args.reward == 'queue': env._compute_rewards = env._queue_average_reward else: env._compute_rewards = env._waiting_time_reward for run in range(1, args.runs + 1): initial_states = env.reset() ql_agents = { ts: QLAgent(starting_state=env.encode(initial_states[ts]), state_space=env.observation_space, action_space=env.action_space, alpha=args.alpha, gamma=args.gamma, exploration_strategy=EpsilonGreedy( initial_epsilon=args.epsilon, min_epsilon=args.min_epsilon, decay=args.decay)) for ts in env.ts_ids } done = {'__all__': False} infos = [] if args.fixed: while not done['__all__']: _, _, done, _ = env.step({}) else: while not done['__all__']: actions = {ts: ql_agents[ts].act() for ts in ql_agents.keys()} s, r, done, _ = env.step(actions=actions)
traci.trafficlight.Phase(35000, 35000, 35000, "rrrGGG"), # west-east traci.trafficlight.Phase(2000, 2000, 2000, "rrryyy") ]) for run in range(1, runs + 1): initial_states = env.reset() ql_agents = { ts: QLAgent(starting_state=env.encode(initial_states[ts]), state_space=env.observation_space, action_space=env.action_space, alpha=alpha, gamma=gamma, exploration_strategy=EpsilonGreedy(initial_epsilon=0.05, min_epsilon=0.005, decay=decay)) for ts in env.ts_ids } infos = [] done = {'__all__': False} while not done['__all__']: actions = {ts: ql_agents[ts].act() for ts in ql_agents.keys()} s, r, done, info = env.step(actions=actions) infos.append(info) for agent_id in ql_agents.keys(): ql_agents[agent_id].learn(new_state=env.encode(s[agent_id]), reward=r[agent_id])
traci.trafficlight.Phase(32000, 32000, 32000, "rrrrrGrrrrrG"), traci.trafficlight.Phase(2000, 2000, 2000, "rrrrryrrrrry") ]) if args.reward == 'queue': env._compute_rewards = env._queue_average_reward else: env._compute_rewards = env._waiting_time_reward for run in range(1, args.runs+1): initial_states = env.reset() ql_agents = {ts: QLAgent(starting_state=env.encode(initial_states[ts]), state_space=env.observation_space, action_space=env.action_space, alpha=args.alpha, gamma=args.gamma, exploration_strategy=EpsilonGreedy(initial_epsilon=args.epsilon, min_epsilon=args.min_epsilon, decay=args.decay)) for ts in env.ts_ids} done = {'__all__': False} infos = [] if args.fixed: while not done['__all__']: _, _, done, _ = env.step({}) else: while not done['__all__']: actions = {ts: ql_agents[ts].act() for ts in ql_agents.keys()} s, r, done, _ = env.step(actions=actions) if args.v: print('s=', env.radix_decode(ql_agents['t'].state), 'a=', actions['t'], 's\'=', env.radix_encode(s['t']), 'r=', r['t'])
max_depart_delay=0, phases=[ traci.trafficlight.Phase(35, "GGGrrr"), # north-south traci.trafficlight.Phase(2, "yyyrrr"), traci.trafficlight.Phase(35, "rrrGGG"), # west-east traci.trafficlight.Phase(2, "rrryyy") ]) for run in range(1, runs+1): initial_states = env.reset() ql_agents = {ts: QLAgent(starting_state=env.encode(initial_states[ts]), state_space=env.observation_space, action_space=env.action_space, alpha=alpha, gamma=gamma, exploration_strategy=EpsilonGreedy(initial_epsilon=0.05, min_epsilon=0.005, decay=decay)) for ts in env.ts_ids} infos = [] done = {'__all__': False} while not done['__all__']: actions = {ts: ql_agents[ts].act() for ts in ql_agents.keys()} s, r, done, info = env.step(actions=actions) infos.append(info) for agent_id in ql_agents.keys(): ql_agents[agent_id].learn(new_state=env.encode(s[agent_id]), reward=r[agent_id]) env.close() df = pd.DataFrame(infos) df.to_csv('outputs/4x4grid/c2_alpha{}_gamma{}_decay{}_run{}.csv'.format(alpha, gamma, decay, run), index=False)