def on_load(self): self.sector_manager = Sector_Manager(SECTORS) self.route_manager = Route_Manager(ROUTES, test_routes=VISUALIZE, draw_paths=VISUALIZE) self.traffic_manager = Traffic_Manager( max_ac=MAX_AC, times=TIME_SEP, max_spd=CONSTRAINTS["cas"]["max"], min_spd=CONSTRAINTS["cas"]["min"], max_alt=32000, min_alt=32000, network=self.route_manager) self.memory = Memory() self.agent = Agent(state_size=STATE_SHAPE, action_size=ACTION_SHAPE, value_size=VALUE_SHAPE) try: self.agent.load(path=FILE + "best.h5") except: try: self.agent.load(path=FILE + ".h5") except: pass self.initilized = True print("ATC: READY") string = "=================================\n UPDATE: RUNNING EPOCH {}\n=================================\n".format( self.format_epoch()) self.print_all(string)
def __init__(self, client, identifier, epsilon, get_qs_callbatch, update_replay_memory_callback): super().__init__() self.daemon = True self.client = client self.terminate = False self.fail_flag = False self.halt = False self.get_qs = get_qs_callbatch self.update_replay_memory = update_replay_memory_callback self.identifier = identifier self.agent = Agent(identifier, self.client, True) self.action = None self.episode = 0 self.epsilon = epsilon self.scores_history = deque(maxlen=settings.LOG_EVERY) self.score_record = None self.steps_per_second = deque(maxlen=settings.LOG_EVERY) self.actions_statistic = deque( maxlen=int(settings.LOG_EVERY * settings.SECONDS_PER_EXPISODE * settings.FPS_COMPENSATION))
def __init__(self, world): self.world = world.split('\n ')[1:-1] self.action_map = {0: 'right', 1: 'down', 2: 'left', 3: 'up'} self.action_space = [0, 1, 2, 3] self.slip = 0.2 #20% chance of taking wrong action self.col = len(self.world[0]) #10 num of columns in the above string self.row = len(self.world) #5 num of rows in the above string self.state_color = (50, 100, 10) self.renderfirst = True self.policy = {} self.episode_step = 0 self._max_epi_step = 1000 self.wall_group = pg.sprite.Group() self.state_group = pg.sprite.Group() self.state_dict = defaultdict(lambda: 0) i = 0 for y, et_row in enumerate(self.world): for x, block_type in enumerate(et_row): if block_type == 'w': self.wall_group.add(Wall(col=x, row=y)) elif block_type == 'a': self.agent = Agent(col=x, row=y) self.state_group.add(State(col=x, row=y)) self.state_dict[(x, y)] = { 'state': i, 'reward': -1, 'done': False } i += 1 elif block_type == 'g': self.goal = Goal(col=x, row=y) self.state_dict[(x, y)] = { 'state': i, 'reward': 10, 'done': True } i += 1 elif block_type == ' ': self.state_group.add(State(col=x, row=y)) self.state_dict[(x, y)] = { 'state': i, 'reward': -1, 'done': False } i += 1 self.state_dict = dict(self.state_dict) self.state_count = len(self.state_dict)
def test_training(): config = Config() n = 1 env = make_parallel_env(n, 100000) update_config(env, config) model_path = "/home/liub/Desktop/mount/teamstrategy/oldmodels/mpe/aqmix+coach+vi2+ctr8+l10.0001+l20.0001/run0" #model_path = "/home/liub/Desktop/mount/teamstrategy/models/mpe/aqmix+ctr8+l10.0001+l20.0001/run0" # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) qlearner.load_models(model_path) qlearner.cuda() all_rewards = [] #orders = tt_orders = 0 orders = 0 tt_orders = 1e-12 for it in tqdm(range(100)): o, e, c, m, ms = reset_wrapper(env) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) episode_reward = 0 prev_z = None for t in range(config.max_steps): o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: _, z_team, logvar = qlearner.coach(o_, e_, c_, ms_) if prev_z is None: mac.set_team_strategy(z_team) prev_z = z_team else: bs, n = z_team.shape[:2] mask = ms_.sum(-1).gt(0).float() #normal = D.Normal(z_team, (0.5*logvar).exp()) #logprob = normal.log_prob(prev_z).sum(-1) #prob = logprob.exp() #broadcast = (prob > 0.001).float() #import pdb; pdb.set_trace() l2 = (z_team - prev_z).pow(2).sum(-1).sqrt() broadcast = (l2 > 5).float() mac.set_part_team_strategy(z_team, broadcast) #import pdb; pdb.set_trace() orders += (broadcast * mask).sum() tt_orders += mask.sum() prev_z = mac.z_team.clone() actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, epsilon=0.) o, e, m, ms, r, d = step_wrapper(env, actions) episode_reward += r.sum() all_rewards.append(episode_reward) all_rewards = np.array(all_rewards) print(f"broadcast rate {orders/tt_orders}") print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}") return all_rewards.mean()
def render_episodes(): from PIL import Image config = Config() n = 1 env = make_parallel_env(n, 9999) update_config(env, config) model_path = "/home/liub/Desktop/mount/teamstrategy/coach1/mpe/aqmix+coach+vi2+ctr4+l20.001/run0" #save_path = f"imgs/{config.method}/" # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) qlearner.load_models(model_path) qlearner.cuda() all_rewards = [] for it in range(20): save_path = f"imgs/{config.method}/it{it}/" if not os.path.exists(save_path): os.makedirs(save_path) #fourcc = VideoWriter_fourcc(*'MP4V') #video = VideoWriter(f"{save_path}/epi{it+1}.mp4", fourcc, float(12), (700,700)) o, e, c, m, ms = reset_wrapper(env) prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) prev_z = torch.zeros(o.shape[0], o.shape[1], config.coach_hidden_dim).to(config.device) print(c[0,:4]) episode_reward = 0 for t in range(config.max_steps): if "full" in config.method: m = ms o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: z_team, _, _ = qlearner.coach(o_, e_, c_, ms_) mac.set_team_strategy(z_team) frame = env.envs[0].render(mode="rgb_array")[0] #video.write(np.uint8(frame)) #if t == 10: #print(o[0,:4]) im = Image.fromarray(frame) im.save(f"{save_path}t{t}.jpg") actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon=0.) prev_a = torch.LongTensor(actions).to(config.device) o, e, m, ms, r, d = step_wrapper(env, actions) episode_reward += r.sum() #if (t+1) % config.centralized_every == 0 and config.has_coach: # prev_z = z all_rewards.append(episode_reward) #video.release() all_rewards = np.array(all_rewards) print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}") return all_rewards.mean()
def train(): env = make_atari(conf.env_name) env = bench.Monitor(env, os.path.join(conf.path_game_scan, conf.env_name)) env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=True) env = WrapPyTorch(env) agent = Agent(conf=conf, env=env, test=False) episode_reward = 0 losses = [] all_rewards = [] state = env.reset() # (1, 84, 84) for frame_idx in range(1, conf.max_train_steps + 1): epsilon = conf.epsilon_by_frame(frame_idx) action = agent.act(state, epsilon, test=False) # agent.save_action(action, frame_idx) next_state, reward, done, _ = env.step(action) next_state = None if done else next_state loss = agent.update(state, action, reward, next_state, done, test=False, frame=frame_idx) # state = next_state episode_reward += reward if done: agent.finish_nstep() state = env.reset() agent.save_reward(episode_reward) episode_reward = 0 if loss is not None: losses.append(loss.item()) if frame_idx % conf.log_freq == 0 and loss: print("frame: {}, loss: {}, reward: {}.".format( frame_idx, loss.item(), episode_reward)) if conf.save_curve: curve_plot(conf.path_plot, frame_idx, agent.all_rewards, losses)
def run(): # uncomment these if you want #memory_fix() #memory_hard_fix() # setup data feed dm = DataManager() # for _ in range(10): # print(dm.renderer_stream.next()) # setup exchange. Needs raw data binance_exchange = BinanceExchange(data=dm.data) # setup portfolio binance_portfolio = BinancePortfolio(exchange=binance_exchange) # setup environment. Needs data feed stream env = Environment(portfolio=binance_portfolio, data_stream=dm.stream, renderer_stream=dm.renderer_stream) # for _ in range(10): # print(env.observer.feed.next()) # setup agent agent = Agent(environment=env) # train agent print(agent.train(steps=100, episodes=4, render_interval=10)) # show plots of performance a = binance_portfolio.performance.plot() plt.show() b = binance_portfolio.performance.net_worth.plot() plt.show()
def play(): client = carla.Client(settings.CONNECTION_IP, settings.CONNECTION_PORT) client.set_timeout(20.0) # Create controllers trafic_control = TraficControlThread(client) weather_control = WeatherControlThread(client) trafic_control.start() weather_control.start() logger.info("Controllers started") predicter = ModelHandler(settings.MODEL_NAME, target_weights_path=MODEL_WEIGHTS, train=False) agent = Agent(999999, client, False) try: while True: step = 1 state = agent.spawn() while True: start_step_time = time.time() action = int(np.argmax(predicter.get_qs(state))) new_state, _, done = agent.step(action) state = new_state if done: agent.clear_agent() break time_diff1 = agent.episode_start + step / settings.FPS_COMPENSATION - time.time() time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time() if time_diff1 > 0: time.sleep(min(0.125, time_diff1)) elif time_diff2 > 0: time.sleep(min(0.125, time_diff2)) except KeyboardInterrupt: logger.info("Exiting playing - Keyboard interrupt") except: logger.error("Playing failed") finally: trafic_control.terminate = True weather_control.terminate = True
def test_exp(config, fn, exp, threshold=0.): env = make_parallel_env(1, 9999, fn) update_config(env, config) config.method = exp k = exp.find("ctr") config.centralized_every = int(exp[k + 3:k + 4]) if "165" in exp: config.agent_hidden_dim = 165 else: config.agent_hidden_dim = 128 if "coach" in exp: config.has_coach = True # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) R = [] OR = [] for run_num in tqdm([0, 1, 2, 3, 4]): model_path = f"/home/liub/Desktop/mount/teamstrategy/coach1/mpe/{exp}/run{run_num}" qlearner.load_models(model_path) qlearner.cuda() reward = 0 n_orders = 0 n_total_orders = 1e-12 for n_ep in range(n_eval): o, e, c, m, ms = reset_wrapper(env) prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) prev_z = None for t in range(145): if "full" in exp: m = ms if "interval" in exp and t % config.centralized_every == 0: m = ms o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: ma = ms_.sum(-1).gt(0).float() with torch.no_grad(): _, z_team, _ = qlearner.coach(o_, e_, c_, ms_) if prev_z is None: mac.set_team_strategy(z_team * ma.unsqueeze(-1)) prev_z = z_team n_orders += ma.sum().item() n_total_orders += ma.sum().item() else: bs, n = z_team.shape[:2] #normal = D.Normal(z_team, (0.5*logvar).exp()) #logprob = normal.log_prob(prev_z).sum(-1) #prob = logprob.exp() #broadcast = (prob > 0.001).float() #import pdb; pdb.set_trace() l2 = (z_team * ma.unsqueeze(-1) - prev_z * ma.unsqueeze(-1)).pow(2).sum(-1).sqrt() broadcast = (l2 > threshold).float() mac.set_part_team_strategy(z_team, broadcast) #import pdb; pdb.set_trace() n_orders += broadcast.sum().item() n_total_orders += ma.sum().item() prev_z = mac.z_team.clone() actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, 0) prev_a = torch.LongTensor(actions).to(config.device) o, e, m, ms, r, d = step_wrapper(env, actions) reward += r.sum() reward = reward / n_eval rate = n_orders / n_total_orders R.append(reward) OR.append(rate) R = np.array(R) OR = np.array(OR) print( f"{exp:30s}[{threshold:3d}] | muR: {R.mean():.4f} stdR: {R.std()/np.sqrt(5):.4f} | muC: {OR.mean():.4f} stdC: {OR.std()/np.sqrt(5):.4f}" ) return R.mean(), R.std(), OR.mean(), OR.std()
def run(): config = Config() run_dir, log_dir = prerun(config) env = make_parallel_env(config.n_rollout_threads, config.seed) update_config(env, config) config.pprint() # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) if config.device == "cuda": qlearner.cuda() train_stats = { "reward": [], } step = 0 reward_buffer = collections.deque(maxlen=100) use_tqdm = True n_iters = config.total_steps // config.max_steps // config.n_rollout_threads if use_tqdm: pbar = tqdm(total=n_iters) prev_update_step = 0 start_epsilon = 1.0 end_epsilon = 0.05 delta = -np.log(end_epsilon) / n_iters logger = SummaryWriter(log_dir) for it in range(n_iters): o, e, c, m, ms = reset_wrapper(env) prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device) temporal_buffer = collections.deque(maxlen=config.centralized_every+1) # record t=0,1,...T episode_reward = 0. epsilon = min(start_epsilon, max(end_epsilon, np.exp(-it * delta))) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) for t in range(config.max_steps): step += config.n_rollout_threads if "full" in config.method: m = ms if "interval" in config.method and t % config.centralized_every == 0: m = ms o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: with torch.no_grad(): z_team, _, _ = qlearner.coach(o_, e_, c_, ms_) mac.set_team_strategy(z_team) actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon) # [n_agents,] prev_a = torch.LongTensor(actions).to(config.device) no, ne, nm, nms, r, d = step_wrapper(env, actions) temporal_buffer.append((o, e, c, m, ms, actions, r)) episode_reward += r if t % config.centralized_every == 0 and t > 0: O, E, C, M, MS, A, R = map(np.stack, zip(*temporal_buffer)) for j in range(config.n_rollout_threads): qlearner.buffer.push(O[:,j], E[:,j], C[:,j], M[:,j], MS[:,j], A[:,j], R[:,j]) if (step - prev_update_step) >= config.update_every: prev_update_step = step qlearner.update(logger, step) o = no; e = ne; m = nm; ms = nms reward_buffer.extend(episode_reward) pbar.update(1) running_reward_mean = np.array(reward_buffer).mean() train_stats["reward"].append((step, running_reward_mean)) logger.add_scalar("reward", running_reward_mean, step) pbar.set_description(f"ep {it:10d} | {running_reward_mean:8.4f} |") if (it+1) % 100 == 0 or (it+1 == n_iters): with open(f"{log_dir}/stats.npy", 'wb') as f: np.save(f, train_stats) f.close() qlearner.save_models(f"{run_dir}") if use_tqdm: pbar.close() env.close()
def qLearning(learning_rate, discount_factor, epsilon, reward_map, state_grid, max_steps, epochs): agent = Agent(learning_rate, discount_factor, reward_map, state_grid, max_steps) stateDic = {} # all_epochs = [] # epoch_rewards = [] # every_5 = [] # mean_every_5 = [] # epochs_mean = [] for e in range(epochs): current_state = Agent.choose_start(reward_map, state_grid, max_steps) # epoch_reward = 0 # all_epochs.append(e) # current_epoch_rewards = [] for _ in range(0, len(reward_map) * len(reward_map[0]) * 2): action = Agent.epsilon_greedy_policy(epsilon, current_state) next_state, reward = agent.take_action(current_state, action, stateDic) # update current_state.update_qvalue(learning_rate, reward, discount_factor, next_state, action) stateDic[(current_state.posX, current_state.posY, current_state.steps)] = current_state # epoch_reward += reward # current_epoch_rewards.append(reward) if next_state.is_terminal: # epoch_rewards.append(epoch_reward) # every_5.append(epoch_reward) break current_state = next_state # epochs_mean.append(sum(current_epoch_rewards) / len(current_epoch_rewards)) # if(e % 5 == 0): # mean_every_5.append(sum(every_5) / len(every_5)) # every_5 = [] # plt.style.use(['dark_background']) # plt.figure(figsize=(18,12)) # plt.plot(epochs_mean) # plt.xlabel("Episodios") # plt.ylabel("Reward médio por episodio", size=10) # plt.title("Reward médio por episodio 0.9 Lambda") # plt.savefig('epochs_mean.png') # plt.show() # plt.style.use(['dark_background']) # plt.figure(figsize=(20,10)) # plt.plot(epoch_rewards) # plt.xlabel("Episodios") # plt.ylabel("Reward axumulativa", size=10) # plt.title("Reward acumulativa por episodio") # plt.savefig('reward_cumulative.png') # plt.show() # plt.figure(figsize=(20,10)) # plt.plot(mean_every_5) # plt.xlabel("Episodios") # plt.ylabel("Média Móvel a cada 5 episodios", size=10) # plt.title("Média Móvel") # plt.savefig('mean_every5.png') # plt.show() # plt.figure(figsize=(20,10)) # plt.plot(epochs_mean) # plt.xlabel("Episodios") # plt.ylabel("Reward médio por episodio", size=10) # plt.title("Reward médio por episodio") # plt.savefig('epochs_mean.png') # plt.show() return stateDic
class ATC(core.Entity): ''' Example new entity object for BlueSky. ''' def __init__(self): super().__init__() self.super_start = time.perf_counter() self.initilized = False self.epoch_counter = 0 # [Success, Fail] self.results = np.zeros(2) self.all_success = [] self.all_fail = [] self.mean_success = 0 self.all_mean_success, self.best = 0, 0 self.mean_rewards = [] self.epoch_actions = np.zeros(ACTION_SHAPE) self.start = None self.stop = None self.dist = [0, -1] self.spd = [0, -1] self.trk = [0, 360] self.vs = [0, -1] self.last_observation = {} self.last_reward_observation = {} self.previous_action = {} self.observation = {} def on_load(self): self.sector_manager = Sector_Manager(SECTORS) self.route_manager = Route_Manager(ROUTES, test_routes=VISUALIZE, draw_paths=VISUALIZE) self.traffic_manager = Traffic_Manager( max_ac=MAX_AC, times=TIME_SEP, max_spd=CONSTRAINTS["cas"]["max"], min_spd=CONSTRAINTS["cas"]["min"], max_alt=32000, min_alt=32000, network=self.route_manager) self.memory = Memory() self.agent = Agent(state_size=STATE_SHAPE, action_size=ACTION_SHAPE, value_size=VALUE_SHAPE) try: self.agent.load(path=FILE + "best.h5") except: try: self.agent.load(path=FILE + ".h5") except: pass self.initilized = True print("ATC: READY") string = "=================================\n UPDATE: RUNNING EPOCH {}\n=================================\n".format( self.format_epoch()) self.print_all(string) # Functions that need to be called periodically can be indicated to BlueSky # with the timed_function decorator @core.timed_function(name='example', dt=12) def update(self): # Initilize system if not self.initilized: self.on_load() # Start epoch timer if not self.start: self.start = time.perf_counter() # Create aircraft self.traffic_manager.spawn() # Update Aircraft active sectors self.traffic_manager.update_active(self.sector_manager.system_sectors) # Generate a full distancematrix between each aircraft full_dist_matrix = self.get_dist_martix() # Get nearest ac in a matrix nearest_ac = self.get_nearest_ac(dist_matrix=full_dist_matrix) # Get goal distances for each aircraft g_distance = self.get_goal_distances() # Get an array of terminal aircraft terminal_ac, terminal_id = self.get_terminal(nearest_ac, g_distance) self.handle_terminal(terminal_id) if self.traffic_manager.check_done(): self.epoch_reset() return if not TRAIN and (self.traffic_manager.total % 50 == 0): string = "Success: {} | Fail: {} | Mean Success: {:.3f}%".format( int(self.results[0]), int(self.results[1]), (self.results[0] / MAX_AC) * 100) self.print_all(string) if len(traf.id) <= 0: return if not len(traf.id) == 0: policy, normal_state, normal_context = self.get_actions( terminal_ac, g_distance, full_dist_matrix) if len(policy) > 0: idx = 0 new_actions = {} for i in range(len(traf.id)): if terminal_ac[i] == 0 and len( self.traffic_manager.active_sectors[i]) > 0: if not np.any(np.isnan(policy[idx])): _id = traf.id[i] if not _id in self.last_observation.keys(): self.last_observation[_id] = [ normal_state[idx], normal_context[idx] ] action = np.random.choice( ACTION_SHAPE, 1, p=policy[idx].flatten())[0] # print(policy[idx], action) self.epoch_actions[action] += 1 if not _id in self.observation.keys( ) and _id in self.previous_action.keys(): self.observation[_id] = [ normal_state[idx], normal_context[idx] ] self.memory.store(_id, self.last_observation[_id], self.previous_action[_id], nearest_ac[idx]) self.last_observation[_id] = self.observation[ _id] del self.observation[_id] self.perform_action(i, action) new_actions[_id] = action self.previous_action = new_actions idx += 1 # Act def get_actions(self, terminal_ac, g_dists, dist_matrix): ids = [] new_actions = {} state = self.get_state() normal_state, normal_context = self.normalise_all( state, terminal_ac, g_dists, dist_matrix) policy = [] if not len(normal_state) == 0: policy = self.agent.act(normal_state, normal_context) return policy, normal_state, normal_context # For an aircraft perform an action def perform_action(self, i, action): if action < 3: traf_alt = int(traf.alt[i] / ft) new_alt = int(round((traf_alt + ACTIONS[action]))) alt = max(CONSTRAINTS["alt"]["min"], min(CONSTRAINTS["alt"]["max"], new_alt)) # print(traf_alt, alt) stack.stack("{} alt {}".format(traf.id[i], alt)) elif action == 4: traf_alt = traf.alt[i] / ft new_alt = int(round((traf_alt))) # Get the current state def get_state(self): state = np.zeros((len(traf.id), 6)) start_ids, end_ids = self.get_all_nodes() state[:, 0] = traf.lat state[:, 1] = traf.lon state[:, 2] = traf.trk state[:, 3] = traf.alt state[:, 4] = traf.tas state[:, 5] = traf.vs return state # Get all nodes for each aircraft def get_all_nodes(self): start_ids = np.zeros(len(traf.id), dtype=int) end_ids = np.zeros(len(traf.id), dtype=int) for i in range(len(traf.id)): _id = traf.id[i] route = self.traffic_manager.routes[_id] start_ids[i] = np.argwhere( self.route_manager.idx_array == route[0]) end_ids[i] = np.argwhere(self.route_manager.idx_array == route[-1]) return start_ids, end_ids # Normalise the state and context def normalise_all(self, state, terminal_ac, g_dists, dist_matrix): normal_states = self.normalise_state(state, terminal_ac, g_dists) normal_context = [] start_ids, end_ids = self.get_all_nodes() max_agents = 0 for _id in traf.id: if terminal_ac[traf.id2idx(_id)] > 0 or len( self.traffic_manager.active_sectors[traf.id2idx( _id)]) <= 0: continue new_context = self.normalise_context(_id, terminal_ac, dist_matrix, start_ids, end_ids) max_agents = max(max_agents, len(new_context)) if len(normal_context) == 0: normal_context = new_context else: normal_context = np.append( keras.preprocessing.sequence.pad_sequences( normal_context, max_agents, dtype='float32'), keras.preprocessing.sequence.pad_sequences( new_context, max_agents, dtype='float32'), axis=0) if len(normal_context) == 0: normal_context = np.array([0, 0, 0, 0, 0, 0, 0]).reshape(1, 1, 7) # print(normal_states.shape, normal_context.shape) return normal_states, normal_context # Normalise the agent state only def normalise_state(self, state, terminal_ac, g_dists): total_active = 0 for i in range(len(terminal_ac)): if terminal_ac[i] == 0 and len( self.traffic_manager.active_sectors[i]) > 0: total_active += 1 normalised_state = np.zeros((total_active, STATE_SHAPE)) count = 0 for i in range(len(traf.id)): if terminal_ac[i] > 0 or len( self.traffic_manager.active_sectors[i]) <= 0: continue normalised_state[count, :] = self.normalise(state[i], 'state', traf.id[i], g_dist=g_dists[i]) count += 1 return normalised_state # Get and normalise context def normalise_context(self, _id, terminal_ac, dist_matrix, start_ids, end_ids): context = [] idx = traf.id2idx(_id) distances = dist_matrix[:, idx] this_sectors = self.traffic_manager.active_sectors[idx] this_lat, this_lon = traf.lat[idx], traf.lon[idx] for i in range(len(distances)): # Ignore current aircraft if i == idx: continue if terminal_ac[i] > 0 or len( self.traffic_manager.active_sectors[i]) <= 0: continue sectors = self.traffic_manager.active_sectors[i] # Only care if the ac in a matching sector flag = False for x in sectors: if x in this_sectors: flag = True if not flag: continue dist = get_dist([this_lat, this_lon], [traf.lat[i], traf.lon[i]]) # Only care about visible distance aircraft if dist > 40: continue spd = traf.tas[i] alt = traf.alt[i] trk = traf.trk[i] vs = traf.vs[i] start_id = start_ids[i] end_id = end_ids[i] self.dist[1] = max(self.dist[1], dist) self.spd[1] = max(self.spd[1], spd) self.vs[1] = max(self.vs[1], vs) dist = dist / self.dist[1] spd = spd / self.spd[1] trk = trk / self.trk[1] alt = ((alt/ft)-CONSTRAINTS["alt"]["min"]) / \ (CONSTRAINTS["alt"]["max"]-CONSTRAINTS["alt"]["min"]) vs = 0 if not vs == 0: vs = vs / self.vs[1] n_nodes, dist2next = get_n_nodes(traf.id[i], self.traffic_manager, self.route_manager) self.dist[1] = max(self.dist[1], dist2next) dist2next = dist2next / self.dist[1] if len(context) == 0: context = np.array([ spd, alt, trk, vs, dist, dist2next, n_nodes[0], n_nodes[1], n_nodes[2] ]).reshape(1, 1, 9) else: context = np.append(context, np.array([ spd, alt, trk, vs, dist, dist2next, n_nodes[0], n_nodes[1], n_nodes[2] ]).reshape(1, 1, 9), axis=1) if len(context) == 0: context = np.zeros(9).reshape(1, 1, 9) return context # perform normalisation def normalise(self, state, what, _id, g_dist=None): # Normalise the entire state if what == 'state': if not g_dist: raise Exception( "For normalising a state please pass the distance to the goal." ) self.dist[1] = max(self.dist[1], g_dist) self.spd[1] = max(self.spd[1], state[4]) self.vs[1] = max(self.vs[1], state[5]) dist = g_dist / self.dist[1] spd = state[4] / self.spd[1] trk = state[2] / self.trk[1] alt = ((state[3]/ft)-CONSTRAINTS["alt"]["min"]) / \ (CONSTRAINTS["alt"]["max"]-CONSTRAINTS["alt"]["min"]) vs = 0 if not state[5] == 0: vs = state[5] / self.vs[1] n_nodes, dist2next = get_n_nodes(_id, self.traffic_manager, self.route_manager) self.dist[1] = max(self.dist[1], dist2next) dist2next = dist2next / self.dist[1] return np.array([ spd, alt, trk, vs, dist, dist2next, n_nodes[0], n_nodes[1], n_nodes[2] ]) # Get the terminal aircraft def get_terminal(self, nearest_ac, g_dists): terminal_ac = np.zeros(len(traf.id), dtype=int) terminal_id = [] # Loop through all aircraft for i in range(len(traf.id)): # Terminal state 0 = not terminal, 1 = collision, 2 = success T = 0 # Only care about aircraft in a sector if len(self.traffic_manager.active_sectors[i]) > 0: close_ac = nearest_ac[i] n_ac_data = (close_ac[0], close_ac[1]) # Get the terminal state T = self.agent.terminal(i, n_ac_data, g_dists[i]) # Only care about terminal aircraft if not T == 0: # Update collision aircraft if T == 1: terminal_ac[i] = 1 terminal_ac[traf.id2idx(close_ac[2])] = 1 elif not terminal_ac[i] == 1: terminal_ac[i] = 2 _id = traf.id[i] self.memory.store(_id, self.last_observation[_id], self.previous_action[_id], nearest_ac[i], T) for i in range(len(terminal_ac)): if terminal_ac[i] > 0: terminal_id.append([traf.id[i], terminal_ac[i]]) return terminal_ac, terminal_id # Handle terminal aircraft def handle_terminal(self, terminal_id): for ac in terminal_id: stack.stack('DEL {}'.format(ac[0])) self.traffic_manager.active -= 1 if ac[1] == 1: self.results[1] += 1 elif ac[1] == 2: self.results[0] += 1 # Generates a distance matrix of all aircraft in the system def get_dist_martix(self): size = traf.lat.shape[0] return geo.latlondist_matrix(np.repeat(traf.lat, size), np.repeat(traf.lon, size), np.tile(traf.lat, size), np.tile(traf.lon, size)).reshape(size, size) # Get the nearest aircraft to agents def get_nearest_ac(self, dist_matrix): nearest = [] # Loop through all aircraft for i in range(len(traf.id)): a_alt = traf.alt[i] / ft ac_dists = dist_matrix[:, i] close = 10e+25 alt_sep = 10e+25 nearest_id = None # Loop through the row on the dist matrix for x in range(len(ac_dists)): # Ensure the aircraft is in controlled airspace and not the current aircraft if not x == i and len( self.traffic_manager.active_sectors[x]) > 0: # See if it is closest and update if ac_dists[x] < close: close = float(ac_dists[x]) i_alt = traf.alt[x] / ft alt_sep = abs(a_alt - i_alt) nearest_id = traf.id[x] nearest.append([close, alt_sep, nearest_id]) return np.array(nearest) # returns a matrix of distances to a goal def get_goal_distances(self): goal_ds = np.zeros(len(traf.id), dtype=float) for i in range(len(traf.id)): goal_ds[i] = get_goal_dist(traf.id[i], self.traffic_manager, self.route_manager) return goal_ds # Reset the environment for the next epoch def epoch_reset(self): # Reset the traffic creation self.traffic_manager.reset() # Keep track of all success and failures self.all_success.append(self.results[0]) self.all_fail.append(self.results[1]) # Calcuate total mean success self.all_mean_success = np.mean(self.all_success) # Calcuate rolling mean success if (self.epoch_counter + 1) >= 50: self.mean_success = np.mean(self.all_success[-50:]) if (self.epoch_counter + 1) % 5 == 0: if self.mean_success > self.best: if TRAIN: print('::::::: Saving Best ::::::') self.agent.save(path=NEW_FILE + "best.h5") self.best = self.mean_success if TRAIN: print(':::::: Saving Model ::::::') self.agent.save(path=NEW_FILE + ".h5") print(":::::::: Training ::::::::") self.agent.train(self.memory) print(":::::::: Complete ::::::::") temp = np.array([np.array(self.all_success), np.array(self.all_fail)]) np.savetxt("Files/" + NEW_FILE + "_numpy.csv", temp, delimiter=',') # Stop the timer self.stop = time.perf_counter() # -------- Printing Outputs -------- string = "Epoch run in {:.2f} seconds".format(self.stop - self.start) self.print_all(string) string = "Success: {} | Fail: {} | Mean Success: {:.3f}% | (50) Mean Success Rolling {:.3f}% | Best {:.3f}%".format( int(self.results[0]), int(self.results[1]), (self.all_mean_success / MAX_AC) * 100, (self.mean_success / MAX_AC) * 100, (self.best / MAX_AC) * 100) self.print_all(string) string = "Actions -> Descend: {}, Hold Current: {}, Climb: {}, Maintain Climb: {}".format( self.epoch_actions[0], self.epoch_actions[1], self.epoch_actions[2], self.epoch_actions[3]) # string = "Actions -> Descend: {}, Climb: {}".format( # self.epoch_actions[1], self.epoch_actions[0]) self.print_all(string) if self.epoch_counter + 1 >= EPOCHS: super_stop = time.perf_counter() stack.stack("STOP") string = "::END:: Training {} episodes took {:.2f} hours".format( EPOCHS, ((super_stop - self.super_start) / 60) / 60) self.print_all(string) return self.epoch_counter += 1 string = "=================================\n UPDATE: RUNNING EPOCH {}\n=================================\n".format( self.format_epoch()) self.print_all(string) # Reset values self.results = np.zeros(2) self.stop = None self.start = None self.mean_rewards = [] self.epoch_actions = [] self.epoch_actions = np.zeros(ACTION_SHAPE) self.previous_action = {} self.last_observation = {} self.observation = {} # Scripts for printing values def print_all(self, string): stack.stack(f'ECHO {string}') print(string) def format_epoch(self): epoch_string = "" if self.epoch_counter + 1 < 10: epoch_string += "0" if self.epoch_counter + 1 < 100: epoch_string += "0" if self.epoch_counter + 1 < 1000: epoch_string += "0" if self.epoch_counter + 1 < 10000: epoch_string += "0" epoch_string += str(self.epoch_counter + 1) return epoch_string
class GridWorld: def __init__(self, world): self.world = world.split('\n ')[1:-1] self.action_map = {0: 'right', 1: 'down', 2: 'left', 3: 'up'} self.action_space = [0, 1, 2, 3] self.slip = 0.2 #20% chance of taking wrong action self.col = len(self.world[0]) #10 num of columns in the above string self.row = len(self.world) #5 num of rows in the above string self.state_color = (50, 100, 10) self.renderfirst = True self.policy = {} self.episode_step = 0 self._max_epi_step = 1000 self.wall_group = pg.sprite.Group() self.state_group = pg.sprite.Group() self.state_dict = defaultdict(lambda: 0) i = 0 for y, et_row in enumerate(self.world): for x, block_type in enumerate(et_row): if block_type == 'w': self.wall_group.add(Wall(col=x, row=y)) elif block_type == 'a': self.agent = Agent(col=x, row=y) self.state_group.add(State(col=x, row=y)) self.state_dict[(x, y)] = { 'state': i, 'reward': -1, 'done': False } i += 1 elif block_type == 'g': self.goal = Goal(col=x, row=y) self.state_dict[(x, y)] = { 'state': i, 'reward': 10, 'done': True } i += 1 elif block_type == ' ': self.state_group.add(State(col=x, row=y)) self.state_dict[(x, y)] = { 'state': i, 'reward': -1, 'done': False } i += 1 self.state_dict = dict(self.state_dict) self.state_count = len(self.state_dict) def reset(self): self.episode_step = 0 self.agent.reInitilizeAgent() return self.state_dict[(self.agent.initial_position.x, self.agent.initial_position.y)]['state'] def get_action_with_probof_slip(self, action): #slip property of env individual_slip = self.slip / 3 prob = [individual_slip for a in self.action_space] prob[action] = 1 - self.slip act = np.random.choice(self.action_space, p=prob) return act def step(self, action, testing=False): if not testing: action = self.get_action_with_probof_slip(action) action = self.action_map[action] response = self.agent.move(action, self.wall_group, self.state_dict) self.episode_step += 1 if self.episode_step <= self._max_epi_step: return response['state'], response['reward'], response['done'], { } #info else: return response['state'], response['reward'], True, { 'TimeLimit': True } def render(self): if self.renderfirst: pg.init() self.screen = pg.display.set_mode((self.col * 50, self.row * 50)) self.screen.fill(self.state_color) self.wall_group.draw(self.screen) self.goal.draw(self.screen) self.agent.draw(self.screen) pg.display.update() pg.display.flip() def close(self): self.renderfirst = True pg.quit() def setPolicy(self, policy): for i, act in enumerate(policy): self.policy[i] = self.action_map[act] for s in self.state_group: s.change_with_policy(self.state_dict, self.policy) def play_as_human(self, show_policy=False): #policy={state_no:action('left')} if show_policy and len(self.policy) == 0: raise Exception( "Sorry, no policy found setPolicy first...use world.setPolicy([list of action for states])" ) pg.init() screen = pg.display.set_mode((self.col * 50, self.row * 50)) clock = pg.time.Clock() done = False while not done: for event in pg.event.get(): if event.type == pg.QUIT: done = True elif event.type == pg.KEYDOWN: if event.key == pg.K_LEFT: response = self.agent.move('left', self.wall_group, self.state_dict) #print(response) elif event.key == pg.K_RIGHT: response = self.agent.move('right', self.wall_group, self.state_dict) #print(response) elif event.key == pg.K_UP: response = self.agent.move('up', self.wall_group, self.state_dict) #print(response) elif event.key == pg.K_DOWN: response = self.agent.move('down', self.wall_group, self.state_dict) #print(response) screen.fill(self.state_color) self.wall_group.draw(screen) if show_policy: self.state_group.draw(screen) self.goal.draw(screen) self.agent.draw(screen) pg.display.update() pg.display.flip() clock.tick(60) pg.quit()
class Trainer(Thread): def __init__(self, client, identifier, epsilon, get_qs_callbatch, update_replay_memory_callback): super().__init__() self.daemon = True self.client = client self.terminate = False self.fail_flag = False self.halt = False self.get_qs = get_qs_callbatch self.update_replay_memory = update_replay_memory_callback self.identifier = identifier self.agent = Agent(identifier, self.client, True) self.action = None self.episode = 0 self.epsilon = epsilon self.scores_history = deque(maxlen=settings.LOG_EVERY) self.score_record = None self.steps_per_second = deque(maxlen=settings.LOG_EVERY) self.actions_statistic = deque( maxlen=int(settings.LOG_EVERY * settings.SECONDS_PER_EXPISODE * settings.FPS_COMPENSATION)) def get_action(self, action: int): num_of_logged_actions = len(self.actions_statistic) if num_of_logged_actions <= 0: return 0 return self.actions_statistic.count(action) / num_of_logged_actions def get_steps_per_second(self): if len(self.steps_per_second) > 0: return sum(self.steps_per_second) / len(self.steps_per_second) return 0 def get_preview_data(self): if self.agent.prev_camera is not None and self.agent.initialized: return cv2.cvtColor(self.agent.prev_camera, cv2.COLOR_RGB2BGR) return np.zeros((settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[1], settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[0], settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[2])) def get_mean_score(self): if len(self.scores_history) > 0: return sum(self.scores_history) / len(self.scores_history) return 0 def get_episode(self): return self.episode def run(self) -> None: logger.info(f"Trainer {self.identifier} started") while not self.terminate: if self.halt: time.sleep(0.1) continue reward = None episode_reward = 0 step = 1 try: state = self.agent.spawn() self.fail_flag = False except: self.fail_flag = True break episode_data_memory = deque() while not self.fail_flag: start_step_time = time.time() if self.epsilon is None or np.random.random() > self.epsilon: self.action = int(np.argmax(self.get_qs(state))) self.actions_statistic.append(self.action) else: self.action = random.choice(list(settings.ACTIONS.keys())) try: new_state, reward, done = self.agent.step(self.action) except: logger.error( f"Trainer {self.identifier} - Failed to make step") self.fail_flag = True break episode_data_memory.append( (state, self.action, reward, new_state, done)) state = new_state episode_reward += reward if done: self.agent.clear_agent() self.action = None break time_diff1 = self.agent.episode_start + step / settings.FPS_COMPENSATION - time.time( ) time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time( ) if time_diff1 > 0: time.sleep(min(0.125, time_diff1)) elif time_diff2 > 0: time.sleep(min(0.125, time_diff2)) step += 1 if not reward or not self.agent.episode_start: continue episode_time = time.time() - self.agent.episode_start if episode_time == 0: episode_time = 10 ^ -9 average_steps_per_second = step / episode_time self.steps_per_second.append(average_steps_per_second) reward_factor = settings.FPS_COMPENSATION / average_steps_per_second episode_reward_weighted = ( (episode_reward - reward) * reward_factor + reward) * settings.EPISODE_REWARD_MULTIPLIER if episode_time > settings.MINIMUM_EPISODE_LENGTH: self.update_replay_memory(episode_data_memory) self.scores_history.append(episode_reward_weighted) self.episode += 1 del episode_data_memory logger.info(f"Trainer {self.identifier} stopped")