def render_episodes(): from PIL import Image config = Config() n = 1 env = make_parallel_env(n, 9999) update_config(env, config) model_path = "/home/liub/Desktop/mount/teamstrategy/coach1/mpe/aqmix+coach+vi2+ctr4+l20.001/run0" #save_path = f"imgs/{config.method}/" # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) qlearner.load_models(model_path) qlearner.cuda() all_rewards = [] for it in range(20): save_path = f"imgs/{config.method}/it{it}/" if not os.path.exists(save_path): os.makedirs(save_path) #fourcc = VideoWriter_fourcc(*'MP4V') #video = VideoWriter(f"{save_path}/epi{it+1}.mp4", fourcc, float(12), (700,700)) o, e, c, m, ms = reset_wrapper(env) prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) prev_z = torch.zeros(o.shape[0], o.shape[1], config.coach_hidden_dim).to(config.device) print(c[0,:4]) episode_reward = 0 for t in range(config.max_steps): if "full" in config.method: m = ms o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: z_team, _, _ = qlearner.coach(o_, e_, c_, ms_) mac.set_team_strategy(z_team) frame = env.envs[0].render(mode="rgb_array")[0] #video.write(np.uint8(frame)) #if t == 10: #print(o[0,:4]) im = Image.fromarray(frame) im.save(f"{save_path}t{t}.jpg") actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon=0.) prev_a = torch.LongTensor(actions).to(config.device) o, e, m, ms, r, d = step_wrapper(env, actions) episode_reward += r.sum() #if (t+1) % config.centralized_every == 0 and config.has_coach: # prev_z = z all_rewards.append(episode_reward) #video.release() all_rewards = np.array(all_rewards) print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}") return all_rewards.mean()
def test_training(): config = Config() n = 1 env = make_parallel_env(n, 100000) update_config(env, config) model_path = "/home/liub/Desktop/mount/teamstrategy/oldmodels/mpe/aqmix+coach+vi2+ctr8+l10.0001+l20.0001/run0" #model_path = "/home/liub/Desktop/mount/teamstrategy/models/mpe/aqmix+ctr8+l10.0001+l20.0001/run0" # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) qlearner.load_models(model_path) qlearner.cuda() all_rewards = [] #orders = tt_orders = 0 orders = 0 tt_orders = 1e-12 for it in tqdm(range(100)): o, e, c, m, ms = reset_wrapper(env) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) episode_reward = 0 prev_z = None for t in range(config.max_steps): o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: _, z_team, logvar = qlearner.coach(o_, e_, c_, ms_) if prev_z is None: mac.set_team_strategy(z_team) prev_z = z_team else: bs, n = z_team.shape[:2] mask = ms_.sum(-1).gt(0).float() #normal = D.Normal(z_team, (0.5*logvar).exp()) #logprob = normal.log_prob(prev_z).sum(-1) #prob = logprob.exp() #broadcast = (prob > 0.001).float() #import pdb; pdb.set_trace() l2 = (z_team - prev_z).pow(2).sum(-1).sqrt() broadcast = (l2 > 5).float() mac.set_part_team_strategy(z_team, broadcast) #import pdb; pdb.set_trace() orders += (broadcast * mask).sum() tt_orders += mask.sum() prev_z = mac.z_team.clone() actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, epsilon=0.) o, e, m, ms, r, d = step_wrapper(env, actions) episode_reward += r.sum() all_rewards.append(episode_reward) all_rewards = np.array(all_rewards) print(f"broadcast rate {orders/tt_orders}") print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}") return all_rewards.mean()
def play(): client = carla.Client(settings.CONNECTION_IP, settings.CONNECTION_PORT) client.set_timeout(20.0) # Create controllers trafic_control = TraficControlThread(client) weather_control = WeatherControlThread(client) trafic_control.start() weather_control.start() logger.info("Controllers started") predicter = ModelHandler(settings.MODEL_NAME, target_weights_path=MODEL_WEIGHTS, train=False) agent = Agent(999999, client, False) try: while True: step = 1 state = agent.spawn() while True: start_step_time = time.time() action = int(np.argmax(predicter.get_qs(state))) new_state, _, done = agent.step(action) state = new_state if done: agent.clear_agent() break time_diff1 = agent.episode_start + step / settings.FPS_COMPENSATION - time.time() time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time() if time_diff1 > 0: time.sleep(min(0.125, time_diff1)) elif time_diff2 > 0: time.sleep(min(0.125, time_diff2)) except KeyboardInterrupt: logger.info("Exiting playing - Keyboard interrupt") except: logger.error("Playing failed") finally: trafic_control.terminate = True weather_control.terminate = True
def test_exp(config, fn, exp, threshold=0.): env = make_parallel_env(1, 9999, fn) update_config(env, config) config.method = exp k = exp.find("ctr") config.centralized_every = int(exp[k + 3:k + 4]) if "165" in exp: config.agent_hidden_dim = 165 else: config.agent_hidden_dim = 128 if "coach" in exp: config.has_coach = True # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) R = [] OR = [] for run_num in tqdm([0, 1, 2, 3, 4]): model_path = f"/home/liub/Desktop/mount/teamstrategy/coach1/mpe/{exp}/run{run_num}" qlearner.load_models(model_path) qlearner.cuda() reward = 0 n_orders = 0 n_total_orders = 1e-12 for n_ep in range(n_eval): o, e, c, m, ms = reset_wrapper(env) prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) prev_z = None for t in range(145): if "full" in exp: m = ms if "interval" in exp and t % config.centralized_every == 0: m = ms o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: ma = ms_.sum(-1).gt(0).float() with torch.no_grad(): _, z_team, _ = qlearner.coach(o_, e_, c_, ms_) if prev_z is None: mac.set_team_strategy(z_team * ma.unsqueeze(-1)) prev_z = z_team n_orders += ma.sum().item() n_total_orders += ma.sum().item() else: bs, n = z_team.shape[:2] #normal = D.Normal(z_team, (0.5*logvar).exp()) #logprob = normal.log_prob(prev_z).sum(-1) #prob = logprob.exp() #broadcast = (prob > 0.001).float() #import pdb; pdb.set_trace() l2 = (z_team * ma.unsqueeze(-1) - prev_z * ma.unsqueeze(-1)).pow(2).sum(-1).sqrt() broadcast = (l2 > threshold).float() mac.set_part_team_strategy(z_team, broadcast) #import pdb; pdb.set_trace() n_orders += broadcast.sum().item() n_total_orders += ma.sum().item() prev_z = mac.z_team.clone() actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, 0) prev_a = torch.LongTensor(actions).to(config.device) o, e, m, ms, r, d = step_wrapper(env, actions) reward += r.sum() reward = reward / n_eval rate = n_orders / n_total_orders R.append(reward) OR.append(rate) R = np.array(R) OR = np.array(OR) print( f"{exp:30s}[{threshold:3d}] | muR: {R.mean():.4f} stdR: {R.std()/np.sqrt(5):.4f} | muC: {OR.mean():.4f} stdC: {OR.std()/np.sqrt(5):.4f}" ) return R.mean(), R.std(), OR.mean(), OR.std()
def run(): config = Config() run_dir, log_dir = prerun(config) env = make_parallel_env(config.n_rollout_threads, config.seed) update_config(env, config) config.pprint() # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) if config.device == "cuda": qlearner.cuda() train_stats = { "reward": [], } step = 0 reward_buffer = collections.deque(maxlen=100) use_tqdm = True n_iters = config.total_steps // config.max_steps // config.n_rollout_threads if use_tqdm: pbar = tqdm(total=n_iters) prev_update_step = 0 start_epsilon = 1.0 end_epsilon = 0.05 delta = -np.log(end_epsilon) / n_iters logger = SummaryWriter(log_dir) for it in range(n_iters): o, e, c, m, ms = reset_wrapper(env) prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device) temporal_buffer = collections.deque(maxlen=config.centralized_every+1) # record t=0,1,...T episode_reward = 0. epsilon = min(start_epsilon, max(end_epsilon, np.exp(-it * delta))) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) for t in range(config.max_steps): step += config.n_rollout_threads if "full" in config.method: m = ms if "interval" in config.method and t % config.centralized_every == 0: m = ms o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: with torch.no_grad(): z_team, _, _ = qlearner.coach(o_, e_, c_, ms_) mac.set_team_strategy(z_team) actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon) # [n_agents,] prev_a = torch.LongTensor(actions).to(config.device) no, ne, nm, nms, r, d = step_wrapper(env, actions) temporal_buffer.append((o, e, c, m, ms, actions, r)) episode_reward += r if t % config.centralized_every == 0 and t > 0: O, E, C, M, MS, A, R = map(np.stack, zip(*temporal_buffer)) for j in range(config.n_rollout_threads): qlearner.buffer.push(O[:,j], E[:,j], C[:,j], M[:,j], MS[:,j], A[:,j], R[:,j]) if (step - prev_update_step) >= config.update_every: prev_update_step = step qlearner.update(logger, step) o = no; e = ne; m = nm; ms = nms reward_buffer.extend(episode_reward) pbar.update(1) running_reward_mean = np.array(reward_buffer).mean() train_stats["reward"].append((step, running_reward_mean)) logger.add_scalar("reward", running_reward_mean, step) pbar.set_description(f"ep {it:10d} | {running_reward_mean:8.4f} |") if (it+1) % 100 == 0 or (it+1 == n_iters): with open(f"{log_dir}/stats.npy", 'wb') as f: np.save(f, train_stats) f.close() qlearner.save_models(f"{run_dir}") if use_tqdm: pbar.close() env.close()
class Trainer(Thread): def __init__(self, client, identifier, epsilon, get_qs_callbatch, update_replay_memory_callback): super().__init__() self.daemon = True self.client = client self.terminate = False self.fail_flag = False self.halt = False self.get_qs = get_qs_callbatch self.update_replay_memory = update_replay_memory_callback self.identifier = identifier self.agent = Agent(identifier, self.client, True) self.action = None self.episode = 0 self.epsilon = epsilon self.scores_history = deque(maxlen=settings.LOG_EVERY) self.score_record = None self.steps_per_second = deque(maxlen=settings.LOG_EVERY) self.actions_statistic = deque( maxlen=int(settings.LOG_EVERY * settings.SECONDS_PER_EXPISODE * settings.FPS_COMPENSATION)) def get_action(self, action: int): num_of_logged_actions = len(self.actions_statistic) if num_of_logged_actions <= 0: return 0 return self.actions_statistic.count(action) / num_of_logged_actions def get_steps_per_second(self): if len(self.steps_per_second) > 0: return sum(self.steps_per_second) / len(self.steps_per_second) return 0 def get_preview_data(self): if self.agent.prev_camera is not None and self.agent.initialized: return cv2.cvtColor(self.agent.prev_camera, cv2.COLOR_RGB2BGR) return np.zeros((settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[1], settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[0], settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[2])) def get_mean_score(self): if len(self.scores_history) > 0: return sum(self.scores_history) / len(self.scores_history) return 0 def get_episode(self): return self.episode def run(self) -> None: logger.info(f"Trainer {self.identifier} started") while not self.terminate: if self.halt: time.sleep(0.1) continue reward = None episode_reward = 0 step = 1 try: state = self.agent.spawn() self.fail_flag = False except: self.fail_flag = True break episode_data_memory = deque() while not self.fail_flag: start_step_time = time.time() if self.epsilon is None or np.random.random() > self.epsilon: self.action = int(np.argmax(self.get_qs(state))) self.actions_statistic.append(self.action) else: self.action = random.choice(list(settings.ACTIONS.keys())) try: new_state, reward, done = self.agent.step(self.action) except: logger.error( f"Trainer {self.identifier} - Failed to make step") self.fail_flag = True break episode_data_memory.append( (state, self.action, reward, new_state, done)) state = new_state episode_reward += reward if done: self.agent.clear_agent() self.action = None break time_diff1 = self.agent.episode_start + step / settings.FPS_COMPENSATION - time.time( ) time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time( ) if time_diff1 > 0: time.sleep(min(0.125, time_diff1)) elif time_diff2 > 0: time.sleep(min(0.125, time_diff2)) step += 1 if not reward or not self.agent.episode_start: continue episode_time = time.time() - self.agent.episode_start if episode_time == 0: episode_time = 10 ^ -9 average_steps_per_second = step / episode_time self.steps_per_second.append(average_steps_per_second) reward_factor = settings.FPS_COMPENSATION / average_steps_per_second episode_reward_weighted = ( (episode_reward - reward) * reward_factor + reward) * settings.EPISODE_REWARD_MULTIPLIER if episode_time > settings.MINIMUM_EPISODE_LENGTH: self.update_replay_memory(episode_data_memory) self.scores_history.append(episode_reward_weighted) self.episode += 1 del episode_data_memory logger.info(f"Trainer {self.identifier} stopped")