Esempio n. 1
0
def render_episodes():
    from PIL import Image
    config = Config()
    n = 1
    env = make_parallel_env(n, 9999)
    update_config(env, config)

    model_path = "/home/liub/Desktop/mount/teamstrategy/coach1/mpe/aqmix+coach+vi2+ctr4+l20.001/run0"
    #save_path = f"imgs/{config.method}/"

    # setup modules
    mac = Agent(config) # policy
    qlearner = QLearner(mac, config)
    qlearner.load_models(model_path)
    qlearner.cuda()

    all_rewards = []

    for it in range(20):
        save_path = f"imgs/{config.method}/it{it}/"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        #fourcc = VideoWriter_fourcc(*'MP4V')
        #video = VideoWriter(f"{save_path}/epi{it+1}.mp4", fourcc, float(12), (700,700))
        o, e, c, m, ms = reset_wrapper(env)
        prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device)
        rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])
        prev_z = torch.zeros(o.shape[0], o.shape[1], config.coach_hidden_dim).to(config.device)
        print(c[0,:4])

        episode_reward = 0
        for t in range(config.max_steps):
            if "full" in config.method:
                m = ms
            o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)
            if config.has_coach and t % config.centralized_every == 0:
               z_team, _, _ = qlearner.coach(o_, e_, c_, ms_)
               mac.set_team_strategy(z_team)

            frame = env.envs[0].render(mode="rgb_array")[0]
            #video.write(np.uint8(frame))
            #if t == 10:
                #print(o[0,:4])
            im = Image.fromarray(frame)
            im.save(f"{save_path}t{t}.jpg")

            actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon=0.)
            prev_a = torch.LongTensor(actions).to(config.device)
            o, e, m, ms, r, d = step_wrapper(env, actions)
            episode_reward += r.sum()

            #if (t+1) % config.centralized_every == 0 and config.has_coach:
            #    prev_z = z

        all_rewards.append(episode_reward)
        #video.release()
    all_rewards = np.array(all_rewards)
    print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}")
    return all_rewards.mean()
Esempio n. 2
0
def test_training():
    config = Config()
    n = 1
    env = make_parallel_env(n, 100000)
    update_config(env, config)

    model_path = "/home/liub/Desktop/mount/teamstrategy/oldmodels/mpe/aqmix+coach+vi2+ctr8+l10.0001+l20.0001/run0"
    #model_path = "/home/liub/Desktop/mount/teamstrategy/models/mpe/aqmix+ctr8+l10.0001+l20.0001/run0"

    # setup modules
    mac = Agent(config) # policy
    qlearner = QLearner(mac, config)
    qlearner.load_models(model_path)
    qlearner.cuda()

    all_rewards = []

    #orders = tt_orders = 0
    orders = 0
    tt_orders = 1e-12
    for it in tqdm(range(100)):
        o, e, c, m, ms = reset_wrapper(env)
        rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

        episode_reward = 0
        prev_z = None
        for t in range(config.max_steps):
            o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)
            if config.has_coach and t % config.centralized_every == 0:
                _, z_team, logvar = qlearner.coach(o_, e_, c_, ms_)
                if prev_z is None:
                    mac.set_team_strategy(z_team)
                    prev_z = z_team
                else:
                    bs, n = z_team.shape[:2]
                    mask = ms_.sum(-1).gt(0).float()
                    #normal = D.Normal(z_team, (0.5*logvar).exp())
                    #logprob = normal.log_prob(prev_z).sum(-1)
                    #prob = logprob.exp()
                    #broadcast = (prob > 0.001).float()
                    #import pdb; pdb.set_trace()
                    l2 = (z_team - prev_z).pow(2).sum(-1).sqrt()
                    broadcast = (l2 > 5).float()
                    mac.set_part_team_strategy(z_team, broadcast)
                    #import pdb; pdb.set_trace()
                    orders += (broadcast * mask).sum()
                    tt_orders += mask.sum()
                    prev_z = mac.z_team.clone()

            actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, epsilon=0.)
            o, e, m, ms, r, d = step_wrapper(env, actions)
            episode_reward += r.sum()

        all_rewards.append(episode_reward)
    all_rewards = np.array(all_rewards)
    print(f"broadcast rate {orders/tt_orders}")
    print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}")
    return all_rewards.mean()
Esempio n. 3
0
def play():
  client = carla.Client(settings.CONNECTION_IP, settings.CONNECTION_PORT)
  client.set_timeout(20.0)

  # Create controllers
  trafic_control = TraficControlThread(client)
  weather_control = WeatherControlThread(client)
  trafic_control.start()
  weather_control.start()
  logger.info("Controllers started")

  predicter = ModelHandler(settings.MODEL_NAME, target_weights_path=MODEL_WEIGHTS, train=False)
  agent = Agent(999999, client, False)

  try:
    while True:
      step = 1

      state = agent.spawn()

      while True:
        start_step_time = time.time()

        action = int(np.argmax(predicter.get_qs(state)))
        new_state, _, done = agent.step(action)
        state = new_state

        if done:
          agent.clear_agent()
          break

        time_diff1 = agent.episode_start + step / settings.FPS_COMPENSATION - time.time()
        time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time()
        if time_diff1 > 0:
          time.sleep(min(0.125, time_diff1))
        elif time_diff2 > 0:
          time.sleep(min(0.125, time_diff2))
  except KeyboardInterrupt:
    logger.info("Exiting playing - Keyboard interrupt")
  except:
    logger.error("Playing failed")
  finally:
    trafic_control.terminate = True
    weather_control.terminate = True
Esempio n. 4
0
def test_exp(config, fn, exp, threshold=0.):
    env = make_parallel_env(1, 9999, fn)
    update_config(env, config)
    config.method = exp
    k = exp.find("ctr")
    config.centralized_every = int(exp[k + 3:k + 4])
    if "165" in exp:
        config.agent_hidden_dim = 165
    else:
        config.agent_hidden_dim = 128

    if "coach" in exp:
        config.has_coach = True

    # setup modules
    mac = Agent(config)  # policy
    qlearner = QLearner(mac, config)

    R = []
    OR = []

    for run_num in tqdm([0, 1, 2, 3, 4]):
        model_path = f"/home/liub/Desktop/mount/teamstrategy/coach1/mpe/{exp}/run{run_num}"

        qlearner.load_models(model_path)
        qlearner.cuda()

        reward = 0
        n_orders = 0
        n_total_orders = 1e-12

        for n_ep in range(n_eval):
            o, e, c, m, ms = reset_wrapper(env)
            prev_a = torch.zeros(o.shape[0],
                                 o.shape[1]).long().to(config.device)
            rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

            prev_z = None

            for t in range(145):
                if "full" in exp:
                    m = ms
                if "interval" in exp and t % config.centralized_every == 0:
                    m = ms
                o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)

                if config.has_coach and t % config.centralized_every == 0:
                    ma = ms_.sum(-1).gt(0).float()
                    with torch.no_grad():
                        _, z_team, _ = qlearner.coach(o_, e_, c_, ms_)
                    if prev_z is None:
                        mac.set_team_strategy(z_team * ma.unsqueeze(-1))
                        prev_z = z_team
                        n_orders += ma.sum().item()
                        n_total_orders += ma.sum().item()
                    else:
                        bs, n = z_team.shape[:2]
                        #normal = D.Normal(z_team, (0.5*logvar).exp())
                        #logprob = normal.log_prob(prev_z).sum(-1)
                        #prob = logprob.exp()
                        #broadcast = (prob > 0.001).float()
                        #import pdb; pdb.set_trace()
                        l2 = (z_team * ma.unsqueeze(-1) -
                              prev_z * ma.unsqueeze(-1)).pow(2).sum(-1).sqrt()
                        broadcast = (l2 > threshold).float()
                        mac.set_part_team_strategy(z_team, broadcast)
                        #import pdb; pdb.set_trace()
                        n_orders += broadcast.sum().item()
                        n_total_orders += ma.sum().item()
                        prev_z = mac.z_team.clone()

                actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden,
                                               prev_a, 0)
                prev_a = torch.LongTensor(actions).to(config.device)
                o, e, m, ms, r, d = step_wrapper(env, actions)
                reward += r.sum()

        reward = reward / n_eval
        rate = n_orders / n_total_orders

        R.append(reward)
        OR.append(rate)

    R = np.array(R)
    OR = np.array(OR)
    print(
        f"{exp:30s}[{threshold:3d}] | muR: {R.mean():.4f} stdR: {R.std()/np.sqrt(5):.4f} | muC: {OR.mean():.4f} stdC: {OR.std()/np.sqrt(5):.4f}"
    )
    return R.mean(), R.std(), OR.mean(), OR.std()
Esempio n. 5
0
def run():
    config = Config()
    run_dir, log_dir = prerun(config)

    env = make_parallel_env(config.n_rollout_threads, config.seed)
    update_config(env, config)

    config.pprint()

    # setup modules
    mac = Agent(config) # policy
    qlearner = QLearner(mac, config)
    if config.device == "cuda":
        qlearner.cuda()

    train_stats = {
        "reward": [],
    }

    step = 0
    reward_buffer = collections.deque(maxlen=100)

    use_tqdm = True
    n_iters = config.total_steps // config.max_steps // config.n_rollout_threads

    if use_tqdm:
        pbar = tqdm(total=n_iters)

    prev_update_step = 0

    start_epsilon = 1.0
    end_epsilon = 0.05

    delta = -np.log(end_epsilon) / n_iters

    logger = SummaryWriter(log_dir)

    for it in range(n_iters):
        o, e, c, m, ms = reset_wrapper(env)
        prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device)

        temporal_buffer = collections.deque(maxlen=config.centralized_every+1) # record t=0,1,...T

        episode_reward = 0.
        epsilon = min(start_epsilon, max(end_epsilon, np.exp(-it * delta)))

        rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

        for t in range(config.max_steps):
            step += config.n_rollout_threads

            if "full" in config.method:
                m = ms
            if "interval" in config.method and t % config.centralized_every == 0:
                m = ms

            o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)

            if config.has_coach and t % config.centralized_every == 0:
                with torch.no_grad():
                    z_team, _, _ = qlearner.coach(o_, e_, c_, ms_)
                    mac.set_team_strategy(z_team)

            actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon) # [n_agents,]
            prev_a = torch.LongTensor(actions).to(config.device)

            no, ne, nm, nms, r, d = step_wrapper(env, actions)

            temporal_buffer.append((o, e, c, m, ms, actions, r))
            episode_reward += r

            if t % config.centralized_every == 0 and t > 0:
                O, E, C, M, MS, A, R = map(np.stack, zip(*temporal_buffer))
                for j in range(config.n_rollout_threads):
                    qlearner.buffer.push(O[:,j], E[:,j], C[:,j],
                                         M[:,j], MS[:,j], A[:,j], R[:,j])

            if (step - prev_update_step) >= config.update_every:
                prev_update_step = step
                qlearner.update(logger, step)

            o = no; e = ne; m = nm; ms = nms

        reward_buffer.extend(episode_reward)
        pbar.update(1)
        running_reward_mean = np.array(reward_buffer).mean()
        train_stats["reward"].append((step, running_reward_mean))
        logger.add_scalar("reward", running_reward_mean, step)
        pbar.set_description(f"ep {it:10d} | {running_reward_mean:8.4f} |")

        if (it+1) % 100 == 0 or (it+1 == n_iters):
            with open(f"{log_dir}/stats.npy", 'wb') as f:
                np.save(f, train_stats)
            f.close()
            qlearner.save_models(f"{run_dir}")

    if use_tqdm:
        pbar.close()
    env.close()
Esempio n. 6
0
class Trainer(Thread):
    def __init__(self, client, identifier, epsilon, get_qs_callbatch,
                 update_replay_memory_callback):
        super().__init__()
        self.daemon = True
        self.client = client

        self.terminate = False
        self.fail_flag = False
        self.halt = False

        self.get_qs = get_qs_callbatch
        self.update_replay_memory = update_replay_memory_callback
        self.identifier = identifier

        self.agent = Agent(identifier, self.client, True)

        self.action = None
        self.episode = 0
        self.epsilon = epsilon
        self.scores_history = deque(maxlen=settings.LOG_EVERY)
        self.score_record = None
        self.steps_per_second = deque(maxlen=settings.LOG_EVERY)

        self.actions_statistic = deque(
            maxlen=int(settings.LOG_EVERY * settings.SECONDS_PER_EXPISODE *
                       settings.FPS_COMPENSATION))

    def get_action(self, action: int):
        num_of_logged_actions = len(self.actions_statistic)
        if num_of_logged_actions <= 0: return 0
        return self.actions_statistic.count(action) / num_of_logged_actions

    def get_steps_per_second(self):
        if len(self.steps_per_second) > 0:
            return sum(self.steps_per_second) / len(self.steps_per_second)
        return 0

    def get_preview_data(self):
        if self.agent.prev_camera is not None and self.agent.initialized:
            return cv2.cvtColor(self.agent.prev_camera, cv2.COLOR_RGB2BGR)
        return np.zeros((settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[1],
                         settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[0],
                         settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[2]))

    def get_mean_score(self):
        if len(self.scores_history) > 0:
            return sum(self.scores_history) / len(self.scores_history)
        return 0

    def get_episode(self):
        return self.episode

    def run(self) -> None:
        logger.info(f"Trainer {self.identifier} started")

        while not self.terminate:
            if self.halt:
                time.sleep(0.1)
                continue

            reward = None
            episode_reward = 0
            step = 1

            try:
                state = self.agent.spawn()
                self.fail_flag = False
            except:
                self.fail_flag = True
                break

            episode_data_memory = deque()

            while not self.fail_flag:
                start_step_time = time.time()

                if self.epsilon is None or np.random.random() > self.epsilon:
                    self.action = int(np.argmax(self.get_qs(state)))
                    self.actions_statistic.append(self.action)
                else:
                    self.action = random.choice(list(settings.ACTIONS.keys()))

                try:
                    new_state, reward, done = self.agent.step(self.action)
                except:
                    logger.error(
                        f"Trainer {self.identifier} - Failed to make step")
                    self.fail_flag = True
                    break

                episode_data_memory.append(
                    (state, self.action, reward, new_state, done))
                state = new_state

                episode_reward += reward

                if done:
                    self.agent.clear_agent()
                    self.action = None
                    break

                time_diff1 = self.agent.episode_start + step / settings.FPS_COMPENSATION - time.time(
                )
                time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time(
                )
                if time_diff1 > 0:
                    time.sleep(min(0.125, time_diff1))
                elif time_diff2 > 0:
                    time.sleep(min(0.125, time_diff2))

                step += 1

            if not reward or not self.agent.episode_start: continue

            episode_time = time.time() - self.agent.episode_start
            if episode_time == 0: episode_time = 10 ^ -9
            average_steps_per_second = step / episode_time

            self.steps_per_second.append(average_steps_per_second)

            reward_factor = settings.FPS_COMPENSATION / average_steps_per_second
            episode_reward_weighted = (
                (episode_reward - reward) * reward_factor +
                reward) * settings.EPISODE_REWARD_MULTIPLIER

            if episode_time > settings.MINIMUM_EPISODE_LENGTH:
                self.update_replay_memory(episode_data_memory)
                self.scores_history.append(episode_reward_weighted)
                self.episode += 1

            del episode_data_memory

        logger.info(f"Trainer {self.identifier} stopped")