コード例 #1
0
    def on_load(self):
        self.sector_manager = Sector_Manager(SECTORS)
        self.route_manager = Route_Manager(ROUTES,
                                           test_routes=VISUALIZE,
                                           draw_paths=VISUALIZE)
        self.traffic_manager = Traffic_Manager(
            max_ac=MAX_AC,
            times=TIME_SEP,
            max_spd=CONSTRAINTS["cas"]["max"],
            min_spd=CONSTRAINTS["cas"]["min"],
            max_alt=32000,
            min_alt=32000,
            network=self.route_manager)

        self.memory = Memory()

        self.agent = Agent(state_size=STATE_SHAPE,
                           action_size=ACTION_SHAPE,
                           value_size=VALUE_SHAPE)

        try:
            self.agent.load(path=FILE + "best.h5")
        except:
            try:
                self.agent.load(path=FILE + ".h5")
            except:
                pass

        self.initilized = True

        print("ATC: READY")
        string = "=================================\n   UPDATE: RUNNING EPOCH {}\n=================================\n".format(
            self.format_epoch())
        self.print_all(string)
コード例 #2
0
    def __init__(self, client, identifier, epsilon, get_qs_callbatch,
                 update_replay_memory_callback):
        super().__init__()
        self.daemon = True
        self.client = client

        self.terminate = False
        self.fail_flag = False
        self.halt = False

        self.get_qs = get_qs_callbatch
        self.update_replay_memory = update_replay_memory_callback
        self.identifier = identifier

        self.agent = Agent(identifier, self.client, True)

        self.action = None
        self.episode = 0
        self.epsilon = epsilon
        self.scores_history = deque(maxlen=settings.LOG_EVERY)
        self.score_record = None
        self.steps_per_second = deque(maxlen=settings.LOG_EVERY)

        self.actions_statistic = deque(
            maxlen=int(settings.LOG_EVERY * settings.SECONDS_PER_EXPISODE *
                       settings.FPS_COMPENSATION))
コード例 #3
0
    def __init__(self, world):

        self.world = world.split('\n    ')[1:-1]
        self.action_map = {0: 'right', 1: 'down', 2: 'left', 3: 'up'}
        self.action_space = [0, 1, 2, 3]
        self.slip = 0.2  #20% chance of taking wrong action

        self.col = len(self.world[0])  #10 num of columns in the above string
        self.row = len(self.world)  #5 num of rows in the above string
        self.state_color = (50, 100, 10)
        self.renderfirst = True
        self.policy = {}
        self.episode_step = 0
        self._max_epi_step = 1000

        self.wall_group = pg.sprite.Group()
        self.state_group = pg.sprite.Group()

        self.state_dict = defaultdict(lambda: 0)

        i = 0
        for y, et_row in enumerate(self.world):
            for x, block_type in enumerate(et_row):

                if block_type == 'w':
                    self.wall_group.add(Wall(col=x, row=y))

                elif block_type == 'a':
                    self.agent = Agent(col=x, row=y)
                    self.state_group.add(State(col=x, row=y))
                    self.state_dict[(x, y)] = {
                        'state': i,
                        'reward': -1,
                        'done': False
                    }
                    i += 1

                elif block_type == 'g':
                    self.goal = Goal(col=x, row=y)
                    self.state_dict[(x, y)] = {
                        'state': i,
                        'reward': 10,
                        'done': True
                    }
                    i += 1

                elif block_type == ' ':
                    self.state_group.add(State(col=x, row=y))
                    self.state_dict[(x, y)] = {
                        'state': i,
                        'reward': -1,
                        'done': False
                    }
                    i += 1

        self.state_dict = dict(self.state_dict)
        self.state_count = len(self.state_dict)
コード例 #4
0
ファイル: main.py プロジェクト: Cranial-XIX/COPA
def test_training():
    config = Config()
    n = 1
    env = make_parallel_env(n, 100000)
    update_config(env, config)

    model_path = "/home/liub/Desktop/mount/teamstrategy/oldmodels/mpe/aqmix+coach+vi2+ctr8+l10.0001+l20.0001/run0"
    #model_path = "/home/liub/Desktop/mount/teamstrategy/models/mpe/aqmix+ctr8+l10.0001+l20.0001/run0"

    # setup modules
    mac = Agent(config) # policy
    qlearner = QLearner(mac, config)
    qlearner.load_models(model_path)
    qlearner.cuda()

    all_rewards = []

    #orders = tt_orders = 0
    orders = 0
    tt_orders = 1e-12
    for it in tqdm(range(100)):
        o, e, c, m, ms = reset_wrapper(env)
        rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

        episode_reward = 0
        prev_z = None
        for t in range(config.max_steps):
            o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)
            if config.has_coach and t % config.centralized_every == 0:
                _, z_team, logvar = qlearner.coach(o_, e_, c_, ms_)
                if prev_z is None:
                    mac.set_team_strategy(z_team)
                    prev_z = z_team
                else:
                    bs, n = z_team.shape[:2]
                    mask = ms_.sum(-1).gt(0).float()
                    #normal = D.Normal(z_team, (0.5*logvar).exp())
                    #logprob = normal.log_prob(prev_z).sum(-1)
                    #prob = logprob.exp()
                    #broadcast = (prob > 0.001).float()
                    #import pdb; pdb.set_trace()
                    l2 = (z_team - prev_z).pow(2).sum(-1).sqrt()
                    broadcast = (l2 > 5).float()
                    mac.set_part_team_strategy(z_team, broadcast)
                    #import pdb; pdb.set_trace()
                    orders += (broadcast * mask).sum()
                    tt_orders += mask.sum()
                    prev_z = mac.z_team.clone()

            actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, epsilon=0.)
            o, e, m, ms, r, d = step_wrapper(env, actions)
            episode_reward += r.sum()

        all_rewards.append(episode_reward)
    all_rewards = np.array(all_rewards)
    print(f"broadcast rate {orders/tt_orders}")
    print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}")
    return all_rewards.mean()
コード例 #5
0
ファイル: main.py プロジェクト: Cranial-XIX/COPA
def render_episodes():
    from PIL import Image
    config = Config()
    n = 1
    env = make_parallel_env(n, 9999)
    update_config(env, config)

    model_path = "/home/liub/Desktop/mount/teamstrategy/coach1/mpe/aqmix+coach+vi2+ctr4+l20.001/run0"
    #save_path = f"imgs/{config.method}/"

    # setup modules
    mac = Agent(config) # policy
    qlearner = QLearner(mac, config)
    qlearner.load_models(model_path)
    qlearner.cuda()

    all_rewards = []

    for it in range(20):
        save_path = f"imgs/{config.method}/it{it}/"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        #fourcc = VideoWriter_fourcc(*'MP4V')
        #video = VideoWriter(f"{save_path}/epi{it+1}.mp4", fourcc, float(12), (700,700))
        o, e, c, m, ms = reset_wrapper(env)
        prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device)
        rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])
        prev_z = torch.zeros(o.shape[0], o.shape[1], config.coach_hidden_dim).to(config.device)
        print(c[0,:4])

        episode_reward = 0
        for t in range(config.max_steps):
            if "full" in config.method:
                m = ms
            o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)
            if config.has_coach and t % config.centralized_every == 0:
               z_team, _, _ = qlearner.coach(o_, e_, c_, ms_)
               mac.set_team_strategy(z_team)

            frame = env.envs[0].render(mode="rgb_array")[0]
            #video.write(np.uint8(frame))
            #if t == 10:
                #print(o[0,:4])
            im = Image.fromarray(frame)
            im.save(f"{save_path}t{t}.jpg")

            actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon=0.)
            prev_a = torch.LongTensor(actions).to(config.device)
            o, e, m, ms, r, d = step_wrapper(env, actions)
            episode_reward += r.sum()

            #if (t+1) % config.centralized_every == 0 and config.has_coach:
            #    prev_z = z

        all_rewards.append(episode_reward)
        #video.release()
    all_rewards = np.array(all_rewards)
    print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}")
    return all_rewards.mean()
コード例 #6
0
def train():
    env = make_atari(conf.env_name)
    env = bench.Monitor(env, os.path.join(conf.path_game_scan, conf.env_name))
    env = wrap_deepmind(env,
                        episode_life=True,
                        clip_rewards=True,
                        frame_stack=False,
                        scale=True)
    env = WrapPyTorch(env)
    agent = Agent(conf=conf, env=env, test=False)

    episode_reward = 0
    losses = []
    all_rewards = []
    state = env.reset()  # (1, 84, 84)
    for frame_idx in range(1, conf.max_train_steps + 1):
        epsilon = conf.epsilon_by_frame(frame_idx)

        action = agent.act(state, epsilon, test=False)
        # agent.save_action(action, frame_idx)

        next_state, reward, done, _ = env.step(action)
        next_state = None if done else next_state
        loss = agent.update(state,
                            action,
                            reward,
                            next_state,
                            done,
                            test=False,
                            frame=frame_idx)

        # state = next_state
        episode_reward += reward

        if done:
            agent.finish_nstep()
            state = env.reset()
            agent.save_reward(episode_reward)
            episode_reward = 0
        if loss is not None:
            losses.append(loss.item())

        if frame_idx % conf.log_freq == 0 and loss:
            print("frame: {}, loss: {}, reward: {}.".format(
                frame_idx, loss.item(), episode_reward))

    if conf.save_curve:
        curve_plot(conf.path_plot, frame_idx, agent.all_rewards, losses)
コード例 #7
0
def run():

    # uncomment these if you want
    #memory_fix()
    #memory_hard_fix()

    # setup data feed
    dm = DataManager()

    # for _ in range(10):
    #     print(dm.renderer_stream.next())

    # setup exchange. Needs raw data
    binance_exchange = BinanceExchange(data=dm.data)

    # setup portfolio
    binance_portfolio = BinancePortfolio(exchange=binance_exchange)

    # setup environment. Needs data feed stream
    env = Environment(portfolio=binance_portfolio,
                      data_stream=dm.stream,
                      renderer_stream=dm.renderer_stream)

    # for _ in range(10):
    #     print(env.observer.feed.next())

    # setup agent
    agent = Agent(environment=env)

    # train agent
    print(agent.train(steps=100, episodes=4, render_interval=10))

    # show plots of performance
    a = binance_portfolio.performance.plot()
    plt.show()
    b = binance_portfolio.performance.net_worth.plot()
    plt.show()
コード例 #8
0
def play():
  client = carla.Client(settings.CONNECTION_IP, settings.CONNECTION_PORT)
  client.set_timeout(20.0)

  # Create controllers
  trafic_control = TraficControlThread(client)
  weather_control = WeatherControlThread(client)
  trafic_control.start()
  weather_control.start()
  logger.info("Controllers started")

  predicter = ModelHandler(settings.MODEL_NAME, target_weights_path=MODEL_WEIGHTS, train=False)
  agent = Agent(999999, client, False)

  try:
    while True:
      step = 1

      state = agent.spawn()

      while True:
        start_step_time = time.time()

        action = int(np.argmax(predicter.get_qs(state)))
        new_state, _, done = agent.step(action)
        state = new_state

        if done:
          agent.clear_agent()
          break

        time_diff1 = agent.episode_start + step / settings.FPS_COMPENSATION - time.time()
        time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time()
        if time_diff1 > 0:
          time.sleep(min(0.125, time_diff1))
        elif time_diff2 > 0:
          time.sleep(min(0.125, time_diff2))
  except KeyboardInterrupt:
    logger.info("Exiting playing - Keyboard interrupt")
  except:
    logger.error("Playing failed")
  finally:
    trafic_control.terminate = True
    weather_control.terminate = True
コード例 #9
0
def test_exp(config, fn, exp, threshold=0.):
    env = make_parallel_env(1, 9999, fn)
    update_config(env, config)
    config.method = exp
    k = exp.find("ctr")
    config.centralized_every = int(exp[k + 3:k + 4])
    if "165" in exp:
        config.agent_hidden_dim = 165
    else:
        config.agent_hidden_dim = 128

    if "coach" in exp:
        config.has_coach = True

    # setup modules
    mac = Agent(config)  # policy
    qlearner = QLearner(mac, config)

    R = []
    OR = []

    for run_num in tqdm([0, 1, 2, 3, 4]):
        model_path = f"/home/liub/Desktop/mount/teamstrategy/coach1/mpe/{exp}/run{run_num}"

        qlearner.load_models(model_path)
        qlearner.cuda()

        reward = 0
        n_orders = 0
        n_total_orders = 1e-12

        for n_ep in range(n_eval):
            o, e, c, m, ms = reset_wrapper(env)
            prev_a = torch.zeros(o.shape[0],
                                 o.shape[1]).long().to(config.device)
            rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

            prev_z = None

            for t in range(145):
                if "full" in exp:
                    m = ms
                if "interval" in exp and t % config.centralized_every == 0:
                    m = ms
                o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)

                if config.has_coach and t % config.centralized_every == 0:
                    ma = ms_.sum(-1).gt(0).float()
                    with torch.no_grad():
                        _, z_team, _ = qlearner.coach(o_, e_, c_, ms_)
                    if prev_z is None:
                        mac.set_team_strategy(z_team * ma.unsqueeze(-1))
                        prev_z = z_team
                        n_orders += ma.sum().item()
                        n_total_orders += ma.sum().item()
                    else:
                        bs, n = z_team.shape[:2]
                        #normal = D.Normal(z_team, (0.5*logvar).exp())
                        #logprob = normal.log_prob(prev_z).sum(-1)
                        #prob = logprob.exp()
                        #broadcast = (prob > 0.001).float()
                        #import pdb; pdb.set_trace()
                        l2 = (z_team * ma.unsqueeze(-1) -
                              prev_z * ma.unsqueeze(-1)).pow(2).sum(-1).sqrt()
                        broadcast = (l2 > threshold).float()
                        mac.set_part_team_strategy(z_team, broadcast)
                        #import pdb; pdb.set_trace()
                        n_orders += broadcast.sum().item()
                        n_total_orders += ma.sum().item()
                        prev_z = mac.z_team.clone()

                actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden,
                                               prev_a, 0)
                prev_a = torch.LongTensor(actions).to(config.device)
                o, e, m, ms, r, d = step_wrapper(env, actions)
                reward += r.sum()

        reward = reward / n_eval
        rate = n_orders / n_total_orders

        R.append(reward)
        OR.append(rate)

    R = np.array(R)
    OR = np.array(OR)
    print(
        f"{exp:30s}[{threshold:3d}] | muR: {R.mean():.4f} stdR: {R.std()/np.sqrt(5):.4f} | muC: {OR.mean():.4f} stdC: {OR.std()/np.sqrt(5):.4f}"
    )
    return R.mean(), R.std(), OR.mean(), OR.std()
コード例 #10
0
ファイル: main.py プロジェクト: Cranial-XIX/COPA
def run():
    config = Config()
    run_dir, log_dir = prerun(config)

    env = make_parallel_env(config.n_rollout_threads, config.seed)
    update_config(env, config)

    config.pprint()

    # setup modules
    mac = Agent(config) # policy
    qlearner = QLearner(mac, config)
    if config.device == "cuda":
        qlearner.cuda()

    train_stats = {
        "reward": [],
    }

    step = 0
    reward_buffer = collections.deque(maxlen=100)

    use_tqdm = True
    n_iters = config.total_steps // config.max_steps // config.n_rollout_threads

    if use_tqdm:
        pbar = tqdm(total=n_iters)

    prev_update_step = 0

    start_epsilon = 1.0
    end_epsilon = 0.05

    delta = -np.log(end_epsilon) / n_iters

    logger = SummaryWriter(log_dir)

    for it in range(n_iters):
        o, e, c, m, ms = reset_wrapper(env)
        prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device)

        temporal_buffer = collections.deque(maxlen=config.centralized_every+1) # record t=0,1,...T

        episode_reward = 0.
        epsilon = min(start_epsilon, max(end_epsilon, np.exp(-it * delta)))

        rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

        for t in range(config.max_steps):
            step += config.n_rollout_threads

            if "full" in config.method:
                m = ms
            if "interval" in config.method and t % config.centralized_every == 0:
                m = ms

            o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)

            if config.has_coach and t % config.centralized_every == 0:
                with torch.no_grad():
                    z_team, _, _ = qlearner.coach(o_, e_, c_, ms_)
                    mac.set_team_strategy(z_team)

            actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, epsilon) # [n_agents,]
            prev_a = torch.LongTensor(actions).to(config.device)

            no, ne, nm, nms, r, d = step_wrapper(env, actions)

            temporal_buffer.append((o, e, c, m, ms, actions, r))
            episode_reward += r

            if t % config.centralized_every == 0 and t > 0:
                O, E, C, M, MS, A, R = map(np.stack, zip(*temporal_buffer))
                for j in range(config.n_rollout_threads):
                    qlearner.buffer.push(O[:,j], E[:,j], C[:,j],
                                         M[:,j], MS[:,j], A[:,j], R[:,j])

            if (step - prev_update_step) >= config.update_every:
                prev_update_step = step
                qlearner.update(logger, step)

            o = no; e = ne; m = nm; ms = nms

        reward_buffer.extend(episode_reward)
        pbar.update(1)
        running_reward_mean = np.array(reward_buffer).mean()
        train_stats["reward"].append((step, running_reward_mean))
        logger.add_scalar("reward", running_reward_mean, step)
        pbar.set_description(f"ep {it:10d} | {running_reward_mean:8.4f} |")

        if (it+1) % 100 == 0 or (it+1 == n_iters):
            with open(f"{log_dir}/stats.npy", 'wb') as f:
                np.save(f, train_stats)
            f.close()
            qlearner.save_models(f"{run_dir}")

    if use_tqdm:
        pbar.close()
    env.close()
コード例 #11
0
def qLearning(learning_rate, discount_factor, epsilon, reward_map, state_grid,
              max_steps, epochs):

    agent = Agent(learning_rate, discount_factor, reward_map, state_grid,
                  max_steps)

    stateDic = {}
    # all_epochs = []
    # epoch_rewards = []
    # every_5 = []
    # mean_every_5 = []
    # epochs_mean = []

    for e in range(epochs):

        current_state = Agent.choose_start(reward_map, state_grid, max_steps)

        # epoch_reward = 0
        # all_epochs.append(e)
        # current_epoch_rewards = []

        for _ in range(0, len(reward_map) * len(reward_map[0]) * 2):

            action = Agent.epsilon_greedy_policy(epsilon, current_state)

            next_state, reward = agent.take_action(current_state, action,
                                                   stateDic)

            # update
            current_state.update_qvalue(learning_rate, reward, discount_factor,
                                        next_state, action)

            stateDic[(current_state.posX, current_state.posY,
                      current_state.steps)] = current_state

            # epoch_reward += reward
            # current_epoch_rewards.append(reward)

            if next_state.is_terminal:
                # epoch_rewards.append(epoch_reward)
                # every_5.append(epoch_reward)
                break

            current_state = next_state

        # epochs_mean.append(sum(current_epoch_rewards) / len(current_epoch_rewards))

        # if(e % 5 == 0):
        #     mean_every_5.append(sum(every_5) / len(every_5))
        #     every_5 = []

    # plt.style.use(['dark_background'])
    # plt.figure(figsize=(18,12))
    # plt.plot(epochs_mean)
    # plt.xlabel("Episodios")
    # plt.ylabel("Reward médio por episodio", size=10)
    # plt.title("Reward médio por episodio 0.9 Lambda")
    # plt.savefig('epochs_mean.png')
    # plt.show()

    # plt.style.use(['dark_background'])
    # plt.figure(figsize=(20,10))
    # plt.plot(epoch_rewards)
    # plt.xlabel("Episodios")
    # plt.ylabel("Reward axumulativa", size=10)
    # plt.title("Reward acumulativa por episodio")
    # plt.savefig('reward_cumulative.png')
    # plt.show()

    # plt.figure(figsize=(20,10))
    # plt.plot(mean_every_5)
    # plt.xlabel("Episodios")
    # plt.ylabel("Média Móvel a cada 5 episodios", size=10)
    # plt.title("Média Móvel")
    # plt.savefig('mean_every5.png')
    # plt.show()

    # plt.figure(figsize=(20,10))
    # plt.plot(epochs_mean)
    # plt.xlabel("Episodios")
    # plt.ylabel("Reward médio por episodio", size=10)
    # plt.title("Reward médio por episodio")
    # plt.savefig('epochs_mean.png')
    # plt.show()

    return stateDic
コード例 #12
0
class ATC(core.Entity):
    ''' Example new entity object for BlueSky. '''
    def __init__(self):
        super().__init__()
        self.super_start = time.perf_counter()

        self.initilized = False

        self.epoch_counter = 0
        # [Success, Fail]
        self.results = np.zeros(2)

        self.all_success = []
        self.all_fail = []
        self.mean_success = 0
        self.all_mean_success, self.best = 0, 0
        self.mean_rewards = []
        self.epoch_actions = np.zeros(ACTION_SHAPE)

        self.start = None
        self.stop = None

        self.dist = [0, -1]
        self.spd = [0, -1]
        self.trk = [0, 360]
        self.vs = [0, -1]

        self.last_observation = {}
        self.last_reward_observation = {}
        self.previous_action = {}
        self.observation = {}

    def on_load(self):
        self.sector_manager = Sector_Manager(SECTORS)
        self.route_manager = Route_Manager(ROUTES,
                                           test_routes=VISUALIZE,
                                           draw_paths=VISUALIZE)
        self.traffic_manager = Traffic_Manager(
            max_ac=MAX_AC,
            times=TIME_SEP,
            max_spd=CONSTRAINTS["cas"]["max"],
            min_spd=CONSTRAINTS["cas"]["min"],
            max_alt=32000,
            min_alt=32000,
            network=self.route_manager)

        self.memory = Memory()

        self.agent = Agent(state_size=STATE_SHAPE,
                           action_size=ACTION_SHAPE,
                           value_size=VALUE_SHAPE)

        try:
            self.agent.load(path=FILE + "best.h5")
        except:
            try:
                self.agent.load(path=FILE + ".h5")
            except:
                pass

        self.initilized = True

        print("ATC: READY")
        string = "=================================\n   UPDATE: RUNNING EPOCH {}\n=================================\n".format(
            self.format_epoch())
        self.print_all(string)

    # Functions that need to be called periodically can be indicated to BlueSky
    # with the timed_function decorator

    @core.timed_function(name='example', dt=12)
    def update(self):
        # Initilize system
        if not self.initilized:
            self.on_load()

        # Start epoch timer
        if not self.start:
            self.start = time.perf_counter()

        # Create aircraft
        self.traffic_manager.spawn()
        # Update Aircraft active sectors
        self.traffic_manager.update_active(self.sector_manager.system_sectors)

        # Generate a full distancematrix between each aircraft
        full_dist_matrix = self.get_dist_martix()

        # Get nearest ac in a matrix
        nearest_ac = self.get_nearest_ac(dist_matrix=full_dist_matrix)

        # Get goal distances for each aircraft
        g_distance = self.get_goal_distances()

        # Get an array of terminal aircraft
        terminal_ac, terminal_id = self.get_terminal(nearest_ac, g_distance)

        self.handle_terminal(terminal_id)

        if self.traffic_manager.check_done():
            self.epoch_reset()
            return

        if not TRAIN and (self.traffic_manager.total % 50 == 0):
            string = "Success: {} | Fail: {} | Mean Success: {:.3f}%".format(
                int(self.results[0]), int(self.results[1]),
                (self.results[0] / MAX_AC) * 100)
            self.print_all(string)

        if len(traf.id) <= 0:
            return

        if not len(traf.id) == 0:
            policy, normal_state, normal_context = self.get_actions(
                terminal_ac, g_distance, full_dist_matrix)

            if len(policy) > 0:
                idx = 0
                new_actions = {}
                for i in range(len(traf.id)):
                    if terminal_ac[i] == 0 and len(
                            self.traffic_manager.active_sectors[i]) > 0:
                        if not np.any(np.isnan(policy[idx])):
                            _id = traf.id[i]

                            if not _id in self.last_observation.keys():
                                self.last_observation[_id] = [
                                    normal_state[idx], normal_context[idx]
                                ]

                            action = np.random.choice(
                                ACTION_SHAPE, 1, p=policy[idx].flatten())[0]

                            # print(policy[idx], action)

                            self.epoch_actions[action] += 1

                            if not _id in self.observation.keys(
                            ) and _id in self.previous_action.keys():
                                self.observation[_id] = [
                                    normal_state[idx], normal_context[idx]
                                ]

                                self.memory.store(_id,
                                                  self.last_observation[_id],
                                                  self.previous_action[_id],
                                                  nearest_ac[idx])

                                self.last_observation[_id] = self.observation[
                                    _id]

                                del self.observation[_id]

                            self.perform_action(i, action)

                            new_actions[_id] = action

                        self.previous_action = new_actions

                        idx += 1

    # Act
    def get_actions(self, terminal_ac, g_dists, dist_matrix):
        ids = []
        new_actions = {}

        state = self.get_state()

        normal_state, normal_context = self.normalise_all(
            state, terminal_ac, g_dists, dist_matrix)

        policy = []
        if not len(normal_state) == 0:
            policy = self.agent.act(normal_state, normal_context)

        return policy, normal_state, normal_context

    # For an aircraft perform an action
    def perform_action(self, i, action):
        if action < 3:
            traf_alt = int(traf.alt[i] / ft)
            new_alt = int(round((traf_alt + ACTIONS[action])))

            alt = max(CONSTRAINTS["alt"]["min"],
                      min(CONSTRAINTS["alt"]["max"], new_alt))

            # print(traf_alt, alt)

            stack.stack("{} alt {}".format(traf.id[i], alt))
        elif action == 4:
            traf_alt = traf.alt[i] / ft
            new_alt = int(round((traf_alt)))

    # Get the current state

    def get_state(self):
        state = np.zeros((len(traf.id), 6))

        start_ids, end_ids = self.get_all_nodes()

        state[:, 0] = traf.lat
        state[:, 1] = traf.lon
        state[:, 2] = traf.trk
        state[:, 3] = traf.alt
        state[:, 4] = traf.tas
        state[:, 5] = traf.vs

        return state

    # Get all nodes for each aircraft
    def get_all_nodes(self):
        start_ids = np.zeros(len(traf.id), dtype=int)
        end_ids = np.zeros(len(traf.id), dtype=int)

        for i in range(len(traf.id)):
            _id = traf.id[i]
            route = self.traffic_manager.routes[_id]
            start_ids[i] = np.argwhere(
                self.route_manager.idx_array == route[0])
            end_ids[i] = np.argwhere(self.route_manager.idx_array == route[-1])

        return start_ids, end_ids

    # Normalise the state and context
    def normalise_all(self, state, terminal_ac, g_dists, dist_matrix):
        normal_states = self.normalise_state(state, terminal_ac, g_dists)

        normal_context = []

        start_ids, end_ids = self.get_all_nodes()

        max_agents = 0
        for _id in traf.id:
            if terminal_ac[traf.id2idx(_id)] > 0 or len(
                    self.traffic_manager.active_sectors[traf.id2idx(
                        _id)]) <= 0:
                continue

            new_context = self.normalise_context(_id, terminal_ac, dist_matrix,
                                                 start_ids, end_ids)

            max_agents = max(max_agents, len(new_context))

            if len(normal_context) == 0:
                normal_context = new_context
            else:
                normal_context = np.append(
                    keras.preprocessing.sequence.pad_sequences(
                        normal_context, max_agents, dtype='float32'),
                    keras.preprocessing.sequence.pad_sequences(
                        new_context, max_agents, dtype='float32'),
                    axis=0)

        if len(normal_context) == 0:
            normal_context = np.array([0, 0, 0, 0, 0, 0, 0]).reshape(1, 1, 7)

        # print(normal_states.shape, normal_context.shape)
        return normal_states, normal_context

    # Normalise the agent state only
    def normalise_state(self, state, terminal_ac, g_dists):
        total_active = 0

        for i in range(len(terminal_ac)):
            if terminal_ac[i] == 0 and len(
                    self.traffic_manager.active_sectors[i]) > 0:
                total_active += 1

        normalised_state = np.zeros((total_active, STATE_SHAPE))

        count = 0
        for i in range(len(traf.id)):
            if terminal_ac[i] > 0 or len(
                    self.traffic_manager.active_sectors[i]) <= 0:
                continue

            normalised_state[count, :] = self.normalise(state[i],
                                                        'state',
                                                        traf.id[i],
                                                        g_dist=g_dists[i])

            count += 1

        return normalised_state

    # Get and normalise context
    def normalise_context(self, _id, terminal_ac, dist_matrix, start_ids,
                          end_ids):
        context = []
        idx = traf.id2idx(_id)

        distances = dist_matrix[:, idx]
        this_sectors = self.traffic_manager.active_sectors[idx]

        this_lat, this_lon = traf.lat[idx], traf.lon[idx]

        for i in range(len(distances)):
            # Ignore current aircraft
            if i == idx:
                continue

            if terminal_ac[i] > 0 or len(
                    self.traffic_manager.active_sectors[i]) <= 0:
                continue

            sectors = self.traffic_manager.active_sectors[i]

            # Only care if the ac in a matching sector
            flag = False
            for x in sectors:
                if x in this_sectors:
                    flag = True

            if not flag:
                continue

            dist = get_dist([this_lat, this_lon], [traf.lat[i], traf.lon[i]])

            # Only care about visible distance aircraft
            if dist > 40:
                continue

            spd = traf.tas[i]
            alt = traf.alt[i]
            trk = traf.trk[i]
            vs = traf.vs[i]
            start_id = start_ids[i]
            end_id = end_ids[i]

            self.dist[1] = max(self.dist[1], dist)
            self.spd[1] = max(self.spd[1], spd)
            self.vs[1] = max(self.vs[1], vs)

            dist = dist / self.dist[1]
            spd = spd / self.spd[1]
            trk = trk / self.trk[1]
            alt = ((alt/ft)-CONSTRAINTS["alt"]["min"]) / \
                (CONSTRAINTS["alt"]["max"]-CONSTRAINTS["alt"]["min"])

            vs = 0
            if not vs == 0:
                vs = vs / self.vs[1]

            n_nodes, dist2next = get_n_nodes(traf.id[i], self.traffic_manager,
                                             self.route_manager)

            self.dist[1] = max(self.dist[1], dist2next)
            dist2next = dist2next / self.dist[1]

            if len(context) == 0:
                context = np.array([
                    spd, alt, trk, vs, dist, dist2next, n_nodes[0], n_nodes[1],
                    n_nodes[2]
                ]).reshape(1, 1, 9)
            else:
                context = np.append(context,
                                    np.array([
                                        spd, alt, trk, vs, dist, dist2next,
                                        n_nodes[0], n_nodes[1], n_nodes[2]
                                    ]).reshape(1, 1, 9),
                                    axis=1)

        if len(context) == 0:
            context = np.zeros(9).reshape(1, 1, 9)

        return context

    # perform normalisation
    def normalise(self, state, what, _id, g_dist=None):

        # Normalise the entire state
        if what == 'state':
            if not g_dist:
                raise Exception(
                    "For normalising a state please pass the distance to the goal."
                )

            self.dist[1] = max(self.dist[1], g_dist)
            self.spd[1] = max(self.spd[1], state[4])
            self.vs[1] = max(self.vs[1], state[5])

            dist = g_dist / self.dist[1]
            spd = state[4] / self.spd[1]
            trk = state[2] / self.trk[1]
            alt = ((state[3]/ft)-CONSTRAINTS["alt"]["min"]) / \
                (CONSTRAINTS["alt"]["max"]-CONSTRAINTS["alt"]["min"])

            vs = 0
            if not state[5] == 0:
                vs = state[5] / self.vs[1]

            n_nodes, dist2next = get_n_nodes(_id, self.traffic_manager,
                                             self.route_manager)

            self.dist[1] = max(self.dist[1], dist2next)
            dist2next = dist2next / self.dist[1]

            return np.array([
                spd, alt, trk, vs, dist, dist2next, n_nodes[0], n_nodes[1],
                n_nodes[2]
            ])

    # Get the terminal aircraft
    def get_terminal(self, nearest_ac, g_dists):
        terminal_ac = np.zeros(len(traf.id), dtype=int)
        terminal_id = []

        # Loop through all aircraft
        for i in range(len(traf.id)):
            # Terminal state 0 = not terminal, 1 = collision, 2 = success
            T = 0

            # Only care about aircraft in a sector
            if len(self.traffic_manager.active_sectors[i]) > 0:
                close_ac = nearest_ac[i]
                n_ac_data = (close_ac[0], close_ac[1])

                # Get the terminal state
                T = self.agent.terminal(i, n_ac_data, g_dists[i])

                # Only care about terminal aircraft
                if not T == 0:
                    # Update collision aircraft
                    if T == 1:
                        terminal_ac[i] = 1
                        terminal_ac[traf.id2idx(close_ac[2])] = 1
                    elif not terminal_ac[i] == 1:
                        terminal_ac[i] = 2

                    _id = traf.id[i]
                    self.memory.store(_id, self.last_observation[_id],
                                      self.previous_action[_id], nearest_ac[i],
                                      T)

        for i in range(len(terminal_ac)):
            if terminal_ac[i] > 0:
                terminal_id.append([traf.id[i], terminal_ac[i]])

        return terminal_ac, terminal_id

    # Handle terminal aircraft
    def handle_terminal(self, terminal_id):
        for ac in terminal_id:
            stack.stack('DEL {}'.format(ac[0]))

            self.traffic_manager.active -= 1

            if ac[1] == 1:
                self.results[1] += 1
            elif ac[1] == 2:
                self.results[0] += 1

    # Generates a distance matrix of all aircraft in the system
    def get_dist_martix(self):
        size = traf.lat.shape[0]
        return geo.latlondist_matrix(np.repeat(traf.lat, size),
                                     np.repeat(traf.lon, size),
                                     np.tile(traf.lat, size),
                                     np.tile(traf.lon,
                                             size)).reshape(size, size)

    # Get the nearest aircraft to agents
    def get_nearest_ac(self, dist_matrix):
        nearest = []

        # Loop through all aircraft
        for i in range(len(traf.id)):
            a_alt = traf.alt[i] / ft

            ac_dists = dist_matrix[:, i]

            close = 10e+25
            alt_sep = 10e+25

            nearest_id = None

            # Loop through the row on the dist matrix
            for x in range(len(ac_dists)):
                # Ensure the aircraft is in controlled airspace and not the current aircraft
                if not x == i and len(
                        self.traffic_manager.active_sectors[x]) > 0:

                    # See if it is closest and update
                    if ac_dists[x] < close:
                        close = float(ac_dists[x])
                        i_alt = traf.alt[x] / ft

                        alt_sep = abs(a_alt - i_alt)

                        nearest_id = traf.id[x]
            nearest.append([close, alt_sep, nearest_id])

        return np.array(nearest)

    # returns a matrix of distances to a goal
    def get_goal_distances(self):
        goal_ds = np.zeros(len(traf.id), dtype=float)

        for i in range(len(traf.id)):
            goal_ds[i] = get_goal_dist(traf.id[i], self.traffic_manager,
                                       self.route_manager)

        return goal_ds

    # Reset the environment for the next epoch
    def epoch_reset(self):
        # Reset the traffic creation
        self.traffic_manager.reset()

        # Keep track of all success and failures
        self.all_success.append(self.results[0])
        self.all_fail.append(self.results[1])

        # Calcuate total mean success
        self.all_mean_success = np.mean(self.all_success)

        # Calcuate rolling mean success
        if (self.epoch_counter + 1) >= 50:
            self.mean_success = np.mean(self.all_success[-50:])

        if (self.epoch_counter + 1) % 5 == 0:
            if self.mean_success > self.best:
                if TRAIN:
                    print('::::::: Saving Best ::::::')
                    self.agent.save(path=NEW_FILE + "best.h5")
                self.best = self.mean_success
            if TRAIN:
                print(':::::: Saving Model ::::::')
                self.agent.save(path=NEW_FILE + ".h5")
                print(":::::::: Training ::::::::")
                self.agent.train(self.memory)
                print(":::::::: Complete ::::::::")

        temp = np.array([np.array(self.all_success), np.array(self.all_fail)])
        np.savetxt("Files/" + NEW_FILE + "_numpy.csv", temp, delimiter=',')

        # Stop the timer
        self.stop = time.perf_counter()
        # -------- Printing Outputs --------
        string = "Epoch run in {:.2f} seconds".format(self.stop - self.start)
        self.print_all(string)
        string = "Success: {} | Fail: {} | Mean Success: {:.3f}% | (50) Mean Success Rolling {:.3f}% | Best {:.3f}%".format(
            int(self.results[0]), int(self.results[1]),
            (self.all_mean_success / MAX_AC) * 100,
            (self.mean_success / MAX_AC) * 100, (self.best / MAX_AC) * 100)
        self.print_all(string)
        string = "Actions -> Descend: {}, Hold Current: {}, Climb: {}, Maintain Climb: {}".format(
            self.epoch_actions[0], self.epoch_actions[1],
            self.epoch_actions[2], self.epoch_actions[3])
        # string = "Actions -> Descend: {}, Climb: {}".format(
        #     self.epoch_actions[1], self.epoch_actions[0])
        self.print_all(string)

        if self.epoch_counter + 1 >= EPOCHS:
            super_stop = time.perf_counter()
            stack.stack("STOP")
            string = "::END:: Training {} episodes took {:.2f} hours".format(
                EPOCHS, ((super_stop - self.super_start) / 60) / 60)
            self.print_all(string)
            return

        self.epoch_counter += 1
        string = "=================================\n   UPDATE: RUNNING EPOCH {}\n=================================\n".format(
            self.format_epoch())
        self.print_all(string)

        # Reset values
        self.results = np.zeros(2)
        self.stop = None
        self.start = None
        self.mean_rewards = []
        self.epoch_actions = []
        self.epoch_actions = np.zeros(ACTION_SHAPE)

        self.previous_action = {}
        self.last_observation = {}
        self.observation = {}

    # Scripts for printing values
    def print_all(self, string):
        stack.stack(f'ECHO {string}')
        print(string)

    def format_epoch(self):
        epoch_string = ""

        if self.epoch_counter + 1 < 10:
            epoch_string += "0"
        if self.epoch_counter + 1 < 100:
            epoch_string += "0"
        if self.epoch_counter + 1 < 1000:
            epoch_string += "0"
        if self.epoch_counter + 1 < 10000:
            epoch_string += "0"

        epoch_string += str(self.epoch_counter + 1)
        return epoch_string
コード例 #13
0
class GridWorld:
    def __init__(self, world):

        self.world = world.split('\n    ')[1:-1]
        self.action_map = {0: 'right', 1: 'down', 2: 'left', 3: 'up'}
        self.action_space = [0, 1, 2, 3]
        self.slip = 0.2  #20% chance of taking wrong action

        self.col = len(self.world[0])  #10 num of columns in the above string
        self.row = len(self.world)  #5 num of rows in the above string
        self.state_color = (50, 100, 10)
        self.renderfirst = True
        self.policy = {}
        self.episode_step = 0
        self._max_epi_step = 1000

        self.wall_group = pg.sprite.Group()
        self.state_group = pg.sprite.Group()

        self.state_dict = defaultdict(lambda: 0)

        i = 0
        for y, et_row in enumerate(self.world):
            for x, block_type in enumerate(et_row):

                if block_type == 'w':
                    self.wall_group.add(Wall(col=x, row=y))

                elif block_type == 'a':
                    self.agent = Agent(col=x, row=y)
                    self.state_group.add(State(col=x, row=y))
                    self.state_dict[(x, y)] = {
                        'state': i,
                        'reward': -1,
                        'done': False
                    }
                    i += 1

                elif block_type == 'g':
                    self.goal = Goal(col=x, row=y)
                    self.state_dict[(x, y)] = {
                        'state': i,
                        'reward': 10,
                        'done': True
                    }
                    i += 1

                elif block_type == ' ':
                    self.state_group.add(State(col=x, row=y))
                    self.state_dict[(x, y)] = {
                        'state': i,
                        'reward': -1,
                        'done': False
                    }
                    i += 1

        self.state_dict = dict(self.state_dict)
        self.state_count = len(self.state_dict)

    def reset(self):
        self.episode_step = 0
        self.agent.reInitilizeAgent()
        return self.state_dict[(self.agent.initial_position.x,
                                self.agent.initial_position.y)]['state']

    def get_action_with_probof_slip(self, action):  #slip property of env
        individual_slip = self.slip / 3
        prob = [individual_slip for a in self.action_space]
        prob[action] = 1 - self.slip
        act = np.random.choice(self.action_space, p=prob)
        return act

    def step(self, action, testing=False):
        if not testing:
            action = self.get_action_with_probof_slip(action)
        action = self.action_map[action]
        response = self.agent.move(action, self.wall_group, self.state_dict)
        self.episode_step += 1

        if self.episode_step <= self._max_epi_step:
            return response['state'], response['reward'], response['done'], {
            }  #info
        else:
            return response['state'], response['reward'], True, {
                'TimeLimit': True
            }

    def render(self):
        if self.renderfirst:
            pg.init()
            self.screen = pg.display.set_mode((self.col * 50, self.row * 50))

        self.screen.fill(self.state_color)

        self.wall_group.draw(self.screen)
        self.goal.draw(self.screen)
        self.agent.draw(self.screen)

        pg.display.update()
        pg.display.flip()

    def close(self):
        self.renderfirst = True
        pg.quit()

    def setPolicy(self, policy):
        for i, act in enumerate(policy):
            self.policy[i] = self.action_map[act]

        for s in self.state_group:
            s.change_with_policy(self.state_dict, self.policy)

    def play_as_human(self,
                      show_policy=False):  #policy={state_no:action('left')}
        if show_policy and len(self.policy) == 0:
            raise Exception(
                "Sorry, no policy found setPolicy first...use world.setPolicy([list of action for states])"
            )

        pg.init()
        screen = pg.display.set_mode((self.col * 50, self.row * 50))

        clock = pg.time.Clock()

        done = False

        while not done:

            for event in pg.event.get():
                if event.type == pg.QUIT:
                    done = True

                elif event.type == pg.KEYDOWN:
                    if event.key == pg.K_LEFT:
                        response = self.agent.move('left', self.wall_group,
                                                   self.state_dict)
                        #print(response)
                    elif event.key == pg.K_RIGHT:
                        response = self.agent.move('right', self.wall_group,
                                                   self.state_dict)
                        #print(response)
                    elif event.key == pg.K_UP:
                        response = self.agent.move('up', self.wall_group,
                                                   self.state_dict)
                        #print(response)
                    elif event.key == pg.K_DOWN:
                        response = self.agent.move('down', self.wall_group,
                                                   self.state_dict)
                        #print(response)

            screen.fill(self.state_color)

            self.wall_group.draw(screen)
            if show_policy: self.state_group.draw(screen)
            self.goal.draw(screen)
            self.agent.draw(screen)

            pg.display.update()
            pg.display.flip()
            clock.tick(60)
        pg.quit()
コード例 #14
0
class Trainer(Thread):
    def __init__(self, client, identifier, epsilon, get_qs_callbatch,
                 update_replay_memory_callback):
        super().__init__()
        self.daemon = True
        self.client = client

        self.terminate = False
        self.fail_flag = False
        self.halt = False

        self.get_qs = get_qs_callbatch
        self.update_replay_memory = update_replay_memory_callback
        self.identifier = identifier

        self.agent = Agent(identifier, self.client, True)

        self.action = None
        self.episode = 0
        self.epsilon = epsilon
        self.scores_history = deque(maxlen=settings.LOG_EVERY)
        self.score_record = None
        self.steps_per_second = deque(maxlen=settings.LOG_EVERY)

        self.actions_statistic = deque(
            maxlen=int(settings.LOG_EVERY * settings.SECONDS_PER_EXPISODE *
                       settings.FPS_COMPENSATION))

    def get_action(self, action: int):
        num_of_logged_actions = len(self.actions_statistic)
        if num_of_logged_actions <= 0: return 0
        return self.actions_statistic.count(action) / num_of_logged_actions

    def get_steps_per_second(self):
        if len(self.steps_per_second) > 0:
            return sum(self.steps_per_second) / len(self.steps_per_second)
        return 0

    def get_preview_data(self):
        if self.agent.prev_camera is not None and self.agent.initialized:
            return cv2.cvtColor(self.agent.prev_camera, cv2.COLOR_RGB2BGR)
        return np.zeros((settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[1],
                         settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[0],
                         settings.PREVIEW_CAMERA_IMAGE_DIMENSIONS[2]))

    def get_mean_score(self):
        if len(self.scores_history) > 0:
            return sum(self.scores_history) / len(self.scores_history)
        return 0

    def get_episode(self):
        return self.episode

    def run(self) -> None:
        logger.info(f"Trainer {self.identifier} started")

        while not self.terminate:
            if self.halt:
                time.sleep(0.1)
                continue

            reward = None
            episode_reward = 0
            step = 1

            try:
                state = self.agent.spawn()
                self.fail_flag = False
            except:
                self.fail_flag = True
                break

            episode_data_memory = deque()

            while not self.fail_flag:
                start_step_time = time.time()

                if self.epsilon is None or np.random.random() > self.epsilon:
                    self.action = int(np.argmax(self.get_qs(state)))
                    self.actions_statistic.append(self.action)
                else:
                    self.action = random.choice(list(settings.ACTIONS.keys()))

                try:
                    new_state, reward, done = self.agent.step(self.action)
                except:
                    logger.error(
                        f"Trainer {self.identifier} - Failed to make step")
                    self.fail_flag = True
                    break

                episode_data_memory.append(
                    (state, self.action, reward, new_state, done))
                state = new_state

                episode_reward += reward

                if done:
                    self.agent.clear_agent()
                    self.action = None
                    break

                time_diff1 = self.agent.episode_start + step / settings.FPS_COMPENSATION - time.time(
                )
                time_diff2 = start_step_time + 1 / settings.FPS_COMPENSATION - time.time(
                )
                if time_diff1 > 0:
                    time.sleep(min(0.125, time_diff1))
                elif time_diff2 > 0:
                    time.sleep(min(0.125, time_diff2))

                step += 1

            if not reward or not self.agent.episode_start: continue

            episode_time = time.time() - self.agent.episode_start
            if episode_time == 0: episode_time = 10 ^ -9
            average_steps_per_second = step / episode_time

            self.steps_per_second.append(average_steps_per_second)

            reward_factor = settings.FPS_COMPENSATION / average_steps_per_second
            episode_reward_weighted = (
                (episode_reward - reward) * reward_factor +
                reward) * settings.EPISODE_REWARD_MULTIPLIER

            if episode_time > settings.MINIMUM_EPISODE_LENGTH:
                self.update_replay_memory(episode_data_memory)
                self.scores_history.append(episode_reward_weighted)
                self.episode += 1

            del episode_data_memory

        logger.info(f"Trainer {self.identifier} stopped")