Exemple #1
0
class fehEnv(gym.Env):
    def __init__(self):
        self.width = 6
        self.height = 8
        # input are row, col, verbose, difficulty
        self.simulator = Simulator()
        self.viewer = None
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        action_space = simulator.get_action_space()
        assert action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))
        s, r, d = self.simulator.step(action)
        return s, r, d

    def render(self):
        screen_width = 600
        screen_height = 400
        if self.viewer is None:
            from gym.envs.classic_control import rendering
        self.viewer = rendering.Viewer(screen_width, screen_height)

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

    def reset(self):
        s, r, d = self.simulator.reset()
        return s, r, d
Exemple #2
0
 def __init__(self):
     self.width = 6
     self.height = 8
     # input are row, col, verbose, difficulty
     self.simulator = Simulator()
     self.viewer = None
     self.seed()
Exemple #3
0
    def __init__(self, identifier, model_name, learning_rate,
                 use_replay_memory, memory_size, burn_in, difficulty):
        self.difficulty = difficulty
        self.simu = Simulator(verbose=False, difficulty=difficulty)
        for i in range(8):
            self.simu.create_unit(i, int(i / 4))

        self.identifier = identifier
        self.ns = 48
        self.na = 4
        self.net = model.QANet(self.ns, self.na, model_name, learning_rate)
        if use_replay_memory:
            self.memory = Memory(memory_size, burn_in, difficulty)
        self.use_replay = use_replay_memory
Exemple #4
0
    def __init__(self, memory_size=50000, burn_in=10000, difficulty=0.0):
        """
        Memory unit: sa_pair, reward, next_sa_pairs, done
        """
        self.memory = []
        self.length = 0
        self.memory_size = memory_size
        self.burn_in = burn_in
        self.full = False
        # burn in
        simu = Simulator(verbose=False, difficulty=difficulty)
        for i in range(8):
            simu.create_unit(i, int(i / 4))

        iteration = 0
        while iteration <= burn_in:
            s, _, _ = simu.reset()
            while True:
                a = choice(simu.get_action_space())
                s_, r, done = simu.step(a)

                sa_pair = np.concatenate((s.flatten(), a.get_values()))
                next_sa_pairs = [
                    np.concatenate((s_.flatten(), action.get_values()))
                    for action in simu.get_action_space()
                ]

                self.remember((sa_pair, r, next_sa_pairs, done))
                s = s_
                iteration += 1
                if done:
                    break
        print("Memory burned in with current index at {}".format(self.length))
        print("Memory size is {}".format(self.memory_size))
Exemple #5
0
def train(args=None):
    gpu_ops = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(gpu_options=gpu_ops, log_device_placement=False)
    sess = tf.Session(config=config)

    env = Simulator()
    for i in range(8):
        env.create_unit(i, int(i / 4))

    ns = 48
    na = 4

    dqn = model.DeepQN(state_shape=(ns + na, ), num_actions=1)
    dqn.reset_sess(sess)
    dqn.set_train(lr=0.001)

    # set mr
    mr = memory_replay.MemoryReplayer((ns + na, ),
                                      capacity=10000,
                                      enabled=True)

    score = deque([], maxlen=100)

    for epi in range(5000):
        s = env.reset()

        done = False

        rc = 0

        while not done:
            a = dqn.select_action_eps_greedy(get_eps(epi), s)
            a_ = a[0]
            s_, r, done, _ = env.step(a_)
            memory_replay.MemoryReplayer.remember(s, s_, r, a_, done)
            s = s_
            rc += r
        score.append(rc)
        # replay
        s, s_, r, a, done = mr.replay(batch_size=32)
        dqn.train(s, s_, r, a, done)

        if (epi + 1) % args.performance_plot_interval == 0:
            print('train-r-mod reward avg: ', np.mean(score))

    return
Exemple #6
0
class DQN_Agent:
    def __init__(self, identifier, model_name, learning_rate,
                 use_replay_memory, memory_size, burn_in, difficulty):
        self.difficulty = difficulty
        self.simu = Simulator(verbose=False, difficulty=difficulty)
        for i in range(8):
            self.simu.create_unit(i, int(i / 4))

        self.identifier = identifier
        self.ns = 48
        self.na = 4
        self.net = model.QANet(self.ns, self.na, model_name, learning_rate)
        if use_replay_memory:
            self.memory = Memory(memory_size, burn_in, difficulty)
        self.use_replay = use_replay_memory

    @staticmethod
    def epsilon_greedy_policy(q_values, eps, actions):
        if random() <= eps:
            return choice(actions)
        else:
            return actions[np.argmax(q_values)]

    def train(self, max_iteration, eps, eps_decay, eps_min, interval_iteration,
              gamma, test_size):
        self.test(0, test_size=test_size)
        iteration = 0
        performance = []
        while iteration <= max_iteration:
            while iteration <= max_iteration:
                s, _, _ = self.simu.reset()
                if not self.use_replay:
                    mini_batch = []
                while True:
                    eps = max(eps - eps_decay * iteration, eps_min)
                    actions = self.simu.get_action_space()
                    action_numbers = [
                        action.get_values() for action in actions
                    ]
                    q_values = [
                        self.net.qvalue(
                            np.concatenate(
                                (s.flatten(), number)).reshape(1, -1))[0][0]
                        for number in action_numbers
                    ]
                    a = self.epsilon_greedy_policy(q_values, eps, actions)

                    sa_pair = np.concatenate((s.flatten(), a.get_values()))
                    s_, r, done = self.simu.step(a)

                    next_sa_pairs = [
                        np.concatenate((s.flatten(), action.get_values()))
                        for action in self.simu.get_action_space()
                    ]
                    if not self.use_replay:
                        mini_batch.append((sa_pair, r, next_sa_pairs, done))
                    else:
                        mini_batch = self.memory.sample()
                        self.memory.remember((sa_pair, r, next_sa_pairs, done))
                        self.train_on_minibatch(mini_batch, gamma)

                    s = s_
                    iteration += 1
                    # save model
                    if iteration % int(max_iteration / 3) == 0:
                        self.net.save_model(self.identifier, iteration)
                        dump(
                            performance,
                            open(
                                './model/{}{}.p'.format(
                                    iteration, self.identifier), 'wb'))
                        break
                    # test
                    if iteration % interval_iteration == 0:
                        performance.append((iteration,
                                            self.test(iteration,
                                                      test_size=test_size)))
                        break
                    if done:
                        # print("hold for {} sec".format(i - start))
                        break

                if not self.use_replay:
                    self.train_on_minibatch(mini_batch, gamma)

        dump(performance, open('./model/{}.p'.format(self.identifier), 'wb'))

    def test(self, iteration, test_size):
        rewards = 0
        count = 0
        win_round = 0
        for _ in range(test_size):
            s2, _, _ = self.simu.reset()
            while True:
                actions = self.simu.get_action_space()
                q_values = [
                    self.net.qvalue(
                        np.concatenate(
                            (s2.flatten(),
                             action.get_values())).reshape(1, -1))[0][0]
                    for action in actions
                ]
                a = self.epsilon_greedy_policy(q_values, 0, actions)
                s2, r2, done2 = self.simu.step(a)
                rewards += r2
                count += 1
                if done2:
                    if r2 == 100:
                        win_round += 1
                    break
        print("The average reward of {} iteration is {}".format(
            iteration, rewards / test_size))
        print("The average iterations taken per episode is {}".format(
            count / test_size))
        print("The win rate of this model is {}".format(win_round / test_size))
        return rewards / test_size, count / test_size, win_round / test_size

    def train_on_minibatch(self, mini_batch, gamma):
        x_train = np.zeros((len(mini_batch), self.na + self.ns))
        y_train = np.zeros((len(mini_batch), 1))
        for i1, (sa_pair1, r1, next_sa_pairs1, done1) in enumerate(mini_batch):
            # target
            if done1 is True:
                target = r1
            else:
                target = r1 + gamma * np.max([
                    self.net.qvalue(sa_pair_.reshape(1, -1))[0][0]
                    for sa_pair_ in next_sa_pairs1
                ])
            x_train[i1] = sa_pair1
            y_train[i1] = target
        self.net.train(x_train, y_train, len(mini_batch))
Exemple #7
0
from feh_simulator.simulator import Simulator
s = Simulator()
ass = s.reset()
s.map.render()
ass = s.step(ass[2][3][17])
s.map.render()
ass = s.step(ass[2][3][35])
s.map.render()