Exemple #1
0
class GameManager:
    def __init__(self, id):

        self.visualize = False

        if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0:
            self.visualize = True
        elif Config.PLAY_MODE:
            self.visualize = True

        world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)]
        self.env = Environment(world_name)
        print("Env {} for Agent {} started.".format(world_name, id))

        self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END)
        self.env.set_observation_rotation_size(Config.OBSERVATION_ROTATION_SIZE)
        self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION)
        self.env.set_cluster_size(Config.CLUSTER_SIZE)

        self.reset()

    def reset(self):
        observation, _, _, _ = self.env.reset()
        input_laser, rotation = self.process_observation(observation)
        map = StateMap(input_laser)
        obs = np.array([ [map.S_image], [rotation] ])
        return obs

    def step(self, action):
        self._update_display()
        if action is None:
            observation, reward, done, info = self.env.step(0, 0, 20)

            input_laser, rotation = self.process_observation(observation)
            map = StateMap(input_laser)
            #obs = np.array([[map.States_map, map.Reward_map], [rotation]])
            obs = np.array([[map.S_image], [rotation]])
            reward = 0
            done = False
        else:

            linear, angular = map_action(action)
            observation, reward, done, info = self.env.step(linear, angular, 20)
            input_laser, rotation = self.process_observation(observation)
            map = StateMap(input_laser)
            obs = np.array([[map.S_image], [rotation]])

        return obs, reward, done, info

    def _update_display(self):
        if self.visualize:
            self.env.visualize()

    def observation_size(self):
        return self.env.observation_size()

    def process_observation(self, observation):
        laser_scan = np.array(observation[:Config.OBSERVATION_SIZE])
        oriontaion = np.array(observation[Config.OBSERVATION_SIZE:])
        return laser_scan, oriontaion
Exemple #2
0
class GameManager:
    def __init__(self, id):

        self.visualize = False

        if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0:
            self.visualize = True
        elif Config.PLAY_MODE:
            self.visualize = True

        world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)]
        self.env = Environment(world_name)
        print("Env {} for Agent {} started.".format(world_name, id))

        self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END)
        self.env.set_observation_rotation_size(
            Config.OBSERVATION_ROTATION_SIZE)
        self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION)
        self.env.set_cluster_size(Config.CLUSTER_SIZE)

        self.reset()

    def reset(self):
        observation, _, _, _ = self.env.reset()
        return observation

    def step(self, action):
        self._update_display()
        if action is None:
            observation, reward, done, info = self.env.step(0, 0, 20)
            reward = 0
            done = False
        else:
            linear, angular = map_action(action)
            observation, reward, done, info = self.env.step(
                linear, angular, 20)
        return observation, reward, done, info

    def _update_display(self):
        if self.visualize:
            self.env.visualize()

    def observation_size(self):
        return self.env.observation_size()
 def _build_graph(self):
     env = Environment(self.world_name)  # @TODO Vernünftig machen
     env.set_cluster_size(CLUSTER_SIZE)
     env.use_observation_rotation_size(self.use_target)
     input = tflearn.layers.input_data(shape=(None, env.observation_size()),
                                       dtype=tf.float32)
     input = tf.expand_dims(input, -1)
     net = input
     net = tflearn.layers.conv_1d(net, 16, 3, padding='same')
     net = tflearn.layers.max_pool_1d(net, 3)
     net = tflearn.layers.conv_1d(net, 16, 2)
     net = tflearn.layers.max_pool_1d(net, 2)
     net = tflearn.layers.fully_connected(net, 64, activation='relu')
     net = tflearn.layers.fully_connected(net,
                                          self.action_mapper.ACTION_SIZE,
                                          activation='linear')
     # net = tflearn.layers.fully_connected(net, 512, activation='relu')
     # net = tflearn.layers.fully_connected(net, 256, activation='relu')
     # net = tflearn.layers.fully_connected(net, self.action_size, activation='linear')
     return input, net
class WorkerAgent(threading.Thread):
    def __init__(self, name, graph_ops, update_ops, world_name, use_target,
                 session, saver):
        super().__init__()

        self.name = name
        self.graph_ops = graph_ops
        self.session = session
        self.saver = saver

        self.graph_ops = graph_ops
        self.update_ops = update_ops

        self.env = Environment(world_name)
        self.env.use_observation_rotation_size(use_target)
        self.env.set_cluster_size(CLUSTER_SIZE)
        self.state_size = self.env.observation_size()
        self.action_size = action_mapper.ACTION_SIZE

    def run(self):
        global global_episode, global_step
        print('Thread {} started.'.format(self.name))

        local_episodes = 0
        accumulated_reward = 0
        best_reward = 0
        epsilon = INITIAL_EPSILON

        state_batch = []
        reward_batch = []
        action_batch = []

        period_start_time = time.time()

        while global_episode <= MAX_EPISODES:
            self.env.reset()
            state, _, _, _ = self.env.step(0, 0)
            state = self.reshape_state(state)

            episode_step = 0
            episode_reward = 0

            while True:
                q_output = self.graph_ops['network']['q_values'].eval(
                    session=self.session,
                    feed_dict={self.graph_ops['network']['input']: [state]})

                if random() <= epsilon:
                    action_index = randrange(self.action_size)
                else:
                    action_index = np.argmax(q_output)

                a_t = np.zeros([self.action_size])
                a_t[action_index] = 1

                if epsilon > final_epsilon:
                    epsilon -= (INITIAL_EPSILON -
                                final_epsilon) / anneal_epsilon_timesteps

                #print("Choosing Action {}".format(action_index))

                x1, x2 = action_mapper.map_action(action_index)
                next_state, reward, term, info = self.env.step(x1, x2, 10)
                next_state = self.reshape_state(next_state)
                episode_reward += reward

                if visualize:
                    self.env.visualize()

                #print("Reward: {} \n\n".format(reward))

                next_q_values = self.graph_ops['target_network'][
                    'q_values'].eval(
                        session=self.session,
                        feed_dict={
                            self.graph_ops['target_network']['input']:
                            [next_state]
                        })

                if not term:
                    reward = reward + gamma * np.amax(next_q_values)

                state_batch.append(state)
                action_batch.append(a_t)
                reward_batch.append(reward)

                if global_step % target_update_timestep == 0:
                    self.session.run(self.update_ops['reset_target_network'])
                    print("Target Net Resetted")

                # start = time.time()
                if episode_step % UPDATE_PERIOD == 0 or term:
                    self.session.run(self.update_ops['minimize'],
                                     feed_dict={
                                         self.update_ops['y']:
                                         reward_batch,
                                         self.update_ops['a']:
                                         action_batch,
                                         self.graph_ops['network']['input']:
                                         state_batch
                                     })

                    state_batch = []
                    action_batch = []
                    reward_batch = []

                # end = time.time()
                # print('Time for updating: ', end - start)

                if global_step % CHECKPOINT_PERIOD_TIMESTEPS == 0:
                    self.saver.save(self.session,
                                    CHECKPOINT_PATH,
                                    global_step=global_step)

                global_step += 1
                state = next_state
                episode_step += 1

                if term:
                    break

            accumulated_reward += episode_reward
            best_reward = episode_reward if (
                episode_reward > best_reward) else best_reward

            local_episodes += 1
            global_episode += 1

            if local_episodes % PRINT_EVERY == 0:
                period_end_time = time.time()
                #writer.add_summary(tf.summary.scalar('AVG Reward', accumulated_reward / PRINT_EVERY))
                print(
                    "Thread {0:}. Total Episodes {1:}. Reward AVG: {2:.3f}, Best Reward: {3:.3f}, Globalstep: {4:6d}, Epsilon: {5:f}, Time: {6:}"
                    .format(self.name, global_episode,
                            accumulated_reward / PRINT_EVERY, best_reward,
                            global_step, epsilon,
                            period_end_time - period_start_time))
                accumulated_reward = 0
                best_reward = -99999
                period_start_time = time.time()

    def reshape_state(self, state):
        return np.reshape(state, [self.state_size, 1])
class Worker(object):
    def __init__(self, name, globalAC):
        if MULTIPLE_ROOMS:
            if name == "W_0" or name == "W_1" or name == "W_2":
                self.env = Environment(ENV_NAME)
            elif name == "W_3" or name == "W_4" or name == "W_5":
                self.env = Environment(ENV_NAME_2)
            else:
                self.env = Environment(ENV_NAME_3)
        else:
            self.env = Environment(ENV_NAME)

        self.env.set_cluster_size(CLUSTER_SIZE)
        self.env.set_observation_rotation_size(64)  # TODO
        self.env.use_observation_rotation_size(True)
        self.name = name
        self.AC = ACNet(name, globalAC)

    def convert_action(self, action):
        angular = 0
        linear = 0

        if action == 0:
            angular = 1.0
            linear = 0.5
        elif action == 1:
            angular = 0.5
            linear = 0.75
        elif action == 2:
            angular = 0.0
            linear = 1.0
        elif action == 3:
            angular = -0.5
            linear = 0.75
        else:
            angular = -1.0
            linear = 0.5

        return linear, angular

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s, _, _, _ = self.env.reset()
            s = np.reshape(s, [1, N_S])
            ep_r = 0
            # rnn_state = SESS.run(self.AC.init_state)    # zero rnn state at beginning
            # keep_state = deepcopy(rnn_state)      # keep rnn state for updating global net
            for ep_t in range(MAX_EP_STEP):

                # a, rnn_state_ = self.AC.choose_action(s, rnn_state)  # get the action and next rnn state
                a = self.AC.choose_action(
                    s)  # get the action and next rnn state
                b = np.asarray(a)
                b = b[0][0]

                action = np.argmax(b)

                linear, angular = self.convert_action(action)

                s_, r, done, _ = self.env.step(linear, angular, SKIP_LRF)
                s_ = np.reshape(s_, [1, N_S])

                # if (self.name == 'W_0' or self.name == "W_3") and VISUALIZE:
                if (self.name == 'W_0') and VISUALIZE:
                    self.env.visualize()

                done = True if ep_t == MAX_EP_STEP - 1 else done

                ep_r += r
                buffer_s.append(s)
                buffer_a.append(b)
                buffer_r.append(r)
                # buffer_r.append((r+8)/8)    # normalize

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        # v_s_ = SESS.run(self.AC.v, {self.AC.s: s_, self.AC.init_state: rnn_state_})[0, 0]
                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.vstack(buffer_a), np.vstack(
                            buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                        # self.AC.init_state: keep_state,
                    }

                    self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                    # keep_state = deepcopy(rnn_state_)   # replace the keep_state as the new initial rnn state_

                s = s_
                # rnn_state = rnn_state_  # renew rnn state
                total_step += 1

                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] +
                                                0.1 * ep_r)

                    if self.name == "W_0":
                        print(self.name, "Ep:", GLOBAL_EP, "Ep_r:", ep_r)
                        # print(
                        #     self.name,
                        #     "Ep:", GLOBAL_EP,
                        #     "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                        #       )
                    GLOBAL_EP += 1
                    if GLOBAL_EP % SAVE_INTERVAL == 0:
                        print("Versuche zu Speichern...")
                        self.AC.save_global()
                        print("...gespeichert!")
                    break
GAMMA = 0.9
ENTROPY_BETA = 0.01
LR_A = 0.0001  # 0.0001    # learning rate for actor
LR_C = 0.001  # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0

ENV_NAME = "square"
ENV_NAME_2 = "roblab"
ENV_NAME_3 = "room"

CLUSTER_SIZE = 10
SKIP_LRF = 20

env = Environment(ENV_NAME)
env.set_cluster_size(CLUSTER_SIZE)

N_S = env.observation_size() + 64  # state_size  TODO
N_A = 5  # action size


class ACNet(object):
    def __init__(self, scope, globalAC=None):

        if scope == GLOBAL_NET_SCOPE:  # get global network
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_params, self.c_params = self._build_net(scope)[-2:]
        else:  # local net, calculate losses
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
Exemple #7
0
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = Environment(ENV_NAME)
        self.env.set_cluster_size(CLUSTER_SIZE)
        self.name = name
        self.AC = ACNet(name, globalAC)

    def convert_action(self, action):
        angular = 0
        linear = 0

        if action == 0:
            angular = 1.0
            linear = 0.5
        elif action == 1:
            angular = 0.5
            linear = 0.75
        elif action == 2:
            angular = 0.0
            linear = 1.0
        elif action == 3:
            angular = -0.5
            linear = 0.75
        else:
            angular = -1.0
            linear = 0.5

        return linear, angular

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s, _, _, _ = self.env.reset()
            s = np.reshape(s, [1, N_S])
            ep_r = 0
            rnn_state = SESS.run(
                self.AC.init_state)  # zero rnn state at beginning
            keep_state = rnn_state.copy(
            )  # keep rnn state for updating global net
            for ep_t in range(MAX_EP_STEP):
                if self.name == 'W_0':
                    self.env.visualize()
                a, rnn_state_ = self.AC.choose_action(
                    s, rnn_state)  # get the action and next rnn state

                action = np.argmax(a)

                linear, angular = self.convert_action(action)

                s_, r, done, _ = self.env.step(
                    linear, angular,
                    10)  # Die Zahl heißt: überspringe so viele Laserscanns
                s_ = np.reshape(s_, [1, N_S])

                done = True if ep_t == MAX_EP_STEP - 1 else done

                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)
                # buffer_r.append((r+8)/8)    # normalize

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v, {
                            self.AC.s: s_,
                            self.AC.init_state: rnn_state_
                        })[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.vstack(buffer_a), np.vstack(
                            buffer_v_target)

                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                        self.AC.init_state: keep_state,
                    }

                    self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()
                    keep_state = rnn_state_.copy(
                    )  # replace the keep_state as the new initial rnn state_

                s = s_
                rnn_state = rnn_state_  # renew rnn state
                total_step += 1
                if self.name == 'W_0':
                    self.env.visualize()
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] +
                                                0.1 * ep_r)
                    print(
                        self.name,
                        "Ep:",
                        GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                    )
                    GLOBAL_EP += 1
                    break
Exemple #8
0
        angular = 0
        linear = 1.5
    elif action == 3:
        angular = -0.44
        linear = 1.25
    else:
        angular = -0.77
        linear = 0.75

    return linear, angular


if __name__ == "__main__":
    env = Environment("test")

    env.set_cluster_size(10)

    state_size = env.observation_size()  #Anzahl der Laserscans

    action_size = 5
    agent = RNNAgent(state_size, action_size)
    # agent.load("./save/cartpole-dqn.h5")

    done = False
    batch_size = 32

    print("START")

    for e in range(EPISODES):

        reward_sum = 0