Ejemplo n.º 1
0
class GameManager:
    def __init__(self, id):

        self.visualize = False

        if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0:
            self.visualize = True
        elif Config.PLAY_MODE:
            self.visualize = True

        world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)]
        self.env = Environment(world_name)
        print("Env {} for Agent {} started.".format(world_name, id))

        self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END)
        self.env.set_observation_rotation_size(Config.OBSERVATION_ROTATION_SIZE)
        self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION)
        self.env.set_cluster_size(Config.CLUSTER_SIZE)

        self.reset()

    def reset(self):
        observation, _, _, _ = self.env.reset()
        input_laser, rotation = self.process_observation(observation)
        map = StateMap(input_laser)
        obs = np.array([ [map.S_image], [rotation] ])
        return obs

    def step(self, action):
        self._update_display()
        if action is None:
            observation, reward, done, info = self.env.step(0, 0, 20)

            input_laser, rotation = self.process_observation(observation)
            map = StateMap(input_laser)
            #obs = np.array([[map.States_map, map.Reward_map], [rotation]])
            obs = np.array([[map.S_image], [rotation]])
            reward = 0
            done = False
        else:

            linear, angular = map_action(action)
            observation, reward, done, info = self.env.step(linear, angular, 20)
            input_laser, rotation = self.process_observation(observation)
            map = StateMap(input_laser)
            obs = np.array([[map.S_image], [rotation]])

        return obs, reward, done, info

    def _update_display(self):
        if self.visualize:
            self.env.visualize()

    def observation_size(self):
        return self.env.observation_size()

    def process_observation(self, observation):
        laser_scan = np.array(observation[:Config.OBSERVATION_SIZE])
        oriontaion = np.array(observation[Config.OBSERVATION_SIZE:])
        return laser_scan, oriontaion
Ejemplo n.º 2
0
def main():
	sess = tf.Session()
	K.set_session(sess)
	env = Environment("test")
	actor_critic = ActorCritic(env, sess)
	done = False
	num_trials = 10000
	trial_len  = 500

	steps = []
	state_size = env.observation_size()
	for trial in range(num_trials):

		cur_state,_,_,_ = env.reset()
		cur_state = np.reshape(cur_state, [1,state_size])
		
		for step in range(trial_len):
			action = actor_critic.act(cur_state)
			linear, angular = convert_action(action)
			new_state, reward, done, _ = env.step(linear, angular,10)
			new_state = np.reshape(new_state, [1, state_size])
			actor_critic.remember(cur_state, action, reward, new_state, done)
			actor_critic.train()
			cur_state = new_state
			env.visualize()
			if done:
				break
Ejemplo n.º 3
0
class GameManager:
    def __init__(self, id):

        self.visualize = False

        if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0:
            self.visualize = True
        elif Config.PLAY_MODE:
            self.visualize = True

        world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)]
        self.env = Environment(world_name)
        print("Env {} for Agent {} started.".format(world_name, id))

        self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END)
        self.env.set_observation_rotation_size(
            Config.OBSERVATION_ROTATION_SIZE)
        self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION)
        self.env.set_cluster_size(Config.CLUSTER_SIZE)

        self.reset()

    def reset(self):
        observation, _, _, _ = self.env.reset()
        return observation

    def step(self, action):
        self._update_display()
        if action is None:
            observation, reward, done, info = self.env.step(0, 0, 20)
            reward = 0
            done = False
        else:
            linear, angular = map_action(action)
            observation, reward, done, info = self.env.step(
                linear, angular, 20)
        return observation, reward, done, info

    def _update_display(self):
        if self.visualize:
            self.env.visualize()

    def observation_size(self):
        return self.env.observation_size()
Ejemplo n.º 4
0
def main():
	sess = tf.Session()
	K.set_session(sess)
	env = Environment("test")
	actor_critic = ActorCritic(env, sess)
	done = False
	num_trials = 10000
	trial_len  = 500

	steps = []
	state_size = env.observation_size()
	for trial in range(num_trials):
		reward_sum = 0
		cur_state,_,_,_ = env.reset()
		cur_state = np.reshape(cur_state, [1,state_size])
		
		for step in range(trial_len):
			action = actor_critic.act(cur_state)
			action2 = np.argmax(action[0])
			linear , angular =  convert_action(action2)
			print("action", action)
			#linear = action[0][0]
			#linear = np.array([linear])
			#linear = float(linear[0])
			#linear = (0.8/math.pi)*math.atan((linear-0.5))+0.45
			#2/pi*atan(50*(x-0.5))
			print("linear", linear)	
			#angular =action[0][1]# 0.77
			#angular = np.array([angular])
			#angular = float(angular[0])
			#1/pi*atan(15*(x-0.5))+0.5
			#angular = (2/math.pi)*math.atan((angular - 0.5))
			print("angular", angular)	
			new_state, reward, done, _ = env.step(linear, angular,20)
			new_state = np.reshape(new_state, [1, state_size])
			reward_sum = reward_sum + reward
			actor_critic.remember(cur_state, action, reward, new_state, done)
			cur_state = new_state
			env.visualize()
			if done:
				print("Break!")
				break
		actor_critic.train()
Ejemplo n.º 5
0
def main():
    #env     = gym.make("MountainCar-v0")
    env = Environment("test")
    state_size = env.observation_size()
    gamma = 0.9
    epsilon = .95

    trials = 1000
    trial_len = 500

    # updateTargetNetwork = 1000
    dqn_agent = DQN(env=env)
    done = False
    batch_size = 32
    steps = []
    for trial in range(trials):

        reward_sum = 0
        cur_state, _, _, _ = env.reset()
        cur_state = np.reshape(cur_state, [1, state_size])
        for step in range(trial_len):
            action = dqn_agent.act(cur_state)
            linear, angular = convert_action(action)

            new_state, reward, done, _ = env.step(linear, angular, 10)
            # reward = reward if not done else -20
            new_state = np.reshape(new_state, [1, state_size])
            reward_sum = reward_sum + reward
            dqn_agent.remember(cur_state, action, reward, new_state, done)

            #dqn_agent.replay()       # internally iterates default (prediction) model
            dqn_agent.target_train()  # iterates target model

            cur_state = new_state

            env.visualize()
            if done:
                print("episode: {}/{}, score: {}, e: {:.2} time:{}".format(
                    trial, trials, reward_sum, dqn_agent.epsilon, step))
                break
        if len(dqn_agent.memory) > batch_size:
            dqn_agent.replay()
Ejemplo n.º 6
0
 def _build_graph(self):
     env = Environment(self.world_name)  # @TODO Vernünftig machen
     env.set_cluster_size(CLUSTER_SIZE)
     env.use_observation_rotation_size(self.use_target)
     input = tflearn.layers.input_data(shape=(None, env.observation_size()),
                                       dtype=tf.float32)
     input = tf.expand_dims(input, -1)
     net = input
     net = tflearn.layers.conv_1d(net, 16, 3, padding='same')
     net = tflearn.layers.max_pool_1d(net, 3)
     net = tflearn.layers.conv_1d(net, 16, 2)
     net = tflearn.layers.max_pool_1d(net, 2)
     net = tflearn.layers.fully_connected(net, 64, activation='relu')
     net = tflearn.layers.fully_connected(net,
                                          self.action_mapper.ACTION_SIZE,
                                          activation='linear')
     # net = tflearn.layers.fully_connected(net, 512, activation='relu')
     # net = tflearn.layers.fully_connected(net, 256, activation='relu')
     # net = tflearn.layers.fully_connected(net, self.action_size, activation='linear')
     return input, net
Ejemplo n.º 7
0
	def process_info(self, info):
		return info

	def process_action(self, action):
		return action

	def process_state_batch(self, batch):
		return batch[:, 0, :]

env = Environment("Simulation2d/svg/proto_4", 4)
env.use_observation_rotation_size(True)
env.set_observation_rotation_size(128)
env.set_mode(Mode.ALL_RANDOM)

processor = DQNAgentProc()
states = env.observation_size()
actions = action_mapper.ACTION_SIZE

if DEBUG:
	print('states: {0}'.format(states))
	print('actions: {0}'.format(actions))

def build_callbacks(env_name):
    weights_filename = 'new_results/'+ env_name +'{step}.h5f'
    log_filename = 'new_log/{}.json'.format(env_name)
    callbacks = [ModelIntervalCheckpoint(weights_filename, interval=10000)]
    callbacks += [FileLogger(log_filename, interval=1000)]
    return callbacks

def build_model(states, actions):
	model = Sequential()
Ejemplo n.º 8
0
        self.rewards.append(reward)

    # update policy network and value network every episode
    def train_episode(self, done):
        discounted_rewards = self.discount_rewards(self.rewards, done)
        inp = np.reshape(self.states, (1, 5))  #np.reshape(values, len(values))
        values = self.critic.predict(inp)
        print("tesst")

        advantages = discounted_rewards - values

        self.optimizer[0]([self.states, self.actions, advantages])
        self.optimizer[1]([self.states, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []

    def get_action(self, state):
        policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
        return np.random.choice(self.action_size, 1, p=policy)[0]


if __name__ == "__main__":

    env = Environment("test")

    state_size = env.observation_size()
    action_size = 5

    global_agent = A3CAgent(state_size, action_size)

global_agent.train()
Ejemplo n.º 9
0
class WorkerAgent(threading.Thread):
    def __init__(self, name, graph_ops, update_ops, world_name, use_target,
                 session, saver):
        super().__init__()

        self.name = name
        self.graph_ops = graph_ops
        self.session = session
        self.saver = saver

        self.graph_ops = graph_ops
        self.update_ops = update_ops

        self.env = Environment(world_name)
        self.env.use_observation_rotation_size(use_target)
        self.env.set_cluster_size(CLUSTER_SIZE)
        self.state_size = self.env.observation_size()
        self.action_size = action_mapper.ACTION_SIZE

    def run(self):
        global global_episode, global_step
        print('Thread {} started.'.format(self.name))

        local_episodes = 0
        accumulated_reward = 0
        best_reward = 0
        epsilon = INITIAL_EPSILON

        state_batch = []
        reward_batch = []
        action_batch = []

        period_start_time = time.time()

        while global_episode <= MAX_EPISODES:
            self.env.reset()
            state, _, _, _ = self.env.step(0, 0)
            state = self.reshape_state(state)

            episode_step = 0
            episode_reward = 0

            while True:
                q_output = self.graph_ops['network']['q_values'].eval(
                    session=self.session,
                    feed_dict={self.graph_ops['network']['input']: [state]})

                if random() <= epsilon:
                    action_index = randrange(self.action_size)
                else:
                    action_index = np.argmax(q_output)

                a_t = np.zeros([self.action_size])
                a_t[action_index] = 1

                if epsilon > final_epsilon:
                    epsilon -= (INITIAL_EPSILON -
                                final_epsilon) / anneal_epsilon_timesteps

                #print("Choosing Action {}".format(action_index))

                x1, x2 = action_mapper.map_action(action_index)
                next_state, reward, term, info = self.env.step(x1, x2, 10)
                next_state = self.reshape_state(next_state)
                episode_reward += reward

                if visualize:
                    self.env.visualize()

                #print("Reward: {} \n\n".format(reward))

                next_q_values = self.graph_ops['target_network'][
                    'q_values'].eval(
                        session=self.session,
                        feed_dict={
                            self.graph_ops['target_network']['input']:
                            [next_state]
                        })

                if not term:
                    reward = reward + gamma * np.amax(next_q_values)

                state_batch.append(state)
                action_batch.append(a_t)
                reward_batch.append(reward)

                if global_step % target_update_timestep == 0:
                    self.session.run(self.update_ops['reset_target_network'])
                    print("Target Net Resetted")

                # start = time.time()
                if episode_step % UPDATE_PERIOD == 0 or term:
                    self.session.run(self.update_ops['minimize'],
                                     feed_dict={
                                         self.update_ops['y']:
                                         reward_batch,
                                         self.update_ops['a']:
                                         action_batch,
                                         self.graph_ops['network']['input']:
                                         state_batch
                                     })

                    state_batch = []
                    action_batch = []
                    reward_batch = []

                # end = time.time()
                # print('Time for updating: ', end - start)

                if global_step % CHECKPOINT_PERIOD_TIMESTEPS == 0:
                    self.saver.save(self.session,
                                    CHECKPOINT_PATH,
                                    global_step=global_step)

                global_step += 1
                state = next_state
                episode_step += 1

                if term:
                    break

            accumulated_reward += episode_reward
            best_reward = episode_reward if (
                episode_reward > best_reward) else best_reward

            local_episodes += 1
            global_episode += 1

            if local_episodes % PRINT_EVERY == 0:
                period_end_time = time.time()
                #writer.add_summary(tf.summary.scalar('AVG Reward', accumulated_reward / PRINT_EVERY))
                print(
                    "Thread {0:}. Total Episodes {1:}. Reward AVG: {2:.3f}, Best Reward: {3:.3f}, Globalstep: {4:6d}, Epsilon: {5:f}, Time: {6:}"
                    .format(self.name, global_episode,
                            accumulated_reward / PRINT_EVERY, best_reward,
                            global_step, epsilon,
                            period_end_time - period_start_time))
                accumulated_reward = 0
                best_reward = -99999
                period_start_time = time.time()

    def reshape_state(self, state):
        return np.reshape(state, [self.state_size, 1])
Ejemplo n.º 10
0
LR_A = 0.0001  # 0.0001    # learning rate for actor
LR_C = 0.001  # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0

ENV_NAME = "square"
ENV_NAME_2 = "roblab"
ENV_NAME_3 = "room"

CLUSTER_SIZE = 10
SKIP_LRF = 20

env = Environment(ENV_NAME)
env.set_cluster_size(CLUSTER_SIZE)

N_S = env.observation_size() + 64  # state_size  TODO
N_A = 5  # action size


class ACNet(object):
    def __init__(self, scope, globalAC=None):

        if scope == GLOBAL_NET_SCOPE:  # get global network
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_params, self.c_params = self._build_net(scope)[-2:]
        else:  # local net, calculate losses
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
                self.v_target = tf.placeholder(tf.float32, [None, 1],
Ejemplo n.º 11
0
MAX_GLOBAL_EP = 1500
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 5
GAMMA = 0.9
ENTROPY_BETA = 0.01
LR_A = 0.0001  # learning rate for actor
LR_C = 0.001  # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
ENV_NAME = "test"
CLUSTER_SIZE = 10

env = Environment(ENV_NAME)
env.set_cluster_size(CLUSTER_SIZE)

N_S = env.observation_size()  # state_size
N_A = 5  # action size


class ACNet(object):
    def __init__(self, scope, globalAC=None):

        if scope == GLOBAL_NET_SCOPE:  # get global network
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_params, self.c_params = self._build_net(scope)[-2:]
        else:  # local net, calculate losses
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
                self.v_target = tf.placeholder(tf.float32, [None, 1],
Ejemplo n.º 12
0
    elif action == 3:
        angular = -0.44
        linear = 1.25
    else:
        angular = -0.77
        linear = 0.75

    return linear, angular


if __name__ == "__main__":
    env = Environment("test")

    env.set_cluster_size(10)

    state_size = env.observation_size()  #Anzahl der Laserscans

    action_size = 5
    agent = RNNAgent(state_size, action_size)
    # agent.load("./save/cartpole-dqn.h5")

    done = False
    batch_size = 32

    print("START")

    for e in range(EPISODES):

        reward_sum = 0

        state, _, _, _ = env.reset()