class GameManager: def __init__(self, id): self.visualize = False if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0: self.visualize = True elif Config.PLAY_MODE: self.visualize = True world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)] self.env = Environment(world_name) print("Env {} for Agent {} started.".format(world_name, id)) self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END) self.env.set_observation_rotation_size(Config.OBSERVATION_ROTATION_SIZE) self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION) self.env.set_cluster_size(Config.CLUSTER_SIZE) self.reset() def reset(self): observation, _, _, _ = self.env.reset() input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) obs = np.array([ [map.S_image], [rotation] ]) return obs def step(self, action): self._update_display() if action is None: observation, reward, done, info = self.env.step(0, 0, 20) input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) #obs = np.array([[map.States_map, map.Reward_map], [rotation]]) obs = np.array([[map.S_image], [rotation]]) reward = 0 done = False else: linear, angular = map_action(action) observation, reward, done, info = self.env.step(linear, angular, 20) input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) obs = np.array([[map.S_image], [rotation]]) return obs, reward, done, info def _update_display(self): if self.visualize: self.env.visualize() def observation_size(self): return self.env.observation_size() def process_observation(self, observation): laser_scan = np.array(observation[:Config.OBSERVATION_SIZE]) oriontaion = np.array(observation[Config.OBSERVATION_SIZE:]) return laser_scan, oriontaion
def main(): sess = tf.Session() K.set_session(sess) env = Environment("test") actor_critic = ActorCritic(env, sess) done = False num_trials = 10000 trial_len = 500 steps = [] state_size = env.observation_size() for trial in range(num_trials): cur_state,_,_,_ = env.reset() cur_state = np.reshape(cur_state, [1,state_size]) for step in range(trial_len): action = actor_critic.act(cur_state) linear, angular = convert_action(action) new_state, reward, done, _ = env.step(linear, angular,10) new_state = np.reshape(new_state, [1, state_size]) actor_critic.remember(cur_state, action, reward, new_state, done) actor_critic.train() cur_state = new_state env.visualize() if done: break
class GameManager: def __init__(self, id): self.visualize = False if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0: self.visualize = True elif Config.PLAY_MODE: self.visualize = True world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)] self.env = Environment(world_name) print("Env {} for Agent {} started.".format(world_name, id)) self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END) self.env.set_observation_rotation_size( Config.OBSERVATION_ROTATION_SIZE) self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION) self.env.set_cluster_size(Config.CLUSTER_SIZE) self.reset() def reset(self): observation, _, _, _ = self.env.reset() return observation def step(self, action): self._update_display() if action is None: observation, reward, done, info = self.env.step(0, 0, 20) reward = 0 done = False else: linear, angular = map_action(action) observation, reward, done, info = self.env.step( linear, angular, 20) return observation, reward, done, info def _update_display(self): if self.visualize: self.env.visualize() def observation_size(self): return self.env.observation_size()
def main(): sess = tf.Session() K.set_session(sess) env = Environment("test") actor_critic = ActorCritic(env, sess) done = False num_trials = 10000 trial_len = 500 steps = [] state_size = env.observation_size() for trial in range(num_trials): reward_sum = 0 cur_state,_,_,_ = env.reset() cur_state = np.reshape(cur_state, [1,state_size]) for step in range(trial_len): action = actor_critic.act(cur_state) action2 = np.argmax(action[0]) linear , angular = convert_action(action2) print("action", action) #linear = action[0][0] #linear = np.array([linear]) #linear = float(linear[0]) #linear = (0.8/math.pi)*math.atan((linear-0.5))+0.45 #2/pi*atan(50*(x-0.5)) print("linear", linear) #angular =action[0][1]# 0.77 #angular = np.array([angular]) #angular = float(angular[0]) #1/pi*atan(15*(x-0.5))+0.5 #angular = (2/math.pi)*math.atan((angular - 0.5)) print("angular", angular) new_state, reward, done, _ = env.step(linear, angular,20) new_state = np.reshape(new_state, [1, state_size]) reward_sum = reward_sum + reward actor_critic.remember(cur_state, action, reward, new_state, done) cur_state = new_state env.visualize() if done: print("Break!") break actor_critic.train()
def main(): #env = gym.make("MountainCar-v0") env = Environment("test") state_size = env.observation_size() gamma = 0.9 epsilon = .95 trials = 1000 trial_len = 500 # updateTargetNetwork = 1000 dqn_agent = DQN(env=env) done = False batch_size = 32 steps = [] for trial in range(trials): reward_sum = 0 cur_state, _, _, _ = env.reset() cur_state = np.reshape(cur_state, [1, state_size]) for step in range(trial_len): action = dqn_agent.act(cur_state) linear, angular = convert_action(action) new_state, reward, done, _ = env.step(linear, angular, 10) # reward = reward if not done else -20 new_state = np.reshape(new_state, [1, state_size]) reward_sum = reward_sum + reward dqn_agent.remember(cur_state, action, reward, new_state, done) #dqn_agent.replay() # internally iterates default (prediction) model dqn_agent.target_train() # iterates target model cur_state = new_state env.visualize() if done: print("episode: {}/{}, score: {}, e: {:.2} time:{}".format( trial, trials, reward_sum, dqn_agent.epsilon, step)) break if len(dqn_agent.memory) > batch_size: dqn_agent.replay()
def _build_graph(self): env = Environment(self.world_name) # @TODO Vernünftig machen env.set_cluster_size(CLUSTER_SIZE) env.use_observation_rotation_size(self.use_target) input = tflearn.layers.input_data(shape=(None, env.observation_size()), dtype=tf.float32) input = tf.expand_dims(input, -1) net = input net = tflearn.layers.conv_1d(net, 16, 3, padding='same') net = tflearn.layers.max_pool_1d(net, 3) net = tflearn.layers.conv_1d(net, 16, 2) net = tflearn.layers.max_pool_1d(net, 2) net = tflearn.layers.fully_connected(net, 64, activation='relu') net = tflearn.layers.fully_connected(net, self.action_mapper.ACTION_SIZE, activation='linear') # net = tflearn.layers.fully_connected(net, 512, activation='relu') # net = tflearn.layers.fully_connected(net, 256, activation='relu') # net = tflearn.layers.fully_connected(net, self.action_size, activation='linear') return input, net
def process_info(self, info): return info def process_action(self, action): return action def process_state_batch(self, batch): return batch[:, 0, :] env = Environment("Simulation2d/svg/proto_4", 4) env.use_observation_rotation_size(True) env.set_observation_rotation_size(128) env.set_mode(Mode.ALL_RANDOM) processor = DQNAgentProc() states = env.observation_size() actions = action_mapper.ACTION_SIZE if DEBUG: print('states: {0}'.format(states)) print('actions: {0}'.format(actions)) def build_callbacks(env_name): weights_filename = 'new_results/'+ env_name +'{step}.h5f' log_filename = 'new_log/{}.json'.format(env_name) callbacks = [ModelIntervalCheckpoint(weights_filename, interval=10000)] callbacks += [FileLogger(log_filename, interval=1000)] return callbacks def build_model(states, actions): model = Sequential()
self.rewards.append(reward) # update policy network and value network every episode def train_episode(self, done): discounted_rewards = self.discount_rewards(self.rewards, done) inp = np.reshape(self.states, (1, 5)) #np.reshape(values, len(values)) values = self.critic.predict(inp) print("tesst") advantages = discounted_rewards - values self.optimizer[0]([self.states, self.actions, advantages]) self.optimizer[1]([self.states, discounted_rewards]) self.states, self.actions, self.rewards = [], [], [] def get_action(self, state): policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0] return np.random.choice(self.action_size, 1, p=policy)[0] if __name__ == "__main__": env = Environment("test") state_size = env.observation_size() action_size = 5 global_agent = A3CAgent(state_size, action_size) global_agent.train()
class WorkerAgent(threading.Thread): def __init__(self, name, graph_ops, update_ops, world_name, use_target, session, saver): super().__init__() self.name = name self.graph_ops = graph_ops self.session = session self.saver = saver self.graph_ops = graph_ops self.update_ops = update_ops self.env = Environment(world_name) self.env.use_observation_rotation_size(use_target) self.env.set_cluster_size(CLUSTER_SIZE) self.state_size = self.env.observation_size() self.action_size = action_mapper.ACTION_SIZE def run(self): global global_episode, global_step print('Thread {} started.'.format(self.name)) local_episodes = 0 accumulated_reward = 0 best_reward = 0 epsilon = INITIAL_EPSILON state_batch = [] reward_batch = [] action_batch = [] period_start_time = time.time() while global_episode <= MAX_EPISODES: self.env.reset() state, _, _, _ = self.env.step(0, 0) state = self.reshape_state(state) episode_step = 0 episode_reward = 0 while True: q_output = self.graph_ops['network']['q_values'].eval( session=self.session, feed_dict={self.graph_ops['network']['input']: [state]}) if random() <= epsilon: action_index = randrange(self.action_size) else: action_index = np.argmax(q_output) a_t = np.zeros([self.action_size]) a_t[action_index] = 1 if epsilon > final_epsilon: epsilon -= (INITIAL_EPSILON - final_epsilon) / anneal_epsilon_timesteps #print("Choosing Action {}".format(action_index)) x1, x2 = action_mapper.map_action(action_index) next_state, reward, term, info = self.env.step(x1, x2, 10) next_state = self.reshape_state(next_state) episode_reward += reward if visualize: self.env.visualize() #print("Reward: {} \n\n".format(reward)) next_q_values = self.graph_ops['target_network'][ 'q_values'].eval( session=self.session, feed_dict={ self.graph_ops['target_network']['input']: [next_state] }) if not term: reward = reward + gamma * np.amax(next_q_values) state_batch.append(state) action_batch.append(a_t) reward_batch.append(reward) if global_step % target_update_timestep == 0: self.session.run(self.update_ops['reset_target_network']) print("Target Net Resetted") # start = time.time() if episode_step % UPDATE_PERIOD == 0 or term: self.session.run(self.update_ops['minimize'], feed_dict={ self.update_ops['y']: reward_batch, self.update_ops['a']: action_batch, self.graph_ops['network']['input']: state_batch }) state_batch = [] action_batch = [] reward_batch = [] # end = time.time() # print('Time for updating: ', end - start) if global_step % CHECKPOINT_PERIOD_TIMESTEPS == 0: self.saver.save(self.session, CHECKPOINT_PATH, global_step=global_step) global_step += 1 state = next_state episode_step += 1 if term: break accumulated_reward += episode_reward best_reward = episode_reward if ( episode_reward > best_reward) else best_reward local_episodes += 1 global_episode += 1 if local_episodes % PRINT_EVERY == 0: period_end_time = time.time() #writer.add_summary(tf.summary.scalar('AVG Reward', accumulated_reward / PRINT_EVERY)) print( "Thread {0:}. Total Episodes {1:}. Reward AVG: {2:.3f}, Best Reward: {3:.3f}, Globalstep: {4:6d}, Epsilon: {5:f}, Time: {6:}" .format(self.name, global_episode, accumulated_reward / PRINT_EVERY, best_reward, global_step, epsilon, period_end_time - period_start_time)) accumulated_reward = 0 best_reward = -99999 period_start_time = time.time() def reshape_state(self, state): return np.reshape(state, [self.state_size, 1])
LR_A = 0.0001 # 0.0001 # learning rate for actor LR_C = 0.001 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 ENV_NAME = "square" ENV_NAME_2 = "roblab" ENV_NAME_3 = "room" CLUSTER_SIZE = 10 SKIP_LRF = 20 env = Environment(ENV_NAME) env.set_cluster_size(CLUSTER_SIZE) N_S = env.observation_size() + 64 # state_size TODO N_A = 5 # action size class ACNet(object): def __init__(self, scope, globalAC=None): if scope == GLOBAL_NET_SCOPE: # get global network with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self.a_params, self.c_params = self._build_net(scope)[-2:] else: # local net, calculate losses with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') self.v_target = tf.placeholder(tf.float32, [None, 1],
MAX_GLOBAL_EP = 1500 GLOBAL_NET_SCOPE = 'Global_Net' UPDATE_GLOBAL_ITER = 5 GAMMA = 0.9 ENTROPY_BETA = 0.01 LR_A = 0.0001 # learning rate for actor LR_C = 0.001 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 ENV_NAME = "test" CLUSTER_SIZE = 10 env = Environment(ENV_NAME) env.set_cluster_size(CLUSTER_SIZE) N_S = env.observation_size() # state_size N_A = 5 # action size class ACNet(object): def __init__(self, scope, globalAC=None): if scope == GLOBAL_NET_SCOPE: # get global network with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self.a_params, self.c_params = self._build_net(scope)[-2:] else: # local net, calculate losses with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') self.v_target = tf.placeholder(tf.float32, [None, 1],
elif action == 3: angular = -0.44 linear = 1.25 else: angular = -0.77 linear = 0.75 return linear, angular if __name__ == "__main__": env = Environment("test") env.set_cluster_size(10) state_size = env.observation_size() #Anzahl der Laserscans action_size = 5 agent = RNNAgent(state_size, action_size) # agent.load("./save/cartpole-dqn.h5") done = False batch_size = 32 print("START") for e in range(EPISODES): reward_sum = 0 state, _, _, _ = env.reset()