def _set_local_network(self, device, network_scope, scene_scope, task_scope): self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)
def __init__(self): if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) # self.evluation_gap = 10**6 print(MAX_TIME_STEP) self.device = "/gpu:0" if USE_GPU else "/cpu:0" self.network_scope = TASK_TYPE self.list_of_tasks = TASK_LIST self.scene_scopes = self.list_of_tasks.keys() self.global_t = 0 self.stop_requested = False self.initial_learning_rate = self.log_uniform(LR_ALPHA_LOW, LR_ALPHA_HIGH, LR_ALPHA_LOG_RATE) self.global_network = DRLNetwork(action_size=ACTION_SIZE, device=self.device, network_scope=self.network_scope, scene_scopes=self.scene_scopes) self.branches = [] for scene in self.scene_scopes: for task in self.list_of_tasks[scene]: self.branches.append((scene, task)) self.NUM_TASKS = len(self.branches) assert NUM_THREADS >= self.NUM_TASKS, \ "Not enough threads for multitasking: at least {} threads needed.".format(self.NUM_TASKS) self.learning_rate_input = tf.placeholder("float") self.grad_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=self.device) # instantiate each training thread # each thread is training for one target in one scene self.training_threads = [] for i in range(NUM_THREADS): scene, task = self.branches[i % self.NUM_TASKS] training_thread = ADQN_Thread(i, self.global_network, self.initial_learning_rate, self.learning_rate_input, self.grad_applier, MAX_TIME_STEP, device=self.device, network_scope="thread-%d" % (i + 1), scene_scope=scene, task_scope=task) self.training_threads.append(training_thread)
def __init__(self, global_step): device = "/cpu:0" # use CPU for display tool network_scope = TASK_TYPE list_of_tasks = TASK_LIST scene_scopes = list_of_tasks.keys() global_network = DRLNetwork(action_size=4, device=device, network_scope=network_scope, scene_scopes=scene_scopes) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() # Read network from checkpoint file checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded: {}".format( checkpoint.model_checkpoint_path)) else: print("Could not find old checkpoint") # Read checkpoint directly, 'meta' file # saver.restore(sess, checkpoint.model_checkpoint_path) scene_stats = dict() self.results = dict() for scene_scope in scene_scopes: scene_stats[scene_scope] = [] for task_scope in list_of_tasks[scene_scope]: env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) ep_rewards = [] ep_lengths = [] ep_collisions = [] scopes = [network_scope, scene_scope, task_scope] for i_episode in range(NUM_EVAL_EPISODES): env.reset() terminal = False ep_reward = 0 ep_collision = 0 ep_t = 0 while not terminal: pi_values = global_network.run_policy( sess, env.s_t, env.target, scopes) pi_values = np.array(pi_values) / np.sum(pi_values) action = np.random.choice(np.arange(len(pi_values)), p=pi_values) env.step(action) terminal = env.terminal if ep_t == 10000: break if env.collided: ep_collision += 1 ep_reward += env.reward ep_t += 1 ep_lengths.append(ep_t) ep_rewards.append(ep_reward) ep_collisions.append(ep_collision) if VERBOSE: print("episode #{} ends after {} steps".format( i_episode, ep_t)) print('evaluation: %s %s' % (scene_scope, task_scope)) print('mean episode reward: %.2f' % np.mean(ep_rewards)) print('mean episode length: %.2f' % np.mean(ep_lengths)) print('mean episode collision: %.2f' % np.mean(ep_collisions)) scene_stats[scene_scope].extend(ep_lengths) print('\nResults (average trajectory length):') for scene_scope in scene_stats: self.results[scene_scope] = np.mean(scene_stats[scene_scope]) print('%s: %.2f steps' % (scene_scope, self.results[scene_scope])) with open("./Evaluation/result_%d.txt" % global_step, 'wb') as fp: pickle.dump(self.results, fp)
scene_scopes = list_of_tasks.keys() global_t = 0 stop_requested = False if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) # Initialize learning rate log_lo = np.log(LR_ALPHA_LOW) log_hi = np.log(LR_ALPHA_HIGH) v = log_lo * (1 - LR_ALPHA_LOG_RATE) + log_hi * LR_ALPHA_LOG_RATE initial_learning_rate = np.exp(v) # Create global network global_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=scene_scopes) # Initialize scene-task specific branch branches = [] for scene in scene_scopes: for task in list_of_tasks[scene]: branches.append((scene, task)) NUM_TASKS = len(branches) assert NUM_THREADS >= NUM_TASKS, \ "Not enough threads for multitasking: at least {} threads needed.".format(NUM_TASKS) # Create gradient applier learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input,