def _set_local_network(self, device, network_scope, scene_scope, task_scope):
   self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope,
                          scene_scopes=[scene_scope])
   
   self.network_scope = network_scope
   self.scene_scope = scene_scope
   self.task_scope = task_scope
   self.scopes = [network_scope, scene_scope, task_scope]
   self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)
    def __init__(self):
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

        # self.evluation_gap = 10**6
        print(MAX_TIME_STEP)
        self.device = "/gpu:0" if USE_GPU else "/cpu:0"
        self.network_scope = TASK_TYPE
        self.list_of_tasks = TASK_LIST
        self.scene_scopes = self.list_of_tasks.keys()
        self.global_t = 0
        self.stop_requested = False

        self.initial_learning_rate = self.log_uniform(LR_ALPHA_LOW,
                                                      LR_ALPHA_HIGH,
                                                      LR_ALPHA_LOG_RATE)

        self.global_network = DRLNetwork(action_size=ACTION_SIZE,
                                         device=self.device,
                                         network_scope=self.network_scope,
                                         scene_scopes=self.scene_scopes)

        self.branches = []
        for scene in self.scene_scopes:
            for task in self.list_of_tasks[scene]:
                self.branches.append((scene, task))

        self.NUM_TASKS = len(self.branches)
        assert NUM_THREADS >= self.NUM_TASKS, \
            "Not enough threads for multitasking: at least {} threads needed.".format(self.NUM_TASKS)

        self.learning_rate_input = tf.placeholder("float")
        self.grad_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=RMSP_ALPHA,
            momentum=0.0,
            epsilon=RMSP_EPSILON,
            clip_norm=GRAD_NORM_CLIP,
            device=self.device)

        # instantiate each training thread
        # each thread is training for one target in one scene
        self.training_threads = []
        for i in range(NUM_THREADS):
            scene, task = self.branches[i % self.NUM_TASKS]
            training_thread = ADQN_Thread(i,
                                          self.global_network,
                                          self.initial_learning_rate,
                                          self.learning_rate_input,
                                          self.grad_applier,
                                          MAX_TIME_STEP,
                                          device=self.device,
                                          network_scope="thread-%d" % (i + 1),
                                          scene_scope=scene,
                                          task_scope=task)
            self.training_threads.append(training_thread)
Esempio n. 3
0
    def __init__(self, global_step):

        device = "/cpu:0"  # use CPU for display tool
        network_scope = TASK_TYPE
        list_of_tasks = TASK_LIST
        scene_scopes = list_of_tasks.keys()

        global_network = DRLNetwork(action_size=4,
                                    device=device,
                                    network_scope=network_scope,
                                    scene_scopes=scene_scopes)

        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)

        saver = tf.train.Saver()

        # Read network from checkpoint file
        checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded: {}".format(
                checkpoint.model_checkpoint_path))
        else:
            print("Could not find old checkpoint")

        # Read checkpoint directly, 'meta' file
        # saver.restore(sess, checkpoint.model_checkpoint_path)

        scene_stats = dict()
        self.results = dict()
        for scene_scope in scene_scopes:

            scene_stats[scene_scope] = []
            for task_scope in list_of_tasks[scene_scope]:

                env = Environment({
                    'scene_name': scene_scope,
                    'terminal_state_id': int(task_scope)
                })
                ep_rewards = []
                ep_lengths = []
                ep_collisions = []

                scopes = [network_scope, scene_scope, task_scope]

                for i_episode in range(NUM_EVAL_EPISODES):

                    env.reset()
                    terminal = False
                    ep_reward = 0
                    ep_collision = 0
                    ep_t = 0

                    while not terminal:

                        pi_values = global_network.run_policy(
                            sess, env.s_t, env.target, scopes)
                        pi_values = np.array(pi_values) / np.sum(pi_values)
                        action = np.random.choice(np.arange(len(pi_values)),
                                                  p=pi_values)
                        env.step(action)

                        terminal = env.terminal
                        if ep_t == 10000: break
                        if env.collided: ep_collision += 1
                        ep_reward += env.reward
                        ep_t += 1

                    ep_lengths.append(ep_t)
                    ep_rewards.append(ep_reward)
                    ep_collisions.append(ep_collision)
                    if VERBOSE:
                        print("episode #{} ends after {} steps".format(
                            i_episode, ep_t))

                print('evaluation: %s %s' % (scene_scope, task_scope))
                print('mean episode reward: %.2f' % np.mean(ep_rewards))
                print('mean episode length: %.2f' % np.mean(ep_lengths))
                print('mean episode collision: %.2f' % np.mean(ep_collisions))

                scene_stats[scene_scope].extend(ep_lengths)

        print('\nResults (average trajectory length):')
        for scene_scope in scene_stats:
            self.results[scene_scope] = np.mean(scene_stats[scene_scope])
            print('%s: %.2f steps' % (scene_scope, self.results[scene_scope]))

        with open("./Evaluation/result_%d.txt" % global_step, 'wb') as fp:
            pickle.dump(self.results, fp)
    scene_scopes = list_of_tasks.keys()
    global_t = 0
    stop_requested = False

    if not os.path.exists(CHECKPOINT_DIR):
        os.mkdir(CHECKPOINT_DIR)

    # Initialize learning rate
    log_lo = np.log(LR_ALPHA_LOW)
    log_hi = np.log(LR_ALPHA_HIGH)
    v = log_lo * (1 - LR_ALPHA_LOG_RATE) + log_hi * LR_ALPHA_LOG_RATE
    initial_learning_rate = np.exp(v)

    # Create global network
    global_network = DRLNetwork(action_size=ACTION_SIZE,
                                device=device,
                                network_scope=network_scope,
                                scene_scopes=scene_scopes)

    # Initialize scene-task specific branch
    branches = []
    for scene in scene_scopes:
        for task in list_of_tasks[scene]:
            branches.append((scene, task))

    NUM_TASKS = len(branches)
    assert NUM_THREADS >= NUM_TASKS, \
      "Not enough threads for multitasking: at least {} threads needed.".format(NUM_TASKS)

    # Create gradient applier
    learning_rate_input = tf.placeholder("float")
    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,