Esempio n. 1
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               env_type,
               env_name,
               use_lstm,
               use_pixel_change,
               use_value_replay,
               use_reward_prediction,
               pixel_change_lambda,
               entropy_beta,
               local_t_max,
               gamma,
               gamma_pc,
               experience_history_size,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.env_type = env_type
    self.env_name = env_name
    self.use_lstm = use_lstm
    self.use_pixel_change = use_pixel_change
    self.use_value_replay = use_value_replay
    self.use_reward_prediction = use_reward_prediction
    self.local_t_max = local_t_max
    self.gamma = gamma
    self.gamma_pc = gamma_pc
    self.experience_history_size = experience_history_size
    self.max_global_time_step = max_global_time_step
    self.action_size = Environment.get_action_size(env_type, env_name)
    self.objective_size = Environment.get_objective_size(env_type, env_name)
    
    self.local_network = UnrealModel(self.action_size,
                                     self.objective_size,
                                     thread_index,
                                     use_lstm,
                                     use_pixel_change,
                                     use_value_replay,
                                     use_reward_prediction,
                                     pixel_change_lambda,
                                     entropy_beta,
                                     device)
    self.local_network.prepare_loss()

    self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss,
                                                       global_network.get_vars(),
                                                       self.local_network.get_vars())
    
    self.sync = self.local_network.sync_from(global_network)
    self.experience = Experience(self.experience_history_size)
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    # For log output
    self.prev_local_t = 0
Esempio n. 2
0
 def __init__(self):
     self.action_size = Environment.get_action_size(flags.env_type,
                                                    flags.env_name)
     self.objective_size = Environment.get_objective_size(
         flags.env_type, flags.env_name)
     self.global_network = UnrealModel(self.action_size,
                                       self.objective_size,
                                       -1,
                                       flags.use_lstm,
                                       flags.use_pixel_change,
                                       flags.use_value_replay,
                                       flags.use_reward_prediction,
                                       0.0,
                                       0.0,
                                       "/cpu:0",
                                       for_display=True)
     self.environment = Environment.create_environment(
         flags.env_type,
         flags.env_name,
         env_args={
             'episode_schedule': flags.split,
             'log_action_trace': flags.log_action_trace,
             'seed': flags.seed,
             # 'max_states_per_scene': flags.episodes_per_scene,
             'episodes_per_scene_test': flags.episodes_per_scene
         })
     self.episode_reward = 0
     self.cnt_success = 0
Esempio n. 3
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.action_size = Environment.get_action_size()
    self.local_network = UnrealModel(self.action_size, thread_index, device)
    self.local_network.prepare_loss()

    self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss,
                                                       global_network.get_vars(),
                                                       self.local_network.get_vars())
    
    self.sync = self.local_network.sync_from(global_network)
    self.environment = Environment.create_environment()
    self.experience = Experience(EXPERIENCE_HISTORY_SIZE)
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    # For log output
    self.prev_local_t = 0
Esempio n. 4
0
 def __init__(self):
     self.action_size = Environment.get_action_size(flags.env_type,
                                                    flags.env_name)
     self.objective_size = Environment.get_objective_size(
         flags.env_type, flags.env_name)
     print('flags:use_pixel_change {}'.format(flags.use_pixel_change))
     sleep(10)
     self.global_network = UnrealModel(self.action_size,
                                       self.objective_size,
                                       -1,
                                       flags.use_lstm,
                                       flags.use_pixel_change,
                                       flags.use_value_replay,
                                       flags.use_reward_prediction,
                                       0.0,
                                       0.0,
                                       "/cpu:0",
                                       for_display=True)
     self.environment = Environment.create_environment(
         flags.env_type,
         flags.env_name,
         env_args={
             'episode_schedule': flags.split,
             'log_action_trace': flags.log_action_trace,
             'max_states_per_scene': flags.episodes_per_scene,
             'episodes_per_scene_test': flags.episodes_per_scene
         })
     print('\n======\nENV in Evaluate::ctor')
     print(self.environment)
     print(self.global_network)
     print('val_replay!!! {}'.format(flags.use_value_replay))
     print(flags.split)
     print('=======\n')
     sleep(10)
     self.episode_reward = 0
Esempio n. 5
0
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        pygame.display.set_caption('UNREAL')

        self.action_size = Environment.get_action_size(flags.env_type,
                                                       flags.env_name)
        self.objective_size = Environment.get_objective_size(
            flags.env_type, flags.env_name)
        self.global_network = UnrealModel(self.action_size,
                                          self.objective_size,
                                          -1,
                                          flags.use_lstm,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          0.0,
                                          0.0,
                                          "/cpu:0",
                                          for_display=True)
        self.environment = Environment.create_environment(
            flags.env_type,
            flags.env_name,
            env_args={
                'episode_schedule': flags.split,
                'log_action_trace': flags.log_action_trace,
                'max_states_per_scene': flags.episodes_per_scene,
                'episodes_per_scene_test': flags.episodes_per_scene
            })
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0
Esempio n. 6
0
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        name = 'UNREAL' if flags.segnet == 0 else "A3C ErfNet"
        pygame.display.set_caption(name)

        env_config = sim_config.get(flags.env_name)
        self.image_shape = [
            env_config.get('height', 88),
            env_config.get('width', 88)
        ]
        segnet_param_dict = {'segnet_mode': flags.segnet}
        is_training = tf.placeholder(tf.bool, name="training")
        map_file = env_config.get('objecttypes_file', '../../objectTypes.csv')
        self.label_mapping = pd.read_csv(map_file, sep=',', header=0)
        self.get_col_index()

        self.action_size = Environment.get_action_size(flags.env_type,
                                                       flags.env_name)
        self.objective_size = Environment.get_objective_size(
            flags.env_type, flags.env_name)
        self.global_network = UnrealModel(self.action_size,
                                          self.objective_size,
                                          -1,
                                          flags.use_lstm,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          0.0,
                                          0.0,
                                          "/gpu:0",
                                          segnet_param_dict=segnet_param_dict,
                                          image_shape=self.image_shape,
                                          is_training=is_training,
                                          n_classes=flags.n_classes,
                                          segnet_lambda=flags.segnet_lambda,
                                          dropout=flags.dropout,
                                          for_display=True)
        self.environment = Environment.create_environment(
            flags.env_type,
            flags.env_name,
            flags.termination_time_sec,
            env_args={
                'episode_schedule': flags.split,
                'log_action_trace': flags.log_action_trace,
                'max_states_per_scene': flags.episodes_per_scene,
                'episodes_per_scene_test': flags.episodes_per_scene
            })
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0
Esempio n. 7
0
 def __init__(self):
     self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8)
     self.action_size = Environment.get_action_size()
     self.global_network = UnrealModel(self.action_size,
                                       -1,
                                       "/cpu:0",
                                       for_display=True)
     self.env = Environment.create_environment()
     self.value_history = ValueHistory()
     self.state_history = StateHistory()
     self.ep_reward = 0
     self.mazemap = MazeMap()
Esempio n. 8
0
class Evaluate(object):
  def __init__(self):
    self.action_size = Environment.get_action_size(flags.env_type, flags.env_name)
    self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name)
    self.global_network = UnrealModel(self.action_size,
                                      self.objective_size,
                                      -1,
                                      flags.use_lstm,
                                      flags.use_pixel_change,
                                      flags.use_value_replay,
                                      flags.use_reward_prediction,
                                      0.0,
                                      0.0,
                                      "/cpu:0",
                                      for_display=True)
    self.environment = Environment.create_environment(flags.env_type, flags.env_name,
                                                      env_args={'episode_schedule': flags.split,
                                                                'log_action_trace': flags.log_action_trace,
                                                                'max_states_per_scene': flags.episodes_per_scene,
                                                                'episodes_per_scene_test': flags.episodes_per_scene})
    self.episode_reward = 0

  def update(self, sess):
    self.process(sess)

  def is_done(self):
    return self.environment.is_all_scheduled_episodes_done()

  def choose_action(self, pi_values):
    return np.random.choice(range(len(pi_values)), p=pi_values)

  def process(self, sess):
    last_action = self.environment.last_action
    last_reward = np.clip(self.environment.last_reward, -1, 1)
    last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size,
                                                                  last_reward, self.environment.last_state)
    
    if not flags.use_pixel_change:
      pi_values, v_value = self.global_network.run_base_policy_and_value(sess,
                                                                         self.environment.last_state,
                                                                         last_action_reward)
    else:
      pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess,
                                                                                self.environment.last_state,
                                                                                last_action_reward)
    action = self.choose_action(pi_values)
    state, reward, terminal, pixel_change = self.environment.process(action)
    self.episode_reward += reward
  
    if terminal:
      self.environment.reset()
      self.episode_reward = 0
Esempio n. 9
0
  def __init__(self):
    self.action_size = Environment.get_action_size(flags.env_type, flags.env_name)
    self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name)

    env_config = sim_config.get(flags.env_name)
    self.image_shape = [env_config['height'], env_config['width']]
    segnet_param_dict = {'segnet_mode': flags.segnet}
    is_training = tf.placeholder(tf.bool, name="training") # for display param in UnrealModel says its value

    self.global_network = UnrealModel(self.action_size,
                                      self.objective_size,
                                      -1,
                                      flags.use_lstm,
                                      flags.use_pixel_change,
                                      flags.use_value_replay,
                                      flags.use_reward_prediction,
                                      0.0, #flags.pixel_change_lambda
                                      0.0, #flags.entropy_beta
                                      device,
                                      segnet_param_dict=segnet_param_dict,
                                      image_shape=self.image_shape,
                                      is_training=is_training,
                                      n_classes=flags.n_classes,
                                      segnet_lambda=flags.segnet_lambda,
                                      dropout=flags.dropout,
                                      for_display=True)
    self.environment = Environment.create_environment(flags.env_type, flags.env_name, flags.termination_time_sec,
                                                      env_args={'episode_schedule': flags.split,
                                                                'log_action_trace': flags.log_action_trace,
                                                                'max_states_per_scene': flags.episodes_per_scene,
                                                                'episodes_per_scene_test': flags.episodes_per_scene})

    self.global_network.prepare_loss()

    self.total_loss = []
    self.segm_loss = []
    self.episode_reward = [0]
    self.episode_roomtype = []
    self.roomType_dict  = {}
    self.segnet_class_dict = {}
    self.success_rate = []
    self.batch_size = 20
    self.batch_cur_num = 0
    self.batch_prev_num = 0
    self.batch_si = []
    self.batch_sobjT = []
    self.batch_a = []
    self.batch_reward = []
Esempio n. 10
0
def main(args):
    action_size = Environment.get_action_size(flags.env_type, flags.env_name)
    objective_size = Environment.get_objective_size(flags.env_type,
                                                    flags.env_name)
    global_network = UnrealModel(action_size, objective_size, -1,
                                 flags.use_lstm, flags.use_pixel_change,
                                 flags.use_value_replay,
                                 flags.use_reward_prediction, 0.0, 0.0,
                                 "/cpu:0")  # use CPU for weight visualize tool

    sess = tf.Session()

    init = tf.global_variables_initializer()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    vars = {}
    var_list = global_network.get_vars()
    for v in var_list:
        vars[v.name] = v

    W_conv1 = sess.run(vars['net_-1/base_conv/W_base_conv1:0'])

    # show graph of W_conv1
    fig, axes = plt.subplots(3,
                             16,
                             figsize=(12, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(3 * 16)):
        inch = i // 16
        outch = i % 16
        img = W_conv1[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()
Esempio n. 11
0
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        pygame.display.set_caption('UNREAL')

        self.action_size = Environment.get_action_size()
        self.global_network = UnrealModel(self.action_size,
                                          -1,
                                          "/cpu:0",
                                          for_display=True)
        self.environment = Environment.create_environment()
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0
Esempio n. 12
0
def get_prediction(history, action, env_name, check_dir):
    action_size = Environment.get_action_size(env_type, env_name)
    global_network = UnrealModel(
        action_size,
        -1,
        #flags.use_pixel_change,
        #flags.use_value_replay,
        #flags.use_reward_prediction,
        #flags.use_future_reward_prediction,
        #flags.use_autoencoder,
        False,
        False,
        False,
        True,
        True,
        .0,
        .0,
        "/cpu:0")

    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(check_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    feed_dict = {
        global_network.frp_input: np.zeros((4, 84, 84, 3)),
        global_network.frp_action_input: np.zeros((1, action_size))
    }
    encoder_output = sess.run(global_network.encoder_output, feed_dict)
    print(encoder_output)
Esempio n. 13
0
class Display(object):
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        name = 'UNREAL' if flags.segnet == 0 else "A3C ErfNet"
        pygame.display.set_caption(name)

        env_config = sim_config.get(flags.env_name)
        self.image_shape = [
            env_config.get('height', 88),
            env_config.get('width', 88)
        ]
        segnet_param_dict = {'segnet_mode': flags.segnet}
        is_training = tf.placeholder(tf.bool, name="training")
        map_file = env_config.get('objecttypes_file', '../../objectTypes.csv')
        self.label_mapping = pd.read_csv(map_file, sep=',', header=0)
        self.get_col_index()

        self.action_size = Environment.get_action_size(flags.env_type,
                                                       flags.env_name)
        self.objective_size = Environment.get_objective_size(
            flags.env_type, flags.env_name)
        self.global_network = UnrealModel(self.action_size,
                                          self.objective_size,
                                          -1,
                                          flags.use_lstm,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          0.0,
                                          0.0,
                                          "/gpu:0",
                                          segnet_param_dict=segnet_param_dict,
                                          image_shape=self.image_shape,
                                          is_training=is_training,
                                          n_classes=flags.n_classes,
                                          segnet_lambda=flags.segnet_lambda,
                                          dropout=flags.dropout,
                                          for_display=True)
        self.environment = Environment.create_environment(
            flags.env_type,
            flags.env_name,
            flags.termination_time_sec,
            env_args={
                'episode_schedule': flags.split,
                'log_action_trace': flags.log_action_trace,
                'max_states_per_scene': flags.episodes_per_scene,
                'episodes_per_scene_test': flags.episodes_per_scene
            })
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0

    def update(self, sess):
        self.surface.fill(BLACK)
        self.process(sess)
        pygame.display.update()

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def scale_image(self, image, scale):
        return image.repeat(scale, axis=0).repeat(scale, axis=1)

    def draw_text(self, str, left, top, color=WHITE):
        text = self.font.render(str, True, color, BLACK)
        text_rect = text.get_rect()
        text_rect.left = left
        text_rect.top = top
        self.surface.blit(text, text_rect)

    def draw_center_text(self, str, center_x, top):
        text = self.font.render(str, True, WHITE, BLACK)
        text_rect = text.get_rect()
        text_rect.centerx = center_x
        text_rect.top = top
        self.surface.blit(text, text_rect)

    def show_pixel_change(self, pixel_change, left, top, rate, label):
        """
    Show pixel change
    """
        if "PC" in label:
            pixel_change_ = np.clip(pixel_change * 255.0 * rate, 0.0, 255.0)
            data = pixel_change_.astype(np.uint8)
            data = np.stack([data for _ in range(3)], axis=2)
            data = self.scale_image(data, 4)
            #print("PC shape", data.shape)
            image = pygame.image.frombuffer(data, (20 * 4, 20 * 4), 'RGB')
        else:
            pixel_change = self.scale_image(pixel_change, 2)
            #print("Preds shape", pixel_change.shape)
            image = pygame.image.frombuffer(pixel_change.astype(
                np.uint8), (self.image_shape[0] * 2, self.image_shape[1] * 2),
                                            'RGB')
        self.surface.blit(image, (2 * left + 16 + 8, 2 * top + 16 + 8))
        self.draw_center_text(label, 2 * left + 200 / 2, 2 * top + 200)

    def show_policy(self, pi):
        """
    Show action probability.
    """
        start_x = 10

        y = 150

        for i in range(len(pi)):
            width = pi[i] * 100
            pygame.draw.rect(self.surface, WHITE,
                             (2 * start_x, 2 * y, 2 * width, 2 * 10))
            y += 20
        self.draw_center_text("PI", 2 * 50, 2 * y)

    def show_image(self, state):
        """
    Show input image
    """
        state_ = state * 255.0
        data = state_.astype(np.uint8)
        data = self.scale_image(data, 2)
        image = pygame.image.frombuffer(
            data, (self.image_shape[0] * 2, self.image_shape[1] * 2), 'RGB')
        self.surface.blit(image, (8 * 2, 8 * 2))
        self.draw_center_text("input", 2 * 50, 2 * 100)

    def show_value(self):
        if self.value_history.is_empty:
            return

        min_v = float("inf")
        max_v = float("-inf")

        values = self.value_history.values

        for v in values:
            min_v = min(min_v, v)
            max_v = max(max_v, v)

        top = 150 * 2
        left = 150 * 2
        width = 100 * 2
        height = 100 * 2
        bottom = top + width
        right = left + height

        d = max_v - min_v
        last_r = 0.0
        for i, v in enumerate(values):
            r = (v - min_v) / d
            if i > 0:
                x0 = i - 1 + left
                x1 = i + left
                y0 = bottom - last_r * height
                y1 = bottom - r * height
                pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1)
            last_r = r

        pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1)
        pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1)
        pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1)
        pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom),
                         1)

        self.draw_center_text("V", left + width / 2, bottom + 10)

    def show_reward_prediction(self, rp_c, reward):
        start_x = 310
        reward_index = 0
        if reward == 0:
            reward_index = 0
        elif reward > 0:
            reward_index = 1
        elif reward < 0:
            reward_index = 2

        y = 150

        labels = ["0", "+", "-"]

        for i in range(len(rp_c)):
            width = rp_c[i] * 100

            if i == reward_index:
                color = RED
            else:
                color = WHITE
            pygame.draw.rect(self.surface, color,
                             (2 * start_x + 2 * 15, 2 * y, 2 * width, 2 * 10))
            self.draw_text(labels[i], 2 * start_x, 2 * y - 2 * 1, color)
            y += 20

        self.draw_center_text("RP", 2 * start_x + 2 * 100 / 2, y)

    def show_reward(self):
        self.draw_text("REWARD: {:.4}".format(float(self.episode_reward)), 300,
                       2 * 10)

    def process(self, sess):
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer()
        ])
        #sess.run(tf.initialize_all_variables())

        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward,
            self.environment.last_state)
        preds = None
        mode = "segnet" if flags.segnet >= 2 else ""
        mode = ""  #don't want preds
        if not flags.use_pixel_change:
            pi_values, v_value, preds = self.global_network.run_base_policy_and_value(
                sess,
                self.environment.last_state,
                last_action_reward,
                mode=mode)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.environment.last_state, last_action_reward)

        #print(preds)
        self.value_history.add_value(v_value)

        prev_state = self.environment.last_state

        action = self.choose_action(pi_values)
        state, reward, terminal, pixel_change = self.environment.process(
            action)
        self.episode_reward += reward

        if terminal:
            self.environment.reset()
            self.episode_reward = 0

        self.show_image(state['image'])
        self.show_policy(pi_values)
        self.show_value()
        self.show_reward()

        if not flags.use_pixel_change:
            if preds is not None:
                self.show_pixel_change(self.label_to_rgb(preds), 100, 0, 3.0,
                                       "Preds")
                self.show_pixel_change(self.label_to_rgb(state['objectType']),
                                       200, 0, 0.4, "Segm Mask")
        else:
            self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC")
            self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q")

        if flags.use_reward_prediction:
            if self.state_history.is_full:
                rp_c = self.global_network.run_rp_c(sess,
                                                    self.state_history.states)
                self.show_reward_prediction(rp_c, reward)

        self.state_history.add_state(state)

    def get_frame(self):
        data = self.surface.get_buffer().raw
        return data

    def get_col_index(self):
        ind_col = self.label_mapping[["index", "color"]].values
        index = ind_col[:, 0].astype(np.int)
        self.index, ind = np.unique(index, return_index=True)
        self.col = np.array([[int(x) for x in col.split('_')]
                             for col in ind_col[ind, 1]])

    def label_to_rgb(self, labels):
        #print(self.col)
        rgb_img = self.col[np.where(self.index[np.newaxis, :] == labels.ravel(
        )[:, np.newaxis])[1]].reshape(labels.shape + (3, ))
        return rgb_img
Esempio n. 14
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_lstm, use_pixel_change, use_value_replay,
                 use_reward_prediction, pixel_change_lambda, entropy_beta,
                 local_t_max, n_step_TD, gamma, gamma_pc,
                 experience_history_size, max_global_time_step, device,
                 segnet_param_dict, image_shape, is_training, n_classes,
                 random_state, termination_time, segnet_lambda, dropout):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_lstm = use_lstm
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.n_step_TD = n_step_TD
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)
        self.objective_size = Environment.get_objective_size(
            env_type, env_name)

        self.segnet_param_dict = segnet_param_dict
        self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None)

        self.is_training = is_training
        self.n_classes = n_classes
        self.segnet_lambda = segnet_lambda

        self.run_metadata = tf.RunMetadata()
        self.many_runs_timeline = TimeLiner()

        self.random_state = random_state
        self.termination_time = termination_time
        self.dropout = dropout

        try:
            self.local_network = UnrealModel(
                self.action_size,
                self.objective_size,
                thread_index,
                use_lstm,
                use_pixel_change,
                use_value_replay,
                use_reward_prediction,
                pixel_change_lambda,
                entropy_beta,
                device,
                segnet_param_dict=self.segnet_param_dict,
                image_shape=image_shape,
                is_training=is_training,
                n_classes=n_classes,
                segnet_lambda=self.segnet_lambda,
                dropout=dropout)

            self.local_network.prepare_loss()

            self.apply_gradients = grad_applier.minimize_local(
                self.local_network.total_loss, global_network.get_vars(),
                self.local_network.get_vars(), self.thread_index)

            self.sync = self.local_network.sync_from(global_network)
            self.experience = Experience(self.experience_history_size,
                                         random_state=self.random_state)
            self.local_t = 0
            self.initial_learning_rate = initial_learning_rate
            self.episode_reward = 0
            # For log output
            self.prev_local_t = -1
            self.prev_local_t_loss = 0
            self.sr_size = 50
            self.success_rates = deque(maxlen=self.sr_size)
        except Exception as e:
            print(str(e))  #, flush=True)
            raise Exception(
                "Problem in Trainer {} initialization".format(thread_index))
Esempio n. 15
0
class Trainer(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_lstm, use_pixel_change, use_value_replay,
                 use_reward_prediction, pixel_change_lambda, entropy_beta,
                 local_t_max, n_step_TD, gamma, gamma_pc,
                 experience_history_size, max_global_time_step, device,
                 segnet_param_dict, image_shape, is_training, n_classes,
                 random_state, termination_time, segnet_lambda, dropout):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_lstm = use_lstm
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.n_step_TD = n_step_TD
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)
        self.objective_size = Environment.get_objective_size(
            env_type, env_name)

        self.segnet_param_dict = segnet_param_dict
        self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None)

        self.is_training = is_training
        self.n_classes = n_classes
        self.segnet_lambda = segnet_lambda

        self.run_metadata = tf.RunMetadata()
        self.many_runs_timeline = TimeLiner()

        self.random_state = random_state
        self.termination_time = termination_time
        self.dropout = dropout

        try:
            self.local_network = UnrealModel(
                self.action_size,
                self.objective_size,
                thread_index,
                use_lstm,
                use_pixel_change,
                use_value_replay,
                use_reward_prediction,
                pixel_change_lambda,
                entropy_beta,
                device,
                segnet_param_dict=self.segnet_param_dict,
                image_shape=image_shape,
                is_training=is_training,
                n_classes=n_classes,
                segnet_lambda=self.segnet_lambda,
                dropout=dropout)

            self.local_network.prepare_loss()

            self.apply_gradients = grad_applier.minimize_local(
                self.local_network.total_loss, global_network.get_vars(),
                self.local_network.get_vars(), self.thread_index)

            self.sync = self.local_network.sync_from(global_network)
            self.experience = Experience(self.experience_history_size,
                                         random_state=self.random_state)
            self.local_t = 0
            self.initial_learning_rate = initial_learning_rate
            self.episode_reward = 0
            # For log output
            self.prev_local_t = -1
            self.prev_local_t_loss = 0
            self.sr_size = 50
            self.success_rates = deque(maxlen=self.sr_size)
        except Exception as e:
            print(str(e))  #, flush=True)
            raise Exception(
                "Problem in Trainer {} initialization".format(thread_index))

    def prepare(self, termination_time=50.0, termination_dist_value=-10.0):
        self.environment = Environment.create_environment(
            self.env_type,
            self.env_name,
            self.termination_time,
            thread_index=self.thread_index)

    def stop(self):
        self.environment.stop()

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return self.random_state.choice(len(pi_values), p=pi_values)

    def _record_one(self, sess, summary_writer, summary_op, score_input, score,
                    global_t):
        if self.thread_index >= 0:
            summary_str = sess.run(summary_op, feed_dict={score_input: score})
            for sum_wr in summary_writer:
                sum_wr.add_summary(summary_str, global_t)

    def _record_all(self, sess, summary_writer, summary_op, dict_input,
                    dict_eval, global_t):
        if self.thread_index >= 0:
            assert set(dict_input.keys()) == set(dict_eval.keys()), print(
                dict_input.keys(), dict_eval.keys())

            feed_dict = {}
            for key in dict_input.keys():
                feed_dict.update({dict_input[key]: dict_eval[key]})
            summary_str = sess.run(summary_op, feed_dict=feed_dict)
            for sum_wr in summary_writer:
                sum_wr.add_summary(summary_str, global_t)

    def set_start_time(self, start_time):
        self.start_time = start_time

    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        #print("Start experience filling", flush=True)
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward, prev_state)

        #print("Local network run base policy, value!", flush=True)
        pi_, _, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action, flag=0)

        frame = ExperienceFrame(
            {
                key: val
                for key, val in prev_state.items() if 'objectType' not in key
            }, reward, action, terminal, pixel_change, last_action,
            last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")

    def _print_log(self, global_t):
        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print(
                "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))  #, flush=True)
            # print("### Experience : {}".format(self.experience.get_debug_string()))

    def _process_base(self, sess, global_t, summary_writer, summary_op_dict,
                      summary_dict):  #, losses_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = None
        if self.use_lstm:
            start_lstm_state = self.local_network.base_lstm_state_out

        mode = "segnet" if self.segnet_mode >= 2 else ""
        # t_max times loop
        flag = 0
        for _ in range(self.n_step_TD):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward,
                self.environment.last_state)

            pi_, value_, losses = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward, mode)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("Trainer {}>>> Local step {}:".format(
                    self.thread_index, self.local_t))
                print("Trainer {}>>> pi={}".format(self.thread_index, pi_))
                print("Trainer {}>>> V={}".format(self.thread_index, value_))
                flag = 1

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action, flag=flag)
            frame = ExperienceFrame(
                {
                    key: val
                    for key, val in prev_state.items()
                    if 'objectType' not in key
                }, reward, action, terminal, pixel_change, last_action,
                last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            # Use to know about Experience collection
            #print(self.experience.get_debug_string())

            self.episode_reward += reward
            rewards.append(reward)
            self.local_t += 1

            if terminal:
                terminal_end = True
                print("Trainer {}>>> score={}".format(
                    self.thread_index, self.episode_reward))  #, flush=True)

                summary_dict['values'].update(
                    {'score_input': self.episode_reward})

                success = 1 if self.environment._last_full_state[
                    "success"] else 0
                #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success)
                self.success_rates.append(success)
                summary_dict['values'].update({
                    'sr_input':
                    np.mean(self.success_rates)
                    if len(self.success_rates) == self.sr_size else 0
                })

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                if flag:
                    flag = 0
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []
        batch_sobjT = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si['image'])
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)
            if self.segnet_param_dict["segnet_mode"] >= 2:
                batch_sobjT.append(si['objectType'])

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()
        batch_sobjT.reverse()

        #print(np.unique(batch_sobjT))

        ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made

        return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state

    def _process_pc(self, sess):
        # [pixel change]
        # Sample 20+1 frame (+1 for last next state)
        #print(">>> Process run!", flush=True)
        pc_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        # pc_experience_frames.reverse()
        pc_experience_frames = pc_experience_frames[::-1]
        #print(">>> Process ran!", flush=True)

        batch_pc_si = []
        batch_pc_a = []
        batch_pc_R = []
        batch_pc_last_action_reward = []

        pc_R = np.zeros([20, 20], dtype=np.float32)
        if not pc_experience_frames[1].terminal:
            pc_R = self.local_network.run_pc_q_max(
                sess, pc_experience_frames[0].state,
                pc_experience_frames[0].get_last_action_reward(
                    self.action_size))

        #print(">>> Process run!", flush=True)

        for frame in pc_experience_frames[1:]:

            pc_R = frame.pixel_change + self.gamma_pc * pc_R
            a = np.zeros([self.action_size])
            a[frame.action] = 1.0
            last_action_reward = frame.get_last_action_reward(self.action_size)

            batch_pc_si.append(frame.state['image'])
            batch_pc_a.append(a)
            batch_pc_R.append(pc_R)
            batch_pc_last_action_reward.append(last_action_reward)

        batch_pc_si.reverse()
        batch_pc_a.reverse()
        batch_pc_R.reverse()
        batch_pc_last_action_reward.reverse()

        #print(">>> Process ended!", flush=True)
        return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R

    def _process_vr(self, sess):
        # [Value replay]
        # Sample 20+1 frame (+1 for last next state)
        vr_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        vr_experience_frames.reverse()

        batch_vr_si = []
        batch_vr_R = []
        batch_vr_last_action_reward = []

        vr_R = 0.0
        if not vr_experience_frames[1].terminal:
            vr_R = self.local_network.run_vr_value(
                sess, vr_experience_frames[0].state,
                vr_experience_frames[0].get_last_action_reward(
                    self.action_size))

        # t_max times loop
        for frame in vr_experience_frames[1:]:
            vr_R = frame.reward + self.gamma * vr_R
            batch_vr_si.append(frame.state['image'])
            batch_vr_R.append(vr_R)
            last_action_reward = frame.get_last_action_reward(self.action_size)
            batch_vr_last_action_reward.append(last_action_reward)

        batch_vr_si.reverse()
        batch_vr_R.reverse()
        batch_vr_last_action_reward.reverse()

        return batch_vr_si, batch_vr_last_action_reward, batch_vr_R

    def _process_rp(self):
        # [Reward prediction]
        rp_experience_frames = self.experience.sample_rp_sequence()
        # 4 frames

        batch_rp_si = []
        batch_rp_c = []

        for i in range(3):
            batch_rp_si.append(rp_experience_frames[i].state['image'])

        # one hot vector for target reward
        r = rp_experience_frames[3].reward
        rp_c = [0.0, 0.0, 0.0]
        if -1e-10 < r < 1e-10:
            rp_c[0] = 1.0  # zero
        elif r > 0:
            rp_c[1] = 1.0  # positive
        else:
            rp_c[2] = 1.0  # negative
        batch_rp_c.append(rp_c)
        return batch_rp_si, batch_rp_c

    def process(self, sess, global_t, summary_writer, summary_op_dict,
                score_input, sr_input, eval_input, entropy_input,
                term_global_t, losses_input):

        if self.prev_local_t == -1 and self.segnet_mode >= 2:
            self.prev_local_t = 0
            sess.run(self.local_network.reset_evaluation_vars)
        # Fill experience replay buffer
        #print("Inside train process of thread!", flush=True)
        if not self.experience.is_full():
            self._fill_experience(sess)
            return 0, None

        start_local_t = self.local_t
        episode_score = None

        cur_learning_rate = self._anneal_learning_rate(global_t)

        #print("Weights copying!", flush=True)
        # Copy weights from shared to local
        sess.run(self.sync)
        #print("Weights copied successfully!", flush=True)

        summary_dict = {'placeholders': {}, 'values': {}}
        summary_dict['placeholders'].update(losses_input)

        # [Base]
        #print("[Base]", flush=True)
        batch_si, batch_sobjT, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state,  = \
              self._process_base(sess,
                                 global_t,
                                 summary_writer,
                                 summary_op_dict,
                                 summary_dict)
        if summary_dict['values'].get('score_input', None) is not None:
            self._record_one(sess, summary_writer,
                             summary_op_dict['score_input'], score_input,
                             summary_dict['values']['score_input'], global_t)
            self._record_one(sess, summary_writer, summary_op_dict['sr_input'],
                             sr_input, summary_dict['values']['sr_input'],
                             global_t)
            #self._record_one(sess, summary_writer, summary_op_dict['term_global_t'], term_global_t,
            #                 global_t, global_t)
            #summary_writer[0].flush()
            # summary_writer[1].flush()
            # Return advanced local step size
            episode_score = summary_dict['values'].get('score_input', None)
            summary_dict['values'] = {}

        feed_dict = {
            self.local_network.base_input: batch_si,
            self.local_network.base_last_action_reward_input:
            batch_last_action_rewards,
            self.local_network.base_a: batch_a,
            self.local_network.base_adv: batch_adv,
            self.local_network.base_r: batch_R,
            # [common]
            self.learning_rate_input: cur_learning_rate,
            self.is_training: True
        }

        if self.use_lstm:
            feed_dict[
                self.local_network.base_initial_lstm_state] = start_lstm_state

        if self.segnet_param_dict["segnet_mode"] >= 2:
            feed_dict[self.local_network.base_segm_mask] = batch_sobjT

        #print("[Pixel change]", flush=True)
        # [Pixel change]
        if self.use_pixel_change:
            batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc(
                sess)

            pc_feed_dict = {
                self.local_network.pc_input: batch_pc_si,
                self.local_network.pc_last_action_reward_input:
                batch_pc_last_action_reward,
                self.local_network.pc_a: batch_pc_a,
                self.local_network.pc_r: batch_pc_R
            }
            feed_dict.update(pc_feed_dict)

        #print("[Value replay]", flush=True)
        # [Value replay]
        if self.use_value_replay:
            batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr(
                sess)

            vr_feed_dict = {
                self.local_network.vr_input: batch_vr_si,
                self.local_network.vr_last_action_reward_input:
                batch_vr_last_action_reward,
                self.local_network.vr_r: batch_vr_R
            }
            feed_dict.update(vr_feed_dict)

        # [Reward prediction]
        #print("[Reward prediction]", flush=True)
        if self.use_reward_prediction:
            batch_rp_si, batch_rp_c = self._process_rp()
            rp_feed_dict = {
                self.local_network.rp_input: batch_rp_si,
                self.local_network.rp_c_target: batch_rp_c
            }
            feed_dict.update(rp_feed_dict)
            #print(len(batch_rp_c), batch_rp_c)

        grad_check = None
        #if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL:
        #  grad_check = [tf.add_check_numerics_ops()]
        #print("Applying gradients in train!", flush=True)
        # Calculate gradients and copy them to global network.
        out_list = [self.apply_gradients]
        out_list += [
            self.local_network.total_loss, self.local_network.base_loss,
            self.local_network.policy_loss, self.local_network.value_loss,
            self.local_network.entropy
        ]
        if self.segnet_mode >= 2:
            out_list += [self.local_network.decoder_loss]
            out_list += [self.local_network.regul_loss]
        if self.use_pixel_change:
            out_list += [self.local_network.pc_loss]
        if self.use_value_replay:
            out_list += [self.local_network.vr_loss]
        if self.use_reward_prediction:
            out_list += [self.local_network.rp_loss]
        if self.segnet_mode >= 2:
            out_list += [self.local_network.update_evaluation_vars]
            if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL:
                out_list += [self.local_network.evaluation]

        import time

        now = time.time()
        with tf.control_dependencies(grad_check):
            if GPU_LOG:
                return_list = sess.run(out_list,
                                       feed_dict=feed_dict,
                                       options=run_options,
                                       run_metadata=self.run_metadata)
            else:
                return_list = sess.run(out_list,
                                       feed_dict=feed_dict,
                                       options=run_options)

        if time.time() - now > 30.0:
            print(
                "Too much time on sess.run: check tensorflow")  #, flush=True)
            sys.exit(0)
            raise ValueError("More than 100 seconds update in tensorflow!")  #

        gradients_tuple, total_loss, base_loss, policy_loss, value_loss, entropy = return_list[:
                                                                                               6]
        grad_norm = gradients_tuple[1]
        return_list = return_list[6:]
        return_string = "Trainer {}>>> Total loss: {}, Base loss: {}\n".format(
            self.thread_index, total_loss, base_loss)
        return_string += "\t\tPolicy loss: {}, Value loss: {}, Grad norm: {}\nEntropy: {}\n".format(
            policy_loss, value_loss, grad_norm, entropy)
        losses_eval = {
            'all/total_loss': total_loss,
            'all/base_loss': base_loss,
            'all/policy_loss': policy_loss,
            'all/value_loss': value_loss,
            'all/loss/grad_norm': grad_norm
        }
        if self.segnet_mode >= 2:
            decoder_loss, l2_loss = return_list[:2]
            return_list = return_list[2:]
            return_string += "\t\tDecoder loss: {}, L2 weights loss: {}\n".format(
                decoder_loss, l2_loss)
            losses_eval.update({
                'all/decoder_loss': decoder_loss,
                'all/l2_weights_loss': l2_loss
            })
        if self.use_pixel_change:
            pc_loss = return_list[0]
            return_list = return_list[1:]
            return_string += "\t\tPC loss: {}\n".format(pc_loss)
            losses_eval.update({'all/pc_loss': pc_loss})
        if self.use_value_replay:
            vr_loss = return_list[0]
            return_list = return_list[1:]
            return_string += "\t\tVR loss: {}\n".format(vr_loss)
            losses_eval.update({'all/vr_loss': vr_loss})
        if self.use_reward_prediction:
            rp_loss = return_list[0]
            return_list = return_list[1:]
            return_string += "\t\tRP loss: {}\n".format(rp_loss)
            losses_eval.update({'all/rp_loss': rp_loss})
        if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL:
            if self.segnet_mode >= 2:
                return_string += "\t\tmIoU: {}\n".format(return_list[-1])

        summary_dict['values'].update(losses_eval)

        # Printing losses
        if self.local_t - self.prev_local_t_loss >= LOSS_AND_EVAL_LOG_INTERVAL:
            if self.segnet_mode >= 2:
                self._record_one(sess, summary_writer,
                                 summary_op_dict['eval_input'], eval_input,
                                 return_list[-1], global_t)
            self._record_one(sess, summary_writer, summary_op_dict['entropy'],
                             entropy_input, entropy, global_t)
            # summary_writer[0].flush()
            # summary_writer[1].flush()
            print(return_string)
            self.prev_local_t_loss += LOSS_AND_EVAL_LOG_INTERVAL

        if GPU_LOG:
            fetched_timeline = timeline.Timeline(self.run_metadata.step_stats)
            chrome_trace = fetched_timeline.generate_chrome_trace_format()
            self.many_runs_timeline.update_timeline(chrome_trace)

        self._print_log(global_t)

        #Recording score and losses
        self._record_all(sess, summary_writer, summary_op_dict['losses_input'],
                         summary_dict['placeholders'], summary_dict['values'],
                         global_t)

        # Return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, episode_score
Esempio n. 16
0
class Tester(object):
    def __init__(self):
        self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8)
        self.action_size = Environment.get_action_size()
        self.global_network = UnrealModel(self.action_size,
                                          -1,
                                          "/cpu:0",
                                          for_display=True)
        self.env = Environment.create_environment()
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.ep_reward = 0
        self.mazemap = MazeMap()

    def process(self, sess):
        self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8)
        last_action = self.env.last_action
        last_reward = np.clip(self.env.last_reward, -1, 1)
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)
        if not USE_PIXEL_CHANGE:
            pi_values, v_value = self.global_network.run_base_policy_and_value(
                sess, self.env.last_state, last_action_reward)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.env.last_state, last_action_reward)
        self.value_history.add_value(v_value)
        action = self.choose_action(pi_values)
        state, reward, terminal, pc, vtrans, vrot = self.env.process(action)
        self.state_history.add_state(state)
        self.ep_reward += reward
        self.mazemap.update(vtrans, vrot)
        if reward > 9:  # agent到达迷宫终点时,reward为10,地图需要重置
            self.mazemap.reset()
        if terminal:  # lab环境默认3600帧为一个episode而不是到达迷宫终点时给terminal信号
            self.env.reset()
            self.ep_reward = 0
            self.mazemap.reset()

        self.show_ob(state, 3, 3, "Observation")
        self.show_pc(pc, 100, 3, 3.0, "Pixel Change")
        self.show_pc(pc_q[:, :, action], 200, 3, 0.4, "PC Q")
        self.show_map(300, 3, "Maze Map")
        self.show_pi(pi_values)
        self.show_reward()
        self.show_rp()
        self.show_value()

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def scale_image(self, image, scale):
        return image.repeat(scale, axis=0).repeat(scale, axis=1)

    def draw_text(self, text, left, bottom, color=WHITE):
        font = cv2.FONT_HERSHEY_COMPLEX
        cv2.putText(self.img, text, (left, bottom), font, 0.35, color)

    def show_pc(self, pc, left, top, rate, label):
        pc = np.clip(pc * 255.0 * rate, 0.0, 255.0)
        data = pc.astype(np.uint8)
        data = np.stack([data for _ in range(3)], axis=2)
        data = self.scale_image(data, 4)
        h = data.shape[0]
        w = data.shape[1]
        self.img[top:top + h, left:left + w, :] = data
        self.draw_text(label, (left + 2), (top + h + 15))

    def show_map(self, left, top, label):
        maze = self.mazemap.get_map(84, 84)
        maze = (maze * 255).astype(np.uint8)
        h = maze.shape[0]
        w = maze.shape[1]
        self.img[top:top + h, left:left + w, :] = maze
        self.draw_text(label, (left + 2), (top + h + 5))

    def show_pi(self, pi):
        for i in range(len(pi)):
            width = int(pi[i] * 100)
            cv2.rectangle(self.img, (3, 113 + 15 * i), (width, 120 + 15 * i),
                          WHITE)
        self.draw_text("Policy", 20, 120 + 15 * len(pi))

    def show_ob(self, state, left, top, label):
        state = (state * 255.0).astype(np.uint8)
        h = state.shape[0]
        w = state.shape[1]
        self.img[top:top + h, left:left + w, :] = state
        self.draw_text(label, (left + 2), (top + h + 15))

    def show_value(self, left, top, height, width):
        if self.value_history.is_empty:
            return

        min_v = float("inf")
        max_v = float("-inf")
        values = self.value_history.values

        for v in values:
            min_v = min(min_v, v)
            max_v = max(max_v, v)

        bottom = top + height
        right = left + width
        d = max_v - min_v
        last_r = 0.0
        for i, v in enumerate(values):
            r = (v - min_v) / d
            if i > 0:
                x0 = i - 1 + left
                x1 = i + left
                y0 = bottom - last_r * height
                y1 = bottom - r * height
                cv2.line(self.img, (y0, x0), (y1, x1), BLUE, 2)
            last_r = r

        cv2.line(self.img, (top, left), (bottom, left), WHITE, 1)
        cv2.line(self.img, (top, right), (bottom, right), WHITE, 1)
        cv2.line(self.img, (top, left), (top, right), WHITE, 1)
        cv2.line(self.img, (bottom, left), (bottom, right), WHITE, 1)
        self.draw_text("Q Value", 120, 215)

    def show_rp(self):
        pass

    def show_reward(self):
        self.draw_text("Reward: {}".format(int(self.ep_reward)), 10, 230)

    def get_frame(self):
        return self.img
Esempio n. 17
0
device = "/cpu:0"
if USE_GPU:
  device = "/gpu:0"

initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW,
                                    INITIAL_ALPHA_HIGH,
                                    INITIAL_ALPHA_LOG_RATE)

global_t = 0

stop_requested = False
terminate_reqested = False

action_size = Environment.get_action_size()
global_network = UnrealModel(action_size, -1, device)

trainers = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = RMSP_ALPHA,
                              momentum = 0.0,
                              epsilon = RMSP_EPSILON,
                              clip_norm = GRAD_NORM_CLIP,
                              device = device)

for i in range(PARALLEL_SIZE):
  trainer = Trainer(i,
                    global_network,
Esempio n. 18
0

device = "/cpu:0"
if USE_GPU:
    device = "/gpu:0"

initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH,
                                    INITIAL_ALPHA_LOG_RATE)

global_t = 0

stop_requested = False
terminate_reqested = False

action_size = Environment.get_action_size()
global_network = UnrealModel(action_size, -1, device)

trainers = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                              decay=RMSP_ALPHA,
                              momentum=0.0,
                              epsilon=RMSP_EPSILON,
                              clip_norm=GRAD_NORM_CLIP,
                              device=device)

for i in range(PARALLEL_SIZE):
    trainer = Trainer(i,
                      global_network,
Esempio n. 19
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               env_type,
               env_name,
               use_pixel_change,
               use_value_replay,
               use_reward_prediction,
               pixel_change_lambda,
               entropy_beta,
               vf_coeff,
               local_t_max,
               gamma,
               gamma_pc,
               experience_history_size,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.env_type = env_type
    self.env_name = env_name
    self.use_pixel_change = use_pixel_change
    self.use_value_replay = use_value_replay
    self.use_reward_prediction = use_reward_prediction
    self.local_t_max = local_t_max
    self.gamma = gamma
    self.gamma_pc = gamma_pc
    self.experience_history_size = experience_history_size
    self.max_global_time_step = max_global_time_step
    self.action_size = Environment.get_action_size(env_type, env_name)
    
    self.local_network = UnrealModel(self.action_size,
                                     thread_index,
                                     use_pixel_change,
                                     use_value_replay,
                                     use_reward_prediction,
                                     pixel_change_lambda,
                                     entropy_beta,
                                     vf_coeff,
                                     device)
    #self.local_network.prepare_loss()

    #adding things for acktr
    self.local_network.prepare_loss_acktr()

    self.optim = optim = KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

    update_stats_op = optim.compute_and_apply_stats(self.local_network.fisher_loss, var_list=self.local_network.params)
    train_op, q_runner = optim.apply_gradients(list(zip(self.local_network.grads,self.local_network.params)))

    

    #update the rest according to normal stuff
    self.apply_gradients = grad_applier.minimize_local(self.local_network.intermediate_loss,
                                                       global_network.get_vars(),
                                                       self.local_network.get_vars())
    
    self.sync = self.local_network.sync_from(global_network)
    self.experience = Experience(self.experience_history_size)
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    # For log output
    self.prev_local_t = 0
Esempio n. 20
0
  def run(self):
    device = "/cpu:0"
    if USE_GPU:
      device = "/gpu:0"

    self.print_flags_info()

    if flags.segnet == -1:
      with open(flags.segnet_config) as f:
        self.config = json.load(f)

      self.num_classes = self.config["NUM_CLASSES"]
      self.use_vgg = self.config["USE_VGG"]

      if self.use_vgg is False:
        self.vgg_param_dict = None
        print("No VGG path in config, so learning from scratch")
      else:
        self.vgg16_npy_path = self.config["VGG_FILE"]
        self.vgg_param_dict = np.load(self.vgg16_npy_path, encoding='latin1').item()
        print("VGG parameter loaded")

      self.bayes = self.config["BAYES"]
      segnet_param_dict = {'segnet_mode': flags.segnet, 'vgg_param_dict': self.vgg_param_dict, 'use_vgg': self.use_vgg,
                       'num_classes': self.num_classes, 'bayes': self.bayes}
    else: # 0, 1, 2, 3
      segnet_param_dict = {'segnet_mode': flags.segnet}

    if flags.env_type != 'indoor':
        env_config = {}
    else:
        env_config = sim_config.get(flags.env_name)
    self.image_shape = [env_config.get('height', 84), env_config.get('width', 84)]
    self.map_file = env_config.get('objecttypes_file', '../../objectTypes_1x.csv')
    
    initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                        flags.initial_alpha_high,
                                        flags.initial_alpha_log_rate)
    self.global_t = 0
    
    self.stop_requested = False
    self.terminate_requested = False
    
    action_size = Environment.get_action_size(flags.env_type,
                                              flags.env_name)
    objective_size = Environment.get_objective_size(flags.env_type, flags.env_name)

    is_training = tf.placeholder(tf.bool, name="training")

    self.random_state = np.random.RandomState(seed=env_config.get("seed", 0xA3C))

    print("Global network initializing!")#, flush=True)

    self.global_network = UnrealModel(action_size,
                                      objective_size,
                                      -1,
                                      flags.use_lstm,
                                      flags.use_pixel_change,
                                      flags.use_value_replay,
                                      flags.use_reward_prediction,
                                      flags.pixel_change_lambda,
                                      flags.entropy_beta,
                                      device,
                                      segnet_param_dict=segnet_param_dict,
                                      image_shape=self.image_shape,
                                      is_training=is_training,
                                      n_classes=flags.n_classes,
                                      segnet_lambda=flags.segnet_lambda,
                                      dropout=flags.dropout)
    self.trainers = []
    
    learning_rate_input = tf.placeholder("float")

    
    grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                                  decay = flags.rmsp_alpha,
                                  momentum = 0.0,
                                  epsilon = flags.rmsp_epsilon,
                                  clip_norm = flags.grad_norm_clip,
                                  device = device)
    
    for i in range(flags.parallel_size):
      trainer = Trainer(i,
                        self.global_network,
                        initial_learning_rate,
                        learning_rate_input,
                        grad_applier,
                        flags.env_type,
                        flags.env_name,
                        flags.use_lstm,
                        flags.use_pixel_change,
                        flags.use_value_replay,
                        flags.use_reward_prediction,
                        flags.pixel_change_lambda,
                        flags.entropy_beta,
                        flags.local_t_max,
                        flags.n_step_TD,
                        flags.gamma,
                        flags.gamma_pc,
                        flags.experience_history_size,
                        flags.max_time_step,
                        device,
                        segnet_param_dict=segnet_param_dict,
                        image_shape=self.image_shape,
                        is_training=is_training,
                        n_classes = flags.n_classes,
                        random_state=self.random_state,
                        termination_time=flags.termination_time_sec,
                        segnet_lambda=flags.segnet_lambda,
                        dropout=flags.dropout)
      self.trainers.append(trainer)


    self.last_scores = []
    self.best_score = -1.0
    
    # prepare session
    config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False)
    config.gpu_options.allow_growth = True
    self.sess = tf.Session(config=config)

    # Wrap sess.run for debugging messages!
    def run_(*args, **kwargs):
      #print(">>> RUN!", args[0] if args else None)#, flush=True)
      return self.sess.__run(*args, **kwargs)  # getattr(self, "__run")(self, *args, **kwargs)
    self.sess.__run, self.sess.run = self.sess.run, run_

    self.sess.run(tf.global_variables_initializer())

    # summary for tensorboard
    self.score_input = tf.placeholder(tf.float32)
    self.sr_input = tf.placeholder(tf.float32)
    self.mIoU_input = tf.placeholder(tf.float32)
    self.term_global_t = tf.placeholder(tf.int32)

    self.losses_input = {}

    self.total_loss = tf.placeholder(tf.float32,  name='total_loss')
    self.losses_input.update({'all/total_loss': self.total_loss})

    self.base_loss = tf.placeholder(tf.float32, name='base_loss')
    self.losses_input.update({'all/base_loss': self.base_loss})

    self.policy_loss = tf.placeholder(tf.float32,  name='policy_loss')
    self.losses_input.update({'all/policy_loss': self.policy_loss})

    self.value_loss = tf.placeholder(tf.float32, name='policy_loss')
    self.losses_input.update({'all/value_loss': self.value_loss})

    self.grad_norm = tf.placeholder(tf.float32, name='grad_norm')
    self.losses_input.update({'all/loss/grad_norm': self.grad_norm})

    self.entropy_input = tf.placeholder(tf.float32, shape=[None], name='entropy')

    if segnet_param_dict["segnet_mode"] >= 2:
      self.decoder_loss = tf.placeholder(tf.float32,  name='decoder_loss')
      self.losses_input.update({'all/decoder_loss': self.decoder_loss})
      self.l2_weights_loss = tf.placeholder(tf.float32, name='regul_weights_loss')
      self.losses_input.update({'all/l2_weights_loss': self.l2_weights_loss})
    if flags.use_pixel_change:
      self.pc_loss = tf.placeholder(tf.float32,  name='pc_loss')
      self.losses_input.update({'all/pc_loss': self.pc_loss})
    if flags.use_value_replay:
      self.vr_loss = tf.placeholder(tf.float32,  name='vr_loss')
      self.losses_input.update({'all/vr_loss': self.vr_loss})
    if flags.use_reward_prediction:
      self.rp_loss = tf.placeholder(tf.float32,  name='rp_loss')
      self.losses_input.update({'all/rp_loss': self.rp_loss})

    score_summary = tf.summary.scalar("all/eval/score", self.score_input)
    sr_summary = tf.summary.scalar("all/eval/success_rate", self.sr_input)
    term_summary = tf.summary.scalar("all/eval/term_global_t", self.term_global_t)
    eval_summary = tf.summary.scalar("all/eval/mIoU_all", self.mIoU_input)
    losses_summary_list = []
    for key, val in self.losses_input.items():
      losses_summary_list += [tf.summary.scalar(key, val)]


    self.summary_op_dict = {'score_input': score_summary, 'eval_input': eval_summary, 'sr_input':sr_summary,
                            'losses_input': tf.summary.merge(losses_summary_list),
                            'entropy': tf.summary.scalar('all/eval/entropy_stepTD', tf.reduce_mean(self.entropy_input)),
                            'term_global_t': term_summary}
    flags.checkpoint_dir = os.path.join(base_dir, flags.checkpoint_dir)
    #print("First dirs {}::{}".format(flags.log_dir, flags.checkpoint_dir))
    flags.checkpoint_dir = flags.checkpoint_dir
    print("Checkpoint dir: {}, Log dir: {}".format(flags.checkpoint_dir, flags.log_dir))
    overall_FW = tf.summary.FileWriter(os.path.join(flags.log_dir, 'overall'),
                          self.sess.graph)
    self.summary_writer = [(tf.summary.FileWriter(os.path.join(flags.log_dir, 'worker_{}'.format(i)),
                                                self.sess.graph), overall_FW) for i in range(flags.parallel_size)]
    
    # init or load checkpoint with saver
    self.saver = tf.train.Saver(self.global_network.get_global_vars(), max_to_keep=20)


    
    #checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir, latest_filename ="best-checkpoint")
    #if checkpoint is None or checkpoint.model_checkpoint_path is None:
    #  checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
    checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)

    if checkpoint and checkpoint.model_checkpoint_path:
      if flags.segnet == -1:
          from tensorflow.python import pywrap_tensorflow
          reader = pywrap_tensorflow.NewCheckpointReader(checkpoint.model_checkpoint_path)
          big_var_to_shape_map = reader.get_variable_to_shape_map()
          s = []
          for key in big_var_to_shape_map:
              s += [key]
              # print("tensor_name: ", key)
          glob_var_names = [v.name for v in tf.global_variables()]
          endings = [r.split('/')[-1][:-2] for r in glob_var_names]
          old_ckpt_to_new_ckpt = {[k for k in s if endings[i] in k][0]: v for i, v in enumerate(tf.global_variables())}
          saver1 = tf.train.Saver(var_list=old_ckpt_to_new_ckpt)
          saver1.restore(self.sess, checkpoint.model_checkpoint_path)
      else:
          self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
      print("checkpoint loaded:", checkpoint.model_checkpoint_path)
      tokens = checkpoint.model_checkpoint_path.split("-")
      # set global step
      if 'best' in checkpoint.model_checkpoint_path:
          files = os.listdir(flags.checkpoint_dir)
          max_g_step = 0
          max_best_score = -10
          for file in files:
            if '.meta' not in file or 'checkpoint' not in file:
              continue
            if len(tokens) == 2:
              continue
            if len(tokens) > 3:
              best_score = float('-0.'+file.split('-')[2]) if 'best' in file else float('-0.'+file.split('-')[1])
              if best_score > max_best_score:
                g_step = int(file.split('-')[3]).split('.')[0] if 'best' in file else int(file.split('-')[2].split('.')[0])
                if max_g_step < g_step:
                  max_g_step = g_step
            else:
              self.best_score = -1.0
              g_step = int(file.split('-')[2]) if 'best' in file else int(file.split('-')[1])
              if max_g_step < g_step:
                max_g_step = g_step
          self.best_score = max_best_score
          self.global_t = max_g_step
          print("Chosen g_step >>", g_step)
      else:
        if len(tokens) == 3:
          self.global_t = int(tokens[2])
        else:
          self.global_t = int(tokens[1])
      #for i in range(flags.parallel_size):
      #  self.trainers[i].local_t = self.global_t
      print(">>> global step set: ", self.global_t)
      # set wall time
      wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t)
      with open(wall_t_fname, 'r') as f:
        self.wall_t = float(f.read())
        self.next_save_steps = (self.global_t + flags.save_interval_step) // flags.save_interval_step * flags.save_interval_step
      print_tensors_in_checkpoint_file(file_name=checkpoint.model_checkpoint_path,
                                     tensor_name='', all_tensors=False, all_tensor_names=True)
    else:
      print("Could not find old checkpoint")
      # set wall time
      self.wall_t = 0.0
      self.next_save_steps = flags.save_interval_step
    print("Global step {}, max best score {}".format(self.global_t, self.best_score))

    if flags.segnet_pretrain:
      checkpoint_dir = "../erfnet_segmentation/models"
      checkpoint_dir = os.path.join(checkpoint_dir, "aug_erfnetC_0_{}x{}_{}x/snapshots_best".format(
          self.image_shape[1],
          self.image_shape[0],
          self.map_file.split('_')[1].split('x')[0]))
      checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)

      big_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_encoder')
      big_weights += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_decoder')

      erfnet_weights = [l.name.split(':')[0].rsplit('net_-1/base_encoder/')[-1] for l in big_weights
             if len(l.name.split(':')[0].rsplit('net_-1/base_encoder/')) == 2]
      erfnet_weights += [l.name.split(':')[0].rsplit('net_-1/base_decoder/')[-1] for l in big_weights
              if len(l.name.split(':')[0].rsplit('net_-1/base_decoder/')) == 2]

      if checkpoint and checkpoint.model_checkpoint_path:
          saver2 = tf.train.Saver(var_list=dict(zip(erfnet_weights, big_weights)))
          saver2.restore(self.sess, checkpoint.model_checkpoint_path)
          print("ErfNet pretrained weights restored from file ", checkpoint_dir)
      else:
          print("Can't load pretrained weights for ErfNet from file ", checkpoint_dir)

    # run training threads
    self.train_threads = []
    for i in range(flags.parallel_size):
      self.train_threads.append(threading.Thread(target=self.train_function, args=(i,True)))
      
    signal.signal(signal.SIGINT, self.signal_handler)
  
    # set start time
    self.start_time = time.time() - self.wall_t

    print("Ready to start")
    for t in self.train_threads:
      t.start()
  
    print('Press Ctrl+C to stop')#, flush=True)
    signal.pause()
Esempio n. 21
0
class Application(object):
  def __init__(self):
    pass
  
  def train_function(self, parallel_index, preparing):
    """ Train each environment. """
    
    trainer = self.trainers[parallel_index]
    try:
      if preparing:
        trainer.prepare()
    except Exception as e:
      print(str(e))#, flush=True)
      raise Exception("Problem with trainer environment creation")

    # set start_time
    trainer.set_start_time(self.start_time)
    print("Trainer ", parallel_index, " process (re)start!")

    prev_print_t = 0
    while True:
      if self.global_t - prev_print_t >= 1000 or not prev_print_t and self.global_t != prev_print_t:
        prev_print_t = self.global_t
        print("Trainer {0}>>> stop_requested: {1}, terminate_requested: {2}, global_t: {3}".format(parallel_index, self.stop_requested,
                                                self.terminate_requested, self.global_t))
        #if parallel_index == 0:
        #    print("Graph size is {}".format(
        #        len([n.name for n in self.sess.graph.as_graph_def().node])))

      if self.stop_requested:
        print("Trainer ", parallel_index, ": stop requested!")
        break
      if self.terminate_requested:
        print("Trainer ", parallel_index, ": terminate_requested => process stop!")
        trainer.stop()
        break
      if self.global_t > flags.max_time_step:
        print("Trainer ", parallel_index, ": end of training!")
        trainer.stop()
        break
      if parallel_index == 0 and self.global_t > self.next_save_steps:
        # Save checkpoint
        self.save()


      try:
        diff_global_t, score = trainer.process(self.sess,
                                            self.global_t,
                                            self.summary_writer[parallel_index],
                                            self.summary_op_dict,
                                            self.score_input,
                                            self.sr_input,
                                            self.mIoU_input,
                                            self.entropy_input,
                                            self.term_global_t,
                                            self.losses_input)

        self.global_t += diff_global_t
        # if parallel_index == 0 and score is not None:
        #     #print("Got score", flush=True)
        #     self.last_scores += [score]
        #     if len(self.last_scores) >= 50:
        #         print("Last scores len >= 50")
        #         cur_score = np.mean(self.last_scores)
        #         print("Best score: {}, Cur score: {}".format(self.best_score, cur_score))
        #         self.last_scores = self.last_scores[-10:]
        #         if cur_score > self.best_score:
        #             self.best_score = cur_score
        #             self.save(name="best-checkpoint")

        # [n.name for n in tf.get_default_graph().as_graph_def().node]
        # [op for op in tf.get_default_graph().get_operations()] #op.name


        # GPU logging memory
        # prev_runs_t = 0
        # if self.global_t - prev_runs_t > 1000 or prev_runs_t == 0:
        #   prev_runs_t = self.global_t
        #   trainer.many_runs_timeline.save(os.path.join(flags.checkpoint_dir,
        #                                                'timeline_{}_merged_{}_runs.json'.format(
        #                                                  parallel_index, self.global_t)))
        # else:
        #   trainer.many_runs_timeline.save(os.path.join(flags.checkpoint_dir,
        #                                                'timeline_{}_merged_{}_runs.json'.format(
        #                                                  parallel_index, prev_runs_t)))
        #

      except Exception as e:
        print(traceback.format_exc())#, flush=True)
        trainer.stop()
        ## Let it be here!!!
        print("Trainer ", parallel_index, " process Error!")#, flush=True)
        break

    print("Trainer ", parallel_index, " after a while return!")

  def run(self):
    device = "/cpu:0"
    if USE_GPU:
      device = "/gpu:0"

    self.print_flags_info()

    if flags.segnet == -1:
      with open(flags.segnet_config) as f:
        self.config = json.load(f)

      self.num_classes = self.config["NUM_CLASSES"]
      self.use_vgg = self.config["USE_VGG"]

      if self.use_vgg is False:
        self.vgg_param_dict = None
        print("No VGG path in config, so learning from scratch")
      else:
        self.vgg16_npy_path = self.config["VGG_FILE"]
        self.vgg_param_dict = np.load(self.vgg16_npy_path, encoding='latin1').item()
        print("VGG parameter loaded")

      self.bayes = self.config["BAYES"]
      segnet_param_dict = {'segnet_mode': flags.segnet, 'vgg_param_dict': self.vgg_param_dict, 'use_vgg': self.use_vgg,
                       'num_classes': self.num_classes, 'bayes': self.bayes}
    else: # 0, 1, 2, 3
      segnet_param_dict = {'segnet_mode': flags.segnet}

    if flags.env_type != 'indoor':
        env_config = {}
    else:
        env_config = sim_config.get(flags.env_name)
    self.image_shape = [env_config.get('height', 84), env_config.get('width', 84)]
    self.map_file = env_config.get('objecttypes_file', '../../objectTypes_1x.csv')
    
    initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                        flags.initial_alpha_high,
                                        flags.initial_alpha_log_rate)
    self.global_t = 0
    
    self.stop_requested = False
    self.terminate_requested = False
    
    action_size = Environment.get_action_size(flags.env_type,
                                              flags.env_name)
    objective_size = Environment.get_objective_size(flags.env_type, flags.env_name)

    is_training = tf.placeholder(tf.bool, name="training")

    self.random_state = np.random.RandomState(seed=env_config.get("seed", 0xA3C))

    print("Global network initializing!")#, flush=True)

    self.global_network = UnrealModel(action_size,
                                      objective_size,
                                      -1,
                                      flags.use_lstm,
                                      flags.use_pixel_change,
                                      flags.use_value_replay,
                                      flags.use_reward_prediction,
                                      flags.pixel_change_lambda,
                                      flags.entropy_beta,
                                      device,
                                      segnet_param_dict=segnet_param_dict,
                                      image_shape=self.image_shape,
                                      is_training=is_training,
                                      n_classes=flags.n_classes,
                                      segnet_lambda=flags.segnet_lambda,
                                      dropout=flags.dropout)
    self.trainers = []
    
    learning_rate_input = tf.placeholder("float")

    
    grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                                  decay = flags.rmsp_alpha,
                                  momentum = 0.0,
                                  epsilon = flags.rmsp_epsilon,
                                  clip_norm = flags.grad_norm_clip,
                                  device = device)
    
    for i in range(flags.parallel_size):
      trainer = Trainer(i,
                        self.global_network,
                        initial_learning_rate,
                        learning_rate_input,
                        grad_applier,
                        flags.env_type,
                        flags.env_name,
                        flags.use_lstm,
                        flags.use_pixel_change,
                        flags.use_value_replay,
                        flags.use_reward_prediction,
                        flags.pixel_change_lambda,
                        flags.entropy_beta,
                        flags.local_t_max,
                        flags.n_step_TD,
                        flags.gamma,
                        flags.gamma_pc,
                        flags.experience_history_size,
                        flags.max_time_step,
                        device,
                        segnet_param_dict=segnet_param_dict,
                        image_shape=self.image_shape,
                        is_training=is_training,
                        n_classes = flags.n_classes,
                        random_state=self.random_state,
                        termination_time=flags.termination_time_sec,
                        segnet_lambda=flags.segnet_lambda,
                        dropout=flags.dropout)
      self.trainers.append(trainer)


    self.last_scores = []
    self.best_score = -1.0
    
    # prepare session
    config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False)
    config.gpu_options.allow_growth = True
    self.sess = tf.Session(config=config)

    # Wrap sess.run for debugging messages!
    def run_(*args, **kwargs):
      #print(">>> RUN!", args[0] if args else None)#, flush=True)
      return self.sess.__run(*args, **kwargs)  # getattr(self, "__run")(self, *args, **kwargs)
    self.sess.__run, self.sess.run = self.sess.run, run_

    self.sess.run(tf.global_variables_initializer())

    # summary for tensorboard
    self.score_input = tf.placeholder(tf.float32)
    self.sr_input = tf.placeholder(tf.float32)
    self.mIoU_input = tf.placeholder(tf.float32)
    self.term_global_t = tf.placeholder(tf.int32)

    self.losses_input = {}

    self.total_loss = tf.placeholder(tf.float32,  name='total_loss')
    self.losses_input.update({'all/total_loss': self.total_loss})

    self.base_loss = tf.placeholder(tf.float32, name='base_loss')
    self.losses_input.update({'all/base_loss': self.base_loss})

    self.policy_loss = tf.placeholder(tf.float32,  name='policy_loss')
    self.losses_input.update({'all/policy_loss': self.policy_loss})

    self.value_loss = tf.placeholder(tf.float32, name='policy_loss')
    self.losses_input.update({'all/value_loss': self.value_loss})

    self.grad_norm = tf.placeholder(tf.float32, name='grad_norm')
    self.losses_input.update({'all/loss/grad_norm': self.grad_norm})

    self.entropy_input = tf.placeholder(tf.float32, shape=[None], name='entropy')

    if segnet_param_dict["segnet_mode"] >= 2:
      self.decoder_loss = tf.placeholder(tf.float32,  name='decoder_loss')
      self.losses_input.update({'all/decoder_loss': self.decoder_loss})
      self.l2_weights_loss = tf.placeholder(tf.float32, name='regul_weights_loss')
      self.losses_input.update({'all/l2_weights_loss': self.l2_weights_loss})
    if flags.use_pixel_change:
      self.pc_loss = tf.placeholder(tf.float32,  name='pc_loss')
      self.losses_input.update({'all/pc_loss': self.pc_loss})
    if flags.use_value_replay:
      self.vr_loss = tf.placeholder(tf.float32,  name='vr_loss')
      self.losses_input.update({'all/vr_loss': self.vr_loss})
    if flags.use_reward_prediction:
      self.rp_loss = tf.placeholder(tf.float32,  name='rp_loss')
      self.losses_input.update({'all/rp_loss': self.rp_loss})

    score_summary = tf.summary.scalar("all/eval/score", self.score_input)
    sr_summary = tf.summary.scalar("all/eval/success_rate", self.sr_input)
    term_summary = tf.summary.scalar("all/eval/term_global_t", self.term_global_t)
    eval_summary = tf.summary.scalar("all/eval/mIoU_all", self.mIoU_input)
    losses_summary_list = []
    for key, val in self.losses_input.items():
      losses_summary_list += [tf.summary.scalar(key, val)]


    self.summary_op_dict = {'score_input': score_summary, 'eval_input': eval_summary, 'sr_input':sr_summary,
                            'losses_input': tf.summary.merge(losses_summary_list),
                            'entropy': tf.summary.scalar('all/eval/entropy_stepTD', tf.reduce_mean(self.entropy_input)),
                            'term_global_t': term_summary}
    flags.checkpoint_dir = os.path.join(base_dir, flags.checkpoint_dir)
    #print("First dirs {}::{}".format(flags.log_dir, flags.checkpoint_dir))
    flags.checkpoint_dir = flags.checkpoint_dir
    print("Checkpoint dir: {}, Log dir: {}".format(flags.checkpoint_dir, flags.log_dir))
    overall_FW = tf.summary.FileWriter(os.path.join(flags.log_dir, 'overall'),
                          self.sess.graph)
    self.summary_writer = [(tf.summary.FileWriter(os.path.join(flags.log_dir, 'worker_{}'.format(i)),
                                                self.sess.graph), overall_FW) for i in range(flags.parallel_size)]
    
    # init or load checkpoint with saver
    self.saver = tf.train.Saver(self.global_network.get_global_vars(), max_to_keep=20)


    
    #checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir, latest_filename ="best-checkpoint")
    #if checkpoint is None or checkpoint.model_checkpoint_path is None:
    #  checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
    checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)

    if checkpoint and checkpoint.model_checkpoint_path:
      if flags.segnet == -1:
          from tensorflow.python import pywrap_tensorflow
          reader = pywrap_tensorflow.NewCheckpointReader(checkpoint.model_checkpoint_path)
          big_var_to_shape_map = reader.get_variable_to_shape_map()
          s = []
          for key in big_var_to_shape_map:
              s += [key]
              # print("tensor_name: ", key)
          glob_var_names = [v.name for v in tf.global_variables()]
          endings = [r.split('/')[-1][:-2] for r in glob_var_names]
          old_ckpt_to_new_ckpt = {[k for k in s if endings[i] in k][0]: v for i, v in enumerate(tf.global_variables())}
          saver1 = tf.train.Saver(var_list=old_ckpt_to_new_ckpt)
          saver1.restore(self.sess, checkpoint.model_checkpoint_path)
      else:
          self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
      print("checkpoint loaded:", checkpoint.model_checkpoint_path)
      tokens = checkpoint.model_checkpoint_path.split("-")
      # set global step
      if 'best' in checkpoint.model_checkpoint_path:
          files = os.listdir(flags.checkpoint_dir)
          max_g_step = 0
          max_best_score = -10
          for file in files:
            if '.meta' not in file or 'checkpoint' not in file:
              continue
            if len(tokens) == 2:
              continue
            if len(tokens) > 3:
              best_score = float('-0.'+file.split('-')[2]) if 'best' in file else float('-0.'+file.split('-')[1])
              if best_score > max_best_score:
                g_step = int(file.split('-')[3]).split('.')[0] if 'best' in file else int(file.split('-')[2].split('.')[0])
                if max_g_step < g_step:
                  max_g_step = g_step
            else:
              self.best_score = -1.0
              g_step = int(file.split('-')[2]) if 'best' in file else int(file.split('-')[1])
              if max_g_step < g_step:
                max_g_step = g_step
          self.best_score = max_best_score
          self.global_t = max_g_step
          print("Chosen g_step >>", g_step)
      else:
        if len(tokens) == 3:
          self.global_t = int(tokens[2])
        else:
          self.global_t = int(tokens[1])
      #for i in range(flags.parallel_size):
      #  self.trainers[i].local_t = self.global_t
      print(">>> global step set: ", self.global_t)
      # set wall time
      wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t)
      with open(wall_t_fname, 'r') as f:
        self.wall_t = float(f.read())
        self.next_save_steps = (self.global_t + flags.save_interval_step) // flags.save_interval_step * flags.save_interval_step
      print_tensors_in_checkpoint_file(file_name=checkpoint.model_checkpoint_path,
                                     tensor_name='', all_tensors=False, all_tensor_names=True)
    else:
      print("Could not find old checkpoint")
      # set wall time
      self.wall_t = 0.0
      self.next_save_steps = flags.save_interval_step
    print("Global step {}, max best score {}".format(self.global_t, self.best_score))

    if flags.segnet_pretrain:
      checkpoint_dir = "../erfnet_segmentation/models"
      checkpoint_dir = os.path.join(checkpoint_dir, "aug_erfnetC_0_{}x{}_{}x/snapshots_best".format(
          self.image_shape[1],
          self.image_shape[0],
          self.map_file.split('_')[1].split('x')[0]))
      checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)

      big_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_encoder')
      big_weights += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_decoder')

      erfnet_weights = [l.name.split(':')[0].rsplit('net_-1/base_encoder/')[-1] for l in big_weights
             if len(l.name.split(':')[0].rsplit('net_-1/base_encoder/')) == 2]
      erfnet_weights += [l.name.split(':')[0].rsplit('net_-1/base_decoder/')[-1] for l in big_weights
              if len(l.name.split(':')[0].rsplit('net_-1/base_decoder/')) == 2]

      if checkpoint and checkpoint.model_checkpoint_path:
          saver2 = tf.train.Saver(var_list=dict(zip(erfnet_weights, big_weights)))
          saver2.restore(self.sess, checkpoint.model_checkpoint_path)
          print("ErfNet pretrained weights restored from file ", checkpoint_dir)
      else:
          print("Can't load pretrained weights for ErfNet from file ", checkpoint_dir)

    # run training threads
    self.train_threads = []
    for i in range(flags.parallel_size):
      self.train_threads.append(threading.Thread(target=self.train_function, args=(i,True)))
      
    signal.signal(signal.SIGINT, self.signal_handler)
  
    # set start time
    self.start_time = time.time() - self.wall_t

    print("Ready to start")
    for t in self.train_threads:
      t.start()
  
    print('Press Ctrl+C to stop')#, flush=True)
    signal.pause()

  def save(self, name=""):
    """ Save checkpoint. 
    Called from thread-0.
    """
    self.stop_requested = True
  
    # Wait for all other threads to stop
    print("Waiting for childs!")#, flush=True)
    for (i, t) in enumerate(self.train_threads):
      if i != 0:
        t.join()
  
    # Save
    try:
        if not os.path.exists(flags.checkpoint_dir):
          os.mkdir(flags.checkpoint_dir)

        # Write wall time
        wall_t = time.time() - self.start_time
        wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t)
        #if not os.path.exists(wall_t_fname):
        #  os.mkdir(wall_t_fname)

        with open(wall_t_fname, 'w') as f:
          f.write(str(wall_t))

        ckpt_name = 'checkpoint'
        if name != "":
            ckpt_name = name
        ckpt_name += '-' + str(abs(self.best_score))[2:8] # Here may be bug
        print('Start saving.')
        self.saver.save(self.sess,
                        flags.checkpoint_dir + '/' + ckpt_name,
                        global_step = self.global_t)
        print('End saving.')

        self.stop_requested = False
        self.next_save_steps += flags.save_interval_step
    except Exception as e:
        self.terminate_requested = True
        ## Let it be here for debug save() function!!!
        print(traceback.format_exc())#, flush=True)
        raise Exception("Error in 'save' occured!")
    finally:
        # Restart other threads
        print("Restarting other threads!")
        for i in range(flags.parallel_size):
          if i != 0:
            thread = threading.Thread(target=self.train_function, args=(i,False))
            self.train_threads[i] = thread
            thread.start()
    
  def signal_handler(self, signal, frame):
    print('You pressed Ctrl+C!')#, flush=True)
    self.terminate_requested = True

  def print_flags_info(self):
    return_string = "\n\n\n"
    return_string += "Envs FILE:{}\n".format(flags.env_name)
    return_string += "Checkpoint dir: {}, Termination time in sec: " \
                     "{}, Max steps to train: {:2.3E}, Parallel threads:{}\n".format(flags.checkpoint_dir,
                                                                                   flags.termination_time_sec,
                                                                                   flags.max_time_step,
                                                                                   flags.parallel_size)
    return_string += "Use ErfNet Encoder-Decoder, N classes: {}\n".format(flags.n_classes) if flags.segnet >= 2 else ""
    return_string += "Use ErfNet Encoder only\n" if flags.segnet == 1 else ""
    return_string += "Use vanilla encoder\n" if flags.segnet == 0 else ""
    return_string += "Use PC:{}, Use VR:{}, use RP:{}\n".format(flags.use_pixel_change,
                                                              flags.use_value_replay,
                                                              flags.use_reward_prediction)
    return_string += "Experience hist size: {}, Local_t max: {}, n-step-TD: {}\n".format(flags.experience_history_size,
                                                                                   flags.local_t_max,
                                                                                   flags.n_step_TD)
    return_string += "Entropy beta: {}, Gradient norm clipping: {}, Rmsprop alpha: {}, Saving step: {}\n".format(
                                                                                     flags.entropy_beta,
                                                                                     flags.grad_norm_clip,
                                                                                     flags.rmsp_alpha,
                                                                                     flags.save_interval_step)
    print(return_string)
Esempio n. 22
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import matplotlib.pyplot as plt

from environment.environment import Environment
from model.model import UnrealModel
from constants import *

# use CPU for weight visualize tool
device = "/cpu:0"

action_size = Environment.get_action_size()
global_network = UnrealModel(action_size, -1, device)

sess = tf.Session()

init = tf.global_variables_initializer()
sess.run(init)

saver = tf.train.Saver()
checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
if checkpoint and checkpoint.model_checkpoint_path:
    saver.restore(sess, checkpoint.model_checkpoint_path)
    print("checkpoint loaded:", checkpoint.model_checkpoint_path)
else:
    print("Could not find old checkpoint")

vars = {}
Esempio n. 23
0
class Application(object):
    def __init__(self):
        pass

    def base_train_function(self):
        """ Train routine for base_trainer. """

        trainer = self.base_trainer

        # set start_time
        trainer.set_start_time(self.start_time, self.global_t)

        while True:
            if self.stop_requested:
                break
            if self.terminate_requested:
                break
            if self.global_t > flags.max_time_step:
                break
            if self.global_t > self.next_save_steps:
                # Save checkpoint
                logger.debug("Steps:{}".format(self.global_t))
                logger.debug(self.next_save_steps)

                self.save()

            diff_global_t = trainer.process(self.sess, self.global_t,
                                            self.summary_writer,
                                            self.summary_op,
                                            self.summary_values,
                                            flags.base_lambda)
            self.global_t += diff_global_t
        logger.warn("exiting training!")
        self.environment.stop()
        #sys.exit(0)
        time.sleep(1)
        os._exit(0)

    def aux_train_function(self, aux_index):
        """ Train routine for aux_trainer. """

        trainer = self.aux_trainers[aux_index]

        while True:
            if self.global_t < 500:
                continue
            if self.stop_requested:
                continue
            if self.terminate_requested:
                break
            if self.global_t > flags.max_time_step:
                break

            diff_aux_t = trainer.process(self.sess, self.global_t, self.aux_t,
                                         self.summary_writer,
                                         self.summary_op_aux, self.summary_aux)
            self.aux_t += diff_aux_t
            #logger.debug("aux_t:{}".format(self.aux_t))

    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"
        logger.debug("start App")
        initial_learning_rate = flags.initial_learning_rate

        self.global_t = 0
        self.aux_t = 0
        self.stop_requested = False
        self.terminate_requested = False
        logger.debug("getting action size...")
        visinput = [flags.vision, flags.vis_h, flags.vis_w]
        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)
        # Setup Global Network
        logger.debug("loading global model...")
        self.global_network = UnrealModel(
            action_size, visinput, -1, flags.entropy_beta, device,
            flags.use_pixel_change, flags.use_value_replay,
            flags.use_reward_prediction, flags.use_temporal_coherence,
            flags.pixel_change_lambda, flags.temporal_coherence_lambda)
        logger.debug("done loading global model")
        learning_rate_input = tf.placeholder("float")

        # Setup gradient calculator
        #"""
        grad_applier = RMSPropApplier(
            learning_rate=learning_rate_input,
            #decay = flags.rmsp_alpha,
            momentum=0.0,
            #epsilon = flags.rmsp_epsilon,
            clip_norm=flags.grad_norm_clip,
            device=device)
        """
        grad_applier = AdamApplier(learning_rate = learning_rate_input,
                                   clip_norm=flags.grad_norm_clip,
                                   device=device)
        """
        # Start environment
        self.environment = Environment.create_environment(
            flags.env_type, flags.env_name, visinput)
        logger.debug("done loading environment")

        # Setup runner
        self.runner = RunnerThread(flags, self.environment,
                                   self.global_network, action_size, visinput,
                                   device, visualise)
        logger.debug("done setting up RunnerTread")

        # Setup experience
        self.experience = Experience(flags.experience_history_size)

        #@TODO check device usage: should we build a cluster?
        # Setup Base Network
        self.base_trainer = BaseTrainer(
            self.runner, self.global_network, initial_learning_rate,
            learning_rate_input, grad_applier, visinput, flags.env_type,
            flags.env_name, flags.entropy_beta, flags.gamma, self.experience,
            flags.max_time_step, device)

        # Setup Aux Networks
        self.aux_trainers = []
        for k in range(flags.parallel_size):
            self.aux_trainers.append(
                AuxTrainer(
                    self.global_network,
                    k + 2,  #-1 is global, 0 is runnerthread, 1 is base
                    flags.use_pixel_change,
                    flags.use_value_replay,
                    flags.use_reward_prediction,
                    flags.use_temporal_coherence,
                    flags.pixel_change_lambda,
                    flags.temporal_coherence_lambda,
                    flags.aux_initial_learning_rate,
                    learning_rate_input,
                    grad_applier,
                    visinput,
                    self.aux_t,
                    flags.env_type,
                    flags.env_name,
                    flags.entropy_beta,
                    flags.local_t_max,
                    flags.gamma,
                    flags.aux_lambda,
                    flags.gamma_pc,
                    self.experience,
                    flags.max_time_step,
                    device))

        # Start tensorflow session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        self.init_tensorboard()

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            checkpointpath = checkpoint.model_checkpoint_path.replace(
                "/", "\\")
            logger.info("checkpoint loaded: {}".format(checkpointpath))
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            logger.info(">>> global step set: {}".format(self.global_t))
            logger.info(">>> aux step: {}".format(self.aux_t))
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step
                logger.debug("next save steps:{}".format(self.next_save_steps))
        else:
            logger.info("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t
        # Start runner
        self.runner.start_runner(self.sess)
        # Start base_network thread
        self.base_train_thread = threading.Thread(
            target=self.base_train_function, args=())
        self.base_train_thread.start()

        # Start aux_network threads
        self.aux_train_threads = []
        for k in range(flags.parallel_size):
            self.aux_train_threads.append(
                threading.Thread(target=self.aux_train_function, args=(k, )))
            self.aux_train_threads[k].start()

        logger.debug(threading.enumerate())

        logger.info('Press Ctrl+C to stop')
        #signal.pause()

    def init_tensorboard(self):
        # tensorboard summary for base
        self.score_input = tf.placeholder(tf.int32)
        self.epl_input = tf.placeholder(tf.int32)
        self.policy_loss = tf.placeholder(tf.float32)
        self.value_loss = tf.placeholder(tf.float32)
        self.base_entropy = tf.placeholder(tf.float32)
        self.base_gradient = tf.placeholder(tf.float32)
        self.base_lr = tf.placeholder(tf.float32)
        self.laststate = tf.placeholder(
            tf.float32, [1, flags.vis_w, flags.vis_h,
                         len(flags.vision)],
            name="laststate")
        score = tf.summary.scalar("env/score", self.score_input)
        epl = tf.summary.scalar("env/ep_length", self.epl_input)
        policy_loss = tf.summary.scalar("base/policy_loss", self.policy_loss)
        value_loss = tf.summary.scalar("base/value_loss", self.value_loss)
        entropy = tf.summary.scalar("base/entropy", self.base_entropy)
        gradient = tf.summary.scalar("base/gradient", self.base_gradient)
        lr = tf.summary.scalar("base/learning_rate", self.base_lr)
        laststate = tf.summary.image("base/laststate", self.laststate)

        self.summary_values = [
            self.score_input, self.epl_input, self.policy_loss,
            self.value_loss, self.base_entropy, self.base_gradient,
            self.base_lr, self.laststate
        ]
        self.summary_op = tf.summary.merge_all(
        )  # we want to merge model histograms as well here

        # tensorboard summary for aux
        self.summary_aux = []
        aux_losses = []
        self.aux_basep_loss = tf.placeholder(tf.float32)
        self.aux_basev_loss = tf.placeholder(tf.float32)
        self.aux_entropy = tf.placeholder(tf.float32)
        self.aux_gradient = tf.placeholder(tf.float32)
        self.summary_aux.append(self.aux_basep_loss)
        self.summary_aux.append(self.aux_basev_loss)

        aux_losses.append(
            tf.summary.scalar("aux/basep_loss", self.aux_basep_loss))
        aux_losses.append(
            tf.summary.scalar("aux/basev_loss", self.aux_basev_loss))

        if flags.use_pixel_change:
            self.pc_loss = tf.placeholder(tf.float32)
            self.summary_aux.append(self.pc_loss)
            aux_losses.append(tf.summary.scalar("aux/pc_loss", self.pc_loss))
        if flags.use_value_replay:
            self.vr_loss = tf.placeholder(tf.float32)
            self.summary_aux.append(self.vr_loss)
            aux_losses.append(tf.summary.scalar("aux/vr_loss", self.vr_loss))
        if flags.use_reward_prediction:
            self.rp_loss = tf.placeholder(tf.float32)
            self.summary_aux.append(self.rp_loss)
            aux_losses.append(tf.summary.scalar("aux/rp_loss", self.rp_loss))
        if flags.use_temporal_coherence:
            self.tc_loss = tf.placeholder(tf.float32)
            self.summary_aux.append(self.tc_loss)
            aux_losses.append(tf.summary.scalar("aux/tc_loss", self.tc_loss))

        # append entropy and gradient last
        self.summary_aux.append(self.aux_entropy)
        self.summary_aux.append(self.aux_gradient)
        aux_losses.append(tf.summary.scalar("aux/entropy", self.aux_entropy))
        aux_losses.append(tf.summary.scalar("aux/gradient", self.aux_gradient))

        self.summary_op_aux = tf.summary.merge(aux_losses)

        #self.summary_op = tf.summary.merge_all()
        tensorboard_path = flags.temp_dir + TRAINING_NAME + "/"
        logger.info("tensorboard path:" + tensorboard_path)
        if not os.path.exists(tensorboard_path):
            os.makedirs(tensorboard_path)
        self.summary_writer = tf.summary.FileWriter(tensorboard_path)
        self.summary_writer.add_graph(self.sess.graph)

    def save(self):
        """ Save checkpoint. 
        Called from base_trainer.
        """
        self.stop_requested = True

        # Save
        if not os.path.exists(flags.checkpoint_dir):
            os.mkdir(flags.checkpoint_dir)

        # Write wall time
        wall_t = time.time() - self.start_time
        wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
            self.global_t)
        with open(wall_t_fname, 'w') as f:
            f.write(str(wall_t))
        logger.info('Start saving.')
        self.saver.save(self.sess,
                        flags.checkpoint_dir + '/' + 'checkpoint',
                        global_step=self.global_t)
        logger.info('End saving.')

        self.stop_requested = False
        self.next_save_steps += flags.save_interval_step

    def signal_handler(self, signal, frame):
        logger.warn('Ctrl+C detected, shutting down...')
        logger.info('run name: {} -- terminated'.format(TRAINING_NAME))
        self.terminate_requested = True
Esempio n. 24
0
    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"
        logger.debug("start App")
        initial_learning_rate = flags.initial_learning_rate

        self.global_t = 0
        self.aux_t = 0
        self.stop_requested = False
        self.terminate_requested = False
        logger.debug("getting action size...")
        visinput = [flags.vision, flags.vis_h, flags.vis_w]
        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)
        # Setup Global Network
        logger.debug("loading global model...")
        self.global_network = UnrealModel(
            action_size, visinput, -1, flags.entropy_beta, device,
            flags.use_pixel_change, flags.use_value_replay,
            flags.use_reward_prediction, flags.use_temporal_coherence,
            flags.pixel_change_lambda, flags.temporal_coherence_lambda)
        logger.debug("done loading global model")
        learning_rate_input = tf.placeholder("float")

        # Setup gradient calculator
        #"""
        grad_applier = RMSPropApplier(
            learning_rate=learning_rate_input,
            #decay = flags.rmsp_alpha,
            momentum=0.0,
            #epsilon = flags.rmsp_epsilon,
            clip_norm=flags.grad_norm_clip,
            device=device)
        """
        grad_applier = AdamApplier(learning_rate = learning_rate_input,
                                   clip_norm=flags.grad_norm_clip,
                                   device=device)
        """
        # Start environment
        self.environment = Environment.create_environment(
            flags.env_type, flags.env_name, visinput)
        logger.debug("done loading environment")

        # Setup runner
        self.runner = RunnerThread(flags, self.environment,
                                   self.global_network, action_size, visinput,
                                   device, visualise)
        logger.debug("done setting up RunnerTread")

        # Setup experience
        self.experience = Experience(flags.experience_history_size)

        #@TODO check device usage: should we build a cluster?
        # Setup Base Network
        self.base_trainer = BaseTrainer(
            self.runner, self.global_network, initial_learning_rate,
            learning_rate_input, grad_applier, visinput, flags.env_type,
            flags.env_name, flags.entropy_beta, flags.gamma, self.experience,
            flags.max_time_step, device)

        # Setup Aux Networks
        self.aux_trainers = []
        for k in range(flags.parallel_size):
            self.aux_trainers.append(
                AuxTrainer(
                    self.global_network,
                    k + 2,  #-1 is global, 0 is runnerthread, 1 is base
                    flags.use_pixel_change,
                    flags.use_value_replay,
                    flags.use_reward_prediction,
                    flags.use_temporal_coherence,
                    flags.pixel_change_lambda,
                    flags.temporal_coherence_lambda,
                    flags.aux_initial_learning_rate,
                    learning_rate_input,
                    grad_applier,
                    visinput,
                    self.aux_t,
                    flags.env_type,
                    flags.env_name,
                    flags.entropy_beta,
                    flags.local_t_max,
                    flags.gamma,
                    flags.aux_lambda,
                    flags.gamma_pc,
                    self.experience,
                    flags.max_time_step,
                    device))

        # Start tensorflow session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        self.init_tensorboard()

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            checkpointpath = checkpoint.model_checkpoint_path.replace(
                "/", "\\")
            logger.info("checkpoint loaded: {}".format(checkpointpath))
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            logger.info(">>> global step set: {}".format(self.global_t))
            logger.info(">>> aux step: {}".format(self.aux_t))
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step
                logger.debug("next save steps:{}".format(self.next_save_steps))
        else:
            logger.info("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t
        # Start runner
        self.runner.start_runner(self.sess)
        # Start base_network thread
        self.base_train_thread = threading.Thread(
            target=self.base_train_function, args=())
        self.base_train_thread.start()

        # Start aux_network threads
        self.aux_train_threads = []
        for k in range(flags.parallel_size):
            self.aux_train_threads.append(
                threading.Thread(target=self.aux_train_function, args=(k, )))
            self.aux_train_threads[k].start()

        logger.debug(threading.enumerate())

        logger.info('Press Ctrl+C to stop')
Esempio n. 25
0
class Display(object):
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        pygame.display.set_caption('UNREAL')

        self.action_size = Environment.get_action_size(flags.env_type,
                                                       flags.env_name)
        self.objective_size = Environment.get_objective_size(
            flags.env_type, flags.env_name)
        self.global_network = UnrealModel(self.action_size,
                                          self.objective_size,
                                          -1,
                                          flags.use_lstm,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          0.0,
                                          0.0,
                                          "/cpu:0",
                                          for_display=True)
        self.environment = Environment.create_environment(
            flags.env_type,
            flags.env_name,
            env_args={
                'episode_schedule': flags.split,
                'log_action_trace': flags.log_action_trace,
                'max_states_per_scene': flags.episodes_per_scene,
                'episodes_per_scene_test': flags.episodes_per_scene
            })
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0

    def update(self, sess):
        self.surface.fill(BLACK)
        self.process(sess)
        pygame.display.update()

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def scale_image(self, image, scale):
        return image.repeat(scale, axis=0).repeat(scale, axis=1)

    def draw_text(self, str, left, top, color=WHITE):
        text = self.font.render(str, True, color, BLACK)
        text_rect = text.get_rect()
        text_rect.left = left
        text_rect.top = top
        self.surface.blit(text, text_rect)

    def draw_center_text(self, str, center_x, top):
        text = self.font.render(str, True, WHITE, BLACK)
        text_rect = text.get_rect()
        text_rect.centerx = center_x
        text_rect.top = top
        self.surface.blit(text, text_rect)

    def show_pixel_change(self, pixel_change, left, top, rate, label):
        """
    Show pixel change
    """
        pixel_change_ = np.clip(pixel_change * 255.0 * rate, 0.0, 255.0)
        data = pixel_change_.astype(np.uint8)
        data = np.stack([data for _ in range(3)], axis=2)
        data = self.scale_image(data, 4)
        image = pygame.image.frombuffer(data, (20 * 4, 20 * 4), 'RGB')
        self.surface.blit(image, (left + 8 + 4, top + 8 + 4))
        self.draw_center_text(label, left + 100 / 2, top + 100)

    def show_policy(self, pi):
        """
    Show action probability.
    """
        start_x = 10

        y = 150

        for i in range(len(pi)):
            width = pi[i] * 100
            pygame.draw.rect(self.surface, WHITE, (start_x, y, width, 10))
            y += 20
        self.draw_center_text("PI", 50, y)

    def show_image(self, state):
        """
    Show input image
    """
        state_ = state * 255.0
        data = state_.astype(np.uint8)
        image = pygame.image.frombuffer(data, (84, 84), 'RGB')
        self.surface.blit(image, (8, 8))
        self.draw_center_text("input", 50, 100)

    def show_value(self):
        if self.value_history.is_empty:
            return

        min_v = float("inf")
        max_v = float("-inf")

        values = self.value_history.values

        for v in values:
            min_v = min(min_v, v)
            max_v = max(max_v, v)

        top = 150
        left = 150
        width = 100
        height = 100
        bottom = top + width
        right = left + height

        d = max_v - min_v
        last_r = 0.0
        for i, v in enumerate(values):
            r = (v - min_v) / d
            if i > 0:
                x0 = i - 1 + left
                x1 = i + left
                y0 = bottom - last_r * height
                y1 = bottom - r * height
                pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1)
            last_r = r

        pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1)
        pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1)
        pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1)
        pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom),
                         1)

        self.draw_center_text("V", left + width / 2, bottom + 10)

    def show_reward_prediction(self, rp_c, reward):
        start_x = 310
        reward_index = 0
        if reward == 0:
            reward_index = 0
        elif reward > 0:
            reward_index = 1
        elif reward < 0:
            reward_index = 2

        y = 150

        labels = ["0", "+", "-"]

        for i in range(len(rp_c)):
            width = rp_c[i] * 100

            if i == reward_index:
                color = RED
            else:
                color = WHITE
            pygame.draw.rect(self.surface, color, (start_x + 15, y, width, 10))
            self.draw_text(labels[i], start_x, y - 1, color)
            y += 20

        self.draw_center_text("RP", start_x + 100 / 2, y)

    def show_reward(self):
        self.draw_text("REWARD: {}".format(int(self.episode_reward)), 310, 10)

    def process(self, sess):
        last_action = self.environment.last_action
        last_reward = np.clip(self.environment.last_reward, -1, 1)
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward,
            self.environment.last_state)

        if not flags.use_pixel_change:
            pi_values, v_value = self.global_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.environment.last_state, last_action_reward)
        self.value_history.add_value(v_value)

        action = self.choose_action(pi_values)
        state, reward, terminal, pixel_change = self.environment.process(
            action)
        self.episode_reward += reward

        if terminal:
            self.environment.reset()
            self.episode_reward = 0

        self.show_image(state['image'])
        self.show_policy(pi_values)
        self.show_value()
        self.show_reward()

        if flags.use_pixel_change:
            self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC")
            self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q")

        if flags.use_reward_prediction:
            if self.state_history.is_full:
                rp_c = self.global_network.run_rp_c(sess,
                                                    self.state_history.states)
                self.show_reward_prediction(rp_c, reward)

        self.state_history.add_state(state)

    def get_frame(self):
        data = self.surface.get_buffer().raw
        return data
Esempio n. 26
0
class Trainer(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_pixel_change, use_value_replay, use_reward_prediction,
                 use_future_reward_prediction, use_autoencoder, reward_length,
                 pixel_change_lambda, entropy_beta, local_t_max, gamma,
                 gamma_pc, experience_history_size, max_global_time_step,
                 device, log_file, skip_step):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.use_future_reward_prediction = use_future_reward_prediction
        self.use_autoencoder = use_autoencoder
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.skip_step = skip_step
        self.action_size = Environment.get_action_size(env_type, env_name)

        self.local_network = UnrealModel(self.action_size, thread_index,
                                         use_pixel_change, use_value_replay,
                                         use_reward_prediction,
                                         use_future_reward_prediction,
                                         use_autoencoder, pixel_change_lambda,
                                         entropy_beta, device)
        self.local_network.prepare_loss()

        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.total_loss, global_network.get_vars(),
            self.local_network.get_vars())

        self.sync = self.local_network.sync_from(global_network)
        self.experience = Experience(self.experience_history_size,
                                     reward_length)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # For log output
        self.prev_local_t = 0
        self.log_file = log_file
        self.prediction_res_file = log_file + '/' + 'res.pkl'

    def prepare(self):
        self.environment = Environment.create_environment(
            self.env_type, self.env_name, self.skip_step)

    def stop(self):
        self.environment.stop()

    def add_summary(self, step, name, value, writer):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.simple_value = float(value)
        summary_value.tag = name
        writer.add_summary(summary, step)

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)

        pi_, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action)

        frame = ExperienceFrame(prev_state, reward, action, terminal,
                                pixel_change, last_action, last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")

    def _print_log(self, global_t):
        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print(
                "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(self.local_t_max):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)

            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action)
            frame = ExperienceFrame(prev_state, reward, action, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state

    def _process_pc(self, sess):
        # [pixel change]
        # Sample 20+1 frame (+1 for last next state)
        pc_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        pc_experience_frames.reverse()

        batch_pc_si = []
        batch_pc_a = []
        batch_pc_R = []
        batch_pc_last_action_reward = []

        pc_R = np.zeros([20, 20], dtype=np.float32)
        if not pc_experience_frames[1].terminal:
            pc_R = self.local_network.run_pc_q_max(
                sess, pc_experience_frames[0].state,
                pc_experience_frames[0].get_last_action_reward(
                    self.action_size))

        for frame in pc_experience_frames[1:]:
            pc_R = frame.pixel_change + self.gamma_pc * pc_R
            a = np.zeros([self.action_size])
            a[frame.action] = 1.0
            last_action_reward = frame.get_last_action_reward(self.action_size)

            batch_pc_si.append(frame.state)
            batch_pc_a.append(a)
            batch_pc_R.append(pc_R)
            batch_pc_last_action_reward.append(last_action_reward)

        batch_pc_si.reverse()
        batch_pc_a.reverse()
        batch_pc_R.reverse()
        batch_pc_last_action_reward.reverse()

        return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R

    def _process_vr(self, sess):
        # [Value replay]
        # Sample 20+1 frame (+1 for last next state)
        vr_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        vr_experience_frames.reverse()

        batch_vr_si = []
        batch_vr_R = []
        batch_vr_last_action_reward = []

        vr_R = 0.0
        if not vr_experience_frames[1].terminal:
            vr_R = self.local_network.run_vr_value(
                sess, vr_experience_frames[0].state,
                vr_experience_frames[0].get_last_action_reward(
                    self.action_size))

        # t_max times loop
        for frame in vr_experience_frames[1:]:
            vr_R = frame.reward + self.gamma * vr_R
            batch_vr_si.append(frame.state)
            batch_vr_R.append(vr_R)
            last_action_reward = frame.get_last_action_reward(self.action_size)
            batch_vr_last_action_reward.append(last_action_reward)

        batch_vr_si.reverse()
        batch_vr_R.reverse()
        batch_vr_last_action_reward.reverse()

        return batch_vr_si, batch_vr_last_action_reward, batch_vr_R

    '''
  def _process_rp(self):
    # [Reward prediction]
    rp_experience_frames, total_raw_reward, _ = self.experience.sample_rp_sequence()
    # 4 frames

    batch_rp_si = []
    batch_rp_c = []
    
    for i in range(4):
      batch_rp_si.append(rp_experience_frames[i].state)

    # one hot vector for target reward
    r = total_raw_reward
    rp_c = [0.0, 0.0, 0.0]
    if r == 0:
      rp_c[0] = 1.0 # zero
    elif r > 0:
      rp_c[1] = 1.0 # positive
    else:
      rp_c[2] = 1.0 # negative
    batch_rp_c.append(rp_c)
    return batch_rp_si, batch_rp_c
  '''

    def _process_replay(self, action=False):
        # [Reward prediction]
        rp_experience_frames, total_raw_reward, next_frame = self.experience.sample_rp_sequence(
            flag=True)
        # 4 frames

        batch_rp_si = []
        batch_rp_c = []

        for i in range(4):
            batch_rp_si.append(rp_experience_frames[i].state)

        # one hot vector for target reward
        r = total_raw_reward
        rp_c = [0.0, 0.0, 0.0]
        if r == 0:
            rp_c[0] = 1.0  # zero
        elif r > 0:
            rp_c[1] = 1.0  # positive
        else:
            rp_c[2] = 1.0  # negative
        batch_rp_c.append(rp_c)

        result = [batch_rp_si, batch_rp_c, next_frame]

        if action:
            batch_rp_action = []
            action_index = rp_experience_frames[3].action
            action_one_hot = np.zeros([self.action_size])
            action_one_hot[action_index] = 1.0
            batch_rp_action.append(action_one_hot)
            result.append(batch_rp_action)
        return result

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        # Fill experience replay buffer
        if not self.experience.is_full():
            self._fill_experience(sess)
            return 0

        start_local_t = self.local_t

        cur_learning_rate = self._anneal_learning_rate(global_t)

        # Copy weights from shared to local
        sess.run(self.sync)

        # [Base]
        batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \
              self._process_base(sess,
                                 global_t,
                                 summary_writer,
                                 summary_op,
                                 score_input)
        feed_dict = {
            self.local_network.base_input: batch_si,
            self.local_network.base_last_action_reward_input:
            batch_last_action_rewards,
            self.local_network.base_a: batch_a,
            self.local_network.base_adv: batch_adv,
            self.local_network.base_r: batch_R,
            self.local_network.base_initial_lstm_state: start_lstm_state,
            # [common]
            self.learning_rate_input: cur_learning_rate
        }

        # [Pixel change]
        if self.use_pixel_change:
            batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc(
                sess)

            pc_feed_dict = {
                self.local_network.pc_input: batch_pc_si,
                self.local_network.pc_last_action_reward_input:
                batch_pc_last_action_reward,
                self.local_network.pc_a: batch_pc_a,
                self.local_network.pc_r: batch_pc_R
            }
            feed_dict.update(pc_feed_dict)

        # [Value replay]
        if self.use_value_replay:
            batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr(
                sess)

            vr_feed_dict = {
                self.local_network.vr_input: batch_vr_si,
                self.local_network.vr_last_action_reward_input:
                batch_vr_last_action_reward,
                self.local_network.vr_r: batch_vr_R
            }
            feed_dict.update(vr_feed_dict)

        # [Reward prediction]
        next_frame = None
        if self.use_reward_prediction:
            batch_rp_si, batch_rp_c, next_frame = self._process_replay()
            rp_feed_dict = {
                self.local_network.rp_input: batch_rp_si,
                self.local_network.rp_c_target: batch_rp_c
            }
            feed_dict.update(rp_feed_dict)

        # [Future reward prediction]
        if self.use_future_reward_prediction:
            batch_frp_si, batch_frp_c, next_frame, batch_frp_action = self._process_replay(
                action=True)
            frp_feed_dict = {
                self.local_network.frp_input: batch_frp_si,
                self.local_network.frp_c_target: batch_frp_c,
                self.local_network.frp_action_input: batch_frp_action
            }
            feed_dict.update(frp_feed_dict)

        if next_frame and self.use_autoencoder:
            ae_feed_dict = {
                self.local_network.ground_truth:
                np.expand_dims(next_frame.state, axis=0)
            }
            feed_dict.update(ae_feed_dict)

        # Calculate gradients and copy them to global network.
        #sess.run( self.apply_gradients, feed_dict=feed_dict)
        ln = self.local_network
        if self.use_future_reward_prediction:
            if self.use_autoencoder:
                frp_c, decoder_loss, frp_loss, value_loss, policy_loss, _ = sess.run(
                    [
                        ln.frp_c, ln.decoder_loss, ln.frp_loss, ln.value_loss,
                        ln.policy_loss, self.apply_gradients
                    ],
                    feed_dict=feed_dict)
                self.add_summary(global_t, 'decoder_loss', decoder_loss,
                                 summary_writer)
                self.add_summary(global_t, 'frp_loss', frp_loss,
                                 summary_writer)
            else:
                frp_c, value_loss, policy_loss, _ = sess.run(
                    [
                        ln.frp_c, ln.value_loss, ln.policy_loss,
                        self.apply_gradients
                    ],
                    feed_dict=feed_dict)
            acc = ((frp_c == frp_c.max()) * batch_frp_c).sum()
            self.add_summary(global_t, 'reward prediction accuracy', acc,
                             summary_writer)
        else:
            value_loss, policy_loss, _ = sess.run(
                [ln.value_loss, ln.policy_loss, self.apply_gradients],
                feed_dict=feed_dict)

        self.add_summary(global_t, 'value_loss', value_loss, summary_writer)
        self.add_summary(global_t, 'policy_loss', policy_loss, summary_writer)
        self.add_summary(global_t, 'base_loss', policy_loss + value_loss,
                         summary_writer)

        if self.use_autoencoder and global_t % 25000 == 0:
            current_res = {
                'next_frame_ground_truth': next_frame,
                'step': global_t
            }
            if self.use_reward_prediction:
                predicted_frame, predicted_reward = sess.run(
                    [
                        self.local_network.encoder_output,
                        self.local_network.rp_c
                    ],
                    feed_dict=feed_dict)
                current_res['states'] = batch_rp_si
                current_res['target_reward'] = batch_rp_c
            elif self.use_future_reward_prediction:
                predicted_frame, predicted_reward = sess.run(
                    [
                        self.local_network.encoder_output,
                        self.local_network.frp_c
                    ],
                    feed_dict=feed_dict)
                current_res['states'] = batch_frp_si
                current_res['target_reward'] = batch_frp_c
                current_res['action'] = batch_frp_action
            current_res['next_frame_prediction'] = predicted_frame
            current_res['next_reward_prediction'] = predicted_reward
            if os.path.exists(self.prediction_res_file) and os.path.getsize(
                    self.prediction_res_file) > 0:
                with open(self.prediction_res_file, 'rb') as f:
                    res = pickle.load(f)
            else:
                res = []
            res.append(current_res)
            with open(self.prediction_res_file, 'wb') as f:
                pickle.dump(res, f)

        self._print_log(global_t)

        # Return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Esempio n. 27
0
class Trainer(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_pixel_change, use_value_replay, use_reward_prediction,
                 pixel_change_lambda, entropy_beta, local_t_max, gamma,
                 gamma_pc, experience_history_size, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)

        self.local_network = UnrealModel(self.action_size, thread_index,
                                         use_pixel_change, use_value_replay,
                                         use_reward_prediction,
                                         pixel_change_lambda, entropy_beta,
                                         device)
        self.local_network.prepare_loss()

        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.total_loss, global_network.get_vars(),
            self.local_network.get_vars())

        self.sync = self.local_network.sync_from(global_network)
        self.experience = Experience(self.experience_history_size)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # For log output
        self.prev_local_t = 0

    def prepare(self):
        self.environment = Environment.create_environment(
            self.env_type, self.env_name)

    def stop(self):
        self.environment.stop()

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)

        pi_, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action)

        frame = ExperienceFrame(prev_state, reward, action, terminal,
                                pixel_change, last_action, last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")

    def _print_log(self, global_t):
        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print(
                "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(self.local_t_max):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)
            #Modify Last State - with attention
            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action)  #Modify New State - with attention
            frame = ExperienceFrame(prev_state, reward, action, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state

    def _process_pc(self, sess):
        # [pixel change]
        # Sample 20+1 frame (+1 for last next state)
        pc_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        pc_experience_frames.reverse()

        batch_pc_si = []
        batch_pc_a = []
        batch_pc_R = []
        batch_pc_last_action_reward = []

        pc_R = np.zeros([20, 20], dtype=np.float32)
        if not pc_experience_frames[1].terminal:
            pc_R = self.local_network.run_pc_q_max(
                sess, pc_experience_frames[0].state,
                pc_experience_frames[0].get_last_action_reward(
                    self.action_size))

        for frame in pc_experience_frames[1:]:
            pc_R = frame.pixel_change + self.gamma_pc * pc_R
            a = np.zeros([self.action_size])
            a[frame.action] = 1.0
            last_action_reward = frame.get_last_action_reward(self.action_size)

            batch_pc_si.append(frame.state)
            batch_pc_a.append(a)
            batch_pc_R.append(pc_R)
            batch_pc_last_action_reward.append(last_action_reward)

        batch_pc_si.reverse()
        batch_pc_a.reverse()
        batch_pc_R.reverse()
        batch_pc_last_action_reward.reverse()

        return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R

    def _process_vr(self, sess):
        # [Value replay]
        # Sample 20+1 frame (+1 for last next state)
        vr_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        vr_experience_frames.reverse()

        batch_vr_si = []
        batch_vr_R = []
        batch_vr_last_action_reward = []

        vr_R = 0.0
        if not vr_experience_frames[1].terminal:
            vr_R = self.local_network.run_vr_value(
                sess, vr_experience_frames[0].state,
                vr_experience_frames[0].get_last_action_reward(
                    self.action_size))

        # t_max times loop
        for frame in vr_experience_frames[1:]:
            vr_R = frame.reward + self.gamma * vr_R
            batch_vr_si.append(frame.state)
            batch_vr_R.append(vr_R)
            last_action_reward = frame.get_last_action_reward(self.action_size)
            batch_vr_last_action_reward.append(last_action_reward)

        batch_vr_si.reverse()
        batch_vr_R.reverse()
        batch_vr_last_action_reward.reverse()

        return batch_vr_si, batch_vr_last_action_reward, batch_vr_R

    def _process_rp(self):
        # [Reward prediction]
        rp_experience_frames = self.experience.sample_rp_sequence()
        # 4 frames

        batch_rp_si = []
        batch_rp_c = []

        for i in range(3):
            batch_rp_si.append(rp_experience_frames[i].state)

        # one hot vector for target reward
        r = rp_experience_frames[3].reward
        rp_c = [0.0, 0.0, 0.0]
        if r == 0:
            rp_c[0] = 1.0  # zero
        elif r > 0:
            rp_c[1] = 1.0  # positive
        else:
            rp_c[2] = 1.0  # negative
        batch_rp_c.append(rp_c)
        return batch_rp_si, batch_rp_c

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        # Fill experience replay buffer
        if not self.experience.is_full():
            self._fill_experience(sess)
            return 0

        start_local_t = self.local_t

        cur_learning_rate = self._anneal_learning_rate(global_t)

        # Copy weights from shared to local
        sess.run(self.sync)

        # [Base]
        batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \
              self._process_base(sess,
                                 global_t,
                                 summary_writer,
                                 summary_op,
                                 score_input)
        feed_dict = {
            self.local_network.base_input: batch_si,
            self.local_network.base_last_action_reward_input:
            batch_last_action_rewards,
            self.local_network.base_a: batch_a,
            self.local_network.base_adv: batch_adv,
            self.local_network.base_r: batch_R,
            self.local_network.base_initial_lstm_state: start_lstm_state,
            # [common]
            self.learning_rate_input: cur_learning_rate
        }

        # [Pixel change]
        if self.use_pixel_change:
            batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc(
                sess)

            pc_feed_dict = {
                self.local_network.pc_input: batch_pc_si,
                self.local_network.pc_last_action_reward_input:
                batch_pc_last_action_reward,
                self.local_network.pc_a: batch_pc_a,
                self.local_network.pc_r: batch_pc_R
            }
            feed_dict.update(pc_feed_dict)

        # [Value replay]
        if self.use_value_replay:
            batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr(
                sess)

            vr_feed_dict = {
                self.local_network.vr_input: batch_vr_si,
                self.local_network.vr_last_action_reward_input:
                batch_vr_last_action_reward,
                self.local_network.vr_r: batch_vr_R
            }
            feed_dict.update(vr_feed_dict)

        # [Reward prediction]
        if self.use_reward_prediction:
            batch_rp_si, batch_rp_c = self._process_rp()
            rp_feed_dict = {
                self.local_network.rp_input: batch_rp_si,
                self.local_network.rp_c_target: batch_rp_c
            }
            feed_dict.update(rp_feed_dict)

        # Calculate gradients and copy them to global network.
        sess.run(self.apply_gradients, feed_dict=feed_dict)

        self._print_log(global_t)

        # Return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Esempio n. 28
0
class Application(object):
    def __init__(self):
        pass

    def train_function(self, parallel_index, preparing):
        """ Train each environment. """

        trainer = self.trainers[parallel_index]
        if preparing:
            trainer.prepare()

        # set start_time
        trainer.set_start_time(self.start_time)

        while True:
            if self.stop_requested:
                break
            if self.terminate_reqested:
                trainer.stop()
                break
            if self.global_t > flags.max_time_step:
                trainer.stop()
                break
            if parallel_index == 0 and self.global_t > self.next_save_steps:
                # Save checkpoint
                self.save()

            #Each env calls its own process #Process has sub tasks called within
            diff_global_t = trainer.process(self.sess, self.global_t,
                                            self.summary_writer,
                                            self.summary_op, self.score_input)
            self.global_t += diff_global_t

    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"

        initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                            flags.initial_alpha_high,
                                            flags.initial_alpha_log_rate)

        self.global_t = 0

        self.stop_requested = False
        self.terminate_reqested = False

        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)

        self.global_network = UnrealModel(action_size, -1,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          flags.pixel_change_lambda,
                                          flags.entropy_beta, device)
        self.trainers = []

        learning_rate_input = tf.placeholder("float")

        grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                      decay=flags.rmsp_alpha,
                                      momentum=0.0,
                                      epsilon=flags.rmsp_epsilon,
                                      clip_norm=flags.grad_norm_clip,
                                      device=device)

        for i in range(
                flags.parallel_size):  #Trainer creates a UnrealModel in init
            trainer = Trainer(i, self.global_network, initial_learning_rate,
                              learning_rate_input, grad_applier,
                              flags.env_type, flags.env_name,
                              flags.use_pixel_change, flags.use_value_replay,
                              flags.use_reward_prediction,
                              flags.pixel_change_lambda, flags.entropy_beta,
                              flags.local_t_max, flags.gamma, flags.gamma_pc,
                              flags.experience_history_size,
                              flags.max_time_step, device)
            self.trainers.append(trainer)

        # prepare session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        # summary for tensorboard
        self.score_input = tf.placeholder(tf.int32)
        tf.summary.scalar("score", self.score_input)

        self.summary_op = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(flags.log_file,
                                                    self.sess.graph)

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded:", checkpoint.model_checkpoint_path)
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: ", self.global_t)
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step

        else:
            print("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        # run training threads  ## Each Env is Running Here Parallel
        self.train_threads = []
        for i in range(flags.parallel_size):
            self.train_threads.append(
                threading.Thread(target=self.train_function, args=(i, True)))

        signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t

        for t in self.train_threads:
            t.start()

        print('Press Ctrl+C to stop')
        signal.pause()

    def save(self):
        """ Save checkpoint. 
    Called from therad-0.
    """
        self.stop_requested = True

        # Wait for all other threads to stop
        for (i, t) in enumerate(self.train_threads):
            if i != 0:
                t.join()

        # Save
        if not os.path.exists(flags.checkpoint_dir):
            os.mkdir(flags.checkpoint_dir)

        # Write wall time
        wall_t = time.time() - self.start_time
        wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
            self.global_t)
        with open(wall_t_fname, 'w') as f:
            f.write(str(wall_t))

        print('Start saving.')
        self.saver.save(self.sess,
                        flags.checkpoint_dir + '/' + 'checkpoint',
                        global_step=self.global_t)
        print('End saving.')

        self.stop_requested = False
        self.next_save_steps += flags.save_interval_step

        # Restart other threads
        for i in range(flags.parallel_size):
            if i != 0:
                thread = threading.Thread(target=self.train_function,
                                          args=(i, False))
                self.train_threads[i] = thread
                thread.start()

    def signal_handler(self, signal, frame):
        print('You pressed Ctrl+C!')
        self.terminate_reqested = True
Esempio n. 29
0
    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"

        initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                            flags.initial_alpha_high,
                                            flags.initial_alpha_log_rate)

        self.global_t = 0

        self.stop_requested = False
        self.terminate_reqested = False

        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)

        self.global_network = UnrealModel(action_size, -1,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          flags.pixel_change_lambda,
                                          flags.entropy_beta, device)
        self.trainers = []

        learning_rate_input = tf.placeholder("float")

        grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                      decay=flags.rmsp_alpha,
                                      momentum=0.0,
                                      epsilon=flags.rmsp_epsilon,
                                      clip_norm=flags.grad_norm_clip,
                                      device=device)

        for i in range(
                flags.parallel_size):  #Trainer creates a UnrealModel in init
            trainer = Trainer(i, self.global_network, initial_learning_rate,
                              learning_rate_input, grad_applier,
                              flags.env_type, flags.env_name,
                              flags.use_pixel_change, flags.use_value_replay,
                              flags.use_reward_prediction,
                              flags.pixel_change_lambda, flags.entropy_beta,
                              flags.local_t_max, flags.gamma, flags.gamma_pc,
                              flags.experience_history_size,
                              flags.max_time_step, device)
            self.trainers.append(trainer)

        # prepare session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        # summary for tensorboard
        self.score_input = tf.placeholder(tf.int32)
        tf.summary.scalar("score", self.score_input)

        self.summary_op = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(flags.log_file,
                                                    self.sess.graph)

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded:", checkpoint.model_checkpoint_path)
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: ", self.global_t)
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step

        else:
            print("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        # run training threads  ## Each Env is Running Here Parallel
        self.train_threads = []
        for i in range(flags.parallel_size):
            self.train_threads.append(
                threading.Thread(target=self.train_function, args=(i, True)))

        signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t

        for t in self.train_threads:
            t.start()

        print('Press Ctrl+C to stop')
        signal.pause()
Esempio n. 30
0
class Evaluate(object):
  def __init__(self):
    self.action_size = Environment.get_action_size(flags.env_type, flags.env_name)
    self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name)

    env_config = sim_config.get(flags.env_name)
    self.image_shape = [env_config['height'], env_config['width']]
    segnet_param_dict = {'segnet_mode': flags.segnet}
    is_training = tf.placeholder(tf.bool, name="training") # for display param in UnrealModel says its value

    self.global_network = UnrealModel(self.action_size,
                                      self.objective_size,
                                      -1,
                                      flags.use_lstm,
                                      flags.use_pixel_change,
                                      flags.use_value_replay,
                                      flags.use_reward_prediction,
                                      0.0, #flags.pixel_change_lambda
                                      0.0, #flags.entropy_beta
                                      device,
                                      segnet_param_dict=segnet_param_dict,
                                      image_shape=self.image_shape,
                                      is_training=is_training,
                                      n_classes=flags.n_classes,
                                      segnet_lambda=flags.segnet_lambda,
                                      dropout=flags.dropout,
                                      for_display=True)
    self.environment = Environment.create_environment(flags.env_type, flags.env_name, flags.termination_time_sec,
                                                      env_args={'episode_schedule': flags.split,
                                                                'log_action_trace': flags.log_action_trace,
                                                                'max_states_per_scene': flags.episodes_per_scene,
                                                                'episodes_per_scene_test': flags.episodes_per_scene})

    self.global_network.prepare_loss()

    self.total_loss = []
    self.segm_loss = []
    self.episode_reward = [0]
    self.episode_roomtype = []
    self.roomType_dict  = {}
    self.segnet_class_dict = {}
    self.success_rate = []
    self.batch_size = 20
    self.batch_cur_num = 0
    self.batch_prev_num = 0
    self.batch_si = []
    self.batch_sobjT = []
    self.batch_a = []
    self.batch_reward = []

  def update(self, sess):
    self.process(sess)

  def is_done(self):
    return self.environment.is_all_scheduled_episodes_done()

  def choose_action(self, pi_values):
    return np.random.choice(range(len(pi_values)), p=pi_values)

  def process(self, sess):
    last_action = self.environment.last_action
    last_reward = self.environment.last_reward
    last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size,
                                                                  last_reward, self.environment.last_state)
    if random_policy:
      pi_values = [1/3.0, 1/3.0, 1/3.0]
      action = self.choose_action(pi_values)
      state, reward, terminal, pixel_change = self.environment.process(action)
      self.episode_reward[-1] += reward
    else:
      mode = "segnet" if flags.segnet >= 2 else ""
      segnet_preds = None
      if not flags.use_pixel_change:
        pi_values, v_value, segnet_preds = self.global_network.run_base_policy_and_value(sess,
                                                                           self.environment.last_state,
                                                                           last_action_reward, mode=mode)
      else:
        pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess,
                                                                                  self.environment.last_state,
                                                                                  last_action_reward)

      if segnet_preds is not None:
          mask = self.environment.last_state.get('objectType', None)
          if mask is not None:
              new_classes = np.unique(mask)
              if segnet_preds.shape != mask.shape:
                  print("Predictions have shape {}, but groundtruth mask has shape {}".format(segnet_preds.shape, mask.shape))
              else:
                  similar = segnet_preds == mask
                  for id_class in new_classes:
                      id_list = self.segnet_class_dict.get(id_class, None)
                      if id_list is None:
                          id_list = []
                      id_list += [[np.sum(similar[mask == id_class]), np.sum(mask == id_class)]]
                      self.segnet_class_dict[id_class] = id_list

      self.batch_cur_num += 1
      if flags.segnet == -1: #just not necessary
        if self.batch_cur_num != 0 and self.batch_cur_num - self.batch_prev_num >= self.batch_size:

          #print(np.unique(self.batch_sobjT))
          feed_dict = {self.global_network.base_input: self.batch_si,
                       self.global_network.base_segm_mask: self.batch_sobjT,
                       self.global_network.is_training: not True}

          segm_loss, preds, confusion_mtx = sess.run([self.global_network.decoder_loss,
                                                    self.global_network.preds, self.global_network.update_evaluation_vars],
                                                   feed_dict=feed_dict)
          total_loss = 0
          self.total_loss += [total_loss]
          self.segm_loss += [segm_loss] # TODO: here do something with it, store somwhere?

          #update every_thing else
          self.batch_prev_num = self.batch_cur_num
          self.batch_si = []
          self.batch_sobjT = []
          self.batch_a = []
        else:
          self.batch_si += [self.environment.last_state["image"]]
          self.batch_sobjT += [self.environment.last_state["objectType"]]
          self.batch_a += [self.environment.ACTION_LIST[self.environment.last_action]]

      action = self.choose_action(pi_values)
      state, reward, terminal, pixel_change = self.environment.process(action)
      self.episode_reward[-1] += reward

    if terminal:
      ep_info = self.environment._episode_info
      if ep_info['task'] == 'room_goal':
          one_hot_room = ep_info['goal']['roomTypeEncoded']
          room_type = ep_info['goal']['roomType']
          ind = np.where(one_hot_room)[0][0]
          self.roomType_dict[ind] = room_type
          self.episode_roomtype += [ind]
      self.success_rate += [int(self.environment._last_full_state["success"])]
      self.environment.reset()
      self.episode_reward += [0]