class PSMPLearnAgent(object): def __init__(self, model_func, exploration_schedule, obs_shape, input_type, lr=1e-4, buffer_size=1000000, num_actions=6, latent_dim=32, gamma=0.99, knn=4, eval_epsilon=0.01, queue_threshold=5e-5, batch_size=32, tf_writer=None): self.conn, child_conn = Pipe() self.replay_buffer = np.empty((buffer_size, ) + obs_shape, np.float32) self.ec_buffer = PSLearningProcess(num_actions, buffer_size, latent_dim, obs_shape, child_conn, gamma) self.obs = None self.z = None self.h = None self.ind = -1 self.writer = tf_writer self.sequence = [] self.gamma = gamma self.queue_threshold = queue_threshold self.num_actions = num_actions self.exploration_schedule = exploration_schedule self.latent_dim = latent_dim self.knn = knn self.steps = 0 self.batch_size = batch_size self.rmax = 100000 self.logger = logging.getLogger("ecbp") self.eval_epsilon = eval_epsilon self.train_step = 4 self.alpha = 1 self.burnin = 2000 self.burnout = 1000000 self.loss_type = ["contrast"] self.hash_func, self.train_func, self.eval_func, self.norm_func = build_train_contrast( make_obs_ph=lambda name: input_type(obs_shape, name=name), model_func=model_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-4), gamma=gamma, grad_norm_clipping=10, loss_type=self.loss_type) self.ec_buffer.start() def log(self, *args, logtype='debug', sep=' '): getattr(self.logger, logtype)(sep.join(str(a) for a in args)) def send_and_receive(self, msg, obj): self.conn.send((msg, obj)) self.log("waiting") if self.conn.poll(timeout=None): recv_msg, recv_obj = self.conn.recv() assert msg == recv_msg return recv_obj def train(self): # sample self.log("begin training") samples = self.send_and_receive(4, self.batch_size) index_tar, index_pos, index_neg, value_tar, action_tar = samples obs_tar = [self.replay_buffer[ind] for ind in index_tar] obs_pos = [self.replay_buffer[ind] for ind in index_pos] obs_neg = [self.replay_buffer[ind] for ind in index_neg] if "regression" in self.loss_type: value_original = self.norm_func(np.array(obs_tar)) value_tar = np.array(value_tar) self.log(value_original, "value original") self.log(value_tar, "value tar") value_original = np.array(value_original).squeeze() / self.alpha assert value_original.shape == np.array( value_tar).shape, "{}{}".format(value_original.shape, np.array(value_tar).shape) value_tar[np.isnan(value_tar)] = value_original[np.isnan( value_tar)] assert not np.isnan(value_tar).any(), "{}{}".format( value_original, obs_tar) input = [obs_tar] if "contrast" in self.loss_type: input += [obs_pos, obs_neg] if "regression" in self.loss_type: input += [value_tar] if "linear_model" in self.loss_type: input += [action_tar] if "contrast" not in self.loss_type: input += [obs_pos] func = self.train_func if self.steps < self.burnout else self.eval_func if "contrast" in self.loss_type: z_tar, z_pos, z_neg, loss, summary = func(*input) self.send_and_receive(5, (index_tar, z_tar)) self.send_and_receive(5, (index_pos, z_pos)) self.send_and_receive(5, (index_neg, z_neg)) elif "linear_model" in self.loss_type: z_tar, z_pos, loss, summary = func(*input) self.send_and_receive(5, (index_tar, z_tar)) self.send_and_receive(5, (index_pos, z_pos)) else: z_tar, loss, summary = func(*input) self.send_and_receive(5, (index_tar, z_tar)) self.log("finish training") self.writer.add_summary(summary, global_step=self.steps) def act(self, obs, is_train=True): if is_train: self.steps += 1 # print(obs) try: obs = obs[0]['observation'] except IndexError: obs = obs self.obs = obs # print("in act",obs) self.z, self.h = self.hash_func(np.array(obs)) self.z, self.h = np.array(self.z).reshape((self.latent_dim, )), tuple( np.array(self.h).reshape((self.latent_dim, ))) if self.ind == -1: self.ind = self.send_and_receive(1, (np.array([self.z]), self.h)) self.replay_buffer[self.ind] = obs # self.steps += 1 epsilon = max(0, self.exploration_schedule.value( self.steps)) if is_train else self.eval_epsilon if np.random.random() < epsilon: action = np.random.randint(0, self.num_actions) return action else: finds = np.zeros((1, )) extrinsic_qs, intrinsic_qs, find = self.send_and_receive( 0, (np.array([self.z]), self.h, self.knn)) extrinsic_qs, intrinsic_qs = np.array(extrinsic_qs), np.array( intrinsic_qs) finds += sum(find) if is_train: q = extrinsic_qs else: q = extrinsic_qs q = np.squeeze(q) q = np.squeeze(q) # q = np.nan_to_num(q) q_max = np.nanmax(q) if np.isnan(q_max): max_action = np.arange(self.num_actions) else: max_action = np.where(q >= q_max - 1e-7)[0] self.log("action selection", max_action) self.log("q", q, q_max) action_selected = np.random.randint(0, len(max_action)) return max_action[action_selected] def observe(self, action, reward, state_tp1, done, train=True): # state_tp1 = obs[0]['observation'] if type(state_tp1) is dict: state_tp1 = state_tp1['observation'] z_tp1, h_tp1 = self.hash_func(np.array(state_tp1)[np.newaxis, ...]) z_tp1, h_tp1 = np.array(z_tp1).reshape((self.latent_dim, )), tuple( np.array(h_tp1).reshape((self.latent_dim, ))) # z_tp1, h_tp1 = np.array(self.hash_func(np.array(state_tp1)[np.newaxis, ...])).reshape((self.latent_dim,)) if train: self.ind = self.send_and_receive( 2, (self.ind, action, reward, z_tp1, h_tp1, done)) else: self.ind = self.send_and_receive(1, (np.array([z_tp1]), h_tp1)) if done: self.ind = -1 # self.steps = 0 if self.steps % self.train_step == 0 and self.steps >= self.burnin and train: self.train() # else: # self.log("not trai ning ", self.steps,self.steps % self.train_step == 0, self.steps >= self.burnin, train) # def update_sequence(self): # self.ec_buffer.update_sequence(self.sequence, self.debug) # self.sequence = [] def finish(self): self.send_and_receive(3, (True, ))
class BVAEAttentionAgent(PSMPLearnTargetAgent): def __init__(self, encoder_func, decoder_func, exploration_schedule, obs_shape, vector_input=True, lr=1e-4, buffer_size=1000000, num_actions=6, latent_dim=32, gamma=0.99, knn=4, eval_epsilon=0.1, queue_threshold=5e-5, batch_size=32, density=True, trainable=True, num_neg=10, tf_writer=None): self.conn, child_conn = Pipe() self.replay_buffer = np.empty((buffer_size + 10, ) + obs_shape, np.float32 if vector_input else np.uint8) self.ec_buffer = PSLearningProcess(num_actions, buffer_size, latent_dim * 2, obs_shape, child_conn, gamma, density=density) self.obs = None self.z = None self.cur_capacity = 0 self.ind = -1 self.writer = tf_writer self.sequence = [] self.gamma = gamma self.queue_threshold = queue_threshold self.num_actions = num_actions self.exploration_schedule = exploration_schedule self.latent_dim = latent_dim self.knn = knn self.steps = 0 self.batch_size = batch_size self.rmax = 100000 self.logger = logging.getLogger("ecbp") self.log("psmp learning agent here") self.eval_epsilon = eval_epsilon self.train_step = 4 self.alpha = 1 self.burnin = 2000 self.burnout = 10000000000 self.update_target_freq = 10000 self.buffer_capacity = 0 self.trainable = trainable self.num_neg = num_neg self.loss_type = ["attention"] input_type = U.Float32Input if vector_input else U.Uint8Input # input_type = U.Uint8Input self.hash_func, self.unmask_z_func, self.train_func, self.eval_func, self.norm_func, self.attention_func, self.value_func, self.reconstruct_func, self.update_target_func = build_train_mer_bvae_attention( input_type=input_type, obs_shape=obs_shape, encoder_func=encoder_func, decoder_func=decoder_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-4), gamma=gamma, grad_norm_clipping=10, latent_dim=latent_dim, loss_type=self.loss_type, batch_size=batch_size, num_neg=num_neg, c_loss_type="sqmargin", ) self.finds = [0, 0] self.ec_buffer.start() def train(self): # sample # self.log("begin training") # print("training",self.writer) noise = np.random.randn(9, self.batch_size, self.latent_dim) samples = self.send_and_receive(4, (self.batch_size, self.num_neg)) samples_u = self.send_and_receive(4, (self.batch_size, self.num_neg)) samples_v = self.send_and_receive(4, (self.batch_size, self.num_neg)) index_u, _, _, _, value_u, _, _, _ = samples_u index_v, _, _, _, value_v, _, _, _ = samples_v index_tar, index_pos, index_neg, reward_tar, value_tar, action_tar, neighbours_index, neighbours_value = samples if len(index_tar) < self.batch_size: return obs_tar = [self.replay_buffer[ind] for ind in index_tar] obs_pos = [self.replay_buffer[ind] for ind in index_pos] obs_neg = [self.replay_buffer[ind] for ind in index_neg] obs_neighbour = [self.replay_buffer[ind] for ind in neighbours_index] obs_u = [self.replay_buffer[ind] for ind in index_u] obs_v = [self.replay_buffer[ind] for ind in index_v] # print(obs_tar[0].shape) if "regression" in self.loss_type: value_original = self.norm_func(np.array(obs_tar)) value_tar = np.array(value_tar) self.log(value_original, "value original") self.log(value_tar, "value tar") value_original = np.array(value_original).squeeze() / self.alpha assert value_original.shape == np.array( value_tar).shape, "{}{}".format(value_original.shape, np.array(value_tar).shape) value_tar[np.isnan(value_tar)] = value_original[np.isnan( value_tar)] assert not np.isnan(value_tar).any(), "{}{}".format( value_original, obs_tar) input = [noise, obs_tar] if "contrast" in self.loss_type: input += [obs_pos, obs_neg] if "regression" in self.loss_type: input += [np.nan_to_num(value_tar)] if "linear_model" in self.loss_type: input += [action_tar] if "contrast" not in self.loss_type: input += [obs_pos] if "fit" in self.loss_type: input += [obs_neighbour, np.nan_to_num(neighbours_value)] if "regression" not in self.loss_type: input += [np.nan_to_num(value_tar)] if "causality" in self.loss_type: input += [reward_tar, action_tar] if "weight_product" in self.loss_type: value_u = np.nan_to_num(np.array(value_u)) value_v = np.nan_to_num(np.array(value_v)) input += [obs_u, obs_v, obs_u, obs_v, value_u, value_v] if "attention" in self.loss_type: value_original = self.value_func(noise, np.array(obs_tar)) value_tar = np.array(value_tar) value_original = np.array(value_original).squeeze() value_tar[np.isnan(value_tar)] = value_original[np.isnan( value_tar)] input += [value_tar] func = self.train_func if self.steps < self.burnout else self.eval_func loss, summary = func(*input) # self.log("finish training") self.writer.add_summary(summary, global_step=self.steps) def save_attention(self, filedir, step): subdir = os.path.join(filedir, "./attention") noise = np.random.randn(9, 1, self.latent_dim) origin_z = np.array(self.unmask_z_func(noise, np.array(self.obs)))[0] z = np.array(self.hash_func(noise, np.array(self.obs)))[0] attention = np.array(self.attention_func(noise, np.array(self.obs))) # print("", np.var(attention), np.max(attention), np.min(attention)) origin_image, _ = self.reconstruct_func(noise, origin_z) origin_image = origin_image[0] # print(origin_image.shape) attentioned_image, predict_value = self.reconstruct_func(noise, z) attentioned_image = attentioned_image[0] print("attention var mean max min", np.var(attention), np.mean(attention), np.max(attention), np.min(attention), predict_value) if not os.path.isdir(subdir): os.makedirs(os.path.join(subdir, "./ori/")) os.makedirs(os.path.join(subdir, "./atten/")) # print(attention.shape) cv2.imwrite( os.path.join(subdir, "./ori/", "origin_{}.png".format(step)), origin_image.transpose((1, 0, 2)) * 255) cv2.imwrite( os.path.join(subdir, "./atten/", "attention_{}.png".format(step)), attentioned_image.transpose((1, 0, 2)) * 255) def act(self, obs, is_train=True): if is_train: self.steps += 1 if self.steps % 100 == 0: self.log("steps", self.steps) # else: # self.log("obs", obs) # print(obs) self.obs = obs noise = np.random.randn(9, self.batch_size, self.latent_dim) # print("in act",obs) self.z = self.hash_func(noise, np.array(obs)) self.z = np.array(self.z).reshape((self.latent_dim * 2, )) if is_train: if self.ind < 0 or self.ind >= self.buffer_capacity: self.ind = self.send_and_receive(1, (np.array(self.z), None)) self.cur_capacity = max(self.ind, self.cur_capacity) # print(self.ind) self.replay_buffer[self.ind] = obs self.buffer_capacity = max(self.ind, self.buffer_capacity) # self.steps += 1 epsilon = max(0, self.exploration_schedule.value( self.steps)) if is_train else self.eval_epsilon if np.random.random() < epsilon: action = np.random.randint(0, self.num_actions) return action else: # finds = np.zeros((1,)) extrinsic_qs, intrinsic_qs, find = self.send_and_receive( 0, (np.array([self.z]), None, self.knn)) extrinsic_qs, intrinsic_qs = np.array(extrinsic_qs), np.array( intrinsic_qs) self.finds[0] += sum(find) self.finds[1] += 1 if is_train: q = extrinsic_qs else: q = extrinsic_qs q = np.squeeze(q) # q = np.nan_to_num(q) q_max = np.nanmax(q) if np.isnan(q_max): max_action = np.arange(self.num_actions) else: max_action = np.where(q >= q_max - 1e-7)[0] self.log("action selection", max_action) self.log("q", q, q_max) action_selected = np.random.randint(0, len(max_action)) return int(max_action[action_selected]) def observe(self, action, reward, state_tp1, done, train=True): if self.steps <= 1: self.update_target_func() noise = np.random.randn(9, self.batch_size, self.latent_dim) z_tp1 = self.hash_func(noise, np.array(state_tp1)[np.newaxis, ...]) z_tp1 = np.array(z_tp1).reshape((self.latent_dim * 2, )) # z_tp1, h_tp1 = np.array(self.hash_func(np.array(state_tp1)[np.newaxis, ...])).reshape((self.latent_dim,)) if train: self.ind = self.send_and_receive( 2, (self.ind, action, reward, z_tp1, None, done)) self.cur_capacity = max(self.ind, self.cur_capacity) self.replay_buffer[self.ind] = state_tp1 self.buffer_capacity = max(self.ind, self.buffer_capacity) else: self.ind = -1 # self.ind = self.send_and_receive(1, (np.array([z_tp1]), None)) if done: self.ind = -1 if self.writer is not None: find_summary = tf.Summary(value=[ tf.Summary.Value(tag="find rate", simple_value=self.finds[0] / (self.finds[1] + 1e-9)) ]) self.writer.add_summary(find_summary, global_step=self.steps) self.finds = [0, 0] # self.steps = 0 if self.steps > self.burnout: return if self.steps % self.train_step == 0 and self.steps >= self.burnin and train and self.trainable: self.train() if self.steps % self.update_target_freq == 0 and self.steps >= self.burnin and train and self.trainable: self.update_target() def update_target(self): self.log("begin updating target") self.log("self.cur capacity", self.cur_capacity) self.update_target_func() noise = np.random.randn(9, self.batch_size, self.latent_dim) for i in range(int(np.ceil( (self.cur_capacity + 1) / self.batch_size))): low = i * self.batch_size high = min(self.cur_capacity + 1, (i + 1) * self.batch_size) self.log("low,high", low, high) obs_to_update = self.replay_buffer[low:high] # self.log("obs shape", obs_to_update.shape) z_to_update = self.hash_func( noise, np.array(obs_to_update).astype(np.float32)) # self.log("z shape", np.array(z_to_update).shape) self.send_and_receive( 5, (np.arange(low, high), np.array(z_to_update)[0])) self.send_and_receive(8, 0) # recompute density self.log("finish updating target")
class MERAttentionAgent(PSMPLearnTargetAgent): def __init__(self, repr_func, model_func, exploration_schedule, obs_shape, vector_input=True, lr=1e-4, buffer_size=1000000, num_actions=6, latent_dim=32, gamma=0.99, knn=4, eval_epsilon=0.1, queue_threshold=5e-5, batch_size=32, density=True, trainable=True, num_neg=10, tf_writer=None): self.conn, child_conn = Pipe() self.replay_buffer = np.empty((buffer_size + 10, ) + obs_shape, np.float32 if vector_input else np.uint8) self.ec_buffer = PSLearningProcess(num_actions, buffer_size, latent_dim, obs_shape, child_conn, gamma, density=density) self.obs = None self.z = None self.cur_capacity = 0 self.ind = -1 self.writer = tf_writer self.sequence = [] self.gamma = gamma self.queue_threshold = queue_threshold self.num_actions = num_actions self.exploration_schedule = exploration_schedule self.latent_dim = latent_dim self.knn = knn self.steps = 0 self.batch_size = batch_size self.rmax = 100000 self.logger = logging.getLogger("ecbp") self.log("psmp learning agent here") self.eval_epsilon = eval_epsilon self.train_step = 4 self.alpha = 1 self.burnin = 2000 self.burnout = 10000000000 self.update_target_freq = 1000 self.buffer_capacity = 0 self.trainable = trainable self.num_neg = num_neg self.loss_type = ["contrast"] input_type = U.Float32Input if vector_input else U.Uint8Input # input_type = U.Uint8Input self.hash_func, self.train_func, self.eval_func, self.update_target_func = build_train_dbc( input_type=input_type, obs_shape=obs_shape, repr_func=repr_func, model_func=model_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-4), gamma=gamma, grad_norm_clipping=10, latent_dim=latent_dim, loss_type=self.loss_type, batch_size=batch_size, num_neg=num_neg, c_loss_type="margin", ) self.finds = [0, 0] self.ec_buffer.start() def train(self): # sample # self.log("begin training") samples_u = self.send_and_receive(4, (self.batch_size, self.num_neg)) samples_v = self.send_and_receive(4, (self.batch_size, self.num_neg)) index_u, index_u_tp1, _, reward_u, _, action_u, _, _ = samples_u index_v, _, _, _, _, _, _, _ = samples_v if len(index_u) < self.batch_size: return obs_u = [self.replay_buffer[ind] for ind in index_u] obs_u_tp1 = [self.replay_buffer[ind] for ind in index_u_tp1] obs_v = [self.replay_buffer[ind] for ind in index_v] input = [obs_u, obs_u_tp1, obs_v, action_u, reward_u] func = self.train_func if self.steps < self.burnout else self.eval_func loss, summary = func(*input) # self.log("finish training") self.writer.add_summary(summary, global_step=self.steps)
class MERAttentionAgent(PSMPLearnTargetAgent): def __init__(self, model_func, exploration_schedule, obs_shape, vector_input=True, lr=1e-4, buffer_size=1000000, num_actions=6, latent_dim=32, gamma=0.99, knn=4, eval_epsilon=0.1, queue_threshold=5e-5, batch_size=32, density=True, trainable=True, num_neg=10, debug=False, debug_dir=None, tf_writer=None): self.debug = debug self.debug_dir = debug_dir self.obs_shape = obs_shape self.conn, child_conn = Pipe() self.replay_buffer = np.empty((buffer_size + 10, ) + obs_shape, np.float32 if vector_input else np.uint8) self.ec_buffer = PSLearningProcess(num_actions, buffer_size, latent_dim, obs_shape, child_conn, gamma, density=density) self.obs = None self.z = None self.cur_capacity = 0 self.ind = -1 self.writer = tf_writer self.sequence = [] self.gamma = gamma self.queue_threshold = queue_threshold self.num_actions = num_actions self.exploration_schedule = exploration_schedule self.latent_dim = latent_dim self.knn = knn self.steps = 0 self.batch_size = batch_size self.rmax = 100000 self.logger = logging.getLogger("ecbp") self.log("psmp learning agent here") self.eval_epsilon = eval_epsilon self.train_step = 4 self.alpha = 1 self.burnin = 2000 self.burnout = 10000000000 self.update_target_freq = 10000 self.buffer_capacity = 0 self.trainable = trainable self.num_neg = num_neg self.loss_type = ["contrast"] input_type = U.Float32Input if vector_input else U.Uint8Input # input_type = U.Float32Input if vector_input else U.NormalizedUint8Input # input_type = U.Uint8Input self.hash_func, self.train_func, self.eval_func, self.norm_func, self.attention_func, self.value_func, self.update_target_func = build_train_mer_attention( # self.hash_func, self.train_func, self.eval_func, self.norm_func, self.update_target_func = build_train_mer( input_type=input_type, obs_shape=obs_shape, model_func=model_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-4), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, latent_dim=latent_dim, loss_type=self.loss_type, batch_size=batch_size, num_neg=num_neg, c_loss_type="sqmargin", ) self.contrast_type = "predictive" self.augment_input_func, self.rand_init_func = build_random_input( input_type=input_type, obs_shape=obs_shape) self.finds = [0, 0] print("writer", self.writer) self.ec_buffer.start() def train(self): # sample self.log("begin training") samples = self.send_and_receive( 4, (self.batch_size, self.num_neg, 'uniform')) samples_u = self.send_and_receive( 4, (self.batch_size, self.num_neg, 'uniform')) samples_v = self.send_and_receive( 4, (self.batch_size, self.num_neg, 'uniform')) samples_attention = self.send_and_receive( 4, (self.batch_size, self.num_neg, "uniform")) index_u, _, _, _, q_value_u, _, _, _ = samples_u index_v, _, _, _, q_value_v, _, _, _ = samples_v index_tar, index_pos, index_neg, reward_tar, value_tar, action_tar, neighbours_index, neighbours_value = samples index_tar_attention, _, _, _, q_value_tar_attention, _, _, _ = samples_attention self.log("finish sampling") if len(index_tar) < self.batch_size: return obs_tar = [self.replay_buffer[ind] for ind in index_tar] obs_tar_attention = [ self.replay_buffer[ind] for ind in index_tar_attention ] obs_neg = [self.replay_buffer[ind] for ind in index_neg] obs_neighbour = [self.replay_buffer[ind] for ind in neighbours_index] obs_u = [self.replay_buffer[ind] for ind in index_u] obs_v = [self.replay_buffer[ind] for ind in index_v] if self.contrast_type == "predictive": obs_pos = [self.replay_buffer[ind] for ind in index_pos] elif self.contrast_type == "augment": self.rand_init_func() obs_pos = self.augment_input_func(self.replay_buffer[index_tar])[0] elif self.contrast_type == "both": # mixture self.rand_init_func() augment_inds = np.random.choice(self.batch_size, self.batch_size // 2) obs_pos = np.array([self.replay_buffer[ind] for ind in index_pos]) obs_pos_augment = self.augment_input_func( self.replay_buffer[index_tar])[0] obs_pos[augment_inds] = obs_pos_augment[augment_inds] else: # obs_pos = None raise NotImplementedError # print(obs_tar[0].shape) if "regression" in self.loss_type: value_original = self.norm_func(np.array(obs_tar)) value_tar = np.array(value_tar) self.log(value_original, "value original") self.log(value_tar, "value tar") value_original = np.array(value_original).squeeze() / self.alpha assert value_original.shape == np.array( value_tar).shape, "{}{}".format(value_original.shape, np.array(value_tar).shape) value_tar[np.isnan(value_tar)] = value_original[np.isnan( value_tar)] assert not np.isnan(value_tar).any(), "{}{}".format( value_original, obs_tar) input = [obs_tar] if "contrast" in self.loss_type: input += [obs_pos, obs_neg] if "regression" in self.loss_type: input += [np.nan_to_num(value_tar)] if "linear_model" in self.loss_type: input += [action_tar] if "contrast" not in self.loss_type: input += [obs_pos] if "fit" in self.loss_type: input += [obs_neighbour, np.nan_to_num(neighbours_value)] if "regression" not in self.loss_type: input += [np.nan_to_num(value_tar)] if "causality" in self.loss_type: input += [reward_tar, action_tar] if "weight_product" in self.loss_type: value_u = np.nanmax(np.array(q_value_u), axis=1) value_v = np.nanmax(np.array(q_value_v), axis=1) value_u = np.nan_to_num(np.array(value_u)) value_v = np.nan_to_num(np.array(value_v)) input += [obs_u, obs_v, obs_u, obs_v, value_u, value_v] if "attention" in self.loss_type: value_original = self.value_func(np.array(obs_tar_attention)) value_tar_attention = np.array(q_value_tar_attention) value_original = np.array(value_original).squeeze() value_tar_attention[np.isnan( value_tar_attention)] = value_original[np.isnan( value_tar_attention)] input += [obs_tar_attention, value_tar_attention] func = self.train_func if self.steps < self.burnout else self.eval_func loss, summary = func(*input) self.log("finish training") self.writer.add_summary(summary, global_step=self.steps) if self.debug: self.save_debug(self.debug_dir, obs_tar, obs_pos, obs_neg) def save_debug(self, filedir, tar, pos, neg): subdir = os.path.join(filedir, "./debug_sample") if not os.path.isdir(subdir): os.makedirs(os.path.join(subdir, "./tar/")) os.makedirs(os.path.join(subdir, "./pos/")) os.makedirs(os.path.join(subdir, "./neg/")) for i, tar_image in enumerate(tar): cv2.imwrite( os.path.join(subdir, "./tar/{}".format(self.steps), "{}.png".format(i)), (tar_image * 255)) for i, pos_image in enumerate(pos): cv2.imwrite( os.path.join(subdir, "./pos/{}".format(self.steps), "{}.png".format(i)), (pos_image * 255)) for i, neg_image in enumerate(neg): cv2.imwrite( os.path.join(subdir, "./neg/{}".format(self.steps), "{}.png".format(i)), (neg_image * 255)) def save_attention(self, filedir, step): subdir = os.path.join(filedir, "./attention") attention = np.array(self.attention_func(np.array(self.obs))) value = np.array(self.value_func(np.array(self.obs))) print("var", np.var(attention), np.max(attention), np.min(attention), value) # print(attention.squeeze()) length = int(np.sqrt(np.size(attention))) attention = attention.reshape(length, length) attention = (attention - np.min(attention)) / (np.max(attention) - np.min(attention)) attention = cv2.resize(attention, (self.obs_shape[0], self.obs_shape[1])) print(self.obs_shape) attention = np.repeat(attention[..., np.newaxis], 3, axis=2) # attention[1:, ...] = 1 image = np.array(self.obs)[0, ..., :3] print("image min max", np.max(image), np.min(image)) # image = image.transpose((1, 0, 2)) attentioned_image = image * attention if not os.path.isdir(subdir): os.makedirs(os.path.join(subdir, "./mask/")) os.makedirs(os.path.join(subdir, "./masked_image/")) os.makedirs(os.path.join(subdir, "./image/")) # print(attention.shape) cv2.imwrite( os.path.join(subdir, "./masked_image/", "masked_image_{}.png".format(step)), attentioned_image.transpose((1, 0, 2))) # attentioned_image) cv2.imwrite( os.path.join(subdir, "./mask/", "attention_{}.png".format(step)), # attention * 255) attention.transpose((1, 0, 2)) * 255) cv2.imwrite( os.path.join(subdir, "./image/", "obs_{}.png".format(step)), # image * 255) image.transpose((1, 0, 2))) def save_neighbour(self, inds, dists): self.rand_init_func() save_path = os.path.join(self.debug_dir, "./neighbour/{}".format(self.steps)) if not os.path.exists(save_path): os.makedirs(save_path) print(inds) for i, neighbour in enumerate(zip(inds, dists)): ind, dist = neighbour assert 0 <= ind <= self.cur_capacity, "ind:{},cur_capacity:{}".format( ind, self.cur_capacity) # augmented_image = self.augment_input_func(self.replay_buffer[ind:ind + 1])[0][0] # print(augmented_image.shape) cv2.imwrite( os.path.join(save_path, "{}_{}_{}.png".format(i, dist, ind)), self.replay_buffer[ind].transpose(1, 0, 2))
class PSMPLearnTargetAgent(object): def __init__(self, model_func, exploration_schedule, obs_shape, vector_input=True, lr=1e-4, buffer_size=1000000, num_actions=6, latent_dim=32, gamma=0.99, knn=4, eval_epsilon=0.1, queue_threshold=5e-5, batch_size=32, density=True, trainable=True, num_neg=10, debug=False, debug_dir=None, tf_writer=None): self.obs_shape = obs_shape self.debug = debug self.debug_dir = debug_dir self.conn, child_conn = Pipe() self.replay_buffer = np.empty((buffer_size + 10,) + obs_shape, np.float32 if vector_input else np.uint8) self.ec_buffer = PSLearningProcess(num_actions, buffer_size, latent_dim, obs_shape, child_conn, gamma, density=density) self.obs = None self.z = None self.cur_capacity = 0 self.ind = -1 self.writer = tf_writer self.sequence = [] self.gamma = gamma self.queue_threshold = queue_threshold self.num_actions = num_actions self.exploration_schedule = exploration_schedule self.latent_dim = latent_dim self.knn = knn self.steps = 0 self.batch_size = batch_size self.rmax = 100000 self.logger = logging.getLogger("ecbp") self.log("psmp learning agent here") self.eval_epsilon = eval_epsilon self.train_step = 4 self.alpha = 1 self.burnin = 2000 self.burnout = 10000000000 self.update_target_freq = 10000 self.buffer_capacity = 0 self.trainable = trainable self.num_neg = num_neg self.loss_type = ["contrast"] input_type = U.Float32Input if vector_input else U.Uint8Input # input_type = U.Uint8Input self.hash_func, self.train_func, self.eval_func, self.norm_func, self.update_target_func = build_train_mer( input_type=input_type, obs_shape=obs_shape, model_func=model_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-4), gamma=gamma, grad_norm_clipping=10, latent_dim=latent_dim, loss_type=self.loss_type, batch_size=batch_size, num_neg=num_neg, c_loss_type="infonce", ) self.finds = [0, 0] self.contrast_type = "predictive" self.augment_input_func, self.rand_init_func = build_random_input(input_type=input_type, obs_shape=obs_shape) self.ec_buffer.start() def log(self, *args, logtype='debug', sep=' '): getattr(self.logger, logtype)(sep.join(str(a) for a in args)) def send_and_receive(self, msg, obj): self.conn.send((msg, obj)) # self.log("waiting") if self.conn.poll(timeout=None): recv_msg, recv_obj = self.conn.recv() assert msg == recv_msg return recv_obj def save(self, filedir, sess, saver): if not os.path.isdir(filedir): os.makedirs(filedir, exist_ok=True) self.send_and_receive(10, filedir) replay_buffer_file = open(os.path.join(filedir, "replay_buffer.pkl"), "wb") if np.size(self.replay_buffer) < 1024 * 1024 * 1024: pkl.dump((self.steps, self.replay_buffer, self.cur_capacity), replay_buffer_file) else: pkl.dump((self.steps, self.buffer_capacity), replay_buffer_file) model_file = os.path.join(filedir, "model_{}.pkl".format(self.steps)) saver.save(sess, model_file) def load(self, filedir, sess, saver, num_steps, load_model=True): self.send_and_receive(11, filedir) if os.path.exists(os.path.join(filedir, "replay_buffer.pkl")): replay_buffer_file = open(os.path.join(filedir, "replay_buffer.pkl"), "rb") try: self.steps, self.replay_buffer, self.cur_capacity = pkl.load(replay_buffer_file) except ValueError: replay_buffer_file = open(os.path.join(filedir, "replay_buffer.pkl"), "rb") self.steps, self.buffer_capacity = pkl.load(replay_buffer_file) if load_model: model_file = os.path.join(filedir, "model_{}.pkl".format(num_steps)) for var_name, _ in tf.contrib.framework.list_variables( filedir): print(var_name) saver.restore(sess, model_file) def train(self): # sample # self.log("begin training") samples = self.send_and_receive(4, (self.batch_size, self.num_neg)) samples_u = self.send_and_receive(4, (self.batch_size, self.num_neg)) samples_v = self.send_and_receive(4, (self.batch_size, self.num_neg)) index_u, _, _, _, value_u, _, _, _ = samples_u index_v, _, _, _, value_v, _, _, _ = samples_v index_tar, index_pos, index_neg, reward_tar, value_tar, action_tar, neighbours_index, neighbours_value = samples if len(index_tar) < self.batch_size: return obs_tar = [self.replay_buffer[ind] for ind in index_tar] # obs_pos = [self.replay_buffer[ind] for ind in index_pos] obs_neighbour = [self.replay_buffer[ind] for ind in neighbours_index] obs_u = [self.replay_buffer[ind] for ind in index_u] obs_v = [self.replay_buffer[ind] for ind in index_v] # print(obs_tar[0].shape) if self.contrast_type == "predictive": obs_pos = [self.replay_buffer[ind] for ind in index_pos] obs_neg = [self.replay_buffer[ind] for ind in index_neg] elif self.contrast_type == "augment": self.rand_init_func() obs_pos = self.augment_input_func(self.replay_buffer[index_tar])[0] self.rand_init_func() obs_tar = self.augment_input_func(self.replay_buffer[index_tar])[0] self.rand_init_func() obs_neg = self.augment_input_func(self.replay_buffer[index_neg])[0] elif self.contrast_type == "both": # mixture self.rand_init_func() augment_inds = np.random.choice(self.batch_size, self.batch_size // 2) obs_pos = np.array([self.replay_buffer[ind] for ind in index_pos]) obs_pos_augment = self.augment_input_func(self.replay_buffer[index_tar])[0] obs_pos[augment_inds] = obs_pos_augment[augment_inds] obs_tar = np.array([self.replay_buffer[ind] for ind in index_tar]) self.rand_init_func() obs_tar[augment_inds] = self.augment_input_func(self.replay_buffer[index_tar])[0] obs_neg = [self.replay_buffer[ind] for ind in index_neg] self.rand_init_func() obs_neg[augment_inds] = self.augment_input_func(self.replay_buffer[index_neg])[0] else: obs_pos = None obs_tar = None raise NotImplementedError if "regression" in self.loss_type: value_original = self.norm_func(np.array(obs_tar)) value_tar = np.array(value_tar) self.log(value_original, "value original") self.log(value_tar, "value tar") value_original = np.array(value_original).squeeze() / self.alpha assert value_original.shape == np.array(value_tar).shape, "{}{}".format(value_original.shape, np.array(value_tar).shape) value_tar[np.isnan(value_tar)] = value_original[np.isnan(value_tar)] assert not np.isnan(value_tar).any(), "{}{}".format(value_original, obs_tar) input = [obs_tar] if "contrast" in self.loss_type: input += [obs_pos, obs_neg] if "regression" in self.loss_type: input += [np.nan_to_num(value_tar)] if "linear_model" in self.loss_type: input += [action_tar] if "contrast" not in self.loss_type: input += [obs_pos] if "fit" in self.loss_type: input += [obs_neighbour, np.nan_to_num(neighbours_value)] if "regression" not in self.loss_type: input += [np.nan_to_num(value_tar)] if "causality" in self.loss_type: input += [reward_tar, action_tar] if "weight_product" in self.loss_type: value_u = np.nan_to_num(np.array(value_u)) value_v = np.nan_to_num(np.array(value_v)) input += [obs_u, obs_v, obs_u, obs_v, value_u, value_v] func = self.train_func if self.steps < self.burnout else self.eval_func loss, summary = func(*input) # self.log("finish training") self.writer.add_summary(summary, global_step=self.steps) def update_target(self): self.log("begin updating target") self.log("self.cur capacity", self.cur_capacity) self.update_target_func() for i in range(int(np.ceil((self.cur_capacity + 1) / self.batch_size))): low = i * self.batch_size high = min(self.cur_capacity + 1, (i + 1) * self.batch_size) self.log("low,high", low, high) obs_to_update = self.replay_buffer[low:high] # self.log("obs shape", obs_to_update.shape) z_to_update = self.hash_func(np.array(obs_to_update).astype(np.float32)) # self.log("z shape", np.array(z_to_update).shape) self.send_and_receive(5, (np.arange(low, high), np.array(z_to_update)[0])) self.send_and_receive(8, 0) # recompute density self.log("finish updating target") def act(self, obs, is_train=True, debug=False): if is_train: self.steps += 1 if self.steps % 100 == 0: self.log("steps", self.steps) # else: # self.log("obs", obs) # print(obs.shape) # if self.obs is not None: # print("obs diff",np.mean(np.abs(self.obs - obs))) self.obs = obs # print("in act",obs) self.z = self.hash_func(np.array(obs)) self.z = np.array(self.z).reshape((self.latent_dim,)) if is_train: if self.ind < 0 or self.ind >= self.buffer_capacity: self.ind = self.send_and_receive(1, (np.array(self.z), None)) self.cur_capacity = max(self.ind, self.cur_capacity) # print(self.ind) self.replay_buffer[self.ind] = obs self.buffer_capacity = max(self.ind, self.buffer_capacity) # self.steps += 1 epsilon = max(0, self.exploration_schedule.value(self.steps)) if is_train else self.eval_epsilon if np.random.random() < epsilon: self.log("Random action") action = np.random.randint(0, self.num_actions) return action else: # finds = np.zeros((1,)) extrinsic_qs, intrinsic_qs, find, inds, dists = self.send_and_receive(0, (np.array([self.z]), None, self.knn)) extrinsic_qs, intrinsic_qs = np.array(extrinsic_qs), np.array(intrinsic_qs) inds = np.array(inds).reshape(-1) # print("debug? ",len(inds),debug) if len(inds) > 1 and debug: print("saving neightbour") self.save_neighbour(inds,dists) self.finds[0] += sum(find) self.finds[1] += 1 if is_train: q = extrinsic_qs else: q = extrinsic_qs q = np.squeeze(q) # q = np.nan_to_num(q) q_max = np.nanmax(q) if np.isnan(q_max): max_action = np.arange(self.num_actions) else: max_action = np.where(q >= q_max - 1e-7)[0] # print("action selection", max_action) # print("q", q, q_max) self.log("action selection", max_action) self.log("qsu", q, q_max) action_selected = np.random.randint(0, len(max_action)) return int(max_action[action_selected]) def save_neighbour(self, inds,dists): save_path = os.path.join(self.debug_dir, "./neighbour/{}".format(self.steps)) if not os.path.exists(save_path): os.makedirs(save_path) for i,neighbour in enumerate(zip(inds,dists)): ind,dist = neighbour assert 0 <= ind < self.cur_capacity cv2.imwrite(os.path.join(save_path, "{}_{}.png".format(i,dist)), self.replay_buffer[ind].transpose(1, 0, 2)) def empty_buffer(self): self.cur_capacity = 0 self.steps = 0 self.send_and_receive(9, 0) def observe(self, action, reward, state_tp1, done, train=True): if self.steps <= 1: self.update_target_func() z_tp1 = self.hash_func(np.array(state_tp1)[np.newaxis, ...]) z_tp1 = np.array(z_tp1).reshape((self.latent_dim,)) # z_tp1, h_tp1 = np.array(self.hash_func(np.array(state_tp1)[np.newaxis, ...])).reshape((self.latent_dim,)) if train: self.ind = self.send_and_receive(2, (self.ind, action, reward, z_tp1, None, done)) self.cur_capacity = max(self.ind, self.cur_capacity) self.replay_buffer[self.ind] = state_tp1 self.buffer_capacity = max(self.ind, self.buffer_capacity) else: self.ind = -1 # self.ind = self.send_and_receive(1, (np.array([z_tp1]), None)) if done: self.ind = -1 if self.writer is not None: find_summary = tf.Summary( value=[tf.Summary.Value(tag="find rate", simple_value=self.finds[0] / (self.finds[1] + 1e-9))]) self.writer.add_summary(find_summary, global_step=self.steps) self.finds = [0, 0] # self.steps = 0 if self.steps > self.burnout: return if self.steps % self.train_step == 0 and self.steps >= self.burnin and train and self.trainable: self.train() if self.steps % self.update_target_freq == 0 and self.steps >= self.burnin and train and self.trainable: self.update_target() # else: # self.log("not trai ning ", self.steps,self.steps % self.train_step == 0, self.steps >= self.burnin, train) # def update_sequence(self): # self.ec_buffer.update_sequence(self.sequence, self.debug) # self.sequence = [] def finish(self): self.send_and_receive(3, (True,))