def train(): feature_mapper = FeatureMapper() score = Score() segmentation = Segmentation() feature_mapper.load_weights("./immutable_weights/feature_mapper") # score.load_weights("./weights/score") # segmentation.load_weights("./weights/segmentation") opt = Adam(learning_rate=5e-5) with open("../data/data_classification_train.json") as json_file: data = json.load(json_file) data_index = 0 while str(data_index) in data: img = get_img( "../pictures/pictures_classification_train/{}.png".format( data_index)) true_masks = get_true_mask(data[str(data_index)]) features = feature_mapper(img) def get_loss(): segmentation_prediction = segmentation(features) score_prediction = score(features) show_evaluation(segmentation_prediction, true_masks, data_index) return calculate_loss(segmentation_prediction, score_prediction, true_masks) opt.minimize(get_loss, [score.trainable_weights, segmentation.trainable_weights]) if (data_index % 100 == 99): score.save_weights("./weights/score") segmentation.save_weights("./weights/segmentation") data_index += 1
def train(): feature_mapper = FeatureMapper() rpn = Rpn() roi_pooling = RoiPooling() classifier = Classifier() regr = Regr() feature_mapper.load_weights("./weights/feature_mapper") rpn.load_weights("./weights/rpn") classifier.load_weights("./weights/classifier") regr.load_weights("./weights/regr") opt = Adam(learning_rate=5e-5) with open("../data/data_detect_local_evaluate_10000.json") as json_file: data = json.load(json_file) data_index = 0 while str(data_index) in data: raw_data = data[str(data_index)] target, bounding_box_target = get_localization_data(raw_data) img = get_img("../pictures/pictures_detect_local_evaluate_10000/{}.png".format(data_index)) def get_loss(): features = feature_mapper(img) rpn_map = rpn(features) boxes, probs = get_boxes(rpn_map) feature_areas = roi_pooling(features, boxes) classification_logits = classifier(feature_areas) regression_values = regr(feature_areas) labels_boxes = get_labels_boxes(boxes, target) localization_loss = get_localization_loss(rpn_map, target) regression_loss = get_regression_loss(regression_values, boxes, bounding_box_target, probs) classification_loss = get_classification_loss(classification_logits, labels_boxes, probs) no_regr_boxes_precision = get_boxes_precision(boxes, np.zeros(regression_values.shape), target) final_boxes_precision = get_boxes_precision(boxes, regression_values.numpy(), target) save_data(data_index, raw_data, boxes.tolist(), [a.numpy().tolist() for a in classification_logits], labels_boxes, no_regr_boxes_precision, final_boxes_precision, probs.tolist()) return localization_loss + classification_loss + regression_loss opt.minimize( get_loss, [feature_mapper.trainable_weights, rpn.trainable_weights, classifier.trainable_weights, regr.trainable_weights], ) data_index += 1 if (data_index % 100 == 99): feature_mapper.save_weights("./weights/feature_mapper") rpn.save_weights("./weights/rpn") classifier.save_weights("./weights/classifier") regr.save_weights("./weights/regr")
class Agent: def __init__(self, input_dim, output_dim, hidden_layers=[32, 32], lr=1e-3): self.input_dim = input_dim self.output_dim = output_dim # Build the policy network self.input = Input(shape=(input_dim, )) X = self.input for size in hidden_layers: X = Dense(size, activation="relu")(X) X = Dense(output_dim, activation="softmax")(X) self.model = Model(inputs=self.input, outputs=X) # Build the optimizer self.optimizer = Adam(lr) def update(self, states, actions, weights): """Does one step of policy gradient update Args: states: np.array of sample states. dim = (n_samples, self.input_dim) action: np.array of sample actions. dim = (n_samples,) weights: np.array of sample weights e.g. rewards-to-go. dim = (n_samples,) """ def loss(): action_prob = self.model(states) action_mask = utils.to_categorical(actions, num_classes=self.output_dim) probs = tf.reduce_sum(action_prob * action_mask, axis=1) log_probs = tf.math.log(probs) return -tf.reduce_mean(log_probs * weights) self.optimizer.minimize(loss, lambda: self.model.trainable_weights) def sample_action(self, s): """""" state = np.expand_dims(s, axis=0) action_prob = self.model.predict(state)[0] return np.random.choice(range(self.output_dim), p=action_prob) def save(self, path): self.model.save(path) def load(self, path): del self.model self.model = tf.keras.models.load_model(path)
def train(): feature_mapper = FeatureMapper() rpn = Rpn() roi_pooling = RoiPooling() regr = Regr() segmentation = Segmentation() feature_mapper.load_weights("./weights/feature_mapper") rpn.load_weights("./weights/rpn") regr.load_weights("./weights/regr") segmentation.load_weights("./weights/segmentation") opt = Adam(learning_rate=5e-5) with open("../data/data_detect_local_evaluate_10000.json") as json_file: data = json.load(json_file) data_index = 0 while str(data_index) in data: raw_data = data[str(data_index)] true_mask = get_true_mask(raw_data) img = get_img( "../pictures/pictures_detect_local_evaluate_10000/{}.png".format( data_index)) features = feature_mapper(img) rpn_map = rpn(features) boxes, probs = get_boxes(rpn_map) feature_areas = roi_pooling(features, boxes) regression_values = regr(feature_areas) regr_boxes = [ get_final_box(boxes[i], regression_values[i].numpy()) for i in range(len(boxes)) if probs[i] > .9 ] if len(regr_boxes) > 0: regr_feature_areas = roi_pooling(features, regr_boxes) box_true_masks = get_box_true_mask(regr_boxes, true_mask) def get_loss(): predicted_masks = segmentation(regr_feature_areas) return get_segmentation_loss(predicted_masks, box_true_masks) opt.minimize(get_loss, [segmentation.trainable_weights]) data_index += 1 if (data_index % 100 == 99): print("{} - Weights saved".format(data_index)) segmentation.save_weights("./weights/segmentation")
def train(): yolo = Yolo() yolo.load_weights("./weights/yolo") opt = Adam(learning_rate=5e-5) with open("../data/data_detect_local_train.json") as json_file: data = json.load(json_file) data_index = 0 while str(data_index) in data: img = get_img("../pictures/pictures_detect_local_train/{}.png".format( data_index)) true_labels, true_boxes, true_preds = get_localization_data( data[str(data_index)]) def get_loss(): preds = yolo(img) return calculate_loss(preds, true_labels, true_boxes, true_preds) opt.minimize(get_loss, [yolo.trainable_weights]) if (data_index % 100 == 99): yolo.save_weights("./weights/yolo") data_index += 1
# Target value for the Q network loss # This weights should never be differentiated, thus, we should not # use backpropagation on them. These weights should never be learnt, rather # we just have to update them using a moving average. q_target = tf.stop_gradient(R + gamma * (1 - D) * q_mu_target) # DDPG losses mu_loss = -tf.reduce_mean(q_mu) q_loss = tf.reduce_mean((q - q_target) * (q - q_target)) # Train each network separately mu_optimizer = Adam(learning_rate=mu_lr) q_optimizer = Adam(learning_rate=q_lr) # We want to maximize Q wrt μ mu_train_op = mu_optimizer.minimize(mu_loss, var_list=[mu]) q_train_op = q_optimizer.minimize(q_loss, var_list=[q]) # Use soft updates to update the target networks target_update = tf.group([ tf.assign(v_targ, decay * v_targ + (1 - decay) * v_main) for v_main, v_targ in zip(get_vars('main'), q_mu_target) ]) class ReplayBuffer: """ The experience replay memory. """ def __init__(self, obs_dim, act_dim, size): self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
def minimize_cost(self): self.total_cost = self.compute_total_cost() opt = Adam(lr=self.lr) return opt.minimize(self.total_cost)
class DiscreteAgent: def __init__(self, input_dim, output_dim, hidden_layers=[32, 32], policy_lr=1e-3, v_lr=1e-3, v_update_steps=80): self.input_dim = input_dim self.output_dim = output_dim self.v_update_steps = v_update_steps self._build_policy_model(input_dim, output_dim, hidden_layers, policy_lr) self._build_value_model(input_dim, hidden_layers, v_lr) def _build_policy_model(self, input_dim, output_dim, hidden_layers, lr): policy_input = Input(shape=(input_dim,)) X = policy_input for size in hidden_layers: X = Dense(size, activation="tanh", kernel_initializer="glorot_normal")(X) X = Dense(output_dim, activation="softmax", kernel_initializer="glorot_normal")(X) self.policy = Model(inputs=policy_input, outputs=X) self.policy_opt = Adam(lr) def _build_value_model(self, input_dim, hidden_layers, lr): v_input = Input(shape=(input_dim,)) X = v_input for size in hidden_layers: X = Dense(size, activation="tanh", kernel_initializer="glorot_normal")(X) X = Dense(1, activation=None, kernel_initializer="glorot_normal")(X) self.v = Model(inputs=v_input, outputs=X) self.v_opt = Adam(lr) def _gaussian_log_likelihood(self, actions, means, stds, log_stds, eps=1e-8): return -0.5 * (tf.reduce_sum(((actions - means) / (stds + eps)) ** 2 + 2 * log_stds + np.log(2 * np.pi), axis=1)) def update(self, states, actions, rewards_to_go): """Does one step of policy gradient update Args: states: np.array of sample states. dim = (n_samples, self.input_dim) action: np.array of sample actions. dim = (n_samples,) weights: np.array of sample weights e.g. rewards-to-go. dim = (n_samples,) """ # Update the policy def policy_loss(): action_prob = self.policy(states) action_mask = utils.to_categorical(actions, num_classes=self.output_dim) probs = tf.reduce_sum(action_prob * action_mask, axis=1) log_probs = tf.math.log(probs) advs = rewards_to_go - self.v(states) return -tf.reduce_mean(log_probs * advs) self.policy_opt.minimize(policy_loss, lambda: self.policy.trainable_weights) # Update the Value function def v_loss(): values = self.v(states) return tf.reduce_mean(tf.math.squared_difference(values, rewards_to_go)) for _ in range(self.v_update_steps): self.v_opt.minimize(v_loss, lambda: self.v.trainable_weights) def sample_action(self, s): state = np.expand_dims(s, axis=0) action_prob = self.policy.predict(state)[0] return np.random.choice(range(self.output_dim), p=action_prob) def get_value(self, s): state = np.expand_dims(s, axis=0) value = self.v.predict(state)[0] return value def save(self, path, extension="h5"): self.policy.save(f"{path}_pi.{extension}") self.v.save(f"{path}_v.{extension}") def load(self, path, extension="h5"): del self.policy self.policy = tf.keras.models.load_model(f"{path}_pi.{extension}") del self.v self.v = tf.keras.models.load_model(f"{path}_v.{extension}")
def ddpg(env_fn, ac_kwargs=dict(), seed=0, num_train_episodes=100, test_agent_every=25, replay_size=int(1e6), gamma=0.99, decay=0.95, mu_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, action_noise=0.1, max_episode_length=1000): # randomness tf.random.set_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() # state and action spaces num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # maximum possible action value for each dimension action_max = env.action_space.high[0] # replay buffer replay_buffer = ReplayBuffer(obs_dim=num_states, act_dim=num_actions, size=replay_size) mu, q = create_networks(num_actions, action_max, **ac_kwargs) mu_targ, q_targ = create_networks(num_actions, action_max, **ac_kwargs) ### NOTE: Copy target weights for init # q_targ.set_weights(q.get_weights()) # mu_targ.set_weights(mu.get_weights()) s = np.array([0.23164732, 0.97279984, -0.74811356]) a = np.array([1.849152]) s2 = [0.21903736, 0.97571647, 0.25885912] r = -1.8470242261833913 d = False # s = np.array([-0.6460527, -0.76329281, 0.60891966]) # a = np.array([1.5983584]) # s2 = np.array([-0.63545021, -0.77214185, 0.27620381]) # r = -5.2070620242099315 # d = False np.set_printoptions(threshold=np.inf) s = np.expand_dims(s, axis=0) a = np.expand_dims(a, axis=0) s2 = np.expand_dims(s2, axis=0) # Initializes weights print("INITIAL") input = tf.concat([s, mu(s)], axis=-1) print("INITIAL") q(input) print("INITIAL") input = tf.concat([s2, mu_targ(s2)], axis=-1) print("INITIAL") q_targ(input) load_network(q, "q") load_network(mu, "mu") # mu_value = mu(s) # input = tf.concat([s, mu_value], axis=-1) # q_mu_value = q(input) # input = tf.concat([s, a], axis=-1) # q_value = q(input) # print("Q___", mu_value, q_value, q_mu_value) # print(mu_value, q_value, q_mu_value) mu_optimizer = Adam(learning_rate=mu_lr, epsilon=1e-08) q_optimizer = Adam(learning_rate=q_lr, epsilon=1e-08) def q_loss(): # Q-loss print("Mu targ for Q loss") q_targ_input = tf.concat([s2, mu_targ(s2)], axis=-1) print("Q targ for Q loss") q_target = r + gamma * (1 - d) * q_targ(q_targ_input) q_input = tf.concat([s, a], axis=-1) print("Q for q loss") q_loss = tf.math.reduce_mean((q(q_input) - q_target)**2) #q_losses.append(q_loss) print("QLOSS", q_loss) return q_loss def mu_loss(): # Mu-loss print("Mu for loss mu") q_input = tf.concat([s, mu(s)], axis=-1) print("Q for Mu loss") # print("QQQQ", q_input, q(q_input)) mu_loss = -tf.math.reduce_mean(q(q_input)) #mu_losses.append(mu_loss) print("MULOSS", mu_loss) return mu_loss print("SETTING WEIGHTS") q_targ.set_weights(q.get_weights()) mu_targ.set_weights(mu.get_weights()) q_optimizer.minimize(q_loss, var_list=q.trainable_variables) q_birn = tf.concat([s, a], axis=-1) print("Q-new", q(q_birn)) mu_optimizer.minimize(mu_loss, var_list=mu.trainable_variables)
class Agent: def __init__(self, input_dim, output_dim, hidden_layers=[32, 32], policy_lr=1e-3, v_lr=1e-3, v_update_steps=80): self.input_dim = input_dim self.output_dim = output_dim self.v_update_steps = v_update_steps self._build_policy_model(input_dim, output_dim, hidden_layers, policy_lr) self._build_value_model(input_dim, hidden_layers, v_lr) def _build_policy_model(self, input_dim, output_dim, hidden_layers, lr): policy_input = Input(shape=(input_dim, )) X = policy_input for size in hidden_layers: X = Dense(size, activation="tanh", kernel_initializer="glorot_normal")(X) mu = Dense(output_dim, activation=None, kernel_initializer="zeros", use_bias=False)(X) # sigma = Dense(output_dim, activation="softplus", kernel_initializer="glorot_normal", use_bias=False)(X) # sigma = Dense(output_dim, activation=None, kernel_initializer="zeros", use_bias=False)(X) # self.policy = Model(inputs=policy_input, outputs=[mu, sigma]) self.policy = Model(inputs=policy_input, outputs=mu) self.log_stds = tf.Variable(-0.75 * np.ones((output_dim, )), dtype="float32", name="log_stds", trainable=True) self.policy_opt = Adam(lr) def _build_value_model(self, input_dim, hidden_layers, lr): v_input = Input(shape=(input_dim, )) X = v_input for size in hidden_layers: X = Dense(size, activation="tanh", kernel_initializer="glorot_normal")(X) X = Dense(1, activation=None, kernel_initializer="glorot_normal")(X) self.v = Model(inputs=v_input, outputs=X) self.v_opt = Adam(lr) i = 0 def update(self, state, action, G, I): """Does one step of policy gradient update Args: states: np.array of sample states. dim = (n_samples, self.input_dim) action: np.array of sample actions. dim = (n_samples,) weights: np.array of sample weights e.g. rewards-to-go. dim = (n_samples,) """ state = np.array([state], dtype="float32") action = np.array([action], dtype="float32") G = np.array([G], dtype="float32") # Update the policy def policy_loss(): def gaussian_log_likelihood(actions, means, stds, log_stds, eps=1e-8): pre_sum = -0.5 * ( ((actions - means) / (stds + eps))**2 + 2 * log_stds + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) # return -0.5 * (tf.reduce_sum(((actions - means) / (stds + eps)) ** 2 + 2 * log_stds, axis=1) + self.output_dim * np.log(2 * np.pi)) # mean, std = self.policy(state) # log_std = tf.math.log(std + 1e-6) # mean, log_std = self.policy(state) mean = self.policy(state) log_std = self.log_stds std = tf.exp(log_std) log_prob = gaussian_log_likelihood(action, mean, std, log_std) adv = G - self.v(state) loss = -I * tf.reduce_mean(log_prob * adv) # loss = -tf.reduce_mean(log_prob * adv) return loss self.policy_opt.minimize( policy_loss, lambda: self.policy.trainable_weights + [self.log_stds]) # Update the Value function def v_loss(): value = self.v(state) return tf.reduce_mean((G - value)**2) for _ in range(self.v_update_steps): self.v_opt.minimize(v_loss, lambda: self.v.trainable_weights) if (Agent.i % 200 == 0): # mean, std = self.policy(state) # mean, log_std = self.policy(state) # std = tf.exp(log_std) # mean = self.policy(state) log_std = self.log_stds std = tf.exp(log_std) # print("States:", state) # print("Means:", mean) print("Stds:", std) # print("Log Stds:", log_stds) # print("Log Probs:", log_probs) # print("Values:", values) # print("Rewards:", rtg) # print("Advs:", advs) # print("Loss:", loss) Agent.i += 1 def sample_action(self, s): """""" state = np.expand_dims(s, axis=0) mean = self.policy.predict(state) std = [tf.exp(self.log_stds)] # mean, std = self.policy.predict(state) # mean, log_std = self.policy.predict(state) # std = tf.exp(log_std) noise = tf.random.normal((self.output_dim, )) sample = mean[0] + std[0] * noise return tf.clip_by_value(sample, -1.0, 1.0).numpy() def get_value(self, s): state = np.expand_dims(s, axis=0) value = self.v.predict(state)[0] return value def save(self, path, extension="h5"): self.policy.save(f"{path}_pi.{extension}") self.v.save(f"{path}_v.{extension}") # np.save(f"{path}_log_stds", self.log_stds.numpy()) def load(self, path, extension="h5"): del self.policy self.policy = tf.keras.models.load_model(f"{path}_pi.{extension}") del self.v self.v = tf.keras.models.load_model(f"{path}_v.{extension}")
class Agent: def __init__(self, ft, input_dim, output_dim, hidden_layers=[32, 32], policy_lr=1e-3, v_lr=1e-3, v_update_steps=80): self.input_dim = input_dim self.output_dim = output_dim self.ft = ft self.v_update_steps = v_update_steps self._build_policy_model(input_dim, output_dim, hidden_layers, policy_lr) self._build_value_model(input_dim, hidden_layers, v_lr) def _build_policy_model(self, input_dim, output_dim, hidden_layers, lr): policy_input = Input(shape=(input_dim, )) X = policy_input for size in hidden_layers: X = Dense(size, activation="tanh")(X) mu = Dense(output_dim, activation=None)(X) # sigma = Dense(output_dim, kernel_initializer="identity", activation=None)(X) # self.policy = Model(inputs=policy_input, outputs=[mu, sigma]) self.policy = Model(inputs=policy_input, outputs=mu) self.log_stds = tf.Variable(-np.ones((output_dim, )) / 2.0, dtype="float32", name="log_stds", trainable=True) self.policy_opt = Adam(lr) def _build_value_model(self, input_dim, hidden_layers, lr): v_input = Input(shape=(input_dim, )) X = v_input for size in hidden_layers: X = Dense(size, activation="tanh")(X) X = Dense(1)(X) self.v = Model(inputs=v_input, outputs=X) self.v_opt = Adam(lr) def _gaussian_log_likelihood(self, actions, means, stds, log_stds): return -0.5 * (tf.reduce_sum((actions - means)**2 / (stds**2) + 2 * log_stds + tf.math.log(np.pi))) def update(self, states, actions, rewards_to_go): """Does one step of policy gradient update Args: states: np.array of sample states. dim = (n_samples, self.input_dim) action: np.array of sample actions. dim = (n_samples,) weights: np.array of sample weights e.g. rewards-to-go. dim = (n_samples,) """ states = self.ft.transform(states) # Update the policy def policy_loss(): # means, log_stds = self.policy(states) # stds = tf.exp(log_stds) # log_probs = self._gaussian_log_likelihood(actions, means, stds, log_stds) # advs = rewards_to_go - self.v(states) # return -tf.reduce_mean(log_probs * advs) means = self.policy(states) stds = tf.exp(self.log_stds) log_probs = self._gaussian_log_likelihood(actions, means, stds, self.log_stds) advs = rewards_to_go - self.v(states) return -tf.reduce_mean(log_probs * advs) # self.policy_opt.minimize(policy_loss, lambda: self.policy.trainable_weights) self.policy_opt.minimize( policy_loss, lambda: self.policy.trainable_weights + [self.log_stds]) # Update the Value function def v_loss(): values = self.v(states) return tf.reduce_mean( tf.math.squared_difference(values, rewards_to_go)) for _ in range(self.v_update_steps): self.v_opt.minimize(v_loss, lambda: self.v.trainable_weights) def sample_action(self, s): """""" state = np.expand_dims(s, axis=0) state = self.ft.transform(state) means = self.policy.predict(state)[0] stds = tf.exp(self.log_stds) noises = tf.random.normal((self.output_dim, )) sample = means + stds * noises return tf.clip_by_value(sample, -1, 1).numpy() def get_value(self, s): state = np.expand_dims(s, axis=0) state = self.ft.transform(state) value = self.v.predict(state)[0] return value def save(self, path, extension="h5"): self.policy.save(f"{path}_pi.{extension}") self.v.save(f"{path}_v.{extension}") np.save(f"{path}_log_stds", self.log_stds.numpy()) def load(self, path, extension="h5"): del self.policy self.policy = tf.keras.models.load_model(f"{path}_pi.{extension}") del self.v self.v = tf.keras.models.load_model(f"{path}_v.{extension}") self.log_stds.assign(np.load(f"{path}_log_stds.npy"))