def _spectral_norm(self, call_method, inputs, **kwargs): w_shape = K.int_shape(self.kernel) w = tf.reshape(self.kernel, [-1, w_shape[-1]]) u = self.u u_hat = self.u v_hat = None for _ in range(1): """ power iteration Usually iteration = 1 will be enough """ v_ = K.dot(u_hat, tf.transpose(w)) v_hat = K.l2_normalize(v_) u_ = K.dot(v_hat, w) u_hat = K.l2_normalize(u_) u_hat = K.stop_gradient(u_hat) v_hat = K.stop_gradient(v_hat) sigma = K.dot(K.dot(v_hat, w), K.transpose(u_hat)) with tf.control_dependencies([u.assign(u_hat)]): w_norm = w / sigma w_norm = K.reshape(w_norm, w_shape) kernel = self.kernel self.kernel = w_norm result = call_method(inputs, *kwargs) self.kernel = kernel return result
def _encoder(x): # x = tf.keras.layers.Dropout(rate)(x) # Two Embeddings (3 for classes, 10 for degrees) cls = K.expand_dims(K.arange(3), axis=0) cls = K.stop_gradient(cls) cls = tf.keras.layers.Embedding(3, d_model)(cls) cls = K.expand_dims(cls, axis=2) # (1, 3, 1, d_model) direct = K.expand_dims(K.arange(10), axis=0) direct = K.stop_gradient(direct) direct = tf.keras.layers.Embedding(10, d_model)(direct) direct = K.expand_dims(direct, axis=1) # (1, 1, 10, d_model) embedding = tf.keras.layers.Reshape((30, d_model))(cls + direct) for i in range(n_layers): x = transformer_layer(d_model, n_heads, dff, rate)(x) x = multi_head_attention(d_model, n_heads, perm_and_reshape=False)(embedding, x, x) x = tf.keras.layers.Dropout(rate)(x) x = tf.keras.layers.BatchNormalization()(x) if softmax: x = tf.keras.layers.Softmax()(x) return x
def style_loss(style_features, combination_features, image_shape, style_mask=None, content_mask=None): if content_mask is not None: mask_tensor = K.variable( process_mask(content_mask, combination_features.shape)) combination_features = combination_features * K.stop_gradient( mask_tensor) del mask_tensor if style_mask is not None: mask_tensor = K.variable( process_mask(style_mask, style_features.shape)) style_features = style_features * K.stop_gradient(mask_tensor) if content_mask is not None: combination_features = combination_features * K.stop_gradient( mask_tensor) del mask_tensor style_gram = gram_matrix(style_features, use_shifted_activations) content_gram = gram_matrix(combination_features, use_shifted_activations) size = image_shape[0] * image_shape[1] number_of_channels = 3 loss = tf.reduce_sum( tf.square(style_gram - content_gram)) / (4.0 * (number_of_channels**2) * (size**2)) return (loss)
def _init_models(self): # make sure that the policy loss is set to 'sac' if self.policy.update_strategy != 'sac': self.policy.update_strategy = 'sac' self.logger.warn("policy.update_strategy has been set to 'sac'") # inputs S, A = self.policy.train_model.inputs[:2] G = keras.Input(name='G', shape=(1,), dtype='float') # constuct log(pi(a_sampled, s)) A_sampled = self.policy.dist.sample() # differentiable log_pi = self.policy.dist.log_proba(A_sampled) # use target models for q-values, because they're non-trainable Q1 = self._get_q_value(self.q_func1, S, A_sampled) Q2 = self._get_q_value(self.q_func2, S, A_sampled) Q_both = keras.layers.Concatenate()([Q1, Q2]) check_tensor(Q_both, ndim=2, axis_size=2, axis=1) # construct entropy-corrected target for state value function Q_min = keras.layers.Lambda(lambda x: K.min(x, axis=1))(Q_both) V_target = K.stop_gradient(Q_min - self.policy.entropy_beta * log_pi) check_tensor(V_target, ndim=1) # compute advantages from q-function V = self.v_func.predict_model(S) check_tensor(V, axis_size=1, axis=1) V = K.stop_gradient(K.squeeze(V, axis=1)) Q = keras.layers.Lambda(lambda x: K.mean(x, axis=1))(Q_both) Adv = Q - self.policy.entropy_beta * log_pi - V # update loss with advantage coming directly from graph policy_loss, metrics = self.policy.policy_loss_with_metrics(Adv) v_loss = self.v_func.train_model([S, V_target]) q_loss1 = self.q_func1.train_model([S, A, G]) q_loss2 = self.q_func2.train_model([S, A, G]) value_loss = (v_loss + q_loss1 + q_loss2) / 3. # add losses to metrics dict metrics.update({ 'policy/loss': policy_loss, 'v_func/loss': v_loss, 'q_func1/loss': q_loss1, 'q_func2/loss': q_loss2, 'value/loss': value_loss, }) # combined loss function loss = policy_loss + self.value_loss_weight * value_loss check_tensor(loss, ndim=0) # should be a scalar # joint model self.train_model = keras.Model([S, A, G], loss) self.train_model.add_loss(loss) for name, metric in metrics.items(): self.train_model.add_metric(metric, name=name, aggregation='mean') self.train_model.compile(optimizer=self.policy.train_model.optimizer)
def call(self, inputs, **kwargs): boxes = K.stop_gradient(inputs[0]) fpn = K.stop_gradient(inputs[1]) time_distributed = K.ndim(boxes) == 4 if time_distributed: boxes_shape = K.shape(boxes) fpn_shape = K.shape(fpn) new_boxes_shape = [-1] + [ boxes_shape[i] for i in range(2, K.ndim(boxes)) ] new_fpn_shape = [-1 ] + [fpn_shape[i] for i in range(2, K.ndim(fpn))] boxes = K.reshape(boxes, new_boxes_shape) fpn = K.reshape(fpn, new_fpn_shape) image_shape = K.cast(K.shape(fpn), K.floatx()) def _roi_align(args): boxes = args[0] fpn = args[1] # process the feature map x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] fpn_shape = K.cast(K.shape(fpn), dtype=K.floatx()) norm_boxes = K.stack([ (y1 / image_shape[1] * fpn_shape[0]) / (fpn_shape[0] - 1), (x1 / image_shape[2] * fpn_shape[1]) / (fpn_shape[1] - 1), (y2 / image_shape[1] * fpn_shape[0] - 1) / (fpn_shape[0] - 1), (x2 / image_shape[2] * fpn_shape[1] - 1) / (fpn_shape[1] - 1) ], axis=1) rois = tf.image.crop_and_resize( K.expand_dims(fpn, axis=0), norm_boxes, tf.zeros((K.shape(norm_boxes)[0], ), dtype='int32'), self.crop_size) return rois roi_batch = tf.map_fn(_roi_align, elems=[boxes, fpn], dtype=K.floatx(), parallel_iterations=self.parallel_iterations) if time_distributed: roi_shape = tf.shape(roi_batch) new_roi_shape = [boxes_shape[0], boxes_shape[1]] + \ [roi_shape[i] for i in range(1, K.ndim(roi_batch))] roi_batch = tf.reshape(roi_batch, new_roi_shape) return roi_batch
def call(self, inputs): binary_kernel = self.kernel + K.stop_gradient( K.sign(self.kernel) - self.kernel) binary_kernel = binary_kernel + K.stop_gradient( binary_kernel * self.multiplier - binary_kernel) outputs = K.conv2d(inputs, binary_kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) return outputs
def round_through(self, x, rounding_method=None): '''Element-wise rounding to the closest integer with full gradient propagation. A trick from [Sergey Ioffe](http://stackoverflow.com/a/36480182) ''' def ceil_fn(): return tf.ceil(x) def floor_fn(): return tf.floor(x) if rounding_method is None: rounding_method=self.rounding_method if rounding_method == 'nearest': rounded = tf.math.rint(x) elif rounding_method == 'down': rounded = tf.floor(x) elif rounding_method == 'stochastic': rounded=tf.cond(tf.greater(tf.reduce_mean(x-tf.floor(x)), 0.5), ceil_fn, floor_fn) elif rounding_method == 'zero': neg_alter=tf.add(tf.multiply(tf.cast(tf.less(x,0),'float32'),-2.0),1.0) rounded=tf.multiply(tf.floor(tf.multiply(x,neg_alter)),neg_alter) else: print('Wrong Rounding Type\nChoose between \'nearest\' , \'down\', \'zero\', \'stochastic\' ') rounded_through = x + K.stop_gradient(rounded - x) return rounded_through
def loss_uncertainty_gaussian_likelihood(y_true, y_pred): """ Loss function that calculates something similar to a Gaussian Likelihood. Requires that y_pred contains only one predicted value (label). y_true & y_pred are expected to contain the predicted/true label and the predicted std for the label. L = ln(std ** 2) + (y_label_pred - y_label_true) / (std ** 2) Returns ------- loss : Gaussian Likelihood loss """ # order in y_pred: 1) pred label 2) pred label error # prevent that the gradient flows back over the label network: y_pred_label = K.stop_gradient(y_pred[:, 0]) y_pred_label_std = y_pred[:, 1] y_true_label = y_true[:, 0] # equal to a lower std limit of 3.16 e-2 eps = tf.constant(1e-3, dtype="float32") # y_pred_label_std += eps loss = K.log(K.pow(y_pred_label_std, 2) + eps) + K.pow( y_pred_label - y_true_label, 2) / (K.pow(y_pred_label_std, 2) + eps) return loss
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom bounded_lr_t = m_t * K.minimum( K.maximum(step_size_p_bound, lower_bound), upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def _train(self, map_input, reward, policy_mask, policy_one_hot): _entropy = _policy_loss = _value_loss = 0. policy_mask = policy_mask.astype('float32') with tf.GradientTape() as tape: policy, value = self.model(map_input) value = K.squeeze(value, axis=1) policy = K.exp(policy) / (K.sum(K.exp(policy))) value_loss = .5 * K.square(reward - value) # Should I use policy * policy_mask here? entropy = -K.sum(policy * K.log(policy + 1e-10), axis=[1, 2, 3]) log_prob = K.log(K.sum(policy * policy_one_hot, axis=[1, 2, 3]) + 1e-10) advantage = reward - K.stop_gradient(value) policy_loss = -log_prob * advantage - entropy * ENTROPY_RATE total_loss = policy_loss + value_loss _entropy = K.mean(entropy) _policy_loss = K.mean(K.abs(policy_loss)) _value_loss = K.mean(value_loss) gradients = tape.gradient(total_loss, self.model.trainable_variables) gradients, _ = tf.clip_by_global_norm(gradients, GRADIENT_CLIP_MAX) self.opt.apply_gradients(zip(gradients, self.model.trainable_variables)) return [float(_value_loss), float(_policy_loss), float(_entropy)]
def __call__(self, layer): if self.flatten: flatten = layers.Flatten()(layer) else: flatten = layer outputs = [] x = DenseBlock(units=128, **self.kwargs)(flatten) x = DenseBlock(units=32, **self.kwargs)(x) for name in self.output_names: output_label = layers.Dense(units=1, name=name)(x) outputs.append(output_label) # Network for the errors of the labels x_err = layers.Lambda(lambda a: K.stop_gradient(a))(flatten) x_err = DenseBlock(units=128, **self.kwargs)(x_err) x_err = DenseBlock(units=64, **self.kwargs)(x_err) x_err = DenseBlock(units=32, **self.kwargs)(x_err) for i, name in enumerate(self.output_names): output_label_error = layers.Dense(units=1, activation='linear', name=name + '_err_temp')(x_err) # Predicted label gets concatenated with its error (needed for loss function) output_label_merged = layers.Concatenate(name=name + '_err')( [outputs[i], output_label_error]) outputs.append(output_label_merged) return outputs
def cnn(config): if config['batch_norm']: conv = conv2d_bn else: conv = conv2d if config['ds_type'] == 'samples': x = keras.Input(shape=(16000, ), name='input') genc = genc_model(dim_z=40) y = genc(x) if config['load_genc']: genc.load_weights(str(Path.cwd() / 'genc.h5')) if not config['train_genc']: y = backend.stop_gradient(y) y = keras.layers.Reshape((-1, 40, 1))(y) y = keras.layers.Cropping2D(((5, 0), (0, 0)))(y) y = conv(y, 16) else: x = keras.Input(shape=(None, 40, 1), name='input') y = conv(x, 16) y = conv(y, 16) y = pool(y) y = conv(y, 32) y = conv(y, 32) y = pool(y) y = conv(y, 64) y = conv(y, 30) y = keras.layers.GlobalAveragePooling2D()(y) return keras.Model(x, y)
def optimizer(self): """ Actor Optimization: Advantages + Entropy term to encourage exploration (Cf. https://arxiv.org/abs/1602.01783) """ actor, critic = self.actor_critic(self.actor_critic.input) action = K.placeholder(shape=(None, self.out_dim)) advantages = K.placeholder(shape=(None, )) weighted_actions = K.sum(action * actor, axis=1) eligibility = K.log(weighted_actions + 1e-10) * K.stop_gradient(advantages) entropy = K.sum(actor * K.log(actor + 1e-10), axis=1) entropy = K.mean(entropy) actor_loss = 1.0e-3 * entropy - K.mean(eligibility) # actor_loss = 1.0e-4 * entropy - K.cast(K.sum(eligibility), 'float32') discounted_reward = K.placeholder(shape=(None, 1)) # critic_loss = K.mean(K.square(discounted_reward - critic)) critic_loss = K.mean(K.square(discounted_reward - critic)) # loss = actor_loss + 0.5 * critic_loss # updates = self.adam_optimizer.get_updates(loss=loss, params=self.actor_critic.trainable_weights) # return K.function(inputs=[self.actor_critic.input, action, advantages, discounted_reward], \ # outputs=loss, updates=updates) updates = self.adam_optimizer.get_updates( loss=[actor_loss, critic_loss], params=self.actor_critic.trainable_weights) return K.function(inputs=[self.actor_critic.input, action, advantages, discounted_reward], \ outputs=[actor_loss, critic_loss], updates=updates)
def call(self, inputs, mask=None, **kwargs): inputs, embeddings = inputs if self.stop_gradient: embeddings = K.stop_gradient(embeddings) outputs = K.dot(inputs, K.transpose(embeddings)) if self.use_bias: outputs = K.bias_add(outputs, self.bias) return keras.activations.softmax(outputs)
def __init__(self, G, base_loss=keras.losses.mse): if K.ndim(G) == 2: shape = K.int_shape(G) assert shape[1] == 1, f"bad shape: {shape}" G = K.squeeze(G, axis=1) assert K.ndim(G) == 1, "bad shape" self.G = K.stop_gradient(G) self.base_loss = base_loss
def call(self, inputs, training=None): x = inputs assert not isinstance(x, list) # Compute the minibatch statistics mean, var = self._moments(x) sigma = K.sqrt(var + self.epsilon) # If in training phase set rmax, dmax large so that we use the moving # averages to do the normalization rmax = K.in_train_phase(self.rmax, K.constant(1e5), training) dmax = K.in_train_phase(self.dmax, K.constant(1e5), training) # Compute the corrections based on rmax, dmax r = K.stop_gradient( self._clip(sigma / self.moving_sigma, 1. / rmax, rmax)) d = K.stop_gradient( self._clip((mean - self.moving_mean) / self.moving_sigma, -dmax, dmax)) # Actually do the normalization and the rescaling xnorm = ((x - mean) / sigma) * r + d y = self.gamma * xnorm + self.beta # Add the moving average updates self.add_update([ K.moving_average_update(self.moving_mean, mean, self.momentum), K.moving_average_update(self.moving_sigma, sigma, self.momentum) ], x) # Add the r, d updates rmax_prog = K.minimum(1., self.steps / self.rmax_dur) dmax_prog = K.minimum(1., self.steps / self.dmax_dur) self.add_update([ K.update_add(self.steps, 1), K.update(self.rmax, self.rmax_0 + rmax_prog * (self.rmax_inf - self.rmax_0)), K.update(self.dmax, self.dmax_0 + dmax_prog * (self.dmax_inf - self.dmax_0)) ]) # Fix the output's uses learning phase y._uses_learning_phase = rmax._uses_learning_phase return y
def ternarize(W, H=1): '''The weights' ternarization function, # References: - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902) - [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) ''' Wt = _ternarize(W, H) return W + K.stop_gradient(Wt - W)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] t = math_ops.cast(self.iterations, K.floatx()) + 1 lr_t = self.lr if self.initial_decay > 0: lr_t = lr_t * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) lr_t = lr_t * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr_t / self.base_lr lower_bound = final_lr * (1 - 1 / ((1-self.beta_2) * (t + 1))) upper_bound = final_lr * (1 + 1 / ((1-self.beta_2) * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = K.sqrt(vhat_t) + self.epsilon self.updates.append(state_ops.assign(vhat, vhat_t)) else: denom = K.sqrt(v_t) + self.epsilon eta_hat = tf.clip_by_value(lr_t/denom, lower_bound, upper_bound) # eta = eta_hat / K.sqrt(t) p_t = p - m_t * eta_hat self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def _decode_layer(self, input, residual, layer_idx): layers = self.layers[layer_idx] upsample = layers[0](input) stop_grad = self.n_up - 1 - layer_idx in self.stop_grad_skip_connection_levels if self.n_up - 1 - layer_idx not in self.omit_skip_connection_levels: if isinstance(residual, list): if stop_grad: residual = [stop_gradient(r) for r in residual] concat = layers[1](flatten_list(residual + [upsample])) else: if stop_grad: residual = stop_gradient(residual) concat = layers[1]([residual, upsample]) else: concat = upsample conv = concat for convL in layers[2]: conv = convL(conv) return conv
def style_loss(style, combination, mask_path=None, nb_channels=None): assert K.ndim(style) == 3 assert K.ndim(combination) == 3 if content_mask_path is not None: content_mask = K.variable(load_mask(content_mask_path, nb_channels)) combination = combination * K.stop_gradient(content_mask) del content_mask if mask_path is not None: style_mask = K.variable(load_mask(mask_path, nb_channels)) style = style * K.stop_gradient(style_mask) if content_mask_path is None: combination = combination * K.stop_gradient(style_mask) del style_mask channels = 3 size = img_width * img_height return K.sum(K.square(style - combination)) / (4. * (channels ** 2) * (size ** 2))
def dice_loss(y_true, y_pred): ans_list = [] for i in range(class_count): tmp_y_true = y_true[:, i] tmp_y_pred = y_pred[:, i] tmp_y_max_pred = K.max(y_pred, axis=-1, keepdims=False) tmp_pred_one_zero = K.cast(K.equal(tmp_y_pred, tmp_y_max_pred), dtype="float32") tmp_true_pred = tmp_y_true * tmp_pred_one_zero p = (K.sum( K.stop_gradient(tmp_true_pred - tmp_y_pred) + tmp_y_pred) + smooth) / (K.sum( K.stop_gradient(tmp_pred_one_zero - tmp_y_pred) + tmp_y_pred) + smooth) r = (K.sum( K.stop_gradient(tmp_true_pred - tmp_y_pred) + tmp_y_pred) + smooth) / (K.sum(tmp_y_true) + smooth) ans_list.append(2 * p * r / (p + r)) return -K.mean(K.stack(ans_list))
def __init__(self, G, base_loss=keras.losses.Huber()): check_tensor(G) if K.ndim(G) == 2: check_tensor(G, axis_size=1, axis=1) G = K.squeeze(G, axis=1) check_tensor(G, ndim=1) self.G = K.stop_gradient(G) self.base_loss = base_loss
def actor_loss(y_true, y_pred): # Here we define a custom for proximal policy optimization out = K.clip(y_pred, DELTA, 1 - DELTA) log_lik = K.sum(y_true * K.log(out), axis=-1) old_log_lik = K.stop_gradient( K.sum(y_true * K.log(out), axis=-1)) ratio = K.exp(log_lik - old_log_lik) clipped_ratio = K.clip(ratio, 1 - self.epsilon, 1 + self.epsilon) return -K.mean( K.minimum(ratio * advantages, clipped_ratio * advantages))
def loss_uncertainty_gaussian_likelihood_dir(y_true, y_pred): """ Loss function that calculates something similar to a Gaussian Likelihood for predicted directions. Requires that y_pred contains three predicted values (labels): dir_x, dir_y, dir_z. y_true & y_pred are expected to contain the predicted/true label and the predicted std for the label. L = ln(std ** 2) + (y_label_pred - y_label_true) / (std ** 2) Returns ------- loss : Gaussian Likelihood loss for the directional error """ # order in y_pred: 1) pred label 2) pred label error # prevent that the gradient flows back over the label network y_pred_dir_x, y_pred_dir_y, y_pred_dir_z = K.stop_gradient( y_pred[:, 0]), K.stop_gradient(y_pred[:, 1]), K.stop_gradient(y_pred[:, 2]) y_pred_std_dir_x, y_pred_std_dir_y, y_pred_std_dir_z = y_pred[:, 3], y_pred[:, 4], y_pred[:, 5] y_true_dir_x, y_true_dir_y, y_true_dir_z = y_true[:, 0], y_true[:, 1], y_true[:, 2] # equal to a lower std limit of 1e-3 eps = tf.constant(1e-6, dtype="float32") loss_dir_x = K.log(K.pow(y_pred_std_dir_x, 2) + eps) + K.pow( y_pred_dir_x - y_true_dir_x, 2) / (K.pow(y_pred_std_dir_x, 2) + eps) loss_dir_y = K.log(K.pow(y_pred_std_dir_y, 2) + eps) + K.pow( y_pred_dir_y - y_true_dir_y, 2) / (K.pow(y_pred_std_dir_y, 2) + eps) loss_dir_z = K.log(K.pow(y_pred_std_dir_z, 2) + eps) + K.pow( y_pred_dir_z - y_true_dir_z, 2) / (K.pow(y_pred_std_dir_z, 2) + eps) loss = loss_dir_x + loss_dir_y + loss_dir_z return loss
def clip_through(self, X, min_val=None, max_val=None): '''Element-wise clipping with gradient propagation Analogue to round_through ''' if min_val is None: min_val=self.min_value if max_val is None: max_val=self.max_value clipped = K.clip(X, min_val, max_val) clipped_through= X + K.stop_gradient(clipped-X) return clipped_through
def Av_CNN_GCN_trans_model(patch_sz, number_class, number_neighbors=2, droupout_rate=0.5, kernel_reg=None): CNNmodel = Av_CNN3D_model(patch_sz, number_class, droupout_rate=0.5, kernel_reg=None) modeldir = './path-to-trained-CNNmodel/models.h5' if os.path.isfile(modeldir): CNNmodel.load_weights(modeldir) else: sys.exit( "Error! Please provide a trained model in Av_CNN_GCN_trans_model!") return # CNNmodel.summary() X_batch = Input(shape=(patch_sz[0], patch_sz[1], patch_sz[2], 1), name="X_batch") NX_batch = Input(shape=(number_neighbors, patch_sz[0], patch_sz[1], patch_sz[2]), name="NX_batch") reshape = Reshape((patch_sz[0], patch_sz[1], patch_sz[2], 1)) Phi_fun = phi_fun(patch_sz=patch_sz, kernel_reg=kernel_reg, droupout_rate=droupout_rate) Phi_fun.set_weights(CNNmodel.layers[1].get_weights()) X = Phi_fun(X_batch) NX = [] for i in range(NX_batch.shape[1].value): NX_batch_i = slicelayer(index=i)(NX_batch) tmp = reshape(NX_batch_i) tmp = Phi_fun(tmp) NX.append(tmp) # the size of tmp [b, F] & NX is a list with n [b, F] NX = concatenate(NX, axis=1) NX = Lambda(lambda t: stop_gradient(t))(NX) x = gcn_layer(X, NX, Num_Gaussian=1, n_hidden_feat=1, OFeat_len=2, lamda=1.0) xout = Activation('softmax')(x) model = keras.Model(inputs=[X_batch, NX_batch], outputs=xout, name='AV_GCN') return model
def call(self, x): s, x1 = x a = x1[:, :1] s_hat = x1[:, 1:2] # Rescale the weights, making sure we mostly scale down a_hat = a * K.clip(s_hat / s, self.min_decrease, self.max_increase) # Scale again so that the reported loss is comparable to the other ones t = 1 #sT = K.transpose(s) #t = K.dot(sT, a) / K.dot(sT, a_hat) return K.stop_gradient([a_hat * t])[0]
def call(self, inputs): #*************************************************************************************************** #Binary layer as in https://arxiv.org/abs/1802.08530 #M. D. McDonnell, Training wide residual networks for deployment using a single bit for each weight #ICLR, 2018 # #This code sets the full precsion weights to binary for forward and bacjkward propagation #but enables gradients to update the full precision weights that ar used only during training # binary_kernel = self.kernel + K.stop_gradient( K.sign(self.kernel) - self.kernel) binary_kernel = binary_kernel + K.stop_gradient( binary_kernel * self.multiplier - binary_kernel) #*************************************************************************************************** outputs = K.conv2d(inputs, binary_kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) return outputs
def Lossnet(inputs_lossnet, embedding_size): """LossNet network""" def get_embedding_nets(embedding_size): return Sequential([ layers.GlobalAveragePooling2D(), layers.Dense(embedding_size), layers.Activation("relu") ]) c_pred = inputs_lossnet[0] features_w = inputs_lossnet[1:] # split the foward passing of the features to be able to split the training # stop the gradient back to the backbone (expresed as s for split and w for whole) features_s = [] for i, out in enumerate(features_w): features_s.append( layers.Lambda(lambda x: backend.stop_gradient(x))(out)) embeddings_fn_list = [] # generate the embeddings layers for feat in features_w: embeddings_fn_list.append(get_embedding_nets(embedding_size)) # define dense function dense_fn = layers.Dense(1, name="L_pred") concat_same = layers.Concatenate(name="Embedding") # embeddings_list_whole = [] embeddings_list_split = [] for i, out in enumerate(features_w): embeddings_list_split.append(embeddings_fn_list[i](features_s[i])) embeddings_list_whole.append(embeddings_fn_list[i](features_w[i])) embedding_whole = concat_same(embeddings_list_whole) embedding_split = concat_same(embeddings_list_split) #l_pred_w = tf.squeeze(dense_fn(embedding_whole)) #l_pred_s = tf.squeeze(dense_fn(embedding_split)) l_pred_w = dense_fn(embedding_whole) l_pred_s = dense_fn(embedding_split) # concatenate the prediction of the classes with the predicted loss in order to compute the loss concat_w = layers.Concatenate(axis=-1, name='l_pred_w')([c_pred, l_pred_w]) concat_s = layers.Concatenate(axis=-1, name='l_pred_s')([c_pred, l_pred_s]) return [concat_w, concat_s, embedding_whole, embedding_split]
def _train(self, screens_input, action_input, select_input, reward, action, screen_action, screen_used): _entropy = _policy_loss = _value_loss = 0. with tf.GradientTape() as tape: spatial_policy, ns_policy, value = self.model( [screens_input, action_input, select_input]) value = K.squeeze(value, axis=1) ns_action_one_hot = K.one_hot(action, len(ACTION_OPTIONS)) screen_action_one_hot = K.one_hot(screen_action, SCREEN_SIZE * SCREEN_SIZE) value_loss = .5 * K.square(reward - value) entropy = -K.sum(ns_policy * K.log(ns_policy + 1e-10), axis=1) - \ K.sum(spatial_policy * K.log(spatial_policy + 1e-10), axis=1) ns_log_prob = K.log( K.sum(ns_policy * ns_action_one_hot, axis=1) + 1e-10) spatial_log_prob = K.log( K.sum(spatial_policy * screen_action_one_hot, axis=1) + 1e-10) advantage = reward - K.stop_gradient(value) # Mask out spatial_log_prob when the action taken did not use the screen policy_loss = -(ns_log_prob + spatial_log_prob * screen_used) * advantage - entropy * ENTROPY_RATE total_loss = policy_loss + value_loss _entropy = K.mean(entropy) _policy_loss = K.mean(K.abs(policy_loss)) _value_loss = K.mean(value_loss) gradients = tape.gradient(total_loss, self.model.trainable_variables) global_norm = tf.linalg.global_norm(gradients) print(tf.linalg.global_norm(gradients)) gradients, _ = tf.clip_by_global_norm( gradients, GRADIENT_CLIP_MAX) # Prevents exploding gradients...I think self.opt.apply_gradients(zip(gradients, self.model.trainable_variables)) return [ float(_value_loss), float(_policy_loss), float(_entropy), global_norm ]