def correct_boxes(box_xy, box_wh, input_shape, image_shape): '''Get corrected boxes''' box_yx = box_xy[..., ::-1] box_hw = box_wh[..., ::-1] input_shape = K.cast(input_shape, K.dtype(box_yx)) image_shape = K.cast(image_shape, K.dtype(box_yx)) new_shape = K.round(image_shape * K.min(input_shape / image_shape)) offset = (input_shape - new_shape) / 2. / input_shape scale = input_shape / new_shape box_yx = (box_yx - offset) * scale box_hw *= scale box_mins = box_yx - (box_hw / 2.) box_maxes = box_yx + (box_hw / 2.) boxes = K.concatenate([ box_mins[..., 0:1], # y_min box_mins[..., 1:2], # x_min box_maxes[..., 0:1], # y_max box_maxes[..., 1:2] # x_max ]) # Scale boxes back to original image shape. boxes *= K.concatenate([image_shape, image_shape]) return boxes
def yolo_head(feats, anchors, num_classes, input_shape): """Convert final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (box_xy + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) box_wh = box_wh * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) return box_xy, box_wh, box_confidence, box_class_probs
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(T, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' yolo_outputs = args[:3] y_true = args[3:] anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(3)] loss = 0 m = K.shape(yolo_outputs[0])[0] for l in range(3): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] pred_xy, pred_wh, pred_confidence, pred_class_probs = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet box loss. xy_delta = (y_true[l][..., :2]-pred_xy)*grid_shapes[l][::-1] wh_delta = K.log(y_true[l][..., 2:4]) - K.log(pred_wh) # Avoid log(0)=-inf. wh_delta = K.switch(object_mask, wh_delta, K.zeros_like(wh_delta)) box_delta = K.concatenate([xy_delta, wh_delta], axis=-1) box_delta_scale = 2 - y_true[l][...,2:3]*y_true[l][...,3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write(b, K.cast(best_iou<ignore_thresh, K.dtype(true_box))) return b+1, ignore_mask _, ignore_mask = K.control_flow_ops.while_loop(lambda b,*args: b<m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) box_loss = object_mask * K.square(box_delta*box_delta_scale) confidence_loss = object_mask * K.square(1-pred_confidence) + \ (1-object_mask) * K.square(0-pred_confidence) * ignore_mask class_loss = object_mask * K.square(true_class_probs-pred_class_probs) loss += K.sum(box_loss) + K.sum(confidence_loss) + K.sum(class_loss) return loss / K.cast(m, K.dtype(loss))
def _get_anchor_positive_triplet_mask(self, y_true: Tensor, pairwise_dist: Tensor) -> Tensor: # mask label(a) != label(p) mask1 = K.equal(K.expand_dims(y_true, 0), K.expand_dims(y_true, 1)) mask1 = K.cast(mask1, K.dtype(pairwise_dist)) # mask a == p mask2 = K.not_equal(pairwise_dist, 0.0) mask2 = K.cast(mask2, K.dtype(pairwise_dist)) return mask1 * mask2
def _get_semihard_anchor_negative_triplet_mask(self, negative_dist: Tensor, hardest_positive_dist: Tensor, mask_negative: Tensor) -> Tensor: # mask max(dist(a,p)) < dist(a,n) mask = K.greater(negative_dist, hardest_positive_dist) mask = K.cast(mask, K.dtype(negative_dist)) mask_semihard = K.cast(K.expand_dims(K.greater(K.sum(mask, 1), 0.0), 1), K.dtype(negative_dist)) mask = mask_negative * (1 - mask_semihard) + mask * mask_semihard return mask
def call(self, inputs, mask=None): if not isinstance(inputs, list) or len(inputs) <= 1: raise TypeError('SpkLifeLongMemory must be called on a list of tensors ' '(at least 2). Got: ' + str(inputs)) # (None(batch), 1), index of speaker target_spk_l = inputs[0] target_spk_l = K.reshape(target_spk_l, (target_spk_l.shape[0], )) if K.dtype(target_spk_l) != 'int32': target_spk_l = K.cast(target_spk_l, 'int32') # (None(batch), embed_dim) spk_vector_l = inputs[1] # Start to update life-long memory based on the learned speech vector # First do normalization spk_vector_eps = K.switch(K.equal(spk_vector_l, 0.), np.spacing(1), spk_vector_l) # avoid zero spk_vector_eps = K.sqrt(K.sum(spk_vector_eps**2, axis=1)) spk_vector_eps = spk_vector_eps.dimshuffle((0, 'x')) spk_vector = T.true_div(spk_vector_l, K.repeat_elements(spk_vector_eps, self.vec_dim, axis=1)) # Store speech vector into life-long memory according to the speaker identity. life_long_mem = T.inc_subtensor(self.life_long_mem[target_spk_l, :], spk_vector) # Normalization for memory life_long_mem_eps = K.switch(K.equal(life_long_mem, 0.), np.spacing(1), life_long_mem) # avoid 0 life_long_mem_eps = K.sqrt(K.sum(life_long_mem_eps**2, axis=1)) life_long_mem_eps = life_long_mem_eps.dimshuffle((0, 'x')) life_long_mem = T.true_div(life_long_mem, K.repeat_elements(life_long_mem_eps, self.vec_dim, axis=1)) # (None(batch), spk_size, embed_dim) return life_long_mem
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): if p.name in self.lr_mult: multiplied_lr = lr * self.lr_mult[p.name] else: multiplied_lr = lr v = self.momentum * m - multiplied_lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - multiplied_lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def _batch_all_triplet_loss(self, y_true: Tensor, pairwise_dist: Tensor) -> Tensor: anchor_positive_dist = K.expand_dims(pairwise_dist, 2) anchor_negative_dist = K.expand_dims(pairwise_dist, 1) triplet_loss = anchor_positive_dist - anchor_negative_dist + self.margin mask = self._get_triplet_mask(y_true, pairwise_dist) triplet_loss = mask * triplet_loss triplet_loss = K.clip(triplet_loss, 0.0, None) valid_triplets = K.cast(K.greater(triplet_loss, 1e-16), K.dtype(triplet_loss)) num_positive_triplets = K.sum(valid_triplets) triplet_loss = K.sum(triplet_loss) / (num_positive_triplets + 1e-16) return triplet_loss
def call(self, x, mask=None): if mask is None: return super(GlobalAveragePooling1D, self).call(x) mask = K.expand_dims(mask) mask = K.tile(mask, [1, 1, K.shape(x)[2]]) mask = K.cast(mask, K.dtype(x)) safe_mask_sum = K.sum(mask, axis=1) safe_mask_sum = K.maximum(safe_mask_sum, K.ones_like(safe_mask_sum)) return K.sum(mask * x, axis=1) / safe_mask_sum
def call(self, x, mask=None): if K.dtype(x) != 'int32': x = K.cast(x, 'int32') if 0. < self.dropout < 1.: retain_p = 1. - self.dropout B = K.random_binomial((self.input_dim,), p=retain_p) * (1. / retain_p) B = K.expand_dims(B) W = K.in_train_phase(self.W * B, self.W) else: W = self.W denorm = K.sum(W, axis=0) W = W / denorm out = K.gather(W, x) return out
def _pairwise_distances(self, inputs: List[Tensor]) -> Tensor: emb_c, emb_r = inputs bs = K.shape(emb_c)[0] embeddings = K.concatenate([emb_c, emb_r], 0) dot_product = K.dot(embeddings, K.transpose(embeddings)) square_norm = K.batch_dot(embeddings, embeddings, axes=1) distances = K.transpose(square_norm) - 2.0 * dot_product + square_norm distances = K.slice(distances, (0, bs), (bs, bs)) distances = K.clip(distances, 0.0, None) mask = K.cast(K.equal(distances, 0.0), K.dtype(distances)) distances = distances + mask * 1e-16 distances = K.sqrt(distances) distances = distances * (1.0 - mask) return distances
def call(self, inputs, mask=None): if not isinstance(inputs, list) or len(inputs) <= 1: raise TypeError('SelectSpkMemory must be called on a list of tensors ' '(at least 2). Got: ' + str(inputs)) # (None(batch), 1), speaker identity target_spk_l = inputs[0] target_spk_l = K.reshape(target_spk_l, (target_spk_l.shape[0], )) if K.dtype(target_spk_l) != 'int32': target_spk_l = K.cast(target_spk_l, 'int32') # (None(batch), spk_size, embed_dim), life-long memory life_long_mem = inputs[1] # Extract the acoustic feature from memory spk_memory = K.gather(life_long_mem, target_spk_l) # (None(batch), embed_dim) return spk_memory
def _preprocess_conv2d_input(x, data_format): """Transpose and cast the input before the conv2d. # Arguments x: input tensor. data_format: string, `"channels_last"` or `"channels_first"`. # Returns A tensor. """ if K.dtype(x) == "float64": x = tf.cast(x, "float32") if data_format == "channels_first": # TF uses the last dimension as channel dimension, # instead of the 2nd one. # TH input shape: (samples, input_depth, rows, cols) # TF input shape: (samples, rows, cols, input_depth) x = tf.transpose(x, (0, 2, 3, 1)) return x
def call(self, inputs, mask=None): if mask is None: mask = K.zeros_like(inputs) mask = K.sum(mask, axis=-1) mask = 1 + mask else: mask = K.cast(mask, K.dtype(inputs)) safe_n1 = K.sum(mask, axis=1) - 1 safe_n1 = K.maximum(safe_n1, K.ones_like(safe_n1)) safe_n1 = K.expand_dims(safe_n1) r = tf.cumsum(mask, axis=1) - 1 r = self.start + (self.stop - self.start) * r / safe_n1 r = mask * r r = K.expand_dims(r) return r
def keras_wrap(model, target, output, loss): """ Convenience function for wrapping a Keras loss function. """ # pylint: disable=import-error import keras.objectives as O import keras.backend as K # pylint: enable=import-error if isinstance(loss, str): loss = O.get(loss) shape = model.outputs[target].value._keras_shape # pylint: disable=protected-access ins = [ (target, K.placeholder( ndim=len(shape), dtype=K.dtype(model.outputs[target].value), name=target )) ] out = loss(ins[0][1], output) return ins, out
def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write(b, K.cast(best_iou<ignore_thresh, K.dtype(true_box))) return b+1, ignore_mask
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom # TODO: Replace with K.clip after releast of Keras > 2.2.4 bounded_lr_t = m_t * tf.clip_by_value(step_size_p_bound, lower_bound, upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors)//3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2]*grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][...,2:3]*y_true[l][...,3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write(b, K.cast(best_iou<ignore_thresh, K.dtype(true_box))) return b+1, ignore_mask _, ignore_mask = K.control_flow_ops.while_loop(lambda b,*args: b<m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy, raw_pred[...,0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh-raw_pred[...,2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ') return loss
boxes = list() box_scores = list() # classes = list() for i in range(3): # 52 26 13 anchor = anchors[..., 3 * i:3 * (i + 1), :] # feats = model.output[i] feats = net_out[i] grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape(feats, [-1, grid_shape[0], grid_shape[1], 3, num_classes + 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchor / K.cast(input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) # box_xy = (box_xy - offset) * scale # box_wh *= scale # box_mins = box_xy - (box_wh / 2.)
def Constant(c, reference=None): if reference is None: return K.constant(c) else: dtype = K.dtype(reference) return K.constant(np.dtype(dtype)(c), dtype=dtype)
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # Casts a tensor to a different dtype and returns it. input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] print(f'now it is in layer {l}', object_mask) grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask #_, ignore_mask = K.control_flow_ops.while_loop(lambda b,*args: b<m, loop_body, [0, ignore_mask]) _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) #wh_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_wh, raw_pred[...,2:4], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask) ], message='loss: ') return loss
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) # first update the number of iterations self.updates = [K.update_add(self.iterations, 1)] # Cycling Gaussian LR # I implement this lr_f = lambda x,b,c,s: b+ s*np.exp(-(x-c)**2/(c*0.5)**2) def gauss_lr(min_lr, max_lr, center, lrsigma,i): return (min_lr+ max_lr*K.exp(-(i-center)**2/(center*lrsigma)**2)) ite_casted = K.cast(self.iterations, K.dtype(self.peaklriter)) all_lr = gauss_lr(self.min_lr['all'], self.peak_lr['all'], self.peaklriter,self.lrsigma,ite_casted) #current_lr = self.min_lr['all'] + #self.peak_lr['all']*K.exp(((ite_casted-self.peaklriter)**2)/(self.dropsigma*self.peaklriter)**2) ############################################################################ self.updates.append(K.update(self.lr['all'],all_lr)) shapes = [K.int_shape(p) for p in params] moments = [K.zeros(s) for s in shapes] self.weights = [self.iterations] + moments #print(self.weights) for p, g, m in zip(params, grads, moments): #print("HEREEEE:", p.name, g, m) lrptrkey= set_pattern_find(p.name,self.lr.keys()) if lrptrkey: if self.verbose>0: print("Setting different learning rate for ", p.name, " : ", K.eval(self.lr[lrptrkey])) if set_pattern_find(p.name,self.min_lr.keys()) and set_pattern_find(p.name,self.peak_lr.keys()): p_lr = gauss_lr(self.min_lr[lrptrkey], self.peak_lr[lrptrkey], self.peaklriter,self.lrsigma,ite_casted) else: p_lr = gauss_lr(self.min_lr['all'], self.peak_lr['all'], self.peaklriter,self.lrsigma,ite_casted) else: p_lr = self.lr['all'] momptrkey = set_pattern_find(p.name,self.momentum.keys()) if momptrkey: if self.verbose>0: print("Setting different momentum for ", p.name, " , ", K.eval(self.momentum[momptrkey])) momentum = self.momentum[momptrkey] else: momentum = self.momentum['all'] if self.nesterov: updt = momentum * (momentum * m - p_lr * g) - p_lr * g else: updt = momentum * m - p_lr * g # CHANGE CLIP _to_tensor = K.tensorflow_backend._to_tensor _clip_by_val = K.tf.clip_by_value margin = K.mean(K.abs(p))*K.constant(self.UPCLIP) #margin = K.mean(K.abs(p*K.constant(self.UPCLIP))) #min_value = _to_tensor(-margin, p.dtype.base_dtype) #max_value = _to_tensor(margin, p.dtype.base_dtype) #max_v = K.maximum(min_value, max_value) min_v = K.zeros_like(margin) updt_sign = K.sign(updt) updt_val = _clip_by_val(K.abs(updt), min_v, margin) v = updt_sign * updt_val # velocity new_p = p + v self.updates.append(K.update(m, v)) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) clptrkey = set_pattern_find(p.name,self.clips.keys()) if self.clips_val and clptrkey: c = K.eval(self.clips[clptrkey]) if self.verbose>0: print("Clipping variable",p.name," to ", c) #input() new_p = K.clip(new_p, c[0], c[1]) #print("updates for ", p.name, " lr: ", K.eval(lr), " mom:", K.eval(momentum)) self.updates.append(K.update(p, new_p)) return self.updates
def yolo_head(feats, anchors, num_classes, tree_): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tensor Final convolutional layer features. anchors : array-like Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy : tensor x, y box predictions adjusted by spatial location in conv layer. box_wh : tensor w, h box predictions adjusted by anchors and conv spatial resolution. box_conf : tensor Probability estimate for whether each box contains any object. box_class_pred : tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors, name='anchor'), [1, 1, 1, num_anchors, 2]) # Static implementation for fixed models. # TODO: Remove or add option for static implementation. # _, conv_height, conv_width, _ = K.int_shape(feats) # conv_dims = K.variable([conv_width, conv_height]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) # conv_height_index是某一feats的左上角格子高度坐标 # TODO: Repeat_elements and tf.split doesn't support dynamic splits. # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) conv_width_index = K.tile(K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) # conv_width_index = K.tile(conv_width_index, [conv_dims[0]]) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.concatenate([ K.softmax(feats[..., 5 + tree_.group_offset[i]:5 + tree_.group_offset[i] + tree_.group_size[i]]) for i in range(tree_.group_num) ], axis=-1) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. # 在整张图的相对位置 box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_xy, box_wh, box_confidence, box_class_probs
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): """Return yolo_loss tensor Parameters ---------- :param args: :param ignore_thresh: 0.5 :param num_classes: integer :param anchors: array, shape=(N, 2), wh :param print_loss: False Returns ------- loss: tensor, shape=(1,) """ num_layers = len(anchors) // 3 # default setting # yolo_outputs is model's output [y1, y2] yolo_outputs = args[:num_layers] # y_true is same shape with yolo_outputs # y_true is two tensors' list [tensor1: y1(26, 26, 3, 6), tensor1: y2(13, 13, 3, 6)] y_true = args[num_layers:] # I changed mask index here anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [0, 1, 2]] # anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf # 2 - true_w * true_h box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask) ], message='loss: ') return loss
def step(best_indices, previous_indices): # previous_indices is a batch_size vector of state indices and # best_indices a (batch_size, num_states) matrix. Return # [best_indices[previous[b]] for b in range(batch_size)]. b_idx = arange(batch_size, dtype=K.dtype(previous_indices)) return multi_index(best_indices, [b_idx, previous_indices])
def gaussian_yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False, use_focal_confidence_loss=False, use_focal_class_loss=False): # 3 layers num_layers = len(anchors)//3 # args = [*model_body.output, *y_true] # y_true: [(m, 13, 13, 3, 85), (m, 26, 26, 3, 85), (m, 52, 52, 3, 85)] # yolo_outputs: [(m, 13, 13, 3, 89), (m, 26, 26, 3, 89), (m, 52, 52, 3, 89)] y_true = args[num_layers:] yolo_outputs = args[:num_layers] # [6, 7, 8]: [(116, 90), (156, 198), (373, 326)] # [3, 4, 5]: [(30 , 61), (62, 45), (59, 119)] # [0, 1, 2]: [(10, 13), (16, 30), (33, 23)] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # [416, 416] input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) # [[13, 13], [26, 26], [52, 52]] grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)] loss = 0 # cast m to float m = K.shape(yolo_outputs[0])[0] mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): # confidence object_mask = y_true[l][..., 4:5] # class probability true_class_probs = y_true[l][..., 5:] # pred_xy and pred_wh are normalized grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) # (m, 13, 13, 3, 4) pred_box = K.concatenate([pred_xy, pred_wh]) # make a dynamic tensor array ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): # (n, 4) true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) # calculate iou # (13, 13, 3, n) iou = box_iou(pred_box[b], true_box) # (13, 13, 3, 1) best_iou = K.max(iou, axis=-1) # if iou < ignore threshold: negative. # if iou > ignore threshold and it not positive, it's ignore anchor. # And these anchors are closed to positive anchor. # yoloV3 uses this trick to maintain number of negative anchors. ignore_mask = ignore_mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b+1, ignore_mask # repeat loop_body function while condition is true _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() # (m, 13, 13, 3, 1, 1) ignore_mask = K.expand_dims(ignore_mask, -1) # encode the gt bounding boxes raw_true_xy = y_true[l][..., :2]*grid_shapes[l][:] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) # ####################################### # use switch to exchange -inf to 0 # 0 * inf = NAN # ####################################### raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # TODO: yolo3 uses this scale to penalize errors in small gt bounding boxes. box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] x_loss = (-1) * object_mask * box_loss_scale * \ K.log(gaussian_distribution(mu=K.sigmoid(raw_pred[..., 0:1]), sigma=K.sigmoid(raw_pred[..., 4:5]), x=raw_true_xy[..., 0:1]) + K.epsilon()) y_loss = (-1) * object_mask * box_loss_scale * \ K.log(gaussian_distribution(mu=K.sigmoid(raw_pred[..., 1:2]), sigma=K.sigmoid(raw_pred[..., 5:6]), x=raw_true_xy[..., 1:2]) + K.epsilon()) w_loss = (-1) * object_mask * box_loss_scale * \ K.log(gaussian_distribution(mu=raw_pred[..., 2:3], sigma=K.sigmoid(raw_pred[..., 6:7]), x=raw_true_wh[..., 0:1]) + K.epsilon()) h_loss = (-1) * object_mask * box_loss_scale * \ K.log(gaussian_distribution(mu=raw_pred[..., 3:4], sigma=K.sigmoid(raw_pred[..., 7:8]), x=raw_true_wh[..., 1:2]) + K.epsilon()) # use focal confidence loss if use_focal_confidence_loss: confidence_loss = sigmoid_focal_loss(object_mask, raw_pred[..., 8:9]) else: confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 8:9], from_logits=True) + \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 8:9], from_logits=True) * ignore_mask # use focal class loss if use_focal_class_loss: class_loss = sigmoid_focal_loss(true_class_probs, raw_pred[..., 9:]) else: class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[..., 9:], from_logits=True) x_loss = K.sum(x_loss) / mf y_loss = K.sum(y_loss) / mf w_loss = K.sum(w_loss) / mf h_loss = K.sum(h_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += x_loss + y_loss + w_loss + h_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [loss, x_loss, y_loss, w_loss, h_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ') return loss
def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None, input_data=None, expected_output=None, expected_output_dtype=None, expected_output_shape=None, validate_training=True, adapt_data=None, custom_objects=None, test_harness=None, supports_masking=None): """Test routine for a layer with a single input and single output. Args: layer_cls: Layer class object. kwargs: Optional dictionary of keyword arguments for instantiating the layer. input_shape: Input shape tuple. input_dtype: Data type of the input data. input_data: Numpy array of input data. expected_output: Numpy array of the expected output. expected_output_dtype: Data type expected for the output. expected_output_shape: Shape tuple for the expected shape of the output. validate_training: Whether to attempt to validate training on this layer. This might be set to False for non-differentiable layers that output string or integer values. adapt_data: Optional data for an 'adapt' call. If None, adapt() will not be tested for this layer. This is only relevant for PreprocessingLayers. custom_objects: Optional dictionary mapping name strings to custom objects in the layer class. This is helpful for testing custom layers. test_harness: The Tensorflow test, if any, that this function is being called in. supports_masking: Optional boolean to check the `supports_masking` property of the layer. If None, the check will not be performed. Returns: The output data (Numpy array) returned by the layer, for additional checks to be done by the calling code. Raises: ValueError: if `input_shape is None`. """ if input_data is None: if input_shape is None: raise ValueError('input_shape is None') if not input_dtype: input_dtype = 'float32' input_data_shape = list(input_shape) for i, e in enumerate(input_data_shape): if e is None: input_data_shape[i] = np.random.randint(1, 4) input_data = 10 * np.random.random(input_data_shape) if input_dtype[:5] == 'float': input_data -= 0.5 input_data = input_data.astype(input_dtype) elif input_shape is None: input_shape = input_data.shape if input_dtype is None: input_dtype = input_data.dtype if expected_output_dtype is None: expected_output_dtype = input_dtype if tf.as_dtype(expected_output_dtype) == tf.string: if test_harness: assert_equal = test_harness.assertAllEqual else: assert_equal = string_test else: if test_harness: assert_equal = test_harness.assertAllClose else: assert_equal = numeric_test # instantiation kwargs = kwargs or {} layer = layer_cls(**kwargs) if (supports_masking is not None and layer.supports_masking != supports_masking): raise AssertionError( 'When testing layer %s, the `supports_masking` property is %r' 'but expected to be %r.\nFull kwargs: %s' % (layer_cls.__name__, layer.supports_masking, supports_masking, kwargs)) # Test adapt, if data was passed. if adapt_data is not None: layer.adapt(adapt_data) # test get_weights , set_weights at layer level weights = layer.get_weights() layer.set_weights(weights) # test and instantiation from weights if 'weights' in tf_inspect.getargspec(layer_cls.__init__): kwargs['weights'] = weights layer = layer_cls(**kwargs) # test in functional API x = layers.Input(shape=input_shape[1:], dtype=input_dtype) y = layer(x) if backend.dtype(y) != expected_output_dtype: raise AssertionError( 'When testing layer %s, for input %s, found output ' 'dtype=%s but expected to find %s.\nFull kwargs: %s' % (layer_cls.__name__, x, backend.dtype(y), expected_output_dtype, kwargs)) def assert_shapes_equal(expected, actual): """Asserts that the output shape from the layer matches the actual shape.""" if len(expected) != len(actual): raise AssertionError( 'When testing layer %s, for input %s, found output_shape=' '%s but expected to find %s.\nFull kwargs: %s' % (layer_cls.__name__, x, actual, expected, kwargs)) for expected_dim, actual_dim in zip(expected, actual): if isinstance(expected_dim, tf.compat.v1.Dimension): expected_dim = expected_dim.value if isinstance(actual_dim, tf.compat.v1.Dimension): actual_dim = actual_dim.value if expected_dim is not None and expected_dim != actual_dim: raise AssertionError( 'When testing layer %s, for input %s, found output_shape=' '%s but expected to find %s.\nFull kwargs: %s' % (layer_cls.__name__, x, actual, expected, kwargs)) if expected_output_shape is not None: assert_shapes_equal(tf.TensorShape(expected_output_shape), y.shape) # check shape inference model = models.Model(x, y) computed_output_shape = tuple( layer.compute_output_shape(tf.TensorShape(input_shape)).as_list()) computed_output_signature = layer.compute_output_signature( tf.TensorSpec(shape=input_shape, dtype=input_dtype)) actual_output = model.predict(input_data) actual_output_shape = actual_output.shape assert_shapes_equal(computed_output_shape, actual_output_shape) assert_shapes_equal(computed_output_signature.shape, actual_output_shape) if computed_output_signature.dtype != actual_output.dtype: raise AssertionError( 'When testing layer %s, for input %s, found output_dtype=' '%s but expected to find %s.\nFull kwargs: %s' % (layer_cls.__name__, x, actual_output.dtype, computed_output_signature.dtype, kwargs)) if expected_output is not None: assert_equal(actual_output, expected_output) # test serialization, weight setting at model level model_config = model.get_config() recovered_model = models.Model.from_config(model_config, custom_objects) if model.weights: weights = model.get_weights() recovered_model.set_weights(weights) output = recovered_model.predict(input_data) assert_equal(output, actual_output) # test training mode (e.g. useful for dropout tests) # Rebuild the model to avoid the graph being reused between predict() and # See b/120160788 for more details. This should be mitigated after 2.0. layer_weights = layer.get_weights( ) # Get the layer weights BEFORE training. if validate_training: model = models.Model(x, layer(x)) if _thread_local_data.run_eagerly is not None: model.compile('rmsprop', 'mse', weighted_metrics=['acc'], run_eagerly=should_run_eagerly()) else: model.compile('rmsprop', 'mse', weighted_metrics=['acc']) model.train_on_batch(input_data, actual_output) # test as first layer in Sequential API layer_config = layer.get_config() layer_config['batch_input_shape'] = input_shape layer = layer.__class__.from_config(layer_config) # Test adapt, if data was passed. if adapt_data is not None: layer.adapt(adapt_data) model = models.Sequential() model.add(layers.Input(shape=input_shape[1:], dtype=input_dtype)) model.add(layer) layer.set_weights(layer_weights) actual_output = model.predict(input_data) actual_output_shape = actual_output.shape for expected_dim, actual_dim in zip(computed_output_shape, actual_output_shape): if expected_dim is not None: if expected_dim != actual_dim: raise AssertionError( 'When testing layer %s **after deserialization**, ' 'for input %s, found output_shape=' '%s but expected to find inferred shape %s.\nFull kwargs: %s' % (layer_cls.__name__, x, actual_output_shape, computed_output_shape, kwargs)) if expected_output is not None: assert_equal(actual_output, expected_output) # test serialization, weight setting at model level model_config = model.get_config() recovered_model = models.Sequential.from_config(model_config, custom_objects) if model.weights: weights = model.get_weights() recovered_model.set_weights(weights) output = recovered_model.predict(input_data) assert_equal(output, actual_output) # for further checks in the caller function return actual_output
def custom_loss(args, anchors, num_classes, global_step=0., rescore_confidence=False): """ Modified YOLO localization loss function. Parameters ---------- yolo_output : tensor Final convolutional layer features. true_boxes : tensor Ground truth boxes tensor with shape [batch, num_true_boxes, 5] containing box x_center, y_center, width, height, and class. detectors_mask : array 0/1 mask for detector positions where there is a matching ground truth. matching_true_boxes : array Corresponding ground truth boxes for positive detector positions. Already adjusted for conv height and width. anchors : tensor Anchor boxes for model. num_classes : int Number of object classes. rescore_confidence : bool, default=False If true then set confidence target to IOU of best predicted box with the closest matching ground truth box. """ (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args num_anchors = len(anchors) object_scale = 5 no_object_scale = 1 class_scale = 2.5 coordinates_scale = 1 edl_scale = 2.5 yad2kOutput, edlOutput = yolo_head(yolo_output, anchors, num_classes, clip=5.) pred_xy, pred_wh, pred_confidence, pred_softmax_class_probs = yad2kOutput pred_class_logits, pred_box_class_evidence, pred_alpha, \ pred_S, pred_uncertainty, pred_class_probs = edlOutput # Unadjusted box predictions for loss. # TODO: Remove extra computation shared with yolo_head. yolo_output_shape = K.shape(yolo_output) feats = K.reshape(yolo_output, [ -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors, num_classes + 5 ]) pred_boxes = K.concatenate((K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1) # TODO: Adjust predictions by image width/height for non-square images? # IOUs may be off due to different aspect ratio. # Expand pred x,y,w,h to allow comparison with ground truth. # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params pred_xy = K.expand_dims(pred_xy, 4) pred_wh = K.expand_dims(pred_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half true_boxes_shape = K.shape(true_boxes) # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params true_boxes = K.reshape(true_boxes, [ true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2] ]) true_xy = true_boxes[..., 0:2] true_wh = true_boxes[..., 2:4] # Find IOU of each predicted box with each ground truth box. true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half intersect_mins = K.maximum(pred_mins, true_mins) intersect_maxes = K.minimum(pred_maxes, true_maxes) intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = intersect_areas / union_areas # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) # TODO: Darknet region training includes extra coordinate loss for early # training steps to encourage predictions to match anchor priors. # Determine confidence weights from object and no_object weights. # NOTE: YOLO does not use binary cross-entropy here. no_object_weights = (no_object_scale * (1 - object_detections) * (1 - detectors_mask)) no_objects_loss = no_object_weights * K.square(-pred_confidence) if rescore_confidence: objects_loss = (object_scale * detectors_mask * K.square(best_ious - pred_confidence)) else: objects_loss = (object_scale * detectors_mask * K.square(1 - pred_confidence)) confidence_loss = objects_loss + no_objects_loss # Classification loss for matching detections. # NOTE: YOLO does not use categorical cross-entropy loss here. matching_classes = K.cast(matching_true_boxes[..., 4], 'int32') matching_classes = K.one_hot(matching_classes, num_classes) classification_loss = (class_scale * detectors_mask * K.square(matching_classes - pred_class_probs)) # Coordinate loss for matching detection boxes. matching_boxes = matching_true_boxes[..., 0:4] coordinates_loss = (coordinates_scale * detectors_mask * K.square(matching_boxes - pred_boxes)) ######################################################## ######################################################## ######## ######### ######## EDL Loss and metric calculations here ######### ######## ######### ######################################################## ######################################################## ######## EDL Loss ######### ### EDL Loss - expected value of cross entropy loss over # the predicted Dirichlet distribution + KL regularization term # Expected value of cross entropy loss A = tf.reduce_sum(matching_classes * (tf.digamma(pred_S) - tf.digamma(pred_alpha)), 4, keepdims=True) # KL term alp = pred_box_class_evidence * (1 - matching_classes) + 1 beta = K.ones_like(alp) S_alpha = tf.reduce_sum(alp, axis=4, keep_dims=True) S_beta = tf.reduce_sum(beta, axis=4, keep_dims=True) lnB = tf.lgamma(S_alpha) - tf.reduce_sum( tf.lgamma(alp), axis=4, keep_dims=True) lnB_uni = tf.reduce_sum(tf.lgamma(beta), axis=4, keep_dims=True) - tf.lgamma(S_beta) dg0 = tf.digamma(S_alpha) dg1 = tf.digamma(alp) kl = tf.reduce_sum( (alp - beta) * (dg1 - dg0), axis=4, keep_dims=True) + lnB + lnB_uni #annealing_coeff = 2.0 * tf.minimum(1.0, tf.cast(global_step / annealing_step, tf.float32)) annealing_coeff = 5.0 B = annealing_coeff * kl # Anneal the KL term during training phase # 5. Apply detector mask and sum the loss components edl_loss = edl_scale * detectors_mask * (A + B) # EDL loss components exp_ce_loss_sum = tf.reduce_sum(detectors_mask * A) kl_loss_sum = tf.reduce_sum(detectors_mask * kl) akl_loss_sum = annealing_coeff * kl_loss_sum ######## EDL Metrics ######### preds = tf.cast(tf.argmax(pred_box_class_evidence, 4), 'int32') truth = tf.cast(matching_true_boxes[..., 4], 'int32') matchs = tf.cast(tf.equal(preds, truth), tf.float32) match = tf.boolean_mask(tf.expand_dims(matchs, 4), detectors_mask) acc = tf.reduce_mean(match) total_evidence = tf.reduce_sum(pred_box_class_evidence, 4, keepdims=True) total_evidence = tf.boolean_mask(total_evidence, detectors_mask) mean_ev_succ = tf.reduce_sum( total_evidence * match) / tf.reduce_sum(match + 1e-20) mean_ev_fail = tf.reduce_sum( total_evidence * (1 - match)) / (tf.reduce_sum(tf.abs(1 - match)) + 1e-20) ######################################################## ######################################################## confidence_loss_sum = K.sum(confidence_loss) classification_loss_sum = K.sum(classification_loss) coordinates_loss_sum = K.sum(coordinates_loss) edl_loss_sum = K.sum(edl_loss) total_loss = 0.5 * (confidence_loss_sum + edl_loss_sum + coordinates_loss_sum + classification_loss_sum) return tf.stack([ total_loss, confidence_loss_sum, classification_loss_sum, edl_loss_sum, coordinates_loss_sum, acc, mean_ev_succ, mean_ev_fail, annealing_coeff, exp_ce_loss_sum, kl_loss_sum, akl_loss_sum ])
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): num_layers = len( anchors ) // 3 # égal à 3, puisqu'il y a 9 anchors, 3 anchors pour chaque layer de prediction. yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] # masks correspondants des anchors. input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 # initialiser la variable de loss à zéro. m = K.shape(yolo_outputs[0])[0] # le batch-size. mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): # Probabilités réelles: 1 s'il existe un objet, 0 sinon. (extraites du dataset) object_mask = y_true[l][..., 4:5] # Probabilités réelles: 1 s'il appartient à la classe i, 0 sinon. (extraites du dataset) true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) # Concaténer les prédictions xy et wh en un seul array : pred_box. pred_box = K.concatenate([pred_xy, pred_wh]) # Données de box brutes pour calcler le coût. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Trouver ignore_mask, en itérant par chaque batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy utile pour éviter le débordement de l'exp(). xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask) ], message='loss: ') return loss
def yolo_loss(args, anchors, num_classes, ignore_thresh=0.5, print_loss=False): """Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body_full or yolo_body_tiny y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) """ num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] \ if num_layers == 3 else [[3, 4, 5], [0, 1, 2]] input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) # Keras switch allows scalr condition, bit here is expected to have elemnt-wise # also the `object_mask` has in last dimension 1 but the in/out puts has 2 (some replication) # raw_true_wh = tf.where(tf.greater(K.concatenate([object_mask] * 2), 0), # raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def _loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = K.control_flow_ops.while_loop( lambda b, *args: b < m, _loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. ce = K.binary_crossentropy(raw_true_xy, raw_pred[..., 0:2], from_logits=True) xy_loss = object_mask * box_loss_scale * ce wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh - raw_pred[..., 2:4]) ce_loss = K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) confidence_loss = object_mask * ce_loss + (1 - object_mask) * ce_loss * ignore_mask class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ') # see: https://github.com/qqwweee/keras-yolo3/issues/129#issuecomment-408855511 return K.expand_dims(loss, axis=0)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) # first update the number of iterations self.updates = [K.update_add(self.iterations, 1)] if self.decay_epochs: ite_casted = K.cast(self.iterations, K.dtype(self.decay_epochs)) hit_decay_epoch = K.any(K.equal(ite_casted, self.decay_epochs)) #print(hit_decay_epoch) lr = K.switch(hit_decay_epoch, self.lr['all']*self.decay['all'], self.lr['all']) #K.print_tensor(self.lr['all']) #a = K.switch(hit_decay_epoch, # K.print_tensor(self.lr['all'],message='Decays:'), # K.print_tensor(self.lr['all'],message=' ')) self.updates.append(K.update(self.lr['all'],lr)) shapes = [K.int_shape(p) for p in params] moments = [K.zeros(s) for s in shapes] self.weights = [self.iterations] + moments #print(self.weights) for p, g, m in zip(params, grads, moments): #print("HEREEEE:", p.name, g, m) lrptrkey= set_pattern_find(p.name,self.lr.keys()) if lrptrkey: if self.verbose>0: print("Setting different learning rate for ", p.name, " : ", K.eval(self.lr[lrptrkey])) lr = self.lr[lrptrkey] dcptrkey=set_pattern_find(p.name,self.decay.keys()) if self.decay_epochs and dcptrkey: lr = K.switch(hit_decay_epoch, self.lr[lrptrkey]*self.decay[dcptrkey], self.lr[lrptrkey]) self.updates.append(K.update(self.lr[lrptrkey],lr)) if self.verbose>0: print("Added decay to ", p.name, ": ", K.eval(lr),",",self.decay[dcptrkey]) elif self.decay_epochs: lr = K.switch(hit_decay_epoch, self.lr[lrptrkey]*self.decay['all'],self.lr[lrptrkey]) self.updates.append(K.update(self.lr[lrptrkey],lr)) if self.verbose>0: print("Added decay to ", p.name, ": ", K.eval(lr),",",self.decay['all']) else: lr = self.lr[lrptrkey] else: lr = self.lr['all'] momptrkey = set_pattern_find(p.name,self.momentum.keys()) if momptrkey: if self.verbose>0: print("Setting different momentum for ", p.name, " , ", K.eval(self.momentum[momptrkey])) momentum = self.momentum[momptrkey] else: momentum = self.momentum['all'] v = momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + momentum * (momentum * m - lr * g) - lr * g else: new_p = p + momentum * m - lr * g # CHANGE CLIP if self.UPCLIP: _to_tensor = K.tensorflow_backend._to_tensor _clip_by_val = K.tf.clip_by_value margin = K.mean(K.abs(p*K.constant(self.UPCLIP))) min_value = _to_tensor(p-margin, p.dtype.base_dtype) max_value = _to_tensor(p+margin, p.dtype.base_dtype) max_v = K.maximum(min_value, max_value) min_v = K.minimum(min_value, max_value) new_p = _clip_by_val(new_p, min_v, max_v) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) clptrkey = set_pattern_find(p.name,self.clips.keys()) if self.clips_val and clptrkey: if self.verbose>0: print("Clipping variable",p.name," to ", self.clips[clptrkey]) c = K.eval(self.clips[clptrkey]) new_p = K.clip(new_p, c[0], c[1]) #print("updates for ", p.name, " lr: ", K.eval(lr), " mom:", K.eval(momentum)) self.updates.append(K.update(p, new_p)) return self.updates
def yolo4_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0, use_focal_loss=False, use_focal_obj_loss=False, use_softmax_loss=False, use_giou_loss=False, use_diou_loss=False): '''Return yolo4_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [0, 1, 2]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 total_location_loss = 0 total_confidence_loss = 0 total_class_loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) if use_focal_obj_loss: # Focal loss for objectness confidence confidence_loss = sigmoid_focal_loss(object_mask, raw_pred[..., 4:5]) else: confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask if use_focal_loss: # Focal loss for classification score if use_softmax_loss: class_loss = softmax_focal_loss(true_class_probs, raw_pred[..., 5:]) else: class_loss = sigmoid_focal_loss(true_class_probs, raw_pred[..., 5:]) else: if use_softmax_loss: # use softmax style classification output class_loss = object_mask * K.expand_dims( K.categorical_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True), axis=-1) else: # use sigmoid style classification output class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) if use_giou_loss: # Calculate GIoU loss as location loss raw_true_box = y_true[l][..., 0:4] giou = box_giou(pred_box, raw_true_box) giou_loss = object_mask * box_loss_scale * (1 - giou) giou_loss = K.sum(giou_loss) / mf location_loss = giou_loss elif use_diou_loss: # Calculate DIoU loss as location loss raw_true_box = y_true[l][..., 0:4] diou = box_diou(pred_box, raw_true_box) diou_loss = object_mask * box_loss_scale * (1 - diou) diou_loss = K.sum(diou_loss) / mf location_loss = diou_loss else: # Standard YOLO location loss # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf location_loss = xy_loss + wh_loss confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += location_loss + confidence_loss + class_loss total_location_loss += location_loss total_confidence_loss += confidence_loss total_class_loss += class_loss # Fit for tf 2.0.0 loss shape loss = K.expand_dims(loss, axis=-1) return loss #, total_location_loss, total_confidence_loss, total_class_loss
def _model_head( self, feats, anchors, num_classes, input_shape, batch_size, calc_loss=False, verbose=False): """Convert final layer features to bounding box parameters. No threshold or nms applied yet. Args: feats : `Tensor` Elements in the output list from K.model.output: shape = (N, 13, 13, 255) anchors : list anchors. num_classes : int num of classes. input_shape : tuple input shape obtained from model output grid information. Returns: Breaking the (num_class + 5) output logits into box_xy, box_wh, box_confidence, and box_class_probs. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape( K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile( K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile( K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, batch_size, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) /\ K.cast(grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor /\ K.cast(input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss is True: return grid, feats, box_xy, box_wh if verbose is True: # In verbose mode, return logits BEFORE sigmoid activation box_coord_logits = feats[..., :4] box_confidence_logits = feats[..., 4: 5] box_class_probs_logits = feats[..., 5:] return box_xy, box_wh, box_confidence, box_class_probs, \ box_coord_logits, box_confidence_logits, \ box_class_probs_logits return box_xy, box_wh, box_confidence, box_class_probs
def call(self, inputs): if K.dtype(inputs) != 'float32': inputs = K.cast(inputs, 'float32') inner_out = K.relu(K.dot(inputs, self.weights_inner) + self.bais_inner) outputs = K.dot(inner_out, self.weights_out) + self.bais_out return outputs
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.updates.append(K.update_add(self.t_cur, 1)) lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params) ] else: vhats = [ K.zeros(1, name='vhat_' + str(i)) for i in range(len(params)) ] self.weights = [self.iterations] + ms + vs + vhats total_iterations = self.total_iterations # Cosine annealing if self.use_cosine_annealing and total_iterations != 0: self.eta_t = _compute_eta_t(self) self.lr_t = lr_t * self.eta_t # for external tracking for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # Weight decays if p.name in self.weight_decays.keys() and total_iterations != 0: p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) self._init_notified = True return self.updates
def yolo_loss(args, anchors, num_classes, rescore_confidence=False, print_loss=False): """YOLO localization loss function. Parameters ---------- yolo_output : tensor Final convolutional layer features. true_boxes : tensor Ground truth boxes tensor with shape [batch, num_true_boxes, 5] containing box x_center, y_center, width, height, and class. detectors_mask : array 0/1 mask for detector positions where there is a matching ground truth. matching_true_boxes : array Corresponding ground truth boxes for positive detector positions. Already adjusted for conv height and width. anchors : tensor Anchor boxes for model. num_classes : int Number of object classes. rescore_confidence : bool, default=False If true then set confidence target to IOU of best predicted box with the closest matching ground truth box. print_loss : bool, default=False If True then use a tf.Print() to print the loss components. Returns ------- mean_loss : float mean localization loss across minibatch """ (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args num_anchors = len(anchors) object_scale = 5 no_object_scale = 1 class_scale = 1 coordinates_scale = 1 pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_head( yolo_output, anchors, num_classes) # Unadjusted box predictions for loss. # TODO: Remove extra computation shared with yolo_head. yolo_output_shape = K.shape(yolo_output) feats = K.reshape(yolo_output, [ -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors, num_classes + 5 ]) pred_boxes = K.concatenate( (K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1) # TODO: Adjust predictions by image width/height for non-square images? # IOUs may be off due to different aspect ratio. # Expand pred x,y,w,h to allow comparison with ground truth. # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params pred_xy = K.expand_dims(pred_xy, 4) pred_wh = K.expand_dims(pred_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half true_boxes_shape = K.shape(true_boxes) # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params true_boxes = K.reshape(true_boxes, [ true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2] ]) true_xy = true_boxes[..., 0:2] true_wh = true_boxes[..., 2:4] # Find IOU of each predicted box with each ground truth box. true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half intersect_mins = K.maximum(pred_mins, true_mins) intersect_maxes = K.minimum(pred_maxes, true_maxes) intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = intersect_areas / union_areas # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) # TODO: Darknet region training includes extra coordinate loss for early # training steps to encourage predictions to match anchor priors. # Determine confidence weights from object and no_object weights. # NOTE: YOLO does not use binary cross-entropy here. no_object_weights = (no_object_scale * (1 - object_detections) * (1 - detectors_mask)) no_objects_loss = no_object_weights * K.square(-pred_confidence) if rescore_confidence: objects_loss = (object_scale * detectors_mask * K.square(best_ious - pred_confidence)) else: objects_loss = (object_scale * detectors_mask * K.square(1 - pred_confidence)) confidence_loss = objects_loss + no_objects_loss # Classification loss for matching detections. # NOTE: YOLO does not use categorical cross-entropy loss here. matching_classes = K.cast(matching_true_boxes[..., 4], 'int32') matching_classes = K.one_hot(matching_classes, num_classes) classification_loss = (class_scale * detectors_mask * K.square(matching_classes - pred_class_prob)) # Coordinate loss for matching detection boxes. matching_boxes = matching_true_boxes[..., 0:4] coordinates_loss = (coordinates_scale * detectors_mask * K.square(matching_boxes - pred_boxes)) confidence_loss_sum = K.sum(confidence_loss) classification_loss_sum = K.sum(classification_loss) coordinates_loss_sum = K.sum(coordinates_loss) total_loss = 0.5 * ( confidence_loss_sum + classification_loss_sum + coordinates_loss_sum) if print_loss: total_loss = tf.Print( total_loss, [ total_loss, confidence_loss_sum, classification_loss_sum, coordinates_loss_sum ], message='yolo_loss, conf_loss, class_loss, box_coord_loss:') return total_loss
def call(self, inputs): a = K.cast(self.a, dtype=K.dtype(inputs)) P = (K.sigmoid(a*(K.mean(inputs,axis=(1,2))-self.b)) - K.sigmoid(-a * self.b)) / (K.sigmoid(a * (1. - self.b)) - K.sigmoid(-a * self.b)) return P
def get_updates(self, loss, params): self.updates = [] self.updates.append(K.update_add(self.state_counter, 1)) self.updates.append(K.update_add(self.iterator, 1)) self.updates.append(K.update_add(self.iterations, 1)) t = K.cast(self.iterations, K.floatx()) + 1 lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) shapes = [K.int_shape(p) for p in params] x = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)] mu = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)] grads = self.get_gradients(loss, params) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params) ] else: vhats = [ K.zeros(1, name='vhat_' + str(i)) for i in range(len(params)) ] for x_i, x_prime_i, mu_i, g, m, v, vhat in zip(x, params, mu, grads, ms, vs, vhats): ## we update x_prime (if we are in LAngevin steps, we update otherwise we switch to parameters x_i) dx_prime_i = g - self.gamma * (x_i - x_prime_i) x_prime_update_i = K.switch( K.any(K.stack([ K.equal(self.state_counter, 0), K.equal(self.num_steps, self.iterator) ], axis=0), axis=0), x_i, x_prime_i - self.sgld_step * dx_prime_i + K.sqrt(self.sgld_step) * self.sgld_noise * K.random_normal(K.int_shape(x_prime_i))) # Apply constraints. if getattr(x_prime_i, 'constraint', None) is not None: x_prime_update_i = x_prime_i.constraint(x_prime_update_i) self.updates.append(K.update(x_prime_i, x_prime_update_i)) ## We update mu (if we are in LAngevin steps, we update otherwise we switch to parameters x_i) mu_update_i = K.switch(K.equal(self.state_counter, 0), x_i, (1 - self.alpha) * mu_i + self.alpha * x_prime_i) self.updates.append(K.update(mu_i, mu_update_i)) ## We update x every L steps (Note that at step L+1 or when step < L, the update term is 0. This is coherent with the paper) ## As they described in the paper, we remove the gamma from the update because it interferes with the learning annealing ## After each update we rescale gamme with a factor of 1.001 ## Adam update gradient = (x_i - mu_i) m_t = (self.beta_1 * m) + (1. - self.beta_1) * gradient v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(gradient) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) x_i_t = x_i - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append( K.update( vhat, K.switch(K.equal(self.state_counter, self.L + 1), vhat_t, vhat))) else: x_i_t = x_i - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append( K.update( m, K.switch(K.equal(self.state_counter, self.L + 1), m_t, m))) self.updates.append( K.update( v, K.switch(K.equal(self.state_counter, self.L + 1), v_t, v))) new_x_i = x_i_t x_i_update = K.switch(K.equal(self.state_counter, self.L + 1), new_x_i, x_i) self.updates.append(K.update(x_i, x_i_update)) ## Gamma scoping gamma_update = K.switch(K.equal(self.state_counter, self.L + 1), self.gamma, self.gamma * (1. + self.scoping)) self.updates.append(K.update(self.gamma, gamma_update)) counter = K.switch(K.equal(self.state_counter, self.L + 2), K.constant(0, dtype='int64'), self.state_counter) self.updates.append(K.update(self.state_counter, counter)) return self.updates
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(T, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' yolo_outputs = args[:3] y_true = args[3:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(3) ] loss = 0 m = K.shape(yolo_outputs[0])[0] for l in range(3): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] pred_xy, pred_wh, pred_confidence, pred_class_probs = yolo_head( yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet box loss. xy_delta = (y_true[l][..., :2] - pred_xy) * grid_shapes[l][::-1] wh_delta = K.log(y_true[l][..., 2:4]) - K.log(pred_wh) # Avoid log(0)=-inf. wh_delta = K.switch(object_mask, wh_delta, K.zeros_like(wh_delta)) box_delta = K.concatenate([xy_delta, wh_delta], axis=-1) box_delta_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) box_loss = object_mask * K.square(box_delta * box_delta_scale) confidence_loss = object_mask * K.square(1-pred_confidence) + \ (1-object_mask) * K.square(0-pred_confidence) * ignore_mask class_loss = object_mask * K.square(true_class_probs - pred_class_probs) loss += K.sum(box_loss) + K.sum(confidence_loss) + K.sum(class_loss) return loss / K.cast(m, K.dtype(loss))
def yolo_loss(args, anchors, num_seen, ignore_thresh=.5, plus=False): """Return yolo_loss tensor Parameters ---------- args: [*yolo_outputs, *y_true, y_embedding] # yolo_outputs: list of tensor, the output of yolo_body # y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_seen: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss plus: if true, calculate yolo plus model loss Returns ------- loss: tensor, shape=(1,) """ num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:-1] # shape=(num_layers, b, h, w, anchors, 5 + num_classes) embeddings = args[-1] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)] loss = 0. m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for _ in range(3): embeddings = K.expand_dims(embeddings, 1) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh, pred_embedding = \ yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], input_shape, calc_loss=True, plus=plus) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) mask = mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, mask _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *arg: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) raw_pred_xy = raw_pred[..., 0:2] raw_pred_wh = raw_pred[..., 2:4] raw_pred_objectness = raw_pred[..., 4:] raw_pred_embedding = pred_embedding # rescale relation to [0, 1] true_relation = 0.5 * (class_relation(true_class_probs, embeddings) + 1) xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy, raw_pred_xy, True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh - raw_pred_wh) object_loss = object_mask * K.binary_crossentropy(object_mask * true_relation, raw_pred_objectness, True) + \ (1 - object_mask) * \ K.binary_crossentropy(object_mask * true_relation, raw_pred_objectness, True) * ignore_mask embedding_loss = object_mask * category_loss(embeddings[..., :num_seen, :], raw_pred_embedding, true_class_probs[..., :num_seen]) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf object_loss = K.sum(object_loss) / mf embedding_loss = K.sum(embedding_loss) / mf loss += xy_loss + wh_loss + object_loss + embedding_loss return loss
def yolo_head(feats, anchors, num_classes): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tensor Final convolutional layer features. anchors : array-like Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy : tensor x, y box predictions adjusted by spatial location in conv layer. box_wh : tensor w, h box predictions adjusted by anchors and conv spatial resolution. box_conf : tensor Probability estimate for whether each box contains any object. box_class_pred : tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # Static implementation for fixed models. # TODO: Remove or add option for static implementation. # _, conv_height, conv_width, _ = K.int_shape(feats) # conv_dims = K.variable([conv_width, conv_height]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) # TODO: Repeat_elements and tf.split doesn't support dynamic splits. # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) conv_width_index = K.tile(K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) # Static generation of conv_index: # conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)]) # conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering. # conv_index = K.variable( # conv_index.reshape(1, conv_height, conv_width, 1, 2)) # feats = Reshape( # (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats) #feats的五个维度分别是 【图片个数,height,width,anchors个数,(xy(2),wh(2),是否发现目标(1),类别(80))】 box_confidence = K.sigmoid(feats[..., 4:5]) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_class_probs = K.softmax(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_confidence, box_xy, box_wh, box_class_probs
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) # #13*32=416 input_shape--->[416,416] grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] # 获取置信度 true_class_probs = y_true[l][..., 5:] # 获取类别信息 grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) # yolo_head将预测的偏移量转化为真实值,这里的真实值是用来计算iou,并不是来计算loss的,loss使用偏差来计算的 pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][ ..., :2] * grid_shapes[l][::-1] - grid #根据公式将boxes中心点x,y的真实值转换为偏移量 raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) #计算宽高的偏移量 raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][ ..., 3:4] # (2-box_ares)避免大框的误差对loss 比小框误差对loss影响大 # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray( K.dtype(y_true[0]), size=1, dynamic_size=True) # 定义一个size可变的张量来存储不含有目标的预测框的信息 object_mask_bool = K.cast(object_mask, 'bool') # 映射成bool类型 1=true 0=false def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) # 剔除为0的行 iou = box_iou(pred_box[b], true_box) # 一张图片预测出的所有boxes与所有的ground truth boxes计算iou best_iou = K.max(iou, axis=-1) # 找出最大iou ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) # 当iou小于阈值时记录,即认为这个预测框不包含物体 return b + 1, ignore_mask _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) # 传入loop_body函数初值为b=0,ignore_mask ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # 扩展维度用来后续计算loss # K.binary_crossentropy is helpful to avoid exp overflow. # 仅计算包含物体框的x,y,w,h的损失 xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) # 置信度损失既包含有物体的损失 也包含无物体的置信度损失 confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask # 分类损失只计算包含物体的损失 class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask) ], message='loss: ') return loss
def _get_anchor_negative_triplet_mask(self, y_true: Tensor, pairwise_dist: Tensor) -> Tensor: # mask label(n) == label(a) mask = K.not_equal(K.expand_dims(y_true, 0), K.expand_dims(y_true, 1)) mask = K.cast(mask, K.dtype(pairwise_dist)) return mask
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 if self.initial_total_steps > 0: warmup_steps = self.total_steps * self.warmup_proportion decay_steps = K.maximum(self.total_steps - warmup_steps, 1) decay_rate = (self.min_lr - lr) / decay_steps lr = K.switch( t <= warmup_steps, lr * (t / warmup_steps), lr + decay_rate * K.minimum(t - warmup_steps, decay_steps), ) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params) ] else: vhats = [ K.zeros(1, name='vhat_' + str(i)) for i in range(len(params)) ] self.weights = [self.iterations] + ms + vs + vhats beta_1_t = K.pow(self.beta_1, t) beta_2_t = K.pow(self.beta_2, t) sma_inf = 2.0 / (1.0 - self.beta_2) - 1.0 sma_t = sma_inf - 2.0 * t * beta_2_t / (1.0 - beta_2_t) for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) m_corr_t = m_t / (1.0 - beta_1_t) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) v_corr_t = K.sqrt(vhat_t / (1.0 - beta_2_t)) self.updates.append(K.update(vhat, vhat_t)) else: v_corr_t = K.sqrt(v_t / (1.0 - beta_2_t)) r_t = K.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) p_t = K.switch(sma_t >= 5, r_t * m_corr_t / (v_corr_t + self.epsilon), m_corr_t) if self.initial_weight_decay > 0: p_t += self.weight_decay * p p_t = p - lr * p_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def on_epoch_end(self, epoch, logs=None): lr = self.model.optimizer.lr decay = self.model.optimizer.decay iterations = self.model.optimizer.iterations lr_with_decay = lr / (1. + decay * K.cast(iterations, K.dtype(decay))) print K.eval(lr_with_decay)
def obj_detection_loss_by_parts(y_true, y_pred): max_boxes = 20 object_scale = 5. no_object_scale = 1. class_scale = 1. coordinates_scale = 1. N = K.shape(y_true)[0] # number of samples in batch # retrieve the detectors_mask and matching_true_boxes from y_true masks_and_true_boxes = K.reshape(y_true, [N, conv_height, conv_width, 1, -1]) detectors_mask = masks_and_true_boxes[..., 0:1] matching_true_boxes = masks_and_true_boxes[..., 1:] # reshape y_pred as well, we call these t parameters as they are before final activation values t_pred = K.reshape(y_pred, [N, conv_height, conv_width, 1, -1]) # loss related to classification matching_classes = matching_true_boxes[..., 3:] y_pred_class = K.softmax(t_pred[..., 4:]) classification_loss = K.sum(class_scale * detectors_mask * K.square(matching_classes - y_pred_class), axis=(-4, -3, -2, -1)) # loss related to coordinates matching_box_coord = matching_true_boxes[..., :3] y_pred_coord = t_pred[..., 1:4] coordinates_loss = K.sum(coordinates_scale * detectors_mask * K.square(matching_box_coord - y_pred_coord), axis=(-4, -3, -2, -1)) # get a box tensor whose 2nd dimension list the individual boxes boxes = inv_preprocess_true_boxes(detectors_mask, matching_true_boxes, conv_index, conv_dims, max_boxes=max_boxes) boxes_shape = K.shape(boxes) boxes = K.reshape(boxes, [boxes_shape[0], 1, 1, 1, boxes_shape[1], boxes_shape[2]]) pred_xy, pred_r = transform_predicted_from_t_to_actual(t_pred, conv_index, conv_dims) pred_xy = K.expand_dims(pred_xy, 4) pred_r = K.expand_dims(pred_r, 4) true_xy, true_r = boxes[..., 0:2], boxes[..., 2:3] # Find IOU of each predicted box with each ground truth box. pred_mins = pred_xy - pred_r pred_maxes = pred_xy + pred_r true_mins = true_xy - true_r true_maxes = true_xy + true_r intersect_mins = K.maximum(pred_mins, true_mins) intersect_maxes = K.minimum(pred_maxes, true_maxes) intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] pred_areas = 4. * pred_r[..., 0] * pred_r[..., 0] # a square true_areas = 4. * true_r[..., 0] * true_r[..., 0] union_areas = pred_areas + true_areas - intersect_areas iou_scores = intersect_areas / union_areas # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) no_object_weights = no_object_scale * (1 - object_detections) * (1 - detectors_mask) no_objects_loss = no_object_weights * K.square(-K.sigmoid(t_pred[..., 0:1])) objects_loss = object_scale * detectors_mask * K.square(1 - K.sigmoid(t_pred[..., 0:1])) confidence_loss = K.sum(objects_loss + no_objects_loss, axis=(-4, -3, -2, -1)) total_loss = 0.5 * (confidence_loss + classification_loss + coordinates_loss) return 0.5 * confidence_loss, 0.5 * classification_loss, 0.5 * coordinates_loss
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False): # num_anchors = 3 num_layers = len(anchors) // 3 # yolo_outputs = [shape = (None, h//32, w//32, num_anchors*(5+num_classes)), # shape = (None, h//16, w//16, num_anchors*(5+num_classes)), # shape = (None, h//8, w//8, num_anchors*(5+num_classes))] yolo_outputs = args[:num_layers] # y_true = [shape = (None, h//32, w//32, num_anchors, 5+num_classes), # shape = (None, h//16, w//16, num_anchors, 5+num_classes), # shape = (None, h//8, w//8, num_anchors, 5+num_classes)] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # input_shape = (h, w) input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 bs = K.shape(yolo_outputs[0])[0] batch_size = K.cast(bs, K.dtype(yolo_outputs[0])) # y_true是一个列表,包含三个特征层,shape分别为(bs,13,13,3,85),(bs,26,26,3,85),(bs,52,52,3,85)。 # yolo_outputs是一个列表,包含三个特征层,shape分别为(bs,13,13,255),(bs,26,26,255),(bs,52,52,255)。 for i in range(num_layers): # 以第一个特征层(bs,13,13,3,85)为例子 # 取出该特征层中存在目标的点的位置。(bs,13,13,3,1) object_mask = y_true[i][..., 4:5] # 取出其对应的种类(bs,13,13,3,80) true_class_probabilities = y_true[i][..., 5:] if label_smoothing: true_class_probabilities = _smooth_labels(true_class_probabilities, label_smoothing) # 将yolo_outputs的特征层输出进行处理 # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(bs,13,13,3,85) # 还有解码后的xy,wh,(bs,13,13,3,2) grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[i], anchors[anchor_mask[i]], num_classes, input_shape, calc_loss=True) # 这个是解码后的预测的box的位置 # (bs,13,13,3,4) pred_box = K.concatenate([pred_xy, pred_wh]) # 找到负样本群组,第一步是创建一个数组,[] ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # 对每一张图片计算ignore_mask def loop_body(b, ignore_mask): # 取出第b副图内,真实存在的所有的box的参数 # n,4 true_box = tf.boolean_mask(y_true[i][b, ..., 0:4], object_mask_bool[b, ..., 0]) # 计算预测结果与真实情况的iou # pred_box为13,13,3,4 # 计算的结果是每个pred_box和其它所有真实框的iou # 13,13,3,n iou = box_iou(pred_box[b], true_box) # 13,13,3 best_iou = K.max(iou, axis=-1) # 如果某些预测框和真实框的重合程度大于0.5,则忽略。 ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask # 遍历所有的图片 _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < bs, loop_body, [0, ignore_mask]) # 将每幅图的内容压缩,进行处理 ignore_mask = ignore_mask.stack() # (bs,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) box_loss_scale = 2 - y_true[i][..., 2:3] * y_true[i][..., 3:4] # Calculate ciou loss as location loss raw_true_box = y_true[i][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) ciou_loss = K.sum(ciou_loss) / batch_size location_loss = ciou_loss # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本 # best_iou<ignore_thresh用于限制负样本数量 confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \ (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probabilities, raw_pred[..., 5:], from_logits=True) confidence_loss = K.sum(confidence_loss) / batch_size class_loss = K.sum(class_loss) / batch_size loss += location_loss + confidence_loss + class_loss # if print_loss: # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ') return loss
def yolo_head(feats, anchors, num_classes): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tensor Final convolutional layer features. anchors : array-like Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy : tensor x, y box predictions adjusted by spatial location in conv layer. box_wh : tensor w, h box predictions adjusted by anchors and conv spatial resolution. box_conf : tensor Probability estimate for whether each box contains any object. box_class_pred : tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # Static implementation for fixed models. # TODO: Remove or add option for static implementation. # _, conv_height, conv_width, _ = K.int_shape(feats) # conv_dims = K.variable([conv_width, conv_height]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) # TODO: Repeat_elements and tf.split doesn't support dynamic splits. # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) conv_width_index = K.tile( K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) # Static generation of conv_index: # conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)]) # conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering. # conv_index = K.variable( # conv_index.reshape(1, conv_height, conv_width, 1, 2)) # feats = Reshape( # (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.softmax(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_xy, box_wh, box_confidence, box_class_probs
def yolo_loss(args, anchors, num_classes, rescore_confidence=False, print_loss=False): """YOLO localization loss function. Parameters ---------- yolo_output : tensor Final convolutional layer features. true_boxes : tensor Ground truth boxes tensor with shape [batch, num_true_boxes, 5] containing box x_center, y_center, width, height, and class. detectors_mask : array 0/1 mask for detector positions where there is a matching ground truth. matching_true_boxes : array Corresponding ground truth boxes for positive detector positions. Already adjusted for conv height and width. anchors : tensor Anchor boxes for model. num_classes : int Number of object classes. rescore_confidence : bool, default=False If true then set confidence target to IOU of best predicted box with the closest matching ground truth box. print_loss : bool, default=False If True then use a tf.Print() to print the loss components. Returns ------- mean_loss : float mean localization loss across minibatch """ (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args num_anchors = len(anchors) object_scale = 5 no_object_scale = 1 class_scale = 1 coordinates_scale = 1 pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_head( yolo_output, anchors, num_classes) # Unadjusted box predictions for loss. # TODO: Remove extra computation shared with yolo_head. yolo_output_shape = K.shape(yolo_output) feats = K.reshape(yolo_output, [ -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors, num_classes + 5 ]) pred_boxes = K.concatenate((K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1) # TODO: Adjust predictions by image width/height for non-square images? # IOUs may be off due to different aspect ratio. # Expand pred x,y,w,h to allow comparison with ground truth. # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params pred_xy = K.expand_dims(pred_xy, 4) pred_wh = K.expand_dims(pred_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half true_boxes_shape = K.shape(true_boxes) # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params true_boxes = K.reshape(true_boxes, [ true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2] ]) true_xy = true_boxes[..., 0:2] true_wh = true_boxes[..., 2:4] # Find IOU of each predicted box with each ground truth box. true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half intersect_mins = K.maximum(pred_mins, true_mins) intersect_maxes = K.minimum(pred_maxes, true_maxes) intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = intersect_areas / union_areas # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) # TODO: Darknet region training includes extra coordinate loss for early # training steps to encourage predictions to match anchor priors. # Determine confidence weights from object and no_object weights. # NOTE: YOLO does not use binary cross-entropy here. no_object_weights = (no_object_scale * (1 - object_detections) * (1 - detectors_mask)) no_objects_loss = no_object_weights * K.square(-pred_confidence) if rescore_confidence: objects_loss = (object_scale * detectors_mask * K.square(best_ious - pred_confidence)) else: objects_loss = (object_scale * detectors_mask * K.square(1 - pred_confidence)) confidence_loss = objects_loss + no_objects_loss # Classification loss for matching detections. # NOTE: YOLO does not use categorical cross-entropy loss here. matching_classes = K.cast(matching_true_boxes[..., 4], 'int32') matching_classes = K.one_hot(matching_classes, num_classes) classification_loss = (class_scale * detectors_mask * K.square(matching_classes - pred_class_prob)) # Coordinate loss for matching detection boxes. matching_boxes = matching_true_boxes[..., 0:4] coordinates_loss = (coordinates_scale * detectors_mask * K.square(matching_boxes - pred_boxes)) confidence_loss_sum = K.sum(confidence_loss) classification_loss_sum = K.sum(classification_loss) coordinates_loss_sum = K.sum(coordinates_loss) total_loss = 0.5 * (confidence_loss_sum + classification_loss_sum + coordinates_loss_sum) if print_loss: total_loss = tf.Print( total_loss, [ total_loss, confidence_loss_sum, classification_loss_sum, coordinates_loss_sum ], message='yolo_loss, conf_loss, class_loss, box_coord_loss:') return total_loss
def get_loss(self, model, target, output): """ Returns the loss function that can be used by the implementation- specific model. """ backend = model.get_backend() if backend.get_name() == 'keras': import keras.backend as K if 'warp' in self.variant: # Just use the built-in Keras CTC loss function. logger.info('Attaching Warp-CTC loss function to model ' 'output "%s".', target) if backend.get_toolchain() != 'theano': logger.error('If you want to use warp-ctc, you need to ' 'use the Theano backend to Keras.') raise ValueError('Warp-CTC is currently only supported ' 'with the Theano backend to Keras.') else: # Just use the built-in Keras CTC loss function. logger.debug('Attaching built-in Keras CTC loss function to ' 'model output "%s".', target) ctc_scaled = 'ctc_scaled_{}'.format(self.input_length) flattened_labels = 'ctc_flattened_labels_{}'.format(target) transcript_length = K.placeholder( ndim=2, dtype='int32', name=self.output_length ) transcript = K.placeholder( ndim=2, dtype='int32', name=flattened_labels if 'warp' in self.variant \ else self.output ) utterance_length = K.placeholder( ndim=2, dtype='int32', name=self.input_length if self.relative_to is None \ else ctc_scaled ) if self.relative_to is not None: model.add_data_source( ctc_scaled, ScaledSource( model, relative_to=self.relative_to, to_this=target, scale_this=self.input_length ) ) if 'warp' in self.variant: model.add_data_source( flattened_labels, FlattenSource( self.output, self.output_length ) ) try: import ctc # pylint: disable=import-error except ImportError: logger.error('The warp-CTC loss function was requested, ' 'but we cannot find the "ctc" library. See our ' 'troubleshooting page for helpful tips.') raise ImportError('Cannot find the "ctc" library, which ' 'is needed when using the "warp" variant of the CTC ' 'loss function.') out = ctc.cpu_ctc_th( output.dimshuffle((1, 0, 2)), K.squeeze(utterance_length, -1), transcript[0]+1, K.squeeze(transcript_length, -1) ) else: out = K.ctc_batch_cost( transcript, output, utterance_length, transcript_length ) if 'loss_scale' in self.variant: logger.debug('Loss scaling is active.') out = out * K.mean( K.cast(utterance_length, K.dtype(out)) ) / 100 return ( ( (self.output_length, transcript_length), (flattened_labels if 'warp' in self.variant \ else self.output, transcript), (self.input_length if self.relative_to is None \ else ctc_scaled, utterance_length) ), out ) elif backend.get_name() == 'pytorch': if 'warp' not in self.variant: logger.error('PyTorch does not include a native CTC loss ' 'function yet. However, PyTorch bindings to Warp CTC are ' 'available (SeanNaren/warp-ctc). Try installing that, and ' 'then settings variant=warp.') raise ValueError('Only Warp CTC is supported for PyTorch ' 'right now.') ctc_scaled = 'ctc_scaled_{}'.format(self.input_length) flattened_labels = 'ctc_flattened_labels_{}'.format(target) transcript_length = model.data.placeholder( self.output_length, location='cpu', data_type='int' ) transcript = model.data.placeholder( flattened_labels, location='cpu', data_type='int' ) utterance_length = model.data.placeholder( self.input_length if self.relative_to is None else ctc_scaled, location='cpu', data_type='int' ) if self.relative_to is not None: model.add_data_source( ctc_scaled, ScaledSource( model, relative_to=self.relative_to, to_this=target, scale_this=self.input_length ) ) if 'warp' in self.variant: model.add_data_source( flattened_labels, FlattenSource( self.output, self.output_length ) ) try: from warpctc_pytorch import CTCLoss # pytorch: disable=import-error except ImportError: logger.error('The warp-CTC loss function was requested, ' 'but we cannot find the "warpctc_pytorch" library. See ' 'out troubleshooting page for helpful tips.') raise ImportError('Cannot find the "warpctc_pytorch" library, ' 'which is needed when using the "warp" variant of the CTC ' 'loss function.') loss = model.data.move(CTCLoss()) def basic_ctc_loss(inputs, output): """ Computes CTC loss. """ return loss( output.transpose(1, 0).contiguous(), inputs[0][0]+1, # transcript[0]+1 inputs[1].squeeze(1), # K.squeeze(utterance_length, -1), inputs[2].squeeze(1) # K.squeeze(transcript_length, -1) ) / output.size(0) if 'loss_scale' in self.variant: logger.debug('Loss scaling is active.') def loss_scale(inputs, output): """ Computes CTC loss. """ factor = inputs[1].float().mean().data[0] / 100. return basic_ctc_loss(inputs, output) * factor get_ctc_loss = loss_scale else: get_ctc_loss = basic_ctc_loss return [ [ (flattened_labels if 'warp' in self.variant \ else self.output, transcript), (self.input_length if self.relative_to is None \ else ctc_scaled, utterance_length), (self.output_length, transcript_length) ], get_ctc_loss ] else: raise ValueError('Unsupported backend "{}" for loss function "{}"' .format(backend.get_name(), self.get_name()))