class YOLO(object): def __init__(self, backend, input_size, labels, max_box_per_image=50, anchors=[ 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 ]): self.input_size = input_size self.labels = list(labels) self.nb_class = len(self.labels) self.nb_box = len(anchors) // 2 self.class_wt = np.ones(self.nb_class, dtype='float32') self.anchors = anchors self.max_box_per_image = max_box_per_image ########################## # Make the model ########################## # make the feature extractor layers input_image = Input(shape=(self.input_size, self.input_size, 3)) self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image, 4)) if backend == 'SqueezeNet': self.feature_extractor = SqueezeNetFeature(self.input_size) elif backend == 'MobileNet': self.feature_extractor = MobileNetFeature(self.input_size) elif backend == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) else: raise Exception( 'Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!' ) print(self.feature_extractor.get_output_shape()) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() features = self.feature_extractor.extract(input_image) # make the object detection layer output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1, 1), strides=(1, 1), padding='same', name='DetectionLayer', kernel_initializer='lecun_normal')(features) output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) output = Lambda(lambda args: args[0])([output, self.true_boxes]) self.model = Model([input_image, self.true_boxes], output) # initialize the weights of the detection layer layer = self.model.layers[-4] weights = layer.get_weights() new_kernel = np.random.normal(size=weights[0].shape) / (self.grid_h * self.grid_w) new_bias = np.random.normal(size=weights[1].shape) / (self.grid_h * self.grid_w) layer.set_weights([new_kernel, new_bias]) # print a summary of the whole model self.model.summary() def custom_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float( tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0, 2, 1, 3, 4)) cell_grid = tf.tile(tf.concat([cell_x, cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape( self.anchors, [1, 1, 1, self.nb_box, 2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[ ..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float( best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather( self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale / 2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, 1), lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * \ np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \ no_boxes_mask, tf.ones_like(coord_mask)], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum( tf.square(true_box_xy - pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum( tf.square(true_box_wh - pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum( tf.square(true_box_conf - pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum( loss_class * class_mask) / (nb_class_box + 1e-6) loss = tf.cond(tf.less(seen, 1), lambda: loss_xy + loss_wh + loss_conf + loss_class + 10, lambda: loss_xy + loss_wh + loss_conf + loss_class) if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum( tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box / (nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) loss = tf.Print(loss, [total_recall / seen], message='Average Recall \t', summarize=1000) return loss def load_weights(self, weight_path): self.model.load_weights(weight_path) def train( self, train_imgs, # the list of images to train the model valid_imgs, # the list of images used to validate the model nb_epochs, # number of epoches learning_rate, # the learning rate batch_size, # the size of the batch object_scale, no_object_scale, coord_scale, class_scale, saved_weights_name='best_weights.h5', debug=False): self.batch_size = batch_size self.object_scale = object_scale self.no_object_scale = no_object_scale self.coord_scale = coord_scale self.class_scale = class_scale self.debug = debug ############################################ # Make train and validation generators ############################################ generator_config = { 'IMAGE_H': self.input_size, 'IMAGE_W': self.input_size, 'GRID_H': self.grid_h, 'GRID_W': self.grid_w, 'BOX': self.nb_box, 'LABELS': self.labels, 'CLASS': len(self.labels), 'ANCHORS': self.anchors, 'BATCH_SIZE': self.batch_size, 'TRUE_BOX_BUFFER': self.max_box_per_image, } train_generator = BatchGenerator(train_imgs, generator_config, norm=self.feature_extractor.normalize) valid_generator = BatchGenerator(valid_imgs, generator_config, norm=self.feature_extractor.normalize, jitter=False) ############################################ # Compile the model ############################################ optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) self.model.compile(loss=self.custom_loss, optimizer=optimizer) ############################################ # Make a few callbacks ############################################ early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, mode='min', verbose=1) checkpoint = ModelCheckpoint(saved_weights_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1) tensorboard = TensorBoard( log_dir=os.path.expanduser('~/logs/'), histogram_freq=0, #write_batch_performance=True, write_graph=True, write_images=False) ############################################ # Start the training process ############################################ self.model.fit_generator( generator=train_generator, steps_per_epoch=len(train_generator), epochs=nb_epochs, verbose=2 if debug else 1, validation_data=valid_generator, validation_steps=len(valid_generator), callbacks=[early_stop, checkpoint, tensorboard]) ############################################ # Compute mAP on the validation set ############################################ average_precisions = self.evaluate(valid_generator) # print evaluation for label, average_precision in average_precisions.items(): print(self.labels[label], '{:.4f}'.format(average_precision)) print('mAP: {:.4f}'.format( sum(average_precisions.values()) / len(average_precisions))) def evaluate(self, generator, iou_threshold=0.3, score_threshold=0.3, max_detections=100, save_path=None): """ Evaluate a given dataset using a given model. code originally from https://github.com/fizyr/keras-retinanet # Arguments generator : The generator that represents the dataset to evaluate. model : The model to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. score_threshold : The score confidence threshold to use for detections. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. # Returns A dict mapping class names to mAP scores. """ # gather all detections and annotations all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())] all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] for i in range(generator.size()): raw_image = generator.load_image(i) raw_height, raw_width, raw_channels = raw_image.shape # make the boxes and the labels pred_boxes = self.predict(raw_image) score = np.array([box.score for box in pred_boxes]) pred_labels = np.array([box.label for box in pred_boxes]) if len(pred_boxes) > 0: pred_boxes = np.array([[ box.xmin * raw_width, box.ymin * raw_height, box.xmax * raw_width, box.ymax * raw_height, box.score ] for box in pred_boxes]) else: pred_boxes = np.array([[]]) # sort the boxes and the labels according to scores score_sort = np.argsort(-score) pred_labels = pred_labels[score_sort] pred_boxes = pred_boxes[score_sort] # copy detections to all_detections for label in range(generator.num_classes()): all_detections[i][label] = pred_boxes[pred_labels == label, :] annotations = generator.load_annotation(i) # copy detections to all_annotations for label in range(generator.num_classes()): all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() # compute mAP by comparing all detections and all annotations average_precisions = {} for label in range(generator.num_classes()): false_positives = np.zeros((0, )) true_positives = np.zeros((0, )) scores = np.zeros((0, )) num_annotations = 0.0 for i in range(generator.size()): detections = all_detections[i][label] annotations = all_annotations[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for d in detections: scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum( true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = compute_ap(recall, precision) average_precisions[label] = average_precision return average_precisions def predict(self, image): image_h, image_w, _ = image.shape image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:, :, ::-1] input_image = np.expand_dims(input_image, 0) dummy_array = np.zeros((1, 1, 1, 1, self.max_box_per_image, 4)) netout = self.model.predict([input_image, dummy_array])[0] boxes = decode_netout(netout, self.anchors, self.nb_class) return boxes
class YOLO(object): def __init__(self, backend, input_size, labels, max_box_per_image, anchors): self.input_size = input_size self.labels = list(labels) self.nb_class = len(self.labels) self.nb_box = len(anchors) // 2 self.class_wt = np.ones(self.nb_class, dtype='float32') self.anchors = anchors self.max_box_per_image = max_box_per_image # make the feature extractor layers input_image = Input(shape=(self.input_size, self.input_size, 3)) self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) if backend == 'Inception3': self.feature_extractor = Inception3Feature(self.input_size) elif backend == 'SqueezeNet': self.feature_extractor = SqueezeNetFeature(self.input_size) elif backend == 'MobileNet': self.feature_extractor = MobileNetFeature(self.input_size) elif backend == 'Full Yolo': self.feature_extractor = FullYoloFeature(self.input_size) elif backend == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) elif backend == 'VGG16': self.feature_extractor = VGG16Feature(self.input_size) elif backend == 'ResNet50': self.feature_extractor = ResNet50Feature(self.input_size) else: raise Exception('Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!') print("Output Shape of feature extractor: ", self.feature_extractor.get_output_shape()) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() features = self.feature_extractor.extract(input_image) # make the object detection layer output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1,1), strides=(1,1), padding='same', name='DetectionLayer', kernel_initializer='lecun_normal')(features) output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) output = Lambda(lambda args: args[0])([output, self.true_boxes]) self.model = Model([input_image, self.true_boxes], output) # initialize the weights of the detection layer layer = self.model.layers[-4] weights = layer.get_weights() new_kernel = np.random.normal(size=weights[0].shape)/(self.grid_h*self.grid_w) new_bias = np.random.normal(size=weights[1].shape)/(self.grid_h*self.grid_w) layer.set_weights([new_kernel, new_bias]) # print a summary of the whole model # self.model.summary() def custom_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0,2,1,3,4)) cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * \ np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \ no_boxes_mask, tf.ones_like(coord_mask)], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) loss = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: loss_xy + loss_wh + loss_conf + loss_class + 10, lambda: loss_xy + loss_wh + loss_conf + loss_class) if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box/(nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [loss_xy], message='\nLoss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) # loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) return loss def load_weights(self, weight_path): self.model.load_weights(weight_path) def load_data_generators_seq(self, batch_size, labels): input_path = "/media/sf_N_DRIVE/randd/MachineLearning/TeamMembers/Anuar/sequence_data/optical_teeth" num_samples_in_h5 = 50 sequence_length = 1 # 1 is for detection train_batch = SequenceH5Generator(input_path, batch_size, num_samples_in_h5, sequence_length, labels, skip_rate=1, is_debug=False) valid_batch = SequenceH5Generator(input_path, batch_size, num_samples_in_h5, sequence_length, labels, skip_rate=1, is_validation=True) print("Valid batch: ", len(valid_batch)) return train_batch, valid_batch def train(self, train_imgs, valid_imgs, train_times, # the number of time to repeat the training set, often used for small datasets valid_times, # the number of times to repeat the validation set, often used for small datasets nb_epochs, # number of epoches learning_rate, # the learning rate batch_size, # the size of the batch warmup_epochs, # number of initial batches to let the model familiarize with the new dataset object_scale, no_object_scale, coord_scale, class_scale, full_log_dir, early_stop_patience, early_stop_min_delta, learning_rate_decay_factor, learning_rate_decay_patience, learning_rate_decay_min_lr, saved_weights_name='best_weights.h5', debug=False, sequence_length=10): self.batch_size = batch_size self.sequence_length = sequence_length self.object_scale = object_scale self.no_object_scale = no_object_scale self.coord_scale = coord_scale self.class_scale = class_scale self.debug = debug self.full_log_dir = full_log_dir self.early_stop_patience = early_stop_patience self.early_stop_min_delta = early_stop_min_delta self.learning_rate_decay_factor = learning_rate_decay_factor self.learning_rate_decay_patience = learning_rate_decay_patience self.learning_rate_decay_min_lr = learning_rate_decay_min_lr ############################################ # Make train and validation generators ############################################ generator_config = { 'IMAGE_H' : self.input_size, 'IMAGE_W' : self.input_size, 'GRID_H' : self.grid_h, 'GRID_W' : self.grid_w, 'BOX' : self.nb_box, 'LABELS' : self.labels, 'CLASS' : len(self.labels), 'ANCHORS' : self.anchors, 'BATCH_SIZE' : self.batch_size, 'TRUE_BOX_BUFFER' : self.max_box_per_image, 'SEQUENCE_LENGTH' : self.sequence_length } train_generator = BatchGenerator(train_imgs, generator_config, norm=self.feature_extractor.normalize, debug=self.debug) valid_generator = BatchGenerator(valid_imgs, generator_config, norm=self.feature_extractor.normalize, jitter=False) self.warmup_batches = warmup_epochs * (train_times*len(train_generator) + valid_times*len(valid_generator)) / 4 print("Using %d warmup batches" % self.warmup_batches) ############################################ # Define your callbacks ############################################ #HS HSS With a patience of 100 you finish in 200 epochs so I changed it to 400 early_stop = EarlyStopping(monitor='val_loss', min_delta=self.early_stop_min_delta, patience=self.early_stop_patience , verbose=1) #This didnt work with multi gpu checkpoint = ModelCheckpoint('{name}_{{epoch:02d}}.h5'.format(name=saved_weights_name), monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1) # define by Anuar because above didn't work with multi GPU checkpoint_multi = MultiGPUCheckpoint( '{name}_{{epoch:02d}}_multi.h5'.format(name=saved_weights_name), verbose=1, save_best_only=True, mode='min', period=1) #defined by hs for best val checkpoint_multi_hs = MultiGPUCheckpoint( '{name}_{{epoch:02d}}_hsBb_valLoss-{{val_loss:.2f}}.h5'.format(name=saved_weights_name), verbose=1, save_best_only=True,) #defined by HS # HS HSS originally i used monitor='val_loss', factor=0.5, patience=20, min_lr=1e-6 reduce_lr_hs = ReduceLROnPlateau(monitor='val_loss', factor=self.learning_rate_decay_factor, patience=self.learning_rate_decay_patience, min_lr=self.learning_rate_decay_min_lr) # written by Anuar evaluate_callback_train = EvaluateCallback(train_generator, self.evaluate) evaluate_callback_val = EvaluateCallback(valid_generator, self.evaluate) # written by Anuar decay_lr = DecayLR(27, 31, 0.2) optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) ############################################ # Compile the model ############################################ with tf.device("/cpu:0"): self.model.compile(loss=self.custom_loss, optimizer=optimizer) ############################################ # Start the training process ############################################ steps_per_epoch = len(train_generator) * train_times parallel_model = multi_gpu_model(self.model, gpus=2) parallel_model.compile(loss=self.custom_loss, optimizer=optimizer) parallel_model.fit_generator( generator=train_generator, steps_per_epoch = steps_per_epoch, epochs = warmup_epochs + nb_epochs, verbose = 2 if debug else 1, validation_data = valid_generator, validation_steps = len(valid_generator) * valid_times, callbacks = [ early_stop, checkpoint_multi_hs, TrainValTensorBoard_HS(self.full_log_dir, write_graph=False, write_images=True), ValOnlyProgbarLogger(verbose=1, count_mode='steps'), reduce_lr_hs, evaluate_callback_val], workers = 4, max_queue_size = 10, use_multiprocessing=True) self.model.save(saved_weights_name + "_final.h5") ############################################ # Compute mAP on the validation set ############################################ average_precisions = self.evaluate(valid_generator, iou_threshold=0.5, score_threshold=0.5) for label, average_precision in list(average_precisions.items()): print(self.labels[label], '{:.4f}'.format(average_precision)) print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) average_precisions = self.evaluate(valid_generator, iou_threshold=0.3, score_threshold=0.3) for label, average_precision in list(average_precisions.items()): print(self.labels[label], '{:.4f}'.format(average_precision)) print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) def evaluate(self, generator, iou_threshold=0.3, score_threshold=0.3, max_detections=100, save_path=None): """ Evaluate a given dataset using a given model. code originally from https://github.com/fizyr/keras-retinanet # Arguments generator : The generator that represents the dataset to evaluate. model : The model to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. score_threshold : The score confidence threshold to use for detections. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. # Returns A dict mapping class names to mAP scores. """ print("\nUsing %.2f IOU and %.2f Score thresholds!" %\ (iou_threshold, score_threshold)) # gather all detections and annotations all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())] all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] for i in range(generator.size()): if i % 100 == 0: print("%d/%d" % (i, generator.size())) raw_image = generator.load_image(i) raw_height, raw_width, raw_channels = raw_image.shape pred_boxes, filtered_boxes = self.predict(raw_image, obj_threshold=score_threshold, is_filter_bboxes=False) score = np.array([box.score for box in pred_boxes]) pred_labels = np.array([box.label for box in pred_boxes]) if len(pred_boxes) > 0: pred_boxes = np.array([[box.xmin*raw_width, box.ymin*raw_height, box.xmax*raw_width, box.ymax*raw_height, box.score] for box in pred_boxes]) else: pred_boxes = np.array([[]]) # sort the boxes and the labels according to scores score_sort = np.argsort(-score) pred_labels = pred_labels[score_sort] pred_boxes = pred_boxes[score_sort] # copy detections to all_detections for label in range(generator.num_classes()): all_detections[i][label] = pred_boxes[pred_labels == label, :] annotations = generator.load_annotation(i) # copy detections to all_annotations for label in range(generator.num_classes()): if annotations.any(): all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() # compute mAP by comparing all detections and all annotations average_precisions = {} for label in range(generator.num_classes()): false_positives = np.zeros((0,)) true_positives = np.zeros((0,)) scores = np.zeros((0,)) num_annotations = 0.0 for i in range(generator.size()): detections = all_detections[i][label] if type(all_annotations[i]) == list: continue annotations = all_annotations[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for d in detections: scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = compute_ap(recall, precision) average_precisions[label] = average_precision return average_precisions def predict(self, image, obj_threshold=0.3, nms_threshold=0.01, is_filter_bboxes=False, shovel_type="Hydraulic", class_obj_threshold=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3]): image_h, image_w, _ = image.shape image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:,:,::-1] input_image = np.expand_dims(input_image, 0) dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4)) netout = self.model.predict([input_image, dummy_array])[0] boxes = decode_netout(netout, self.anchors, self.nb_class, obj_threshold=obj_threshold, nms_threshold=nms_threshold, class_obj_threshold=class_obj_threshold) filtered_boxes = [] if is_filter_bboxes: start_time = time() boxes, filtered_boxes = filter_all_objects(boxes, shovel_type, image_size=image_h) # print("Filtering time taken %.3f" % (time() - start_time)) return boxes, filtered_boxes def get_inference_model(self): inference_model = Model(self.model.input, self.model.get_layer("reshape_1").output) return inference_model def get_feature_model(self, is_before_activation=True): if is_before_activation: feature_model = Model(self.feature_extractor.feature_extractor.inputs, self.feature_extractor.feature_extractor.\ get_layer("conv_22").output) else: feature_model = Model(self.feature_extractor.feature_extractor.inputs, self.feature_extractor.feature_extractor.\ get_layer("leaky_re_lu_22").output) return feature_model def predict_on_h5(self, h5_path, idx, path_to_save, sequence_length=30, stride=1, obj_threshold=0.3, nms_threshold=0.1): f = h5py.File(h5_path, 'r') x_batches = f["x_batches"] b_batches = f["b_batches"] y_batches = f["y_batches"] id_in_h5 = idx % x_batches.shape[0] x_batch = x_batches[id_in_h5, ...] # read from disk b_batch = b_batches[id_in_h5, ...] y_batch = y_batches[id_in_h5, ...] x_batch = x_batch[::-1, ...][:sequence_length:stride][::-1, ...] image_id = -1 image = x_batch[image_id, ...].copy() boxes, filtered_boxes = self.predict(image, obj_threshold=obj_threshold, nms_threshold=nms_threshold, is_filter_bboxes=False, shovel_type="Cable") boxes += filtered_boxes image = draw_boxes(image, boxes, self.labels, score_threshold=obj_threshold) h5_name = h5_path.split('/')[-1] filepath = os.path.join(path_to_save, "pred_" + h5_name + str(idx) + ".jpg") cv2.imwrite(filepath, image) f.close()
class YOLO(object): def __init__(self, architecture, input_size, labels, max_box_per_image, anchors): self.input_size = input_size self.labels = list(labels) self.nb_class = len(self.labels) # self.nb_box = len(anchors)/2 # PYTHON3 self.nb_box = int(len(anchors) / 2) self.class_wt = np.ones(self.nb_class, dtype='float32') self.anchors = anchors self.max_box_per_image = max_box_per_image ########################## # Make the model ########################## # make the feature extractor layers input_image = Input(shape=(self.input_size, self.input_size, 3)) self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image, 4)) if architecture == 'Inception3': self.feature_extractor = Inception3Feature(self.input_size) elif architecture == 'SqueezeNet': self.feature_extractor = SqueezeNetFeature(self.input_size) elif architecture == 'MobileNet': self.feature_extractor = MobileNetFeature(self.input_size) elif architecture == 'Full Yolo': self.feature_extractor = FullYoloFeature(self.input_size) elif architecture == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) elif architecture == 'VGG16': self.feature_extractor = VGG16Feature(self.input_size) elif architecture == 'ResNet50': self.feature_extractor = ResNet50Feature(self.input_size) else: raise Exception( 'Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!' ) print(self.feature_extractor.get_output_shape()) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() features = self.feature_extractor.extract(input_image) # make the object detection layer output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1, 1), strides=(1, 1), padding='same', name='conv_23', kernel_initializer='lecun_normal')(features) output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) output = Lambda(lambda args: args[0])([output, self.true_boxes]) self.model = Model([input_image, self.true_boxes], output) # initialize the weights of the detection layer layer = self.model.layers[-4] weights = layer.get_weights() new_kernel = np.random.normal(size=weights[0].shape) / (self.grid_h * self.grid_w) new_bias = np.random.normal(size=weights[1].shape) / (self.grid_h * self.grid_w) layer.set_weights([new_kernel, new_bias]) # print a summary of the whole model self.model.summary() def custom_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float( tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0, 2, 1, 3, 4)) cell_grid = tf.tile(tf.concat([cell_x, cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape( self.anchors, [1, 1, 1, self.nb_box, 2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[ ..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float( best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather( self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale / 2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond( tf.less(seen, self.warmup_bs), lambda: [ true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * np.reshape( self.anchors, [1, 1, 1, self.nb_box, 2]) * no_boxes_mask, tf.ones_like(coord_mask) ], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum( tf.square(true_box_xy - pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum( tf.square(true_box_wh - pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum( tf.square(true_box_conf - pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum( loss_class * class_mask) / (nb_class_box + 1e-6) loss = loss_xy + loss_wh + loss_conf + loss_class if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum( tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box / (nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [tf.zeros((1))], message='Dummy Line \t', summarize=1000) loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) loss = tf.Print(loss, [total_recall / seen], message='Average Recall \t', summarize=1000) return loss def load_weights(self, weight_path): self.model.load_weights(weight_path) def predict(self, image): image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:, :, ::-1] input_image = np.expand_dims(input_image, 0) dummy_array = dummy_array = np.zeros( (1, 1, 1, 1, self.max_box_per_image, 4)) netout = self.model.predict([input_image, dummy_array])[0] boxes = self.decode_netout(netout) return boxes def bbox_iou(self, box1, box2): x1_min = box1.x - box1.w / 2 x1_max = box1.x + box1.w / 2 y1_min = box1.y - box1.h / 2 y1_max = box1.y + box1.h / 2 x2_min = box2.x - box2.w / 2 x2_max = box2.x + box2.w / 2 y2_min = box2.y - box2.h / 2 y2_max = box2.y + box2.h / 2 intersect_w = self.interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = self.interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = box1.w * box1.h + box2.w * box2.h - intersect return float(intersect) / union def interval_overlap(self, interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2, x4) - x1 else: if x2 < x3: return 0 else: return min(x2, x4) - x3 def decode_netout(self, netout, obj_threshold=0.3, nms_threshold=0.3): grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = self.sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * self.softmax( netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > obj_threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row, col, b, 5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row, col, b, :4] x = (col + self.sigmoid(x) ) / grid_w # center position, unit: image width y = (row + self.sigmoid(y) ) / grid_h # center position, unit: image height w = self.anchors[2 * b + 0] * np.exp( w) / grid_w # unit: image width h = self.anchors[2 * b + 1] * np.exp( h) / grid_h # unit: image height confidence = netout[row, col, b, 4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) # suppress non-maximal boxes for c in range(self.nb_class): sorted_indices = list( reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i + 1, len(sorted_indices)): index_j = sorted_indices[j] if self.bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def sigmoid(self, x): return 1. / (1. + np.exp(-x)) def softmax(self, x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x / np.min(x) * t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) def train( self, train_imgs, # the list of images to train the model valid_imgs, # the list of images used to validate the model train_times, # the number of time to repeat the training set, often used for small datasets valid_times, # the number of times to repeat the validation set, often used for small datasets nb_epoch, # number of epoches learning_rate, # the learning rate batch_size, # the size of the batch warmup_epochs, # number of initial batches to let the model familiarize with the new dataset object_scale, no_object_scale, coord_scale, class_scale, saved_weights_name='best_weights.h5', debug=False): self.batch_size = batch_size self.warmup_bs = warmup_epochs * (train_times * (len(train_imgs) / batch_size + 1) + valid_times * (len(valid_imgs) / batch_size + 1)) self.object_scale = object_scale self.no_object_scale = no_object_scale self.coord_scale = coord_scale self.class_scale = class_scale self.debug = debug if warmup_epochs > 0: nb_epoch = warmup_epochs # if it's warmup stage, don't train more than warmup_epochs ############################################ # Compile the model ############################################ optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) self.model.compile(loss=self.custom_loss, optimizer=optimizer) ############################################ # Make train and validation generators ############################################ generator_config = { 'IMAGE_H': self.input_size, 'IMAGE_W': self.input_size, 'GRID_H': self.grid_h, 'GRID_W': self.grid_w, 'BOX': self.nb_box, 'LABELS': self.labels, 'CLASS': len(self.labels), 'ANCHORS': self.anchors, 'BATCH_SIZE': self.batch_size, 'TRUE_BOX_BUFFER': self.max_box_per_image, } train_batch = BatchGenerator(train_imgs, generator_config, norm=self.feature_extractor.normalize) valid_batch = BatchGenerator(valid_imgs, generator_config, norm=self.feature_extractor.normalize, jitter=False) ############################################ # Make a few callbacks ############################################ early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, mode='min', verbose=1) checkpoint = ModelCheckpoint(saved_weights_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1) tb_counter = len([ log for log in os.listdir(os.path.expanduser('./logs/')) if 'yolo' in log ]) + 1 tensorboard = TensorBoard( log_dir=os.path.expanduser('./logs/') + 'yolo' + '_' + str(tb_counter), histogram_freq=0, #write_batch_performance=True, write_graph=True, write_images=False) ############################################ # Start the training process ############################################ self.model.fit_generator( generator=train_batch, steps_per_epoch=len(train_batch) * train_times, epochs=nb_epoch, verbose=1, validation_data=valid_batch, validation_steps=len(valid_batch) * valid_times, callbacks=[early_stop, checkpoint, tensorboard], workers=3, max_queue_size=8)
class YOLO(object): def __init__(self, architecture, input_size, labels, max_box_per_image, anchors): self.input_size = input_size self.labels = list(labels) self.nb_class = len(self.labels) self.nb_box = 5 self.class_wt = np.ones(self.nb_class, dtype='float32') self.anchors = anchors self.max_box_per_image = max_box_per_image ########################## # Make the model ########################## # make the feature extractor layers input_image = Input(shape=(self.input_size, self.input_size, 3)) self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) if architecture == 'Inception3': self.feature_extractor = Inception3Feature(self.input_size) elif architecture == 'SqueezeNet': self.feature_extractor = SqueezeNetFeature(self.input_size) elif architecture == 'MobileNet': self.feature_extractor = MobileNetFeature(self.input_size) elif architecture == 'Full Yolo': self.feature_extractor = FullYoloFeature(self.input_size) elif architecture == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) elif architecture == 'VGG16': self.feature_extractor = VGG16Feature(self.input_size) elif architecture == 'ResNet50': self.feature_extractor = ResNet50Feature(self.input_size) else: raise Exception('Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!') print self.feature_extractor.get_output_shape() self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() features = self.feature_extractor.extract(input_image) # make the object detection layer output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1,1), strides=(1,1), padding='same', name='conv_23', kernel_initializer='lecun_normal')(features) output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) output = Lambda(lambda args: args[0])([output, self.true_boxes]) self.model = Model([input_image, self.true_boxes], output) # initialize the weights of the detection layer layer = self.model.layers[-4] weights = layer.get_weights() new_kernel = np.random.normal(size=weights[0].shape)/(self.grid_h*self.grid_w) new_bias = np.random.normal(size=weights[1].shape)/(self.grid_h*self.grid_w) layer.set_weights([new_kernel, new_bias]) # print a summary of the whole model self.model.summary() def custom_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0,2,1,3,4)) cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, 5, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_bs), lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * no_boxes_mask, tf.ones_like(coord_mask)], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) loss = loss_xy + loss_wh + loss_conf + loss_class if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box/(nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [tf.zeros((1))], message='Dummy Line \t', summarize=1000) loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) return loss def load_weights(self, weight_path): self.model.load_weights(weight_path) def predict(self, image): image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:,:,::-1] input_image = np.expand_dims(input_image, 0) dummy_array = dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4)) netout = self.model.predict([input_image, dummy_array])[0] boxes = self.decode_netout(netout) return boxes def bbox_iou(self, box1, box2): x1_min = box1.x - box1.w/2 x1_max = box1.x + box1.w/2 y1_min = box1.y - box1.h/2 y1_max = box1.y + box1.h/2 x2_min = box2.x - box2.w/2 x2_max = box2.x + box2.w/2 y2_min = box2.y - box2.h/2 y2_max = box2.y + box2.h/2 intersect_w = self.interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = self.interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = box1.w * box1.h + box2.w * box2.h - intersect return float(intersect) / union def interval_overlap(self, interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 def decode_netout(self, netout, obj_threshold=0.3, nms_threshold=0.3): grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = self.sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * self.softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > obj_threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row,col,b,5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row,col,b,:4] x = (col + self.sigmoid(x)) / grid_w # center position, unit: image width y = (row + self.sigmoid(y)) / grid_h # center position, unit: image height w = self.anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width h = self.anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height confidence = netout[row,col,b,4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) # suppress non-maximal boxes for c in range(self.nb_class): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in xrange(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in xrange(i+1, len(sorted_indices)): index_j = sorted_indices[j] if self.bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def sigmoid(self, x): return 1. / (1. + np.exp(-x)) def softmax(self, x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x/np.min(x)*t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) def train(self, train_imgs, # the list of images to train the model valid_imgs, # the list of images used to validate the model train_times, # the number of time to repeat the training set, often used for small datasets valid_times, # the number of times to repeat the validation set, often used for small datasets nb_epoch, # number of epoches learning_rate, # the learning rate batch_size, # the size of the batch warmup_epochs, # number of initial batches to let the model familiarize with the new dataset object_scale, no_object_scale, coord_scale, class_scale, saved_weights_name='best_weights.h5', debug=False): self.batch_size = batch_size self.warmup_bs = warmup_epochs * (train_times*(len(train_imgs)/batch_size+1) + valid_times*(len(valid_imgs)/batch_size+1)) self.object_scale = object_scale self.no_object_scale = no_object_scale self.coord_scale = coord_scale self.class_scale = class_scale self.debug = debug if warmup_epochs > 0: nb_epoch = warmup_epochs # if it's warmup stage, don't train more than warmup_epochs ############################################ # Compile the model ############################################ optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) self.model.compile(loss=self.custom_loss, optimizer=optimizer) ############################################ # Make train and validation generators ############################################ generator_config = { 'IMAGE_H' : self.input_size, 'IMAGE_W' : self.input_size, 'GRID_H' : self.grid_h, 'GRID_W' : self.grid_w, 'BOX' : self.nb_box, 'LABELS' : self.labels, 'CLASS' : len(self.labels), 'ANCHORS' : self.anchors, 'BATCH_SIZE' : self.batch_size, 'TRUE_BOX_BUFFER' : self.max_box_per_image, } train_batch = BatchGenerator(train_imgs, generator_config, norm=self.feature_extractor.normalize) valid_batch = BatchGenerator(valid_imgs, generator_config, norm=self.feature_extractor.normalize, jitter=False) ############################################ # Make a few callbacks ############################################ early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, mode='min', verbose=1) checkpoint = ModelCheckpoint(saved_weights_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1) tb_counter = len([log for log in os.listdir(os.path.expanduser('~/logs/')) if 'yolo' in log]) + 1 tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/') + 'yolo' + '_' + str(tb_counter), histogram_freq=0, #write_batch_performance=True, write_graph=True, write_images=False) ############################################ # Start the training process ############################################ self.model.fit_generator(generator = train_batch, steps_per_epoch = len(train_batch) * train_times, epochs = nb_epoch, verbose = 1, validation_data = valid_batch, validation_steps = len(valid_batch) * valid_times, callbacks = [early_stop, checkpoint, tensorboard], workers = 3, max_queue_size = 8)
class YOLO(object): def __init__(self, backend, input_size, labels, max_box_per_image, anchors): self.input_size = input_size self.labels = list(labels) self.nb_class = len(self.labels) self.nb_box = len(anchors) // 2 self.class_wt = np.ones(self.nb_class, dtype='float32') self.anchors = anchors self.max_box_per_image = max_box_per_image ########################## # Make the model ########################## # make the feature extractor layers input_image = Input(shape=(self.input_size, self.input_size, 3)) self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image, 4)) if backend == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) else: raise Exception('Only Tiny Yolo is supported') print('Tiny Yolo is loaded') print(self.feature_extractor.get_output_shape()) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() features = self.feature_extractor.extract(input_image) # make the object detection layer output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1, 1), strides=(1, 1), padding='same', name='DetectionLayer', kernel_initializer='lecun_normal')(features) output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) output = Lambda(lambda args: args[0])([output, self.true_boxes]) self.model = Model([input_image, self.true_boxes], output) # initialize the weights of the detection layer layer = self.model.layers[-4] weights = layer.get_weights() new_kernel = np.random.normal(size=weights[0].shape) / (self.grid_h * self.grid_w) new_bias = np.random.normal(size=weights[1].shape) / (self.grid_h * self.grid_w) layer.set_weights([new_kernel, new_bias]) # print a summary of the whole model self.model.summary() print('Yolo-weight is successfully loaded') def load_weights(self, weight_path): self.model.load_weights(weight_path) def predict(self, image): image_h, image_w, _ = image.shape image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:, :, ::-1] input_image = np.expand_dims(input_image, 0) dummy_array = np.zeros((1, 1, 1, 1, self.max_box_per_image, 4)) netout = self.model.predict([input_image, dummy_array])[0] boxes = decode_netout(netout, self.anchors, self.nb_class, 0.4, 0.2) return boxes
class Vehicle(object): def __init__(self, backend, input_size, labels, actions, ob_weights, max_box_per_image=50, anchors=[0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]): self.input_size = input_size self.labels = list(labels) self.actions = list(actions) self.nb_moves = len(self.actions) self.nb_class = len(self.labels) self.nb_box = len(anchors)//2 self.class_wt = np.ones(self.nb_class, dtype='float32') self.anchors = anchors self.max_box_per_image = max_box_per_image self.losses = { "obj_output": self.yolo_loss, "dir_output": "categorical_crossentropy", } self.lossWeights = {"obj_output": 0.5, "dir_output": 1.0} ########################## # Make the model ########################## # make the feature extractor layers input_image = Input(shape=(self.input_size, self.input_size, 3)) self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) if backend == 'SqueezeNet': self.feature_extractor = SqueezeNetFeature(self.input_size) elif backend == 'MobileNet': self.feature_extractor = MobileNetFeature(self.input_size) elif backend == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) else: raise Exception('Architecture not supported! Only support Tiny Yolo, MobileNet, SqueezeNet at the moment!') print(self.feature_extractor.get_output_shape()) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() features = self.feature_extractor.extract(input_image) print(self.feature_extractor.get_output_shape()) # make the object detection layer output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1,1), strides=(1,1), padding='same', name='DetectionLayer', kernel_initializer='lecun_normal')(features) output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) output = Lambda(lambda args: args[0], name='obj_output')([output, self.true_boxes]) convdir = Conv2D(2, (3, 3), activation='relu', padding='same', use_bias=False, name='dir1')(input_image) convdir2 = Conv2D(2, (3, 3), activation='relu', padding='same', use_bias=False, name='dir2')(convdir) pooldir = MaxPooling2D(pool_size=2, name='dir3')(convdir2) convdir1 = Conv2D(4, (3, 3), activation='relu', padding='same', use_bias=False, name='dir4')(pooldir) convdir12 = Conv2D(4, (3, 3), activation='relu', padding='same', use_bias=False, name='dir5')(convdir1) pooldir1 = AveragePooling2D(pool_size=2, name='dir6')(convdir12) flat1 = Flatten()(pooldir1) flat2 = Flatten()(output) added = Concatenate()([flat1, flat2]) #fc1 = Dense(32, activation='relu', name='fchingona')(added) fc2 = Dense(6, activation='softmax', name='dir_output')(added) self.model = Model([input_image, self.true_boxes], [output, fc2]) self.model.load_weights(ob_weights, by_name=True) # print a summary of the whole model self.model.summary() def yolo_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0,2,1,3,4)) cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, 1), lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * \ np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \ no_boxes_mask, tf.ones_like(coord_mask)], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) loss = tf.cond(tf.less(seen, 1), lambda: loss_xy + loss_wh + loss_conf + loss_class + 10, lambda: loss_xy + loss_wh + loss_conf + loss_class) if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box/(nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) return loss def load_weights(self, weight_path): self.model.load_weights(weight_path) def train(self, train_imgs, # the list of images to train the model valid_imgs, # the list of images used to validate the model nb_epochs, # number of epoches learning_rate, # the learning rate batch_size, # the size of the batch object_scale, no_object_scale, coord_scale, class_scale, saved_weights_name='best_weights.h5', debug=False): self.batch_size = batch_size self.object_scale = object_scale self.no_object_scale = no_object_scale self.coord_scale = coord_scale self.class_scale = class_scale self.debug = debug ############################################ # Make train and validation generators ############################################ generator_config = { 'IMAGE_H' : self.input_size, 'IMAGE_W' : self.input_size, 'GRID_H' : self.grid_h, 'GRID_W' : self.grid_w, 'BOX' : self.nb_box, 'LABELS' : self.labels, 'CLASS' : len(self.labels), 'ACTIONS' : self.actions, 'MOVES' : len(self.actions), 'ANCHORS' : self.anchors, 'BATCH_SIZE' : self.batch_size, 'TRUE_BOX_BUFFER' : self.max_box_per_image, } train_generator = BatchGenerator(train_imgs, generator_config, norm=self.feature_extractor.normalize) valid_generator = BatchGenerator(valid_imgs, generator_config, norm=self.feature_extractor.normalize, jitter=False) ############################################ # Compile the model ############################################ optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) self.model.compile(loss=self.losses, loss_weights=self.lossWeights, optimizer=optimizer) ############################################ # Make a few callbacks ############################################ early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, mode='min', verbose=1) checkpoint = ModelCheckpoint(saved_weights_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1) tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/'), histogram_freq=0, #write_batch_performance=True, write_graph=True, write_images=False) ############################################ # Start the training process ############################################ self.model.fit_generator(generator = train_generator, steps_per_epoch = len(train_generator), epochs = nb_epochs, verbose = 2 if debug else 1, validation_data = valid_generator, validation_steps = len(valid_generator), callbacks = [early_stop, checkpoint, tensorboard]) def predict(self, image): image_h, image_w, _ = image.shape image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:,:,::-1] input_image = np.expand_dims(input_image, 0) dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4)) netout = self.model.predict([input_image, dummy_array]) return netout
class YOLO(object): """ 做了几件事: 1.获取backend 2.加网络的分类部分,模型构建完成 3.初始化参数(部分用于train方法) 输入变量: input_size: int labels: np.array max_box_per_image: int anchors: list? np.array? 成员变量: self.input_size = input_size # input图片的长款(YOLO默认416) self.labels = list(labels) self.nb_class = len(self.labels) self.nb_box = len(anchors)//2 self.class_wt = np.ones(self.nb_class, dtype='float32') #这里没有拓展,可以各类型的修改权重 self.anchors = anchors self.max_box_per_image = max_box_per_image self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) self.feature_extractor = backend.XXX(self.input_size) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() self.model = Model([input_image, self.true_boxes], output) self.threshold = threshold """ def __init__(self, backend, input_size, labels, max_box_per_image, anchors, threshold, max_sur): self.input_size = input_size self.labels = list(labels) self.nb_class = len(self.labels) self.nb_box = len(anchors)//2 #应该是期望的box数量?还没仔细看paper self.class_wt = np.ones(self.nb_class, dtype='float32') #这里没有拓展,可以各类型的修改权重 self.anchors = anchors self.threshold = threshold self.max_sur = max_sur self.max_box_per_image = max_box_per_image ########################## # Make the model ########################## # make the feature extractor layers # 构建了一个图片的input层 input_image = Input(shape=(self.input_size, self.input_size, 3)) # 构建了bounding box回归的输入层 self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) # 从backend获取卷积部分的结构,这里返回的是bankend中定义的不同的几个类的对象,都是BaseFeatureExtractor类的子类 # backend中子类的self.feature_extractor变量就是构建的keras的model对象,在backend中只是加了个包装 # 这里的YOLO类中的变量也叫feature_extractor,但是对应的是backend中的几个子类的对象,不能弄混了 if backend == 'Inception3': self.feature_extractor = Inception3Feature(self.input_size) elif backend == 'SqueezeNet': self.feature_extractor = SqueezeNetFeature(self.input_size) elif backend == 'MobileNet': self.feature_extractor = MobileNetFeature(self.input_size) elif backend == 'Full Yolo': self.feature_extractor = FullYoloFeature(self.input_size) elif backend == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) elif backend == 'VGG16': self.feature_extractor = VGG16Feature(self.input_size) elif backend == 'ResNet50': self.feature_extractor = ResNet50Feature(self.input_size) else: raise Exception('Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!') # 通过在父类定义的get_output_shape()获得输出的特征矩阵的大小 print(issubclass(Model,Layer)) print(self.feature_extractor.get_output_shape()) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() # 这个extract()和get_output_shape()一样也是backend中BaseFeatureExtractor类定义的父类方法 # 这里是把上面定义的图片输入层和模型的特征提取模块接在一起 # 看backen的代码其实这一步有点多余,因为backend已经有输入层了,可能应为方便?我觉得这个操作可以去掉 # 总之,这里的features就是一个构建到一半的模型(的特征提取部分),如果直接调用predict出来的是一个特征矩阵 features = self.feature_extractor.extract(input_image) # make the object detection layer # 构造模型的分类层 # 输出的shape为: # self.nb_box * (4 + 1 + self.nb_class), self.grid_h, self.grid_w) output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1,1), strides=(1,1), padding='same', name='DetectionLayer', kernel_initializer='lecun_normal')(features) # 有13*13组对每个bounding box(max/2个)的predict output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) #加的这层lambda层贼奇怪,把true_boxes放进来以后又不取,相当于没放进来,不知道有啥用 #注意,这里隐形的加了一个input-layer,对于true_boxes的输入 output = Lambda(lambda args: args[0])([output, self.true_boxes]) self.model = Model([input_image, self.true_boxes], output) """ 这里要注意的是: 现在的model是一个: input->BACKEND->Covn2d->Reshape->(input)->Lambda的模型 一共是6层 虽然backend里面的构造很复杂,但是在这里被当成一个layer(因为其实MODEL对象是layer对象的子类) """ #输出的shape:(self.grid_h, self.grid_w, max_box_num, dim(也就是4 + 1 + self.nb_class)) print(self.model.layers) print(f"number of layers:{len(self.model.layers)}") print(self.model.output_shape) # initialize the weights of the detection layer layer = self.model.layers[-4] weights = layer.get_weights() print(f"weigth_2D:{weights}") # 第一个array的shape是(w,h,d(上一层传下来有多少个feature-map),number) # 第二个array的shape是(number)也就是说每个kernel一个bias print(f"weigth_2D_shape:{(weights[0].shape,weights[1].shape)}") new_kernel = np.random.normal(size=weights[0].shape)/(self.grid_h*self.grid_w) new_bias = np.random.normal(size=weights[1].shape)/(self.grid_h*self.grid_w) #分类layer是高斯随机的 layer.set_weights([new_kernel, new_bias]) # print a summary of the whole model self.model.summary() def custom_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0,2,1,3,4)) cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * \ np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \ no_boxes_mask, tf.ones_like(coord_mask)], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) loss = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: loss_xy + loss_wh + loss_conf + loss_class + 10, lambda: loss_xy + loss_wh + loss_conf + loss_class) if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box/(nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) return loss def load_weights(self, weight_path): self.model.load_weights(weight_path) def train(self, train_imgs, # the list of images to train the model valid_imgs, # the list of images used to validate the model train_times, # the number of time to repeat the training set, often used for small datasets valid_times, # the num # ber of times to repeat the validation set, often used for small datasets nb_epochs, # number of epoches learning_rate, # the learning rate batch_size, # the size of the batch warmup_epochs, # number of initial batches to let the model familiarize with the new dataset object_scale, no_object_scale, coord_scale, class_scale, saved_weights_name='best_weights.h5', debug=False): self.batch_size = batch_size self.object_scale = object_scale self.no_object_scale = no_object_scale self.coord_scale = coord_scale self.class_scale = class_scale self.debug = debug ############################################ # Make train and validation generators ############################################ generator_config = { 'IMAGE_H' : self.input_size, 'IMAGE_W' : self.input_size, 'GRID_H' : self.grid_h, 'GRID_W' : self.grid_w, 'BOX' : self.nb_box, 'LABELS' : self.labels, 'CLASS' : len(self.labels), 'ANCHORS' : self.anchors, 'BATCH_SIZE' : self.batch_size, 'TRUE_BOX_BUFFER' : self.max_box_per_image, } train_generator = BatchGenerator(train_imgs, generator_config, norm=self.feature_extractor.normalize) valid_generator = BatchGenerator(valid_imgs, generator_config, norm=self.feature_extractor.normalize, jitter=False) self.warmup_batches = warmup_epochs * (train_times*len(train_generator) + valid_times*len(valid_generator)) ############################################ # Compile the model ############################################ optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) self.model.compile(loss=self.custom_loss, optimizer=optimizer) ############################################ # Make a few callbacks ############################################ early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, mode='min', verbose=1) checkpoint = ModelCheckpoint(saved_weights_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1) tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/'), histogram_freq=0, #write_batch_performance=True, write_graph=True, write_images=False) ############################################ # Start the training process ############################################ self.model.fit_generator(generator = train_generator, steps_per_epoch = len(train_generator) * train_times, epochs = warmup_epochs + nb_epochs, verbose = 2 if debug else 1, validation_data = valid_generator, validation_steps = len(valid_generator) * valid_times, callbacks = [early_stop, checkpoint, tensorboard], workers = 3, max_queue_size = 8) ############################################ # Compute mAP on the validation set ############################################ average_precisions = self.evaluate(valid_generator) print(average_precisions) # print evaluation for label, average_precision in average_precisions.items(): print(self.labels[label], '{:.4f}'.format(average_precision)) print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) def evaluate(self, generator, max_detections=100, save_path=None): """ Evaluate a given dataset using a given model. code originally from https://github.com/fizyr/keras-retinanet # Arguments generator : The generator that represents the dataset to evaluate. model : The model to evaluate. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. # Returns A dict mapping class names to mAP scores. """ iou_threshold = self.threshold # gather all detections and annotations all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())] all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] for i in range(generator.size()): raw_image = generator.load_image(i) raw_height, raw_width, raw_channels = raw_image.shape # make the boxes and the labels pred_boxes = self.predict(raw_image) score = np.array([box.score for box in pred_boxes]) pred_labels = np.array([box.label for box in pred_boxes]) if len(pred_boxes) > 0: pred_boxes = np.array([[box.xmin*raw_width, box.ymin*raw_height, box.xmax*raw_width, box.ymax*raw_height, box.score] for box in pred_boxes]) else: pred_boxes = np.array([[]]) # sort the boxes and the labels according to scores score_sort = np.argsort(-score) pred_labels = pred_labels[score_sort] pred_boxes = pred_boxes[score_sort] # copy detections to all_detections for label in range(generator.num_classes()): all_detections[i][label] = pred_boxes[pred_labels == label, :] annotations = generator.load_annotation(i) # copy detections to all_annotations for label in range(generator.num_classes()): all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() # compute mAP by comparing all detections and all annotations average_precisions = {} for label in range(generator.num_classes()): false_positives = np.zeros((0,)) true_positives = np.zeros((0,)) scores = np.zeros((0,)) num_annotations = 0.0 for i in range(generator.size()): detections = all_detections[i][label] annotations = all_annotations[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for d in detections: scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = compute_ap(recall, precision) average_precisions[label] = average_precision return average_precisions def predict(self, image): image_h, image_w, _ = image.shape image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:,:,::-1] input_image = np.expand_dims(input_image, 0) dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4)) netout = self.model.predict([input_image, dummy_array])[0] # print(netout) boxes = decode_netout(netout, self.anchors, self.nb_class, obj_threshold=self.threshold, nms_threshold = self.max_sur ) return boxes
class YOLO(object): def __init__(self, backend, input_size, labels, max_box_per_image, anchors): self.input_size = input_size self.labels = list(labels) self.nb_class = len(self.labels) self.nb_box = len(anchors)//2 self.class_wt = np.ones(self.nb_class, dtype='float32') self.anchors = anchors self.max_box_per_image = max_box_per_image ########################## # Make the model ########################## # make the feature extractor layers input_image = Input(shape=(self.input_size, self.input_size, 3)) self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) if backend == 'Inception3': self.feature_extractor = Inception3Feature(self.input_size) elif backend == 'SqueezeNet': self.feature_extractor = SqueezeNetFeature(self.input_size) elif backend == 'MobileNet': self.feature_extractor = MobileNetFeature(self.input_size) elif backend == 'Full Yolo': self.feature_extractor = FullYoloFeature(self.input_size) elif backend == 'Tiny Yolo': self.feature_extractor = TinyYoloFeature(self.input_size) elif backend == 'VGG16': self.feature_extractor = VGG16Feature(self.input_size) elif backend == 'ResNet50': self.feature_extractor = ResNet50Feature(self.input_size) else: raise Exception('Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!') print(self.feature_extractor.get_output_shape()) self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() features = self.feature_extractor.extract(input_image) # make the object detection layer output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), (1,1), strides=(1,1), padding='same', name='DetectionLayer', kernel_initializer='lecun_normal')(features) output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) output = Lambda(lambda args: args[0])([output, self.true_boxes]) self.model = Model([input_image, self.true_boxes], output) # initialize the weights of the detection layer layer = self.model.layers[-4] weights = layer.get_weights() new_kernel = np.random.normal(size=weights[0].shape)/(self.grid_h*self.grid_w) new_bias = np.random.normal(size=weights[1].shape)/(self.grid_h*self.grid_w) layer.set_weights([new_kernel, new_bias]) # print a summary of the whole model self.model.summary() def custom_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0,2,1,3,4)) cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * \ np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \ no_boxes_mask, tf.ones_like(coord_mask)], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) loss = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: loss_xy + loss_wh + loss_conf + loss_class + 10, lambda: loss_xy + loss_wh + loss_conf + loss_class) if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box/(nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) return loss def load_weights(self, weight_path): self.model.load_weights(weight_path) def train(self, train_imgs, # the list of images to train the model valid_imgs, # the list of images used to validate the model train_times, # the number of time to repeat the training set, often used for small datasets valid_times, # the number of times to repeat the validation set, often used for small datasets nb_epochs, # number of epoches learning_rate, # the learning rate batch_size, # the size of the batch warmup_epochs, # number of initial batches to let the model familiarize with the new dataset object_scale, no_object_scale, coord_scale, class_scale, saved_weights_name='best_weights.h5', debug=False): self.batch_size = batch_size self.object_scale = object_scale self.no_object_scale = no_object_scale self.coord_scale = coord_scale self.class_scale = class_scale self.debug = debug ############################################ # Make train and validation generators ############################################ generator_config = { 'IMAGE_H' : self.input_size, 'IMAGE_W' : self.input_size, 'GRID_H' : self.grid_h, 'GRID_W' : self.grid_w, 'BOX' : self.nb_box, 'LABELS' : self.labels, 'CLASS' : len(self.labels), 'ANCHORS' : self.anchors, 'BATCH_SIZE' : self.batch_size, 'TRUE_BOX_BUFFER' : self.max_box_per_image, } train_generator = BatchGenerator(train_imgs, generator_config, norm=self.feature_extractor.normalize) valid_generator = BatchGenerator(valid_imgs, generator_config, norm=self.feature_extractor.normalize, jitter=False) self.warmup_batches = warmup_epochs * (train_times*len(train_generator) + valid_times*len(valid_generator)) ############################################ # Compile the model ############################################ optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) self.model.compile(loss=self.custom_loss, optimizer=optimizer) ############################################ # Make a few callbacks ############################################ early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, mode='min', verbose=1) checkpoint = ModelCheckpoint(saved_weights_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1) tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/'), histogram_freq=0, #write_batch_performance=True, write_graph=True, write_images=False) ############################################ # Start the training process ############################################ self.model.fit_generator(generator = train_generator, steps_per_epoch = len(train_generator) * train_times, epochs = warmup_epochs + nb_epochs, verbose = 2 if debug else 1, validation_data = valid_generator, validation_steps = len(valid_generator) * valid_times, callbacks = [early_stop, checkpoint, tensorboard], workers = 3, max_queue_size = 8) ############################################ # Compute mAP on the validation set ############################################ average_precisions = self.evaluate(valid_generator) # print evaluation for label, average_precision in average_precisions.items(): print(self.labels[label], '{:.4f}'.format(average_precision)) print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) def evaluate(self, generator, iou_threshold=0.3, score_threshold=0.3, max_detections=100, save_path=None): """ Evaluate a given dataset using a given model. code originally from https://github.com/fizyr/keras-retinanet # Arguments generator : The generator that represents the dataset to evaluate. model : The model to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. score_threshold : The score confidence threshold to use for detections. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. # Returns A dict mapping class names to mAP scores. """ # gather all detections and annotations all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())] all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] for i in range(generator.size()): raw_image = generator.load_image(i) raw_height, raw_width, raw_channels = raw_image.shape # make the boxes and the labels pred_boxes = self.predict(raw_image) score = np.array([box.score for box in pred_boxes]) pred_labels = np.array([box.label for box in pred_boxes]) if len(pred_boxes) > 0: pred_boxes = np.array([[box.xmin*raw_width, box.ymin*raw_height, box.xmax*raw_width, box.ymax*raw_height, box.score] for box in pred_boxes]) else: pred_boxes = np.array([[]]) # sort the boxes and the labels according to scores score_sort = np.argsort(-score) pred_labels = pred_labels[score_sort] pred_boxes = pred_boxes[score_sort] # copy detections to all_detections for label in range(generator.num_classes()): all_detections[i][label] = pred_boxes[pred_labels == label, :] annotations = generator.load_annotation(i) # copy detections to all_annotations for label in range(generator.num_classes()): all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() # compute mAP by comparing all detections and all annotations average_precisions = {} for label in range(generator.num_classes()): false_positives = np.zeros((0,)) true_positives = np.zeros((0,)) scores = np.zeros((0,)) num_annotations = 0.0 for i in range(generator.size()): detections = all_detections[i][label] annotations = all_annotations[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for d in detections: scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = compute_ap(recall, precision) average_precisions[label] = average_precision return average_precisions def predict(self, image): image_h, image_w, _ = image.shape image = cv2.resize(image, (self.input_size, self.input_size)) image = self.feature_extractor.normalize(image) input_image = image[:,:,::-1] input_image = np.expand_dims(input_image, 0) dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4)) netout = self.model.predict([input_image, dummy_array])[0] boxes = decode_netout(netout, self.anchors, self.nb_class) return boxes