def __call__(self, img): orig_input_height, orig_input_width, _ = img.shape #img = cv2.resize(orig_img, (640, 640)) input_height, input_width, _ = img.shape img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.asarray(img, dtype=np.float32) / 255.0 img = img.transpose(2, 0, 1) # forward x_data = img[np.newaxis, :, :, :] x = Variable(x_data) if self.gpu >= 0: x.to_gpu() pred = self.model.predict(x) x, y, w, h, conf, prob = pred # parse results _, _, _, grid_h, grid_w = x.shape x = F.reshape(x, (self.n_boxes, grid_h, grid_w)).data y = F.reshape(y, (self.n_boxes, grid_h, grid_w)).data w = F.reshape(w, (self.n_boxes, grid_h, grid_w)).data h = F.reshape(h, (self.n_boxes, grid_h, grid_w)).data conf = F.reshape(conf, (self.n_boxes, grid_h, grid_w)).data prob = F.transpose( F.reshape(prob, (self.n_boxes, self.n_classes_yolo, grid_h, grid_w)), (1, 0, 2, 3)).data detected_indices = (conf * prob).max(axis=0) > self.detection_thresh if self.gpu >= 0: x = cuda.to_cpu(x) y = cuda.to_cpu(y) w = cuda.to_cpu(w) h = cuda.to_cpu(h) conf = cuda.to_cpu(conf) prob = cuda.to_cpu(prob) detected_indices = cuda.to_cpu(detected_indices) results = [] for i in range(detected_indices.sum()): results.append({ "label": self.labels[prob.transpose(1, 2, 3, 0)[detected_indices][i].argmax()], "probs": prob.transpose(1, 2, 3, 0)[detected_indices][i], "conf": conf[detected_indices][i], "objectness": conf[detected_indices][i] * prob.transpose(1, 2, 3, 0)[detected_indices][i].max(), "box": Box(x[detected_indices][i] * orig_input_width, y[detected_indices][i] * orig_input_height, w[detected_indices][i] * orig_input_width, h[detected_indices][i] * orig_input_height).crop_region( orig_input_height, orig_input_width) }) # nms nms_results = nms(results, self.iou_thresh) return nms_results
def __call__(self, input_x, t, ignore_t): if isinstance(input_x, chainer.Variable): device = cuda.get_device(input_x.data) else: device = cuda.get_device(input_x) xp = self.predictor.xp with device: output = self.predictor(input_x) batch_size, _, grid_h, grid_w = output.shape self.seen += batch_size x, y, w, h, conf, prob = F.split_axis(F.reshape( output, (batch_size, self.predictor.n_boxes, self.predictor.n_classes + 5, grid_h, grid_w)), (1, 2, 3, 4, 5), axis=2) x = F.sigmoid(x) y = F.sigmoid(y) conf = F.sigmoid(conf) prob = F.transpose(prob, (0, 2, 1, 3, 4)) prob = F.softmax(prob) # training labels tw = np.zeros(w.shape, dtype=np.float32) th = np.zeros(h.shape, dtype=np.float32) tx = np.tile(0.5, x.shape).astype(np.float32) ty = np.tile(0.5, y.shape).astype(np.float32) # set low learning rate for bounding boxes that have no object if self.seen < self.unstable_seen: box_learning_scale = np.tile(0.1, x.shape).astype(np.float32) else: box_learning_scale = np.tile(0, x.shape).astype(np.float32) tconf = np.zeros(conf.shape, dtype=np.float32) conf_learning_scale = np.zeros(conf.shape, dtype=np.float32) if xp == np: conf_data = conf.data.copy() else: conf_data = cuda.to_cpu(conf.data) tprob = prob.data.copy() x_shift = np.broadcast_to(np.arange(grid_w, dtype=np.float32), x.shape[1:]) y_shift = np.broadcast_to( np.arange(grid_h, dtype=np.float32).reshape(grid_h, 1), y.shape[1:]) w_anchor = np.broadcast_to( np.reshape( np.array(self.anchors, dtype=np.float32)[:, 0], (self.predictor.n_boxes, 1, 1, 1)), w.shape[1:]) h_anchor = np.broadcast_to( np.reshape( np.array(self.anchors, dtype=np.float32)[:, 1], (self.predictor.n_boxes, 1, 1, 1)), h.shape[1:]) x_data = cuda.to_cpu(x.data) y_data = cuda.to_cpu(y.data) w_data = cuda.to_cpu(w.data) h_data = cuda.to_cpu(h.data) best_ious = [] for batch in range(batch_size): n_truth_boxes = len(t[batch]) box_x = (x_data[batch] + x_shift) / grid_w box_y = (y_data[batch] + y_shift) / grid_h box_w = np.exp(w_data[batch]) * w_anchor / grid_w box_h = np.exp(h_data[batch]) * h_anchor / grid_h ious = [] for truth_index in range(n_truth_boxes): truth_box_x = np.broadcast_to( np.array(t[batch][truth_index]["x"], dtype=np.float32), box_x.shape) truth_box_y = np.broadcast_to( np.array(t[batch][truth_index]["y"], dtype=np.float32), box_y.shape) truth_box_w = np.broadcast_to( np.array(t[batch][truth_index]["w"], dtype=np.float32), box_w.shape) truth_box_h = np.broadcast_to( np.array(t[batch][truth_index]["h"], dtype=np.float32), box_h.shape) ious.append( multi_box_iou( Box(box_x, box_y, box_w, box_h), Box(truth_box_x, truth_box_y, truth_box_w, truth_box_h))) if len(ious) > 0: ious = np.asarray(ious) best_ious.append(np.max(ious, axis=0)) else: best_ious.append(np.zeros_like(x_data[0])) best_ious = np.array(best_ious) # keep confidence of anchor that has more confidence than threshold tconf[best_ious > self.thresh] = conf.data.get()[ best_ious > self.thresh] conf_learning_scale[best_ious > self.thresh] = 0 conf_data[best_ious > self.thresh] = 0 # ignored regions are not considered either positive or negative best_ious = [] for batch in range(batch_size): n_truth_boxes = len(ignore_t[batch]) box_x = (x_data[batch] + x_shift) / grid_w box_y = (y_data[batch] + y_shift) / grid_h box_w = np.exp(w_data[batch]) * w_anchor / grid_w box_h = np.exp(h_data[batch]) * h_anchor / grid_h ious = [] for truth_index in range(n_truth_boxes): truth_box_x = np.broadcast_to( np.array(ignore_t[batch][truth_index]["x"], dtype=np.float32), box_x.shape) truth_box_y = np.broadcast_to( np.array(ignore_t[batch][truth_index]["y"], dtype=np.float32), box_y.shape) truth_box_w = np.broadcast_to( np.array(ignore_t[batch][truth_index]["w"], dtype=np.float32), box_w.shape) truth_box_h = np.broadcast_to( np.array(ignore_t[batch][truth_index]["h"], dtype=np.float32), box_h.shape) ious.append( multi_box_iou( Box(box_x, box_y, box_w, box_h), Box(truth_box_x, truth_box_y, truth_box_w, truth_box_h))) if len(ious) > 0: ious = np.asarray(ious) best_ious.append(np.max(ious, axis=0)) else: best_ious.append(np.zeros_like(x_data[0])) best_ious = np.array(best_ious) # do not update confidence for ignored regions tconf[best_ious > self.ignore_thresh] = conf.data.get()[ best_ious > self.ignore_thresh] conf_learning_scale[best_ious > self.ignore_thresh] = 0 conf_data[best_ious > self.ignore_thresh] = 0 # adjust x, y, w, h, conf, prob of anchor boxes that have objects abs_anchors = self.anchors / np.array([grid_w, grid_h]) for batch in range(batch_size): for truth_box in t[batch]: truth_w = int(float(truth_box["x"]) * grid_w) truth_h = int(float(truth_box["y"]) * grid_h) truth_n = 0 best_iou = 0.0 for anchor_index, abs_anchor in enumerate(abs_anchors): iou = box_iou( Box(0, 0, float(truth_box["w"]), float(truth_box["h"])), Box(0, 0, abs_anchor[0], abs_anchor[1])) if best_iou < iou: best_iou = iou truth_n = anchor_index box_learning_scale[batch, truth_n, :, truth_h, truth_w] = 1.0 tx[batch, truth_n, :, truth_h, truth_w] = float(truth_box["x"]) * grid_w - truth_w ty[batch, truth_n, :, truth_h, truth_w] = float(truth_box["y"]) * grid_h - truth_h tw[batch, truth_n, :, truth_h, truth_w] = np.log( float(truth_box["w"]) / abs_anchors[truth_n][0]) th[batch, truth_n, :, truth_h, truth_w] = np.log( float(truth_box["h"]) / abs_anchors[truth_n][1]) tprob[batch, :, truth_n, truth_h, truth_w] = 0 tprob[batch, int(truth_box["label"]), truth_n, truth_h, truth_w] = 1 full_truth_box = Box(float(truth_box["x"]), float(truth_box["y"]), float(truth_box["w"]), float(truth_box["h"])) predicted_box = Box( (x[batch][truth_n][0][truth_h][truth_w].data.get() + truth_w) / grid_w, (y[batch][truth_n][0][truth_h][truth_w].data.get() + truth_h) / grid_h, np.exp( w[batch][truth_n][0][truth_h][truth_w].data.get()) * abs_anchors[truth_n][0], np.exp( h[batch][truth_n][0][truth_h][truth_w].data.get()) * abs_anchors[truth_n][1]) predicted_iou = box_iou(full_truth_box, predicted_box) tconf[batch, truth_n, :, truth_h, truth_w] = predicted_iou conf_learning_scale[batch, truth_n, :, truth_h, truth_w] = 10.0 conf_data[batch, truth_n, :, truth_h, truth_w] = 0 n_all = np.prod(conf_learning_scale.shape[1:]) for batch in range(batch_size): n_truth_boxes = len(t[batch]) n_top = np.maximum(n_truth_boxes * 3, 6) ids = np.argsort(conf_data[batch].ravel()) flags = np.zeros(n_all, dtype=bool) flags[ids[-n_top:]] = True conf_learning_scale[batch][flags.reshape( conf_learning_scale[batch].shape)] = 10.0 tx = cuda.to_gpu(tx) ty = cuda.to_gpu(ty) tw = cuda.to_gpu(tw) th = cuda.to_gpu(th) tconf = cuda.to_gpu(tconf) tprob = cuda.to_gpu(tprob) box_learning_scale = cuda.to_gpu(box_learning_scale) conf_learning_scale = cuda.to_gpu(conf_learning_scale) x_loss = F.sum((tx - x)**2 * box_learning_scale) / 2 y_loss = F.sum((ty - y)**2 * box_learning_scale) / 2 w_loss = F.sum((tw - w)**2 * box_learning_scale) / 2 h_loss = F.sum((th - h)**2 * box_learning_scale) / 2 c_loss = F.sum((tconf - conf)**2 * conf_learning_scale) / 2 p_loss = F.sum((tprob - prob)**2) / 2 return x_loss, y_loss, w_loss, h_loss, c_loss, p_loss
def __call__(self, input_x, t, train=True): output_fcn, output_yolo = self.predictor(input_x, train=train) if self.FCN: if train: loss_fcn = F.softmax_cross_entropy(output_fcn, t) reporter.report({'loss': loss_fcn}, self) return loss_fcn else: loss = F.softmax(output_fcn) return loss batch_size, _, grid_h, grid_w = output_yolo.shape self.seen += batch_size x, y, w, h, conf, prob = F.split_axis(F.reshape( output_yolo, (batch_size, self.predictor.n_boxes, self.predictor.n_classes_yolo + 5, grid_h, grid_w)), (1, 2, 3, 4, 5), axis=2) x = F.sigmoid(x) y = F.sigmoid(y) conf = F.sigmoid(conf) prob = F.transpose(prob, (0, 2, 1, 3, 4)) prob = F.softmax(prob) tw = np.zeros( w.shape, dtype=np.float32) # wとhが0になるように学習(e^wとe^hは1に近づく -> 担当するbboxの倍率1) th = np.zeros(h.shape, dtype=np.float32) tx = np.tile(0.5, x.shape).astype(np.float32) # 活性化後のxとyが0.5になるように学習() ty = np.tile(0.5, y.shape).astype(np.float32) if self.seen < self.unstable_seen: box_learning_scale = np.tile(0.1, x.shape).astype(np.float32) else: box_learning_scale = np.tile(0, x.shape).astype(np.float32) tconf = np.zeros( conf.shape, dtype=np.float32 ) # confidenceのtruthは基本0、iouがthresh以上のものは学習しない、ただしobjectの存在するgridのbest_boxのみ真のIOUに近づかせる conf_learning_scale = np.tile(0.1, conf.shape).astype(np.float32) tprob = prob.data.copy() # best_anchor以外は学習させない(自身との二乗和誤差 = 0) # 全bboxとtruthのiouを計算(batch単位で計算する) x_shift = Variable( np.broadcast_to(np.arange(grid_w, dtype=np.float32), x.shape[1:])) y_shift = Variable( np.broadcast_to( np.arange(grid_h, dtype=np.float32).reshape(grid_h, 1), y.shape[1:])) w_anchor = Variable( np.broadcast_to( np.reshape( np.array(self.anchors, dtype=np.float32)[:, 0], (self.predictor.n_boxes, 1, 1, 1)), w.shape[1:])) h_anchor = Variable( np.broadcast_to( np.reshape( np.array(self.anchors, dtype=np.float32)[:, 1], (self.predictor.n_boxes, 1, 1, 1)), h.shape[1:])) x_shift.to_gpu(), y_shift.to_gpu(), w_anchor.to_gpu(), h_anchor.to_gpu( ) best_ious = [] for batch in range(batch_size): #n_truth_boxes = len(t[batch]) n_truth_boxes = int(sum(x[0] != 10.0 for x in t[batch])) # ?? box_x = (x[batch] + x_shift) / grid_w box_y = (y[batch] + y_shift) / grid_h box_w = F.exp(w[batch]) * w_anchor / grid_w box_h = F.exp(h[batch]) * h_anchor / grid_h ious = [] for truth_index in range(n_truth_boxes): t = chainer.cuda.to_cpu(t) # ?? truth_box_x = Variable( np.broadcast_to( np.array(t[batch][truth_index][1], dtype=np.float32), box_x.shape)) truth_box_y = Variable( np.broadcast_to( np.array(t[batch][truth_index][2], dtype=np.float32), box_y.shape)) truth_box_w = Variable( np.broadcast_to( np.array(t[batch][truth_index][3], dtype=np.float32), box_w.shape)) truth_box_h = Variable( np.broadcast_to( np.array(t[batch][truth_index][4], dtype=np.float32), box_h.shape)) truth_box_x.to_gpu(), truth_box_y.to_gpu(), truth_box_w.to_gpu( ), truth_box_h.to_gpu() ious.append( multi_box_iou( Box(box_x, box_y, box_w, box_h), Box(truth_box_x, truth_box_y, truth_box_w, truth_box_h)).data.get()) if ious: ious = np.array(ious) best_ious.append(np.max(ious, axis=0)) else: best_ious.append(0) best_ious = np.array(best_ious) # 一定以上のiouを持つanchorに対しては、confを0に下げないようにする(truthの周りのgridはconfをそのまま維持)。 tconf[best_ious > self.thresh] = conf.data.get()[ best_ious > self.thresh] conf_learning_scale[best_ious > self.thresh] = 0 # objectの存在するanchor boxのみ、x、y、w、h、conf、probを個別修正 abs_anchors = self.anchors / np.array([grid_w, grid_h]) for batch in range(batch_size): for truth_box in t[batch]: if truth_box[0] == 10.0: # ?? continue truth_w = int(float(truth_box[1]) * grid_w) truth_h = int(float(truth_box[2]) * grid_h) truth_n = 0 best_iou = 0.0 for anchor_index, abs_anchor in enumerate(abs_anchors): iou = box_iou( Box(0, 0, float(truth_box[3]), float(truth_box[4])), Box(0, 0, abs_anchor[0], abs_anchor[1])) if best_iou < iou: best_iou = iou truth_n = anchor_index # objectの存在するanchorについて、centerを0.5ではなく、真の座標に近づかせる。anchorのスケールを1ではなく真のスケールに近づかせる。学習スケールを1にする。 box_learning_scale[batch, truth_n, :, truth_h, truth_w] = 1.0 tx[batch, truth_n, :, truth_h, truth_w] = float(truth_box[1]) * grid_w - truth_w ty[batch, truth_n, :, truth_h, truth_w] = float(truth_box[2]) * grid_h - truth_h tw[batch, truth_n, :, truth_h, truth_w] = np.log( float(truth_box[3]) / abs_anchors[truth_n][0]) th[batch, truth_n, :, truth_h, truth_w] = np.log( float(truth_box[4]) / abs_anchors[truth_n][1]) tprob[batch, :, truth_n, truth_h, truth_w] = 0 tprob[batch, int(truth_box[0]), truth_n, truth_h, truth_w] = 1 # IOUの観測 full_truth_box = Box(float(truth_box[1]), float(truth_box[2]), float(truth_box[3]), float(truth_box[4])) predicted_box = Box( (x[batch][truth_n][0][truth_h][truth_w].data.get() + truth_w) / grid_w, (y[batch][truth_n][0][truth_h][truth_w].data.get() + truth_h) / grid_h, np.exp(w[batch][truth_n][0][truth_h][truth_w].data.get()) * abs_anchors[truth_n][0], np.exp(h[batch][truth_n][0][truth_h][truth_w].data.get()) * abs_anchors[truth_n][1]) predicted_iou = box_iou(full_truth_box, predicted_box) tconf[batch, truth_n, :, truth_h, truth_w] = predicted_iou conf_learning_scale[batch, truth_n, :, truth_h, truth_w] = 10.0 # debug prints maps = F.transpose(prob[batch], (2, 3, 1, 0)).data #print("seen = %d" % self.seen) # loss計算 tx, ty, tw, th, tconf, tprob = Variable(tx), Variable(ty), Variable( tw), Variable(th), Variable(tconf), Variable(tprob) box_learning_scale, conf_learning_scale = Variable( box_learning_scale), Variable(conf_learning_scale) tx.to_gpu(), ty.to_gpu(), tw.to_gpu(), th.to_gpu(), tconf.to_gpu( ), tprob.to_gpu() box_learning_scale.to_gpu() conf_learning_scale.to_gpu() x_loss = F.sum((tx - x)**2 * box_learning_scale) / 2 y_loss = F.sum((ty - y)**2 * box_learning_scale) / 2 w_loss = F.sum((tw - w)**2 * box_learning_scale) / 2 h_loss = F.sum((th - h)**2 * box_learning_scale) / 2 c_loss = F.sum((tconf - conf)**2 * conf_learning_scale) / 2 p_loss = F.sum((tprob - prob)**2) / 2 #print("x_loss: %f y_loss: %f w_loss: %f h_loss: %f c_loss: %f p_loss: %f" % # (F.sum(x_loss).data, F.sum(y_loss).data, F.sum(w_loss).data, F.sum(h_loss).data, F.sum(c_loss).data, F.sum(p_loss).data) #) reporter.report({'x_loss': F.sum(x_loss).data}, self) reporter.report({'y_loss': F.sum(y_loss).data}, self) reporter.report({'w_loss': F.sum(w_loss).data}, self) reporter.report({'h_loss': F.sum(h_loss).data}, self) reporter.report({'c_loss': F.sum(c_loss).data}, self) reporter.report({'p_loss': F.sum(p_loss).data}, self) loss_yolo = x_loss + y_loss + w_loss + h_loss + c_loss + p_loss reporter.report({'loss': loss_yolo}, self) return loss_yolo