def _get_predicted_bounding_boxes(self, predicted_center_x_values, predicted_center_y_values, predicted_width_values, predicted_height_values, batch_size, feature_map_width, feature_map_height): default_shape = batch_size * self.num_anchors * feature_map_height * feature_map_width pred_boxes = self._to_cuda(FloatTensor(4, default_shape)) grid_x = linspace(0, feature_map_width - 1, feature_map_width).repeat( feature_map_height, 1).repeat(batch_size * self.num_anchors, 1, 1).view(default_shape) grid_x = self._to_cuda(grid_x) grid_y = linspace(0, feature_map_height - 1, feature_map_height).repeat( feature_map_width, 1).t().repeat(batch_size * self.num_anchors, 1, 1).view(default_shape) grid_y = self._to_cuda(grid_y) anchor_w = self._to_cuda(Tensor(self.anchors)).view( self.num_anchors, 2).index_select(1, self._to_cuda(LongTensor([0]))) anchor_w = anchor_w.repeat(batch_size, 1).repeat( 1, 1, feature_map_height * feature_map_width) anchor_w = anchor_w.view(default_shape) anchor_h = self._to_cuda(Tensor(self.anchors)).view( self.num_anchors, 2).index_select(1, self._to_cuda(LongTensor([1]))) anchor_h = anchor_h.repeat(batch_size, 1).repeat( 1, 1, feature_map_height * feature_map_width) anchor_h = anchor_h.view(default_shape) # https://github.com/marvis/pytorch-yolo2/issues/131#issuecomment-460989919 pred_boxes[0] = torch_reshape(predicted_center_x_values.data, (1, default_shape)) + grid_x pred_boxes[1] = torch_reshape(predicted_center_y_values.data, (1, default_shape)) + grid_y pred_boxes[2] = torch_reshape(torch_exp(predicted_width_values.data), (1, default_shape)) * anchor_w pred_boxes[3] = torch_reshape(torch_exp(predicted_height_values.data), (1, default_shape)) * anchor_h return self._convert_to_cpu( pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
def gcxgcy_to_cxcy(gcxgcy, priors_cxcy): """Decodes bounding boxes from the corresponding prior boxes, both in center-size coordinates form, as used in SSD rewrite in PyTorch. This is implemented as shown in https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection. Some modifications are made. All credits to @sgrvinod. """ return torch_cat([ gcxgcy[:, :2] * priors_cxcy[:, 2:] / 10 + priors_cxcy[:, :2], torch_exp(gcxgcy[:, 2:] / 5) * priors_cxcy[:, 2:] ], 1)
def __init__(self, token_representation_dimension: int, dropout_prob: float, max_sequence_length: int) -> None: super(PositionalEncoding, self).__init__() self.dropout_layer = Dropout(p=dropout_prob) # defining positional signals added to embeddings: # initialization: positional_signals = torch_zeros( (max_sequence_length, token_representation_dimension), requires_grad=False ) positions = torch_arange( start=0, end=max_sequence_length, requires_grad=False ).unsqueeze(dim=1) wave_inputs = positions * torch_exp( torch_arange( start=0, end=token_representation_dimension, step=2 ) * (-log(10000.0) / token_representation_dimension) ) # ✓ see demonstration on my notes ▢ # interleaving sinusoidal and cosinusoidal components along feature # dimension (starting with sine), yielding positional signals for # all the allowed positions (for sequences up to the maximum allowed # length): positional_signals[:, 0::2] = torch_sin(wave_inputs) positional_signals[:, 1::2] = torch_cos(wave_inputs) positional_signals = positional_signals.unsqueeze(dim=0) # parameters not requiring backpropagation (i.e. gradient computation # and update): self.register_buffer('positional_signals', positional_signals)
def detect_objects(self, image_as_tensor, confidence_threshold, nms_threshold): self.forward(image_as_tensor) yolov2_loss = self.loss_function.layer num_anchors = yolov2_loss.num_anchors if self.predictions.dim() == 3: self.predictions = self.predictions.unsqueeze(0) assert self.predictions.size(1) == (5 + self.class_count) * num_anchors batch_size = self.predictions.size(0) feature_map_height = self.predictions.size(2) feature_map_width = self.predictions.size(3) number_of_pixels = feature_map_height * feature_map_width view_size = batch_size * num_anchors * number_of_pixels output = self.predictions.view( batch_size * num_anchors, 5 + self.class_count, number_of_pixels).transpose(0, 1).contiguous().view( 5 + self.class_count, view_size) output = output.cuda() if self.use_cuda else output grid_x = torch_linspace(0, feature_map_width - 1, feature_map_width).repeat( feature_map_height, 1).repeat(batch_size * num_anchors, 1, 1).view(view_size) grid_x = grid_x.cuda() if self.use_cuda else grid_x grid_y = torch_linspace(0, feature_map_height - 1, feature_map_height).repeat( feature_map_width, 1).t().repeat(batch_size * num_anchors, 1, 1).view(view_size) grid_y = grid_y.cuda() if self.use_cuda else grid_y anchor_w = Tensor(yolov2_loss.anchors).view( num_anchors, yolov2_loss.anchor_step).index_select(1, LongTensor([0])).repeat( batch_size, 1).repeat(1, 1, number_of_pixels).view(view_size) anchor_w = anchor_w.cuda() if self.use_cuda else anchor_w anchor_h = Tensor(yolov2_loss.anchors).view( num_anchors, yolov2_loss.anchor_step).index_select(1, LongTensor([1])).repeat( batch_size, 1).repeat(1, 1, number_of_pixels).view(view_size) anchor_h = anchor_h.cuda() if self.use_cuda else anchor_h class_scores = Softmax()(output[5:5 + self.class_count].transpose( 0, 1)).data max_class_scores, top_classes = torch_max(class_scores, 1) objectness_confidences = torch_sigmoid(output[4]) max_class_scores = max_class_scores.view(-1) top_classes = top_classes.view(-1) confidences = objectness_confidences * max_class_scores objectness_confidences = objectness_confidences[ confidences > confidence_threshold] x_predictions = (torch_sigmoid(output[0]) + grid_x)[ confidences > confidence_threshold] / feature_map_width y_predictions = (torch_sigmoid(output[1]) + grid_y)[ confidences > confidence_threshold] / feature_map_height w_predictions = (torch_exp(output[2]) * anchor_w)[ confidences > confidence_threshold] / feature_map_width h_predictions = (torch_exp(output[3]) * anchor_h)[ confidences > confidence_threshold] / feature_map_height class_scores = class_scores[confidences > confidence_threshold].view( -1, self.class_count) max_class_scores = max_class_scores[confidences > confidence_threshold] top_classes = top_classes[confidences > confidence_threshold] all_boxes = [] for b_index in range(batch_size): boxes = [] for index in range(x_predictions.size(0)): objectness = objectness_confidences[index].item() cx = x_predictions[index].item() cy = y_predictions[index].item() w = w_predictions[index].item() h = h_predictions[index].item() max_class_score = max_class_scores[index].item() top_class = top_classes[index].item() box = [cx, cy, w, h, objectness, max_class_score, top_class] possible_classes = (class_scores[index] * objectness > confidence_threshold).nonzero()[:, 0] possible_classes = possible_classes[ possible_classes != top_class] for cls in possible_classes: box.append(class_scores[index][cls].item()) box.append(cls.item()) boxes.append(box) all_boxes.append(boxes) detections = [] image_width, image_height = image_as_tensor.size( 3), image_as_tensor.size(2) for b_index in range(batch_size): batch_detections = [] boxes = nms(all_boxes[b_index], nms_threshold) for box in boxes: x1 = max(box[0] - box[2] / 2.0, 0) * image_width y1 = max(box[1] - box[3] / 2.0, 0) * image_height x2 = min(box[0] + box[2] / 2.0, 1) * image_width y2 = min(box[1] + box[3] / 2.0, 1) * image_height objectness = box[4] for j in range((len(box) - 5) // 2): cls_conf = box[5 + 2 * j] cls_id = box[6 + 2 * j] prob = objectness * cls_conf batch_detections.append([cls_id, prob, x1, y1, x2, y2]) detections.append(batch_detections) return detections