def execute(self, inputs): assert len(inputs) == len(self.in_channels) outs = [] outs.append(inputs[0]) for i in range(1, len(inputs)): outs.append( nn.interpolate(inputs[i], scale_factor=2**i, mode='bilinear')) out = jt.contrib.concat(outs, dim=1) ''' if out.requires_grad and self.with_checkpoint: out = checkpoint(self.reduction_conv, out) else: out = self.reduction_conv(out) ''' out = self.reduction_conv(out) outs = [out] for i in range(1, self.num_level): outs.append( nn.pool(out, kernel_size=2**i, stride=2**i, op=self.pooling)) outputs = [] if self.share_conv: for i in range(self.num_level): outputs.append(self.fpn_conv(outs[i])) else: for i in range(self.num_level): if not outs[i].is_stop_grad() and self.with_checkpoint: tmp_out = checkpoint(self.fpn_conv[i], outs[i]) else: tmp_out = self.fpn_conv[i](outs[i]) outputs.append(tmp_out) return tuple(outputs)
def semantic_segmentation_loss(self, segment_data, mask_t, class_t, interpolation_mode='bilinear'): # Note num_classes here is without the background class so cfg.num_classes-1 batch_size, num_classes, mask_h, mask_w = segment_data.shape loss_s = 0 for idx in range(batch_size): cur_segment = segment_data[idx] cur_class_t = class_t[idx] with jt.no_grad(): downsampled_masks = nn.interpolate( mask_t[idx].unsqueeze(0), (mask_h, mask_w), mode=interpolation_mode, align_corners=False).squeeze(0) downsampled_masks = (downsampled_masks > 0.5).float() # Construct Semantic Segmentation segment_t = jt.zeros_like(cur_segment) segment_t.stop_grad() for obj_idx in range(downsampled_masks.shape[0]): segment_t[cur_class_t[obj_idx]] = jt.maximum( segment_t[cur_class_t[obj_idx]], downsampled_masks[obj_idx]) loss_s += nn.BCEWithLogitsLoss(size_average=False)(cur_segment, segment_t) return loss_s / mask_h / mask_w * cfg.semantic_segmentation_alpha
def execute(self, img): # img assumed to be a pytorch BGR image with channel order [n, h, w, c] if cfg.preserve_aspect_ratio: _, h, w, _ = img.size() img_size = Resize.calc_size_preserve_ar(w, h, cfg.max_size) img_size = (img_size[1], img_size[0]) # Pytorch needs h, w else: img_size = (cfg.max_size, cfg.max_size) img = img.permute(0, 3, 1, 2) img = nn.interpolate(img, img_size, mode='bilinear', align_corners=False) if self.transform.normalize: img = (img - self.mean) / self.std elif self.transform.subtract_means: img = (img - self.mean) elif self.transform.to_float: img = img / 255 if self.transform.channel_order != 'RGB': raise NotImplementedError img = img[:, [2, 1, 0], :, :] # Return value is in channel order [n, c, h, w] and RGB return img
def execute(self, x): """ Arguments: x (list[Tensor]): feature maps for each feature level. Returns: results (tuple[Tensor]): feature maps after FPN layers. They are ordered from highest resolution first. """ last_inner = getattr(self, self.inner_blocks[-1])(x[-1]) results = [] results.append(getattr(self, self.layer_blocks[-1])(last_inner)) for feature, inner_block, layer_block in zip( x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1]): if not inner_block: continue inner_top_down = nn.interpolate(last_inner, scale_factor=2, mode="nearest") inner_lateral = getattr(self, inner_block)(feature) # TODO use size instead of scale to make it robust to different sizes # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:], # mode='bilinear', align_corners=False) last_inner = inner_lateral + inner_top_down results.insert(0, getattr(self, layer_block)(last_inner)) if isinstance(self.top_blocks, LastLevelP6P7): last_results = self.top_blocks(x[-1], results[-1]) results.extend(last_results) elif isinstance(self.top_blocks, LastLevelMaxPool): last_results = self.top_blocks(results[-1]) results.extend(last_results) return tuple(results)
def create_grid(samples, scale_factor, img_file): """ utility function to create a grid of GAN samples :param samples: generated samples for storing :param scale_factor: factor for upscaling the image :param img_file: name of file to write :return: None (saves a file) """ # from torchvision.utils import save_image # from torch.nn.functional import interpolate # upsample the image if scale_factor > 1: # samples = interpolate(samples, scale_factor=scale_factor) samples = nn.interpolate(samples, scale_factor=scale_factor, mode='nearest') # save the images: # save_image(samples, img_file, nrow=int(np.sqrt(len(samples))), # normalize=True, scale_each=True, pad_value=128, padding=1) # print(samples) jt.save_image_my(samples, img_file, nrow=int(np.sqrt(len(samples))), normalize=True, scale_each=True, pad_value=128, padding=1)
def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): if input.numel() > 0: return nn.interpolate(input, size, scale_factor, mode, align_corners) def _check_size_scale_factor(dim): if size is None and scale_factor is None: raise ValueError("either size or scale_factor should be defined") if size is not None and scale_factor is not None: raise ValueError( "only one of size or scale_factor should be defined") if (scale_factor is not None and isinstance(scale_factor, tuple) and len(scale_factor) != dim): raise ValueError("scale_factor shape must match input shape. " "Input is {}D, scale_factor size is {}".format( dim, len(scale_factor))) def _output_size(dim): _check_size_scale_factor(dim) if size is not None: return size scale_factors = _ntuple(dim)(scale_factor) # math.floor might return float in py2.7 return [ int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) ] output_shape = tuple(_output_size(2)) output_shape = input.shape[:-2] + output_shape return _NewEmptyTensorOp()(input, output_shape)
def execute(self, style, noise, step=0, alpha=-1, mixing_range=(-1,-1)): out = noise[0] if len(style) < 2: inject_index = [len(self.progression) + 1] else: inject_index = sorted(random.sample(list(range(step)), len(style) - 1)) crossover = 0 for i, (conv, to_rgb) in enumerate(zip(self.progression, self.to_rgb)): if mixing_range == (-1, -1): if crossover < len(inject_index) and i > inject_index[crossover]: crossover = min(crossover + 1, len(style)) style_step = style[crossover] else: if mixing_range[0] <= i <= mixing_range[1]: style_step = style[1] else: style_step = style[0] if i > 0 and step > 0: out_prev = out out = conv(out, style_step, noise[i]) if i == step: out = to_rgb(out) if i > 0 and 0 <= alpha < 1: skip_rgb = self.to_rgb[i - 1](out_prev) skip_rgb = nn.interpolate(skip_rgb, scale_factor=2, mode='nearest') # F out = (1 - alpha) * skip_rgb + alpha * out break return out
def compute_prediction(self, original_image): """ Arguments: original_image (np.ndarray): an image as returned by OpenCV Returns: prediction (BoxList): the detected objects. Additional information of the detection properties can be found in the fields of the BoxList via `prediction.fields()` """ # apply pre-processing to image image = self.transforms(original_image) # convert to an ImageList, padded so that it is divisible by # cfg.DATALOADER.SIZE_DIVISIBILITY image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) # compute predictions with jt.no_grad(): predictions = self.model(image_list) # always single image is passed at a time prediction = predictions[0] # reshape prediction (a BoxList) into the original image size height, width = original_image.shape[:-1] input_w, input_h = prediction.size prediction = prediction.resize((width, height)) if prediction.has_field("mask"): # if we have masks, paste the masks in the right position # in the image, as defined by the bounding boxes masks = prediction.get_field("mask") if masks.ndim == 3: # resize masks stride_mask = float(prediction.get_field('stride').item()) h = math.ceil(masks.shape[1] * stride_mask * height / input_h) w = math.ceil(masks.shape[2] * stride_mask * width / input_w) mask_th = prediction.get_field('mask_th') masks = masks masks = nn.interpolate(X=masks.unsqueeze(1).float(), size=(h, w), mode="bilinear", align_corners=False) > mask_th masks = masks[:, :, :height, :width] #masks = masks.unsqueeze(1) prediction.add_field("mask", masks) else: # always single image is passed at a time masks = self.masker([masks], [prediction])[0] prediction.add_field("mask", masks) return prediction
def upsample_cat(self, p1, p2, p3, p4): p1 = nn.interpolate(p1, size=(self.numAngle, self.numRho), mode='bilinear', align_corners=True) p2 = nn.interpolate(p2, size=(self.numAngle, self.numRho), mode='bilinear', align_corners=True) p3 = nn.interpolate(p3, size=(self.numAngle, self.numRho), mode='bilinear', align_corners=True) p4 = nn.interpolate(p4, size=(self.numAngle, self.numRho), mode='bilinear', align_corners=True) return jt.concat([p1, p2, p3, p4], dim=1)
def execute(self, convouts: List[jt.Var]): """ Args: - convouts (list): A list of convouts for the corresponding layers in in_channels. Returns: - A list of FPN convouts in the same order as x with extra downsample layers if requested. """ out = [] x = jt.zeros((1, )) for i in range(len(convouts)): out.append(x) # For backward compatability, the conv layers are stored in reverse but the input and output is # given in the correct order. Thus, use j=-i-1 for the input and output and i for the conv layers. j = len(convouts) for lat_layer in self.lat_layers.layers.values(): j -= 1 if j < len(convouts) - 1: _, _, h, w = convouts[j].shape #print('hh',(h,w),x.shape[-2:]) x = nn.interpolate(x, size=(h, w), mode=self.interpolation_mode, align_corners=False) # x = interpolate(x, size=(h, w), mode=self.interpolation_mode, align_corners=False) x = x + lat_layer(convouts[j]) out[j] = x # This janky second loop is here because jtScript. j = len(convouts) for pred_layer in self.pred_layers.layers.values(): j -= 1 out[j] = pred_layer(out[j]) if self.relu_pred_layers: out[j] = nn.relu(out[j]) cur_idx = len(out) # In the original paper, this takes care of P6 if self.use_conv_downsample: for downsample_layer in self.downsample_layers.layers.values(): out.append(downsample_layer(out[-1])) else: for idx in range(self.num_downsample): # Note: this is an untested alternative to out.append(out[-1][:, :, ::2, ::2]). Thanks jtScript. out.append(nn.pool(out[-1], 1, stride=2, op='maximum')) if self.relu_downsample_layers: for idx in range(len(out) - cur_idx): out[idx] = nn.relu(out[idx + cur_idx]) return out
def enforce_size(img, targets, masks, num_crowds, new_w, new_h): """ Ensures that the image is the given size without distorting aspect ratio. """ with jt.no_grad(): _, h, w = img.size() if h == new_h and w == new_w: return img, targets, masks, num_crowds # Resize the image so that it fits within new_w, new_h w_prime = new_w h_prime = h * new_w / w if h_prime > new_h: w_prime *= new_h / h_prime h_prime = new_h w_prime = int(w_prime) h_prime = int(h_prime) # Do all the resizing img = nn.interpolate(img.unsqueeze(0), (h_prime, w_prime), mode='bilinear', align_corners=False) img.squeeze_(0) # Act like each object is a color channel masks = nn.interpolate(masks.unsqueeze(0), (h_prime, w_prime), mode='bilinear', align_corners=False) masks.squeeze_(0) # Scale bounding boxes (this will put them in the top left corner in the case of padding) targets[:, [0, 2]] *= (w_prime / new_w) targets[:, [1, 3]] *= (h_prime / new_h) # Finally, pad everything to be the new_w, new_h pad_dims = (0, new_w - w_prime, 0, new_h - h_prime) img = F.pad(img, pad_dims, mode='constant', value=0) masks = F.pad(masks, pad_dims, mode='constant', value=0) return img, targets, masks, num_crowds
def scale_img(img, ratio=1.0, same_shape=False, gs=32): # img(16,3,256,416) # scales img(bs,3,y,x) by ratio constrained to gs-multiple if ratio == 1.0: return img else: h, w = img.shape[2:] s = (int(h * ratio), int(w * ratio)) # new size img = nn.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize if not same_shape: # pad/crop img h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)] return nn.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447) # value = imagenet mean
def forward_for_mask(self, boxlists, pixel_embed): N, dim, m_h, m_w = pixel_embed.shape new_boxlists = [] stride = self.fpn_strides[0] / self.mask_scale_factor for im in range(N): boxlist = boxlists[im] boxes = boxlist.bbox input_w, input_h = boxlist.size proposal_embed = boxlist.get_field('proposal_embed') if proposal_embed.shape[0] == 0: new_boxlist = BoxList(boxes, boxlist.size, mode="xyxy") new_boxlist.add_field("labels", boxlist.get_field("labels")) new_boxlist.add_field("scores", boxlist.get_field("scores")) new_boxlist.add_field('mask', jt.array([])) if self.post_process_masks: new_boxlist.add_field('stride', jt.array([1])) new_boxlist.add_field('mask_th', jt.array([0.0])) else: new_boxlist.add_field('stride', jt.array([stride])) new_boxlist.add_field('mask_th', jt.array([self.mask_th])) new_boxlists.append(new_boxlist) continue mask_boxes = boxes / stride box_masks = boxes_to_masks(mask_boxes, m_h, m_w) proposal_margin = boxlist.get_field('proposal_margin') mask_prob = self.compute_mask_prob(pixel_embed[im], proposal_embed, proposal_margin, mask_boxes) masks = mask_prob * box_masks.float() if self.post_process_masks: masks = nn.interpolate(X=masks.unsqueeze(1).float(), scale_factor=stride, mode="bilinear", align_corners=False) > self.mask_th masks = masks[:, 0, :input_h, :input_w] new_boxlist = BoxList(boxes, boxlist.size, mode="xyxy") new_boxlist.add_field('mask', masks) new_boxlist.add_field("labels", boxlist.get_field("labels")) new_boxlist.add_field("scores", boxlist.get_field("scores")) if self.post_process_masks: new_boxlist.add_field('stride', jt.array([1])) new_boxlist.add_field('mask_th', jt.array([0.0])) else: new_boxlist.add_field('stride', jt.array([stride])) new_boxlist.add_field('mask_th', jt.array([self.mask_th])) new_boxlists.append(new_boxlist) return new_boxlists
def __progressive_down_sampling(self, real_batch, depth, alpha): """ private helper for down_sampling the original images in order to facilitate the progressive growing of the layers. :param real_batch: batch of real samples :param depth: depth at which training is going on :param alpha: current value of the fade-in alpha :return: real_samples => modified real batch of samples """ # from torch.nn import AvgPool2d # from torch.nn.functional import interpolate if self.structure == 'fixed': return real_batch # down_sample the real_batch for the given depth down_sample_factor = int(np.power(2, self.depth - depth - 1)) prior_down_sample_factor = max(int(np.power(2, self.depth - depth)), 0) # ds_real_samples = AvgPool2d(down_sample_factor)(real_batch) ds_real_samples = nn.Pool(down_sample_factor)(real_batch) if depth > 0: # prior_ds_real_samples = interpolate(AvgPool2d(prior_down_sample_factor)(real_batch), scale_factor=2) prior_ds_real_samples = nn.interpolate( nn.Pool(prior_down_sample_factor)(real_batch), scale_factor=2, mode='nearest') else: prior_ds_real_samples = ds_real_samples # real samples are a combination of ds_real_samples and prior_ds_real_samples real_samples = (alpha * ds_real_samples) + ( (1 - alpha) * prior_ds_real_samples) # return the so computed real_samples return real_samples
def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, labels, interpolation_mode='bilinear'): mask_h = proto_data.shape[1] mask_w = proto_data.shape[2] process_gt_bboxes = cfg.mask_proto_normalize_emulate_roi_pooling or cfg.mask_proto_crop if cfg.mask_proto_remove_empty_masks: # Make sure to store a copy of this because we edit it to get rid of all-zero masks pos = pos.clone() loss_m = 0 loss_d = 0 # Coefficient diversity loss maskiou_t_list = [] maskiou_net_input_list = [] label_t_list = [] for idx in range(mask_data.shape[0]): with jt.no_grad(): downsampled_masks = nn.interpolate(masks[idx].unsqueeze(0), (mask_h, mask_w), mode=interpolation_mode, align_corners=False).squeeze(0) downsampled_masks = downsampled_masks.permute(1, 2, 0) if cfg.mask_proto_binarize_downsampled_gt: downsampled_masks = (downsampled_masks>0.5).float() if cfg.mask_proto_remove_empty_masks: # Get rid of gt masks that are so small they get downsampled away very_small_masks = (downsampled_masks.sum(0).sum(0) <= 0.0001) for i in range(very_small_masks.shape[0]): if very_small_masks[i]: pos[idx, idx_t[idx] == i] = 0 if cfg.mask_proto_reweight_mask_loss: # Ensure that the gt is binary if not cfg.mask_proto_binarize_downsampled_gt: bin_gt = (downsampled_masks>0.5).float() else: bin_gt = downsampled_masks gt_foreground_norm = bin_gt / (jt.sum(bin_gt, dim=(0,1), keepdim=True) + 0.0001) gt_background_norm = (1-bin_gt) / (jt.sum(1-bin_gt, dim=(0,1), keepdim=True) + 0.0001) mask_reweighting = gt_foreground_norm * cfg.mask_proto_reweight_coeff + gt_background_norm mask_reweighting *= mask_h * mask_w cur_pos = pos[idx] cur_pos = jt.where(cur_pos)[0] pos_idx_t = idx_t[idx, cur_pos] if process_gt_bboxes: # Note: this is in point-form if cfg.mask_proto_crop_with_pred_box: pos_gt_box_t = decode(loc_data[idx, :, :], priors.data, cfg.use_yolo_regressors)[cur_pos] else: pos_gt_box_t = gt_box_t[idx, cur_pos] if pos_idx_t.shape[0] == 0: continue proto_masks = proto_data[idx] proto_coef = mask_data[idx, cur_pos, :] if cfg.use_mask_scoring: mask_scores = score_data[idx, cur_pos, :] if cfg.mask_proto_coeff_diversity_loss: if inst_data is not None: div_coeffs = inst_data[idx, cur_pos, :] else: div_coeffs = proto_coef loss_d += self.coeff_diversity_loss(div_coeffs, pos_idx_t) # If we have over the allowed number of masks, select a random sample old_num_pos = proto_coef.shape[0] if old_num_pos > cfg.masks_to_train: perm = jt.randperm(proto_coef.shape[0]) select = perm[:cfg.masks_to_train] proto_coef = proto_coef[select, :] pos_idx_t = pos_idx_t[select] if process_gt_bboxes: pos_gt_box_t = pos_gt_box_t[select, :] if cfg.use_mask_scoring: mask_scores = mask_scores[select, :] num_pos = proto_coef.shape[0] mask_t = downsampled_masks[:, :, pos_idx_t] label_t = labels[idx][pos_idx_t] # Size: [mask_h, mask_w, num_pos] pred_masks = proto_masks @ proto_coef.transpose(1,0) pred_masks = cfg.mask_proto_mask_activation(pred_masks) if cfg.mask_proto_double_loss: if cfg.mask_proto_mask_activation == activation_func.sigmoid: pre_loss = nn.bce_loss(jt.clamp(pred_masks, 0, 1), mask_t, size_average=False) else: pre_loss = nn.smooth_l1_loss(pred_masks, mask_t, reduction='sum') loss_m += cfg.mask_proto_double_loss_alpha * pre_loss if cfg.mask_proto_crop: pred_masks = crop(pred_masks, pos_gt_box_t) if cfg.mask_proto_mask_activation == activation_func.sigmoid: pre_loss = binary_cross_entropy(jt.clamp(pred_masks, 0, 1), mask_t) else: pre_loss = nn.smooth_l1_loss(pred_masks, mask_t, reduction='none') if cfg.mask_proto_normalize_mask_loss_by_sqrt_area: gt_area = jt.sum(mask_t, dim=(0, 1), keepdims=True) pre_loss = pre_loss / (jt.sqrt(gt_area) + 0.0001) if cfg.mask_proto_reweight_mask_loss: pre_loss = pre_loss * mask_reweighting[:, :, pos_idx_t] if cfg.mask_proto_normalize_emulate_roi_pooling: weight = mask_h * mask_w if cfg.mask_proto_crop else 1 pos_gt_csize = center_size(pos_gt_box_t) gt_box_width = pos_gt_csize[:, 2] * mask_w gt_box_height = pos_gt_csize[:, 3] * mask_h pre_loss = pre_loss.sum(0).sum(0) / gt_box_width / gt_box_height * weight # If the number of masks were limited scale the loss accordingly if old_num_pos > num_pos: pre_loss *= old_num_pos / num_pos loss_m += jt.sum(pre_loss) if cfg.use_maskiou: if cfg.discard_mask_area > 0: gt_mask_area = jt.sum(mask_t, dim=(0, 1)) select = gt_mask_area > cfg.discard_mask_area if jt.sum(select).item() < 1: continue pos_gt_box_t = pos_gt_box_t[select, :] pred_masks = pred_masks[:, :, select] mask_t = mask_t[:, :, select] label_t = label_t[select] maskiou_net_input = pred_masks.permute(2, 0, 1).unsqueeze(1) pred_masks = (pred_masks>0.5).float() maskiou_t = self._mask_iou(pred_masks, mask_t) maskiou_net_input_list.append(maskiou_net_input) maskiou_t_list.append(maskiou_t) label_t_list.append(label_t) losses = {'M': loss_m * cfg.mask_alpha / mask_h / mask_w} if cfg.mask_proto_coeff_diversity_loss: losses['D'] = loss_d if cfg.use_maskiou: # discard_mask_area discarded every mask in the batch, so nothing to do here if len(maskiou_t_list) == 0: return losses, None maskiou_t = jt.contrib.concat(maskiou_t_list) label_t = jt.contrib.concat(label_t_list) maskiou_net_input = jt.contrib.concat(maskiou_net_input_list) num_samples = maskiou_t.shape[0] if cfg.maskious_to_train > 0 and num_samples > cfg.maskious_to_train: perm = jt.randperm(num_samples) select = perm[:cfg.masks_to_train] maskiou_t = maskiou_t[select] label_t = label_t[select] maskiou_net_input = maskiou_net_input[select] return losses, [maskiou_net_input, maskiou_t, label_t] return losses
def prepare_for_coco_segmentation(predictions, dataset): import pycocotools.mask as mask_util import numpy as np masker = Masker(threshold=0.5, padding=1) # assert isinstance(dataset, COCODataset) coco_results = [] for image_id in tqdm(predictions): prediction = predictions[image_id] original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] # print(prediction.get_field("mask").shape,image_height,image_width) prediction = prediction.resize((image_width, image_height)) masks = prediction.get_field("mask") # t = time.time() # Masker is necessary only if masks haven't been already resized. # print(masks.shape) if prediction.has_field('mask_th'): # resize masks stride_mask = prediction.get_field('stride') input_w, input_h = prediction.size h = (masks.shape[1] * stride_mask.float() * image_height / input_h).ceil().int32().item() w = (masks.shape[2] * stride_mask.float() * image_width / input_w).ceil().int32().item() mask_th = prediction.get_field('mask_th') masks = (nn.interpolate(masks.unsqueeze(1).float(), size=(int(h), int(w)), mode="bilinear", align_corners=False)>mask_th) masks = masks[:, :, :image_height, :image_width] else: if list(masks.shape[-2:]) != [image_height, image_width]: masks = masker([masks], prediction) masks = masks[0] # logger.info('Time mask: {}'.format(time.time() - t)) # prediction = prediction.convert('xywh') # boxes = prediction.bbox.tolist() scores = prediction.get_field("scores").tolist() labels = prediction.get_field("labels").tolist() # rles = prediction.get_field('mask') masks = masks.numpy() rles = [ mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend( [ { "image_id": original_id, "category_id": mapped_labels[k], "segmentation": rle, "score": scores[k], } for k, rle in enumerate(rles) ] ) return coco_results
def execute(self, x): return nn.interpolate(x, *self.args, **self.kwdargs)
def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', visualize_lincomb=False, crop_masks=True, score_threshold=0): """ Postprocesses the output of Yolact on testing mode into a format that makes sense, accounting for all the possible configuration settings. Args: - det_output: The lost of dicts that Detect outputs. - w: The real with of the image. - h: The real height of the image. - batch_idx: If you have multiple images for this batch, the image's index in the batch. - interpolation_mode: Can be 'nearest' | 'area' | 'bilinear' (see jt.nn.functional.interpolate) Returns 4 jt Tensors (in the following order): - classes [num_det]: The class idx for each detection. - scores [num_det]: The confidence score for each detection. - boxes [num_det, 4]: The bounding box for each detection in absolute point form. - masks [num_det, h, w]: Full image masks for each detection. """ dets = det_output[batch_idx] net = dets['net'] dets = dets['detection'] if dets is None: return [jt.array([]) ] * 4 # Warning, this is 4 copies of the same thing if score_threshold > 0: keep = dets['score'] > score_threshold for k in dets: if k != 'proto': dets[k] = dets[k][keep] if dets['score'].shape[0] == 0: return [jt.array([])] * 4 # Actually extract everything from dets now classes = dets['class'] boxes = dets['box'] scores = dets['score'] masks = dets['mask'] if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: # At this points masks is only the coefficients proto_data = dets['proto'] # Test flag, do not upvote if cfg.mask_proto_debug: np.save('scripts/proto.npy', proto_data.numpy()) if visualize_lincomb: display_lincomb(proto_data, masks) masks = jt.matmul(proto_data, masks.transpose(1, 0)) masks = cfg.mask_proto_mask_activation(masks) # Crop masks before upsampling because you know why if crop_masks: masks = crop(masks, boxes) # Permute into the correct output shape [num_dets, proto_h, proto_w] masks = masks.permute(2, 0, 1) if cfg.use_maskiou: with timer.env('maskiou_net'): with jt.no_grad(): maskiou_p = net.maskiou_net(masks.unsqueeze(1)) maskiou_p = jt.gather( maskiou_p, dim=1, index=classes.unsqueeze(1)).squeeze(1) if cfg.rescore_mask: if cfg.rescore_bbox: scores = scores * maskiou_p else: scores = [scores, scores * maskiou_p] # Scale masks up to the full image masks = nn.interpolate(masks.unsqueeze(0), (h, w), mode=interpolation_mode, align_corners=False).squeeze(0) # Binarize the masks masks = masks > 0.5 boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, cast=False) boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, cast=False) boxes = boxes.int32() if cfg.mask_type == mask_type.direct and cfg.eval_mask_branch: # Upscale masks full_masks = jt.zeros(masks.shape[0], h, w) for jdx in range(masks.shape[0]): x1, y1, x2, y2 = boxes[jdx] mask_w = x2 - x1 mask_h = y2 - y1 # Just in case if mask_w * mask_h <= 0 or mask_w < 0: continue mask = masks[jdx].view(1, 1, cfg.mask_size, cfg.mask_size) mask = nn.interpolate(mask, (mask_h, mask_w), mode=interpolation_mode, align_corners=False) mask = (mask > 0.5).float() full_masks[jdx, y1:y2, x1:x2] = mask masks = full_masks return classes, scores, boxes, masks
def __init__(self, dlatent_size=512, num_channels=3, resolution=1024, fmap_base=8192, fmap_decay=1.0, fmap_max=512, use_styles=True, const_input_layer=True, use_noise=True, nonlinearity='lrelu', use_wscale=True, use_pixel_norm=False, use_instance_norm=True, blur_filter=None, structure='linear', **kwargs): """ Synthesis network used in the StyleGAN paper. :param dlatent_size: Disentangled latent (W) dimensionality. :param num_channels: Number of output color channels. :param resolution: Output resolution. :param fmap_base: Overall multiplier for the number of feature maps. :param fmap_decay: log2 feature map reduction when doubling the resolution. :param fmap_max: Maximum number of feature maps in any layer. :param use_styles: Enable style inputs? :param const_input_layer: First layer is a learned constant? :param use_noise: Enable noise inputs? # :param randomize_noise: True = randomize noise inputs every time (non-deterministic), False = read noise inputs from variables. :param nonlinearity: Activation function: 'relu', 'lrelu' :param use_wscale: Enable equalized learning rate? :param use_pixel_norm: Enable pixel_wise feature vector normalization? :param use_instance_norm: Enable instance normalization? :param blur_filter: Low-pass filter to apply when resampling activations. None = no filtering. :param structure: 'fixed' = no progressive growing, 'linear' = human-readable :param kwargs: Ignore unrecognized keyword args. """ super().__init__() # if blur_filter is None: # blur_filter = [1, 2, 1] def nf(stage): return min(int(fmap_base / (2.0**(stage * fmap_decay))), fmap_max) self.structure = structure resolution_log2 = int(np.log2(resolution)) assert resolution == 2**resolution_log2 and resolution >= 4 self.depth = resolution_log2 - 1 self.num_layers = resolution_log2 * 2 - 2 self.num_styles = self.num_layers if use_styles else 1 act, gain = { 'relu': (nn.ReLU(), np.sqrt(2)), 'lrelu': (nn.LeakyReLU(scale=0.2), np.sqrt(2)) }[nonlinearity] # Early layers. self.init_block = InputBlock(nf(1), dlatent_size, const_input_layer, gain, use_wscale, use_noise, use_pixel_norm, use_instance_norm, use_styles, act) # create the ToRGB layers for various outputs rgb_converters = [ EqualizedConv2d(nf(1), num_channels, 1, gain=1, use_wscale=use_wscale) ] # Building blocks for remaining layers. blocks = [] for res in range(3, resolution_log2 + 1): last_channels = nf(res - 2) channels = nf(res - 1) # name = '{s}x{s}'.format(s=2 ** res) blocks.append( GSynthesisBlock(last_channels, channels, blur_filter, dlatent_size, gain, use_wscale, use_noise, use_pixel_norm, use_instance_norm, use_styles, act)) rgb_converters.append( EqualizedConv2d(channels, num_channels, 1, gain=1, use_wscale=use_wscale)) self.blocks = nn.ModuleList(blocks) self.to_rgb = nn.ModuleList(rgb_converters) # register the temporary upsampler # self.temporaryUpsampler = lambda x: interpolate(x, scale_factor=2) self.temporaryUpsampler = lambda x: nn.interpolate( x, scale_factor=2, mode='nearest')
def train(hyp, opt, tb_writer=None): logger.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) save_dir, epochs, batch_size, weights = Path( opt.save_dir), opt.epochs, opt.batch_size, opt.weights # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pkl' best = wdir / 'best.pkl' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = not opt.no_cuda if cuda: jt.flags.use_cuda = 1 init_seeds(1) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model model = Model(opt.cfg, ch=3, nc=nc) # create pretrained = weights.endswith('.pkl') if pretrained: model.load(weights) # load # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, jt.Var): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, jt.Var): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = optim.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) loggers = {} # loggers dict start_epoch, best_fitness = 0, 0.0 # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # EMA ema = ModelEMA(model) # Trainloader dataloader = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: ')) mlc = np.concatenate(dataloader.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, batch_size, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, workers=opt.workers, pad=0.5, prefix=colorstr('val: ')) labels = np.concatenate(dataloader.labels, 0) c = jt.array(labels[:, 0]) # classes # cf = torch.bincount(c.int(), minlength=nc) + 1. # frequency # model._initialize_biases(cf) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c.numpy(), 0) # Anchors if not opt.noautoanchor: check_anchors(dataloader, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['box'] *= 3. / nl # scale to layers hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3. / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights( dataloader.labels, nc) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices cw = model.class_weights.numpy() * (1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataloader.labels, nc=nc, class_weights=cw) # image weights dataloader.indices = random.choices( range(dataloader.n), weights=iw, k=dataloader.n) # rand weighted idx # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = jt.zeros((4, )) # mean losses pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 7) % ('Epoch', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) # accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = nn.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets, model) # loss scaled by batch_size if opt.quad: loss *= 4. # Optimize optimizer.step(loss) if ema: ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses s = ('%10s' + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # mAP if ema: ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights' ]) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(data=opt.data, batch_size=batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, plots=plots and final_epoch) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5-0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: if hasattr(x, "numpy"): x = x.numpy() tb_writer.add_scalar(tag, x, epoch) # tensorboard # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: # Save last, best and delete jt.save(ema.ema.state_dict(), last) if best_fitness == fi: jt.save(ema.ema.state_dict(), best) # end epoch ---------------------------------------------------------------------------------------------------- # end training # Strip optimizers final = best if best.exists() else last # final model if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png # Test best.pkl logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) best_model = Model(opt.cfg) best_model.load(str(final)) best_model = best_model.fuse() if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=best_model, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False) return results
return 1 / (1 + np.exp(-x)) img_fmt = './data/coco/images/%012d.jpg' with open('info.txt', 'r') as f: img_id = int(f.read()) img = plt.imread(img_fmt % img_id).astype(np.float32) h, w, _ = img.shape gt_masks = np.load('gt.npy').astype(np.float32).transpose(1, 2, 0) proto_masks = np.load('proto.npy').astype(np.float32) proto_masks = jt.array(proto_masks).permute(2, 0, 1).unsqueeze(0) proto_masks = nn.interpolate(proto_masks, (h, w), mode='bilinear', align_corners=False).squeeze(0) proto_masks = proto_masks.permute(1, 2, 0).numpy() # # A x = b ls_A = proto_masks.reshape(-1, proto_masks.shape[-1]) ls_b = gt_masks.reshape(-1, gt_masks.shape[-1]) # x is size [256, num_gt] x = np.linalg.lstsq(ls_A, ls_b, rcond=None)[0] approximated_masks = (np.matmul(proto_masks, x) > 0.5).astype(np.float32) num_gt = approximated_masks.shape[2] ious = mask_iou( jt.array(approximated_masks.reshape(-1, num_gt).transpose(1, 0)),
def execute(self, x): """ The input should be of size [batch_size, 3, img_h, img_w] """ _, _, img_h, img_w = x.shape cfg._tmp_img_h = img_h cfg._tmp_img_w = img_w with timer.env('backbone'): outs = self.backbone(x) if cfg.fpn is not None: with timer.env('fpn'): # Use backbone.selected_layers because we overwrote self.selected_layers outs = [outs[i] for i in cfg.backbone.selected_layers] outs = self.fpn(outs) proto_out = None if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: with timer.env('proto'): proto_x = x if self.proto_src is None else outs[self.proto_src] if self.num_grids > 0: grids = self.grid.repeat(proto_x.shape[0], 1, 1, 1) proto_x = jt.contrib.concat([proto_x, grids], dim=1) proto_out = self.proto_net(proto_x) proto_out = cfg.mask_proto_prototype_activation(proto_out) if cfg.mask_proto_prototypes_as_features: # Clone here because we don't want to permute this, though idk if contiguous makes this unnecessary proto_downsampled = proto_out.clone() if cfg.mask_proto_prototypes_as_features_no_grad: proto_downsampled = proto_out.detach() # Move the features last so the multiplication is easy proto_out = proto_out.permute(0, 2, 3, 1) if cfg.mask_proto_bias: bias_shape = [x for x in proto_out.shape] bias_shape[-1] = 1 proto_out = jt.contrib.concat( [proto_out, jt.ones(bias_shape)], -1) with timer.env('pred_heads'): pred_outs = {'loc': [], 'conf': [], 'mask': [], 'priors': []} if cfg.use_mask_scoring: pred_outs['score'] = [] if cfg.use_instance_coeff: pred_outs['inst'] = [] for idx, pred_layer in zip(self.selected_layers, self.prediction_layers): pred_x = outs[idx] if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_prototypes_as_features: # Scale the prototypes down to the current prediction layer's size and add it as inputs proto_downsampled = nn.interpolate( proto_downsampled, size=outs[idx].shape[2:], mode='bilinear', align_corners=False) # proto_downsampled = interpolate(proto_downsampled, size=outs[idx].shape[2:], mode='bilinear', align_corners=False) pred_x = jt.contrib.concat([pred_x, proto_downsampled], dim=1) # A hack for the way dataparallel works if cfg.share_prediction_module and pred_layer is not self.prediction_layers[ 0]: pred_layer.parent = [self.prediction_layers[0]] p = pred_layer(pred_x) for k, v in p.items(): pred_outs[k].append(v) for k, v in pred_outs.items(): pred_outs[k] = jt.contrib.concat(v, -2) if proto_out is not None: pred_outs['proto'] = proto_out #print('hh',pred_outs) #print() if self.is_training(): # For the extra loss functions if cfg.use_class_existence_loss: pred_outs['classes'] = self.class_existence_fc( outs[-1].mean(dim=(2, 3))) if cfg.use_semantic_segmentation_loss: pred_outs['segm'] = self.semantic_seg_conv(outs[0]) return pred_outs else: if cfg.use_mask_scoring: pred_outs['score'] = jt.sigmoid(pred_outs['score']) if cfg.use_focal_loss: if cfg.use_sigmoid_focal_loss: # Note: even though conf[0] exists, this mode doesn't train it so don't use it pred_outs['conf'] = jt.sigmoid(pred_outs['conf']) if cfg.use_mask_scoring: pred_outs['conf'] *= pred_outs['score'] elif cfg.use_objectness_score: # See focal_loss_sigmoid in multibox_loss.py for details objectness = jt.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = objectness.unsqueeze( 2) * nn.softmax(pred_outs['conf'][:, :, 1:], -1) pred_outs['conf'][:, :, 0] = 1 - objectness else: pred_outs['conf'] = nn.softmax(pred_outs['conf'], -1) else: if cfg.use_objectness_score: objectness = jt.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = (objectness > 0.10).unsqueeze(-1) \ * nn.softmax(pred_outs['conf'][:, :, 1:], dim=-1) else: pred_outs['conf'] = nn.softmax(pred_outs['conf'], -1) return self.detect(pred_outs, self)