def __call__(self, images): self.lstm.reset_state() self.transform_2.reset_state() h = self.bn0(self.conv0(images)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs1(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) # h = self.rs4(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=2) localizations = [] with cuda.get_device_from_array(h.data): # lstm_prediction = chainer.Variable(self.xp.zeros((len(images), self.lstm.state_size), dtype=h.dtype)) for _ in range(self.num_timesteps): # in_feature = self.attend(h, lstm_prediction) in_feature = h lstm_prediction = F.relu(self.lstm(in_feature)) transformed = self.transform_2(lstm_prediction) transformed = F.reshape(transformed, (-1, 2, 3)) localizations.append( rotation_dropout(transformed, ratio=self.dropout_ratio)) return F.concat(localizations, axis=0)
def __call__(self, images): self.lstm.reset_state() h = self.bn0(self.conv0(images)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs1(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) # h = self.rs4(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=2) localizations = [] # 预测产出N个二维仿射转换矩阵A with cuda.get_device_from_array(h.data): for _ in range(self.num_timesteps): in_feature = h lstm_prediction = F.relu(self.lstm(in_feature)) transformed = self.transform_2(lstm_prediction) transformed = F.reshape(transformed, (-1, 2, 3)) # rotation_dropout 旋转dropout 防止过度旋转 localizations.append( rotation_dropout(transformed, ratio=self.dropout_ratio)) return F.concat(localizations, axis=0)
def localization_net(self, images): self.lstm.reset_state() self.transform_2.reset_state() images = self.data_bn(images) h = F.relu(self.bn0(self.conv0(images))) h = F.max_pooling_2d(h, 3, stride=2, pad=1) h = self.rs1_1(h) h = self.rs1_2(h) h = self.rs2_1(h) h = self.rs2_2(h) h = self.rs3_1(h) h = self.rs3_2(h) # h = self.rs4_1(h) # h = self.rs4_2(h) self.localization_vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) localizations = [] with cuda.get_device_from_array(h.data): for _ in range(self.num_timesteps): in_feature = h lstm_prediction = F.relu(self.lstm(in_feature)) transformed = self.transform_2(lstm_prediction) transformed = F.reshape(transformed, (-1, 2, 3)) localizations.append(rotation_dropout(transformed, ratio=self.dropout_ratio)) return F.concat(localizations, axis=0)
def __call__(self, images): self.visual_backprop_anchors.clear() with cuda.Device(images.data.device): input_images = self.prepare_images(images.copy() * 255) h = self.feature_extractor(input_images) if self.train_imagenet: return h if images.shape[-2] > 224: h = self.res6(h) if images.shape[-2] > 300: h = self.res7(h) self.visual_backprop_anchors.append(h) h = _global_average_pooling_2d(h) transform_params = self.param_predictor(h) transform_params = rotation_dropout(F.reshape(transform_params, (-1, 2, 3)), ratio=0.0) points = F.spatial_transformer_grid(transform_params, self.out_size) rois = F.spatial_transformer_sampler(images, points) if self.transform_rois_to_grayscale: assert rois.shape[ 1] == 3, "rois are not in RGB, can not convert them to grayscale" b, g, r = F.split_axis(rois, 3, axis=1) rois = 0.299 * r + 0.587 * g + 0.114 * b return rois, points
def __call__(self, images): self.lstm.reset_state() h = self.bn0(self.conv0(images)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs1(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) # h = self.rs4(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5) localizations = [] with cuda.get_device_from_array(h.data): for _ in range(self.num_timesteps): timestep_localizations = [] in_feature = h lstm_prediction = F.relu(self.lstm(in_feature)) transformed = self.transform_2(lstm_prediction) transformed = F.reshape(transformed, (-1, 2, 3)) transformation_params = rotation_dropout( transformed, ratio=self.dropout_ratio) timestep_localizations.append(transformation_params) # self.transform_2.disable_update() if self.do_parameter_refinement: transformation_params = self.to_homogeneous_coordinates( transformation_params) # refine the transformation parameters for _ in range(self.num_refinement_steps): transformation_deltas = self.do_transformation_param_refinement_step( images, transformation_params) transformation_deltas = self.to_homogeneous_coordinates( transformation_deltas) transformation_params = F.batch_matmul( transformation_params, transformation_deltas) # transformation_params = F.batch_matmul(transformation_deltas, transformation_params) timestep_localizations.append( transformation_params[:, :-1, :]) localizations.append(timestep_localizations) return [F.concat(loc, axis=0) for loc in zip(*localizations)]
def __call__(self, images): self.lstm.reset_state() self.transform_2.reset_state() h = self.bn0(self.conv0(images)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs1(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=2) localizations = [] with cuda.get_device_from_array(h.data): homogenuous_addon = self.xp.zeros((len(h), 1, 3), dtype=h.data.dtype) homogenuous_addon[:, 0, 2] = 1 for _ in range(self.num_timesteps): lstm_prediction = F.relu(self.lstm(h)) translation_transform = F.reshape( self.rotation_transform(lstm_prediction), (-1, 2, 3)) translation_transform = disable_shearing(translation_transform) translation_transform = F.concat( (translation_transform, homogenuous_addon), axis=1) rotation_transform = F.reshape( self.rotation_transform(lstm_prediction), (-1, 2, 3)) rotation_transform = disable_translation(rotation_transform) rotation_transform = F.concat( (rotation_transform, homogenuous_addon), axis=1) # first rotate, then translate transform = F.batch_matmul(rotation_transform, translation_transform) # homogenuous_multiplier = F.get_item(transform, (..., 2, 2)) # # # bring matrices from homogenous coordinates to normal coordinates transform = transform[:, :2, :] # transform = transform / homogenuous_multiplier localizations.append( rotation_dropout(transform, ratio=self.dropout_factor)) return F.concat(localizations, axis=0)
def get_transform_params(self, features): h = self.pre_transform_params(features) slices = F.split_axis(h, self.num_bboxes_to_localize, axis=1) lstm_predictions = [self.lstm(slice) for slice in slices] lstm_predictions = F.stack(lstm_predictions, axis=1) batch_size, num_boxes, _ = lstm_predictions.shape lstm_predictions = F.reshape(lstm_predictions, (-1, ) + lstm_predictions.shape[2:]) params = self.param_predictor(lstm_predictions) transform_params = rotation_dropout(F.reshape(params, (-1, 2, 3)), ratio=self.dropout_ratio) return transform_params
def get_transform_params(self, features): h = _global_average_pooling_2d(features) lstm_predictions = [ self.lstm(h) for _ in range(self.num_bboxes_to_localize) ] lstm_predictions = F.stack(lstm_predictions, axis=1) batch_size, num_boxes, _ = lstm_predictions.shape lstm_predictions = F.reshape(lstm_predictions, (-1, ) + lstm_predictions.shape[2:]) params = self.param_predictor(lstm_predictions) transform_params = rotation_dropout(F.reshape(params, (-1, 2, 3)), ratio=self.dropout_ratio) return transform_params
def get_transform_params(self, features): batch_size, num_channels, feature_height, feature_weight = features.shape features = F.reshape(features, (batch_size, num_channels, -1)) features = F.transpose(features, (0, 2, 1)) target = chainer.Variable(self.xp.zeros((batch_size, 1, 6), dtype=chainer.get_dtype())) for _ in range(self.num_bboxes_to_localize): embedded_params = self.param_embedder(target.array, n_batch_axes=2) embedded_params = self.positional_encoding(embedded_params) decoded = self.decoder(embedded_params, features, None, self.mask) params = self.param_predictor(decoded, n_batch_axes=2) target = F.concat([target, params[:, -1:]]) target = F.reshape(target[:, 1:], (-1,) + target.shape[2:]) transform_params = rotation_dropout(F.reshape(target, (-1, 2, 3)), ratio=self.dropout_ratio) return transform_params
def do_transformation_param_refinement_step(self, images, transformation_params): transformation_params = self.remove_homogeneous_coordinates( transformation_params) points = F.spatial_transformer_grid(transformation_params, self.target_shape) rois = F.spatial_transformer_sampler(images, points) # rerun parts of the feature extraction for producing a refined version of the transformation params h = self.bn0_1(self.conv0_1(rois)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs4(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs5(h) h = F.max_pooling_2d(h, 2, stride=2) transformation_params = self.refinement_transform(h) transformation_params = F.reshape(transformation_params, (-1, 2, 3)) transformation_params = rotation_dropout(transformation_params, ratio=self.dropout_ratio) return transformation_params