def create_train_pconv_unet(): input_image = layers.Input((320, 240, 3)) input_mask = layers.Input((320, 240, 3)) input_groundtruth = layers.Input((320, 240, 3)) ## U-Net # Encoder conv1, mask1 = conv_bn_relu(input_image, input_mask, filters=32, kernel_size=3, downsampling=1, reps=2) # 320x240 conv2, mask2 = conv_bn_relu(conv1, mask1, filters=64, kernel_size=7, downsampling=5, reps=1) conv2, mask2 = conv_bn_relu(conv2, mask2, filters=64, kernel_size=3, downsampling=1, reps=2) # 64x48 conv3, mask3 = conv_bn_relu(conv2, mask2, filters=128, kernel_size=3, downsampling=2, reps=3) # 32x24 conv4, mask4 = conv_bn_relu(conv3, mask3, filters=256, kernel_size=3, downsampling=2, reps=3) # 16x12 conv5, mask5 = conv_bn_relu(conv4, mask4, filters=512, kernel_size=3, downsampling=2, reps=3) # 8x6 ## Decoder img, mask = conv_bn_relu(conv5, mask5, filters=256, kernel_size=3, upsampling=2, reps=3, concat_img=conv4, concat_mask=mask4) # 16x12 img, mask = conv_bn_relu(img, mask, filters=128, kernel_size=3, upsampling=2, reps=3, concat_img=conv3, concat_mask=mask3) # 32x24 img, mask = conv_bn_relu(img, mask, filters=64, kernel_size=3, upsampling=2, reps=3, concat_img=conv2, concat_mask=mask2) # 64x48 img, mask = conv_bn_relu(img, mask, filters=32, kernel_size=7, upsampling=5, reps=1, concat_img=conv1, concat_mask=mask1) # 320x240 img, mask = conv_bn_relu(img, mask, filters=32, kernel_size=3, upsampling=1, reps=2) img, mask = conv_bn_relu(img, mask, filters=3, kernel_size=1, reps=1, act="custom_tanh") # 差分出力 # skip connection img = layers.Add()([img, input_image]) # 収束が早くなるはず img = layers.Lambda(lambda x: K.clip(x, -1.0, 1.0))(img) ## 損失関数 # マスクしていない部分の真の画像+マスク部分の予測画像 y_comp = layers.Lambda(lambda inputs: inputs[0] * inputs[1] + (1 - inputs[0]) * inputs[2])( [input_mask, input_groundtruth, img]) # Caffeカラースケールに変換 vgg_in_pred = layers.Lambda(convert_caffe_color_space)(img) vgg_in_groundtruth = layers.Lambda(convert_caffe_color_space)( input_groundtruth) vgg_in_comp = layers.Lambda(convert_caffe_color_space)(y_comp) # vggの特徴量 vgg_pred_1, vgg_pred_2, vgg_pred_3 = extract_vgg_features( vgg_in_pred, (320, 240, 3), 0) vgg_true_1, vgg_true_2, vgg_true_3 = extract_vgg_features( vgg_in_groundtruth, (320, 240, 3), 1) vgg_comp_1, vgg_comp_2, vgg_comp_3 = extract_vgg_features( vgg_in_comp, (320, 240, 3), 2) # 画像+損失 join = LossLayer()([ input_mask, img, input_groundtruth, y_comp, vgg_pred_1, vgg_pred_2, vgg_pred_3, vgg_true_1, vgg_true_2, vgg_true_3, vgg_comp_1, vgg_comp_2, vgg_comp_3 ]) # lossやmetricsの表示がうまくいかないので出力は1つにする model = Model([input_image, input_mask, input_groundtruth], join) # このモデルは>100MBだが、推論用モデルは93MB return model
def create_train_pconv_unet(): input_image = layers.Input((448, 336, 3)) input_mask = layers.Input((448, 336, 3)) input_groundtruth = layers.Input((448, 336, 3)) ## U-Net # Encoder conv1, mask1 = conv_bn_relu(input_image, input_mask, filters=32, kernel_size=3, downsampling=1, reps=2) # 448x336 conv2, mask2 = conv_bn_relu(conv1, mask1, filters=64, kernel_size=11, downsampling=7, reps=1) conv2, mask2 = conv_bn_relu(conv2, mask2, filters=64, kernel_size=3, downsampling=1, reps=2) # 64x48 conv3, mask3 = conv_bn_relu(conv2, mask2, filters=128, kernel_size=3, downsampling=2, reps=3) # 32x24 conv4, mask4 = conv_bn_relu(conv3, mask3, filters=256, kernel_size=3, downsampling=2, reps=3) # 16x12 conv5, mask5 = conv_bn_relu(conv4, mask4, filters=512, kernel_size=3, downsampling=2, reps=3) # 8x6 ## Decoder img, mask = conv_bn_relu(conv5, mask5, filters=256, kernel_size=3, upsampling=2, reps=3, concat_img=conv4, concat_mask=mask4) # 16x12 img, mask = conv_bn_relu(img, mask, filters=128, kernel_size=3, upsampling=2, reps=3, concat_img=conv3, concat_mask=mask3) # 32x24 img, mask = conv_bn_relu(img, mask, filters=64, kernel_size=3, upsampling=2, reps=3, concat_img=conv2, concat_mask=mask2) # 64x48 img, mask = conv_bn_relu(img, mask, filters=32, kernel_size=11, upsampling=7, reps=1, concat_img=conv1, concat_mask=mask1) # 448x336 img, mask = conv_bn_relu(img, mask, filters=32, kernel_size=3, upsampling=1, reps=2) img, mask = conv_bn_relu(img, mask, filters=3, kernel_size=1, reps=1, act="output") # 出力 ## 損失関数 # マスクしていない部分の真の画像+マスク部分の予測画像 y_comp = layers.Lambda(lambda inputs: inputs[0] * inputs[1] + (1 - inputs[0]) * inputs[2])( [input_mask, input_groundtruth, img]) # 640x480のグラム行列を取るのは大変なのでPoolingを入れる(NaN対策)->448x336から? #vgg_in_pred = layers.AveragePooling2D(7)(img) #vgg_in_groundtruth = layers.AveragePooling2D(7)(input_groundtruth) #vgg_in_comp = layers.AveragePooling2D(7)(y_comp) vgg_in_pred = img vgg_in_groundtruth = input_groundtruth vgg_in_comp = y_comp # vggの特徴量 vgg_pred_1, vgg_pred_2, vgg_pred_3 = extract_vgg_features( vgg_in_pred, (448, 336, 3), 0) vgg_true_1, vgg_true_2, vgg_true_3 = extract_vgg_features( vgg_in_groundtruth, (448, 336, 3), 1) vgg_comp_1, vgg_comp_2, vgg_comp_3 = extract_vgg_features( vgg_in_comp, (448, 336, 3), 2) # 画像+損失 join = LossLayer()([ input_mask, img, input_groundtruth, y_comp, vgg_pred_1, vgg_pred_2, vgg_pred_3, vgg_true_1, vgg_true_2, vgg_true_3, vgg_comp_1, vgg_comp_2, vgg_comp_3 ]) # lossやmetricsの表示がうまくいかないので出力は1つにする model = Model([input_image, input_mask, input_groundtruth], join) # このモデルは>100MBだが、推論用モデルは93MB return model