Esempio n. 1
0
def create_train_pconv_unet():
    input_image = layers.Input((320, 240, 3))
    input_mask = layers.Input((320, 240, 3))
    input_groundtruth = layers.Input((320, 240, 3))

    ## U-Net
    # Encoder
    conv1, mask1 = conv_bn_relu(input_image,
                                input_mask,
                                filters=32,
                                kernel_size=3,
                                downsampling=1,
                                reps=2)  # 320x240
    conv2, mask2 = conv_bn_relu(conv1,
                                mask1,
                                filters=64,
                                kernel_size=7,
                                downsampling=5,
                                reps=1)
    conv2, mask2 = conv_bn_relu(conv2,
                                mask2,
                                filters=64,
                                kernel_size=3,
                                downsampling=1,
                                reps=2)  # 64x48
    conv3, mask3 = conv_bn_relu(conv2,
                                mask2,
                                filters=128,
                                kernel_size=3,
                                downsampling=2,
                                reps=3)  # 32x24
    conv4, mask4 = conv_bn_relu(conv3,
                                mask3,
                                filters=256,
                                kernel_size=3,
                                downsampling=2,
                                reps=3)  # 16x12
    conv5, mask5 = conv_bn_relu(conv4,
                                mask4,
                                filters=512,
                                kernel_size=3,
                                downsampling=2,
                                reps=3)  # 8x6
    ## Decoder
    img, mask = conv_bn_relu(conv5,
                             mask5,
                             filters=256,
                             kernel_size=3,
                             upsampling=2,
                             reps=3,
                             concat_img=conv4,
                             concat_mask=mask4)  # 16x12
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=128,
                             kernel_size=3,
                             upsampling=2,
                             reps=3,
                             concat_img=conv3,
                             concat_mask=mask3)  # 32x24
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=64,
                             kernel_size=3,
                             upsampling=2,
                             reps=3,
                             concat_img=conv2,
                             concat_mask=mask2)  # 64x48
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=32,
                             kernel_size=7,
                             upsampling=5,
                             reps=1,
                             concat_img=conv1,
                             concat_mask=mask1)  # 320x240
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=32,
                             kernel_size=3,
                             upsampling=1,
                             reps=2)
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=3,
                             kernel_size=1,
                             reps=1,
                             act="custom_tanh")  # 差分出力
    # skip connection
    img = layers.Add()([img, input_image])  # 収束が早くなるはず
    img = layers.Lambda(lambda x: K.clip(x, -1.0, 1.0))(img)

    ## 損失関数
    # マスクしていない部分の真の画像+マスク部分の予測画像
    y_comp = layers.Lambda(lambda inputs: inputs[0] * inputs[1] +
                           (1 - inputs[0]) * inputs[2])(
                               [input_mask, input_groundtruth, img])
    # Caffeカラースケールに変換
    vgg_in_pred = layers.Lambda(convert_caffe_color_space)(img)
    vgg_in_groundtruth = layers.Lambda(convert_caffe_color_space)(
        input_groundtruth)
    vgg_in_comp = layers.Lambda(convert_caffe_color_space)(y_comp)
    # vggの特徴量
    vgg_pred_1, vgg_pred_2, vgg_pred_3 = extract_vgg_features(
        vgg_in_pred, (320, 240, 3), 0)
    vgg_true_1, vgg_true_2, vgg_true_3 = extract_vgg_features(
        vgg_in_groundtruth, (320, 240, 3), 1)
    vgg_comp_1, vgg_comp_2, vgg_comp_3 = extract_vgg_features(
        vgg_in_comp, (320, 240, 3), 2)
    # 画像+損失
    join = LossLayer()([
        input_mask, img, input_groundtruth, y_comp, vgg_pred_1, vgg_pred_2,
        vgg_pred_3, vgg_true_1, vgg_true_2, vgg_true_3, vgg_comp_1, vgg_comp_2,
        vgg_comp_3
    ])
    # lossやmetricsの表示がうまくいかないので出力は1つにする
    model = Model([input_image, input_mask, input_groundtruth],
                  join)  # このモデルは>100MBだが、推論用モデルは93MB

    return model
Esempio n. 2
0
def create_train_pconv_unet():
    input_image = layers.Input((448, 336, 3))
    input_mask = layers.Input((448, 336, 3))
    input_groundtruth = layers.Input((448, 336, 3))

    ## U-Net
    # Encoder
    conv1, mask1 = conv_bn_relu(input_image,
                                input_mask,
                                filters=32,
                                kernel_size=3,
                                downsampling=1,
                                reps=2)  # 448x336
    conv2, mask2 = conv_bn_relu(conv1,
                                mask1,
                                filters=64,
                                kernel_size=11,
                                downsampling=7,
                                reps=1)
    conv2, mask2 = conv_bn_relu(conv2,
                                mask2,
                                filters=64,
                                kernel_size=3,
                                downsampling=1,
                                reps=2)  # 64x48
    conv3, mask3 = conv_bn_relu(conv2,
                                mask2,
                                filters=128,
                                kernel_size=3,
                                downsampling=2,
                                reps=3)  # 32x24
    conv4, mask4 = conv_bn_relu(conv3,
                                mask3,
                                filters=256,
                                kernel_size=3,
                                downsampling=2,
                                reps=3)  # 16x12
    conv5, mask5 = conv_bn_relu(conv4,
                                mask4,
                                filters=512,
                                kernel_size=3,
                                downsampling=2,
                                reps=3)  # 8x6
    ## Decoder
    img, mask = conv_bn_relu(conv5,
                             mask5,
                             filters=256,
                             kernel_size=3,
                             upsampling=2,
                             reps=3,
                             concat_img=conv4,
                             concat_mask=mask4)  # 16x12
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=128,
                             kernel_size=3,
                             upsampling=2,
                             reps=3,
                             concat_img=conv3,
                             concat_mask=mask3)  # 32x24
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=64,
                             kernel_size=3,
                             upsampling=2,
                             reps=3,
                             concat_img=conv2,
                             concat_mask=mask2)  # 64x48
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=32,
                             kernel_size=11,
                             upsampling=7,
                             reps=1,
                             concat_img=conv1,
                             concat_mask=mask1)  # 448x336
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=32,
                             kernel_size=3,
                             upsampling=1,
                             reps=2)
    img, mask = conv_bn_relu(img,
                             mask,
                             filters=3,
                             kernel_size=1,
                             reps=1,
                             act="output")  # 出力

    ## 損失関数
    # マスクしていない部分の真の画像+マスク部分の予測画像
    y_comp = layers.Lambda(lambda inputs: inputs[0] * inputs[1] +
                           (1 - inputs[0]) * inputs[2])(
                               [input_mask, input_groundtruth, img])
    # 640x480のグラム行列を取るのは大変なのでPoolingを入れる(NaN対策)->448x336から?
    #vgg_in_pred = layers.AveragePooling2D(7)(img)
    #vgg_in_groundtruth = layers.AveragePooling2D(7)(input_groundtruth)
    #vgg_in_comp = layers.AveragePooling2D(7)(y_comp)
    vgg_in_pred = img
    vgg_in_groundtruth = input_groundtruth
    vgg_in_comp = y_comp
    # vggの特徴量
    vgg_pred_1, vgg_pred_2, vgg_pred_3 = extract_vgg_features(
        vgg_in_pred, (448, 336, 3), 0)
    vgg_true_1, vgg_true_2, vgg_true_3 = extract_vgg_features(
        vgg_in_groundtruth, (448, 336, 3), 1)
    vgg_comp_1, vgg_comp_2, vgg_comp_3 = extract_vgg_features(
        vgg_in_comp, (448, 336, 3), 2)
    # 画像+損失
    join = LossLayer()([
        input_mask, img, input_groundtruth, y_comp, vgg_pred_1, vgg_pred_2,
        vgg_pred_3, vgg_true_1, vgg_true_2, vgg_true_3, vgg_comp_1, vgg_comp_2,
        vgg_comp_3
    ])
    # lossやmetricsの表示がうまくいかないので出力は1つにする
    model = Model([input_image, input_mask, input_groundtruth],
                  join)  # このモデルは>100MBだが、推論用モデルは93MB

    return model