def G(z, is_train=True): z = F.reshape(z, [batch_size, z_dim, 1, 1]) with nn.parameter_scope('G'): with nn.parameter_scope('deconv1'): dc1 = PF.deconvolution(z, 256, (4, 4), with_bias=False) dc1 = PF.batch_normalization(dc1, batch_stat=is_train) dc1 = F.leaky_relu(dc1) with nn.parameter_scope('deconv2'): dc2 = PF.deconvolution(dc1, 128, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False) dc2 = PF.batch_normalization(dc2, batch_stat=is_train) dc2 = F.leaky_relu(dc2) with nn.parameter_scope('deconv3'): dc3 = PF.deconvolution(dc2, 64, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False) dc3 = PF.batch_normalization(dc3, batch_stat=is_train) dc3 = F.leaky_relu(dc3) with nn.parameter_scope('deconv4'): dc4 = PF.deconvolution(dc3, 32, (4, 4), pad=(3, 3), stride=(2, 2), with_bias=False) dc4 = PF.batch_normalization(dc4, batch_stat=is_train) dc4 = F.leaky_relu(dc4) with nn.parameter_scope('output'): output = PF.convolution(dc4, 1, (3, 3), pad=(1, 1)) output = F.sigmoid(output) return output
def generator(z, maxh=256, test=False, output_hidden=False): """ Building generator network which takes (B, Z, 1, 1) inputs and generates (B, 1, 28, 28) outputs. """ # Define shortcut functions def bn(x): # Batch normalization return PF.batch_normalization(x, batch_stat=not test) def upsample2(x, c): # Twise upsampling with deconvolution. return PF.deconvolution(x, c, kernel=(4, 4), pad=(1, 1), stride=(2, 2), with_bias=False) assert maxh / 4 > 0 with nn.parameter_scope("gen"): # (Z, 1, 1) --> (256, 4, 4) with nn.parameter_scope("deconv1"): d1 = F.elu(bn(PF.deconvolution(z, maxh, (4, 4), with_bias=False))) # (256, 4, 4) --> (128, 8, 8) with nn.parameter_scope("deconv2"): d2 = F.elu(bn(upsample2(d1, maxh / 2))) # (128, 8, 8) --> (64, 16, 16) with nn.parameter_scope("deconv3"): d3 = F.elu(bn(upsample2(d2, maxh / 4))) # (64, 16, 16) --> (32, 28, 28) with nn.parameter_scope("deconv4"): # Convolution with kernel=4, pad=3 and stride=2 transforms a 28 x 28 map # to a 16 x 16 map. Deconvolution with those parameters behaves like an # inverse operation, i.e. maps 16 x 16 to 28 x 28. d4 = F.elu( bn( PF.deconvolution(d3, maxh / 8, (4, 4), pad=(3, 3), stride=(2, 2), with_bias=False))) # (32, 28, 28) --> (32, 56, 56) with nn.parameter_scope("deconv5"): # Convolution with kernel=4, pad=3 and stride=2 transforms a 28 x 28 map # to a 16 x 16 map. Deconvolution with those parameters behaves like an # inverse operation, i.e. maps 16 x 16 to 28 x 28. d5 = F.elu(bn(upsample2(d4, maxh / 8))) # (32, 56, 56) --> (1, 56, 56) with nn.parameter_scope("conv6"): x = F.tanh(PF.convolution(d5, 1, (3, 3), pad=(1, 1))) if output_hidden: return x, [d1, d2, d3, d4] return x
def decoder(self, x, test): with nn.parameter_scope('decoder'): out = self.decoder_res_stack(x, test=test) out = F.relu(out) out = PF.deconvolution(out, self.num_hidden, (4, 4), stride=(2, 2), pad=(1, 1), name='deconv_1', rng=self.rng) out = PF.batch_normalization(out, batch_stat=not test) out = F.relu(out) out = PF.deconvolution(out, self.in_channels, (4, 4), stride=(2, 2), pad=(1, 1), name='deconv_2', rng=self.rng) out = F.tanh(out) return out
def upsample(x, c): return PF.deconvolution(x, c, kernel=(4, 4), pad=(1, 1), stride=(2, 2), with_bias=False)
def decode(input_feature, output_nc, n_downsampling, ngf, norm_layer, use_bias): h = input_feature w_init = I.NormalInitializer(sigma=0.02, rng=None) for i in range(n_downsampling): with nn.parameter_scope("dec_downsampling_{}".format(i)): mult = 2**(n_downsampling - i) h = PF.deconvolution(h, int(ngf * mult / 2), kernel=(4, 4), stride=(2, 2), pad=(1, 1), w_init=w_init, with_bias=use_bias) # kernel changed 3 -> 4 to make the output fit to the desired size. h = norm_layer(h) h = F.relu(h) h = F.pad(h, (3, 3, 3, 3), 'reflect') h = PF.convolution(h, output_nc, kernel=(7, 7), w_init=w_init, with_bias=use_bias, name="dec_last_conv") h = F.tanh(h) return h
def deconv_unit(x, scope, maps, k=4, s=2, p=1, act=F.relu, test=False): with nn.parameter_scope(scope): h = PF.deconvolution(x, maps, kernel=(k, k), stride=(s, s), pad=(p, p)) if act is None: return h h = PF.batch_normalization(h, batch_stat=not test) h = act(h) return h
def deconv(x, planes, kernel, pad, stride, with_bias): h = PF.deconvolution(x, planes, kernel=kernel, pad=pad, stride=stride, with_bias=with_bias) return h
def upsample2(x, c): # Twise upsampling with deconvolution. return PF.deconvolution(x, c, kernel=(4, 4), pad=(1, 1), stride=(2, 2), with_bias=False)
def deconvolution(x, n, kernel, stride, pad, init_method=None): if init_method == "paper": init = nn.initializer.NormalInitializer(0.02) else: s = nn.initializer.calc_normal_std_glorot(x.shape[1], n, kernel=kernel) init = nn.initializer.NormalInitializer(s) x = PF.deconvolution(x, n, kernel=kernel, stride=stride, pad=pad, with_bias=True, w_init=init) return x
def deconv(x, output_ch, karnel=(32, ), pad=(15, ), stride=(2, ), name=None): return PF.deconvolution(x, output_ch, karnel, pad=pad, stride=stride, name=name)
def upsampling_layer(self, inp, num_input_features, comp_rate=1.): ''' Define Upsampling Layer ''' num_filters = int(math.ceil(comp_rate * num_input_features)) with nn.parameter_scope('upsample'): out = self.batch_norm(inp, name='norm') out = PF.deconvolution(out, num_filters, kernel=(2, 2), stride=(2, 2), name='transconv') return out
def Generator_early(z, scope_name="Generator", train=True, img_size=1024, ngf=64, big=False): with nn.parameter_scope(scope_name): # Get number of channels nfc_multi = { 4: 16, 8: 8, 16: 4, 32: 2, 64: 2, 128: 1, 256: 0.5, 512: 0.25, 1024: 0.125 } nfc = {} for k, v in nfc_multi.items(): nfc[k] = int(v * ngf) def sn_w(w): return PF.spectral_norm(w, dim=0) # InitLayer: ConvTranspose + BN + GLU -> 4x4 with nn.parameter_scope("init"): h = PF.deconvolution(z, 2 * 16 * ngf, (4, 4), apply_w=sn_w, with_bias=False, name="deconv0") h = PF.batch_normalization(h, batch_stat=train, name="bn0") f_4 = GLU(h) # Calc base features if big: f_8 = UpsampleComp(f_4, nfc[8], "up4->8", train) else: f_8 = Upsample(f_4, nfc[8], "up4->8", train) f_16 = Upsample(f_8, nfc[16], "up8->16", train) if big: f_32 = UpsampleComp(f_16, nfc[32], "up16->32", train) else: f_32 = Upsample(f_16, nfc[32], "up16->32", train) f_64 = Upsample(f_32, nfc[64], "up32->64", train) return f_8, f_16, f_32, f_64
def generator(z, maxh=256, test=False, output_hidden=False): """ Building generator network which takes (B, Z, 1, 1) inputs and generates (B, 1, 28, 28) outputs. """ # Define shortcut functions def bn(x): # Batch normalization return PF.batch_normalization(x, batch_stat=not test) def upsample2(x, c): # Twise upsampling with deconvolution. return PF.deconvolution(x, c, kernel=(4, 4), pad=(1, 1), stride=(2, 2), with_bias=False) assert maxh / 4 > 0 with nn.parameter_scope("gen"): # (Z, 1, 1) --> (256, 4, 4) with nn.parameter_scope("deconv1"): d1 = F.elu(bn(PF.deconvolution(z, maxh, (4, 4), with_bias=False))) # (256, 4, 4) --> (128, 8, 8) with nn.parameter_scope("deconv2"): d2 = F.elu(bn(upsample2(d1, maxh / 2))) # (128, 8, 8) --> (64, 16, 16) with nn.parameter_scope("deconv3"): d3 = F.elu(bn(upsample2(d2, maxh / 4))) # (64, 16, 16) --> (32, 28, 28) with nn.parameter_scope("deconv4"): # Convolution with kernel=4, pad=3 and stride=2 transforms a 28 x 28 map # to a 16 x 16 map. Deconvolution with those parameters behaves like an # inverse operation, i.e. maps 16 x 16 to 28 x 28. d4 = F.elu(bn(PF.deconvolution( d3, maxh / 8, (4, 4), pad=(3, 3), stride=(2, 2), with_bias=False))) # (32, 28, 28) --> (1, 28, 28) with nn.parameter_scope("conv5"): x = F.tanh(PF.convolution(d4, 1, (3, 3), pad=(1, 1))) if output_hidden: return x, [d1, d2, d3, d4] return x
def upsample(x, factor, training, left_shape=None): if len(x.shape) == 4: if training: h = F.interpolate(x, scale=(factor, factor), mode='linear', align_corners=True) else: h = F.interpolate(x, output_size=(left_shape[2] // 4, left_shape[3] // 4), mode='linear', align_corners=True) elif len(x.shape) == 5: planes = x.shape[1] kernel_size = 2 * factor - factor % 2 stride = int(factor) pad = int(math.ceil((factor - 1) / 2.)) scale_factor = (kernel_size + 1) // 2 if kernel_size % 2 == 1: center = scale_factor - 1 else: center = scale_factor - 0.5 bilinear_kernel = np.zeros([kernel_size, kernel_size, kernel_size], dtype=np.float32) for i in range(kernel_size): for j in range(kernel_size): for d in range(kernel_size): bilinear_kernel[ i, j, d] = (1 - abs(i - center) / scale_factor) * ( 1 - abs(j - center) / scale_factor) * ( 1 - abs(d - center) / scale_factor) w_filter = np.zeros([1, planes, kernel_size, kernel_size, kernel_size]) for i in range(planes): w_filter[:, i, :, :, :] = bilinear_kernel h = PF.deconvolution(x, planes, kernel=(kernel_size, kernel_size, kernel_size), pad=(pad, pad, pad), stride=(stride, stride, stride), w_init=w_filter, fix_parameters=True, group=planes) return h
def upsample(h, maps, up, test=False, name="convblock"): if up == "nearest": h = PF.convolution(h, maps, (3, 3), (1, 1), name=name) h = F.interpolate(h, scale=(2, 2), mode="nearest") elif up == "linear": h = PF.convolution(h, maps, (3, 3), (1, 1), name=name) h = F.interpolate(h, scale=(2, 2), mode="linear") elif up == "unpooling": h = PF.convolution(h, maps, (3, 3), (1, 1), name=name) h = F.unpooling(h, (2, 2)) elif up == "deconv": h = PF.deconvolution(h, maps * 2, (2, 2), (0, 0), (2, 2), name=name) else: raise ValueError( 'Set "up" option in ["nearest", "linear", "unpooling", "deconv"]') h = PF.batch_normalization(h, batch_stat=not test, name=name) h = F.relu(h) return h
def back_end(self, x, channel): with nn.parameter_scope("up_sample_layer"): h = PF.deconvolution(x, channel, (4, 4), stride=(2, 2), pad=(1, 1), **self.conv_opts) h = self.instance_norm_relu(h) last_feat = h with nn.parameter_scope("last_layer"): pad_width = get_symmetric_padwidth(3, channel_last=self.channel_last) h = F.pad(h, pad_width=pad_width, mode=self.padding_type) h = PF.convolution(h, self.n_outputs, (7, 7), **self.conv_opts) h = F.tanh(h) return h, last_feat
def deconv2d(input, output_channels, kernel, stride, name='', pad=(1, 1), output_padding=(1, 1), bias=True): """ Deconvolution layer """ return PF.deconvolution(input, output_channels, kernel=kernel, stride=stride, output_padding=output_padding, pad=pad, with_bias=bias, name=name, channel_last=True)
def deconvblock(x, n=64, k=(3, 3), s=(2, 2), p=(1, 1), test=False, norm_type="batch_norm"): x = PF.deconvolution(x, n, kernel=(3, 3), pad=(1, 1), stride=(2, 2), output_padding=(1, 1), with_bias=False) if norm_type == "instance_norm": x = PF.instance_normalization(x, eps=1e-05) else: x = PF.batch_normalization(x, batch_stat=not test) x = F.relu(x) return x
def encdec(self, x, n_downsamples): with nn.parameter_scope("first layer"): pad_width = get_symmetric_padwidth(3, channel_last=self.channel_last) h = F.pad(x, pad_width=pad_width, mode=self.padding_type) h = PF.convolution(h, 32, (7, 7), **self.conv_opts) h = self.instance_norm_relu(h) # down sample layers for i in range(n_downsamples): with nn.parameter_scope("down_{}".format(i)): c = 32 * 2**(i + 1) h = PF.convolution(h, c, (3, 3), strides=(2, 2), pad=(1, 1), **self.conv_opts) h = self.instance_norm_relu(h) # up sample layers for i in range(n_downsamples): with nn.parameter_scope("up_{}".format(i)): c = 32 * 2**(n_downsamples - i - 1) h = PF.deconvolution(h, c, (3, 3), stride=(2, 2), pad=(1, 1), **self.conv_opts) h = F.pad(h, pad_width=(0, 1, 0, 1)) # output padding h = self.instance_norm_relu(h) with nn.parameter_scope("last layer"): pad_width = get_symmetric_padwidth(3, channel_last=self.channel_last) h = F.pad(h, pad_width=pad_width, mode=self.padding_type) h = PF.convolution(h, 3, (7, 7), **self.conv_opts) h = F.tanh(h) return h
def pf_deconvolution(x, ochannels, kernel, stride=(1, 1), pad=(1, 1), dilation=(2, 2), with_bias=False, w_init=None, b_init=None, channel_last=False): x = PF.deconvolution(x, ochannels, kernel, pad=pad, stride=stride, dilation=dilation, w_init=w_init, group=1, with_bias=with_bias, b_init=b_init, channel_last=channel_last) return x
def generator(x, c, conv_dim=64, c_dim=5, num_downsample=2, num_upsample=2, repeat_num=6, w_init=None, epsilon=1e-05): assert len(c.shape) == 4 c = F.tile(c, (1, 1) + x.shape[2:]) concat_input = F.concatenate(x, c, axis=1) h = PF.convolution(concat_input, conv_dim, kernel=(7, 7), pad=( 3, 3), stride=(1, 1), with_bias=False, w_init=w_init, name="init_conv") h = PF.instance_normalization(h, eps=epsilon, name="init_inst_norm") h = F.relu(h, inplace=True) # Down-sampling layers. curr_dim = conv_dim for i in range(num_downsample): h = PF.convolution(h, curr_dim*2, kernel=(4, 4), pad=(1, 1), stride=(2, 2), with_bias=False, w_init=w_init, name="downsample_{}".format(i)) h = PF.instance_normalization( h, eps=epsilon, name="downsample_{}".format(i)) h = F.relu(h, inplace=True) curr_dim = curr_dim * 2 # Bottleneck layers. for i in range(repeat_num): with nn.parameter_scope("bottleneck_{}".format(i)): h = resblock(h, dim_out=curr_dim) # Up-sampling layers. for i in range(num_upsample): h = PF.deconvolution(h, curr_dim//2, kernel=(4, 4), pad=(1, 1), stride=( 2, 2), w_init=w_init, with_bias=False, name="upsample_{}".format(i)) h = PF.instance_normalization( h, eps=epsilon, name="upsample_{}".format(i)) h = F.relu(h, inplace=True) curr_dim = curr_dim // 2 h = PF.convolution(h, 3, kernel=(7, 7), pad=(3, 3), stride=( 1, 1), with_bias=False, w_init=w_init, name="last_conv") h = F.tanh(h) return h
def pf_depthwise_deconvolution(x, kernel, stride=(1, 1), pad=(1, 1), dilation=(2, 2), with_bias=False, w_init=None, b_init=None, channel_last=False): out_map = x.shape[3] if channel_last else x.shape[1] if channel_last: w_init = np.transpose(w_init, (0, 2, 3, 1)) x = PF.deconvolution(x, out_map, kernel, pad=pad, stride=stride, dilation=dilation, w_init=w_init, with_bias=with_bias, b_init=b_init, group=out_map, channel_last=channel_last) return x
def Generator(z, scope_name="Generator", train=True, img_size=1024, ngf=64, big=False): with nn.parameter_scope(scope_name): # Get number of channels nfc_multi = { 4: 16, 8: 8, 16: 4, 32: 2, 64: 2, 128: 1, 256: 0.5, 512: 0.25, 1024: 0.125 } nfc = {} for k, v in nfc_multi.items(): nfc[k] = int(v * ngf) def sn_w(w): return PF.spectral_norm(w, dim=0) # InitLayer: ConvTranspose + BN + GLU -> 4x4 with nn.parameter_scope("init"): h = PF.deconvolution(z, 2 * 16 * ngf, (4, 4), apply_w=sn_w, with_bias=False, name="deconv0") h = PF.batch_normalization(h, batch_stat=train, name="bn0") f_4 = GLU(h) # Calc base features if big: f_8 = UpsampleComp(f_4, nfc[8], "up4->8", train) else: f_8 = Upsample(f_4, nfc[8], "up4->8", train) f_16 = Upsample(f_8, nfc[16], "up8->16", train) if big: f_32 = UpsampleComp(f_16, nfc[32], "up16->32", train) else: f_32 = Upsample(f_16, nfc[32], "up16->32", train) # Apply SLE f_64 = Upsample(f_32, nfc[64], "up32->64", train) if big: f_64 = SLE(f_64, f_4, "sle4->64") f_128 = UpsampleComp(f_64, nfc[128], "up64->128", train) else: f_128 = Upsample(f_64, nfc[128], "up64->128", train) f_128 = SLE(f_128, f_8, "sle8->128") f_256 = Upsample(f_128, nfc[256], "up128->256") f_256 = SLE(f_256, f_16, "sle16->256") f_last = f_256 if img_size > 256: if big: f_512 = UpsampleComp(f_256, nfc[512], "up256->512") else: f_512 = Upsample(f_256, nfc[512], "up256->512") f_512 = SLE(f_512, f_32, "sle32->512") f_last = f_512 if img_size > 512: f_1024 = Upsample(f_512, nfc[1024], "up512->1024") f_last = f_1024 # Conv + Tanh -> image img = F.tanh( PF.convolution(f_last, 3, (3, 3), pad=(1, 1), apply_w=sn_w, with_bias=False, name="conv_last")) img_small = F.tanh( PF.convolution(f_128, 3, (1, 1), apply_w=sn_w, with_bias=False, name="conv_last_small")) return [img, img_small]
def wn_deconv(*args, **kwargs): return PF.deconvolution(*args, **kwargs, apply_w=PF.weight_normalization, w_init=NormalInitializer(0.02))
def conv_bn_relu(inp, maps, size, stride=(1, 1), pad=(0, 0), deconv=False, bn=True, dropout=False, relu=F.relu, test=False, name=''): if not deconv: if isinstance(inp, tuple): h = F.add2( PF.convolution(inp[0], maps, size, stride=stride, pad=pad, with_bias=not bn, name=name + '_conv_0'), PF.convolution(inp[1], maps, size, stride=stride, pad=pad, with_bias=False, name=name + '_conv_1')) else: h = PF.convolution(inp, maps, size, stride=stride, pad=pad, with_bias=not bn, name=name + '_conv') else: if isinstance(inp, tuple): h = F.add2( PF.deconvolution(inp[0], maps, kernel=size, stride=stride, pad=pad, with_bias=not bn, name=name + '_deconv_0'), PF.deconvolution(inp[1], maps, kernel=size, stride=stride, pad=pad, with_bias=False, name=name + '_deconv_1')) else: h = PF.deconvolution(inp, maps, kernel=size, stride=stride, pad=pad, with_bias=not bn, name=name + '_deconv') if bn: h = PF.batch_normalization(h, batch_stat=not test, name=name + '_bn') if dropout and not test: h = F.dropout(h, 0.5) if relu is not None: h = relu(h) return h
def netG_decoder(x, test=False): # x: (1, 15, 64, 64) -> c0: (1, 15, 128, 128) with nn.parameter_scope('ReluDeconvBN1'): c0 = PF.batch_normalization(PF.deconvolution(F.relu(x), 15, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # c0: (1, 15, 128, 128) -> c1: (1, 15, 256, 256) with nn.parameter_scope('ReluDeconvBN2'): c1 = F.tanh( PF.deconvolution(F.relu(c0), 15, (4, 4), pad=(1, 1), stride=(2, 2))) # c1: (1, 15, 256, 256) -> down_0: (1, 64, 128, 128) with nn.parameter_scope('down0'): down_0 = PF.convolution(c1, 64, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False) # down_0: (1, 64, 128, 128) -> down_1: (1, 128, 64, 64) with nn.parameter_scope('down1'): down_1 = PF.batch_normalization(PF.convolution(F.leaky_relu(down_0, alpha=0.2), 128, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_1: (1, 128, 64, 64) -> down_2: (1, 256, 32, 32) with nn.parameter_scope('down2'): down_2 = PF.batch_normalization(PF.convolution(F.leaky_relu(down_1, alpha=0.2), 256, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_2: (1, 256, 32, 32) -> down_3: (1, 512, 16, 16) with nn.parameter_scope('down3'): down_3 = PF.batch_normalization(PF.convolution(F.leaky_relu(down_2, alpha=0.2), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_3: (1, 512, 16, 16) -> down_4: (1, 512, 8, 8) with nn.parameter_scope('down4'): down_4 = PF.batch_normalization(PF.convolution(F.leaky_relu(down_3, alpha=0.2), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_4: (1, 512, 8, 8) -> down_5: (1, 512, 4, 4) with nn.parameter_scope('down5'): down_5 = PF.batch_normalization(PF.convolution(F.leaky_relu(down_4, alpha=0.2), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_5: (1, 512, 4, 4) -> down_6: (1, 512, 2, 2) with nn.parameter_scope('down6'): down_6 = PF.batch_normalization(PF.convolution(F.leaky_relu(down_5, alpha=0.2), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_6: (1, 512, 2, 2) -> down_7: (1, 512, 1, 1) with nn.parameter_scope('down7'): down_7 = PF.convolution(F.leaky_relu(down_6, alpha=0.2), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False) # down_7: (1, 512, 1, 1) -> up_0: (1, 512, 2, 2) with nn.parameter_scope('up0'): up_0 = PF.batch_normalization(PF.deconvolution(F.relu(down_7), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_6: (1, 512, 2, 2) + up_0: (1, 512, 2, 2) -> up_1: (1, 512, 4, 4) with nn.parameter_scope('up1'): up_1 = PF.batch_normalization(PF.deconvolution(F.relu( F.concatenate(down_6, up_0, axis=1)), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) if not test: up_1 = F.dropout(up_1, 0.5) # down_5: (1, 512, 4, 4) + up_1: (1, 512, 4, 4)-> up_2: (1, 512, 8, 8) with nn.parameter_scope('up2'): up_2 = PF.batch_normalization(PF.deconvolution(F.relu( F.concatenate(down_5, up_1, axis=1)), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) if not test: up_2 = F.dropout(up_2, 0.5) # down_4: (1, 512, 8, 8) + up_2: (1, 512, 8, 8) -> up_3: (1, 512, 16, 16) with nn.parameter_scope('up3'): up_3 = PF.batch_normalization(PF.deconvolution(F.relu( F.concatenate(down_4, up_2, axis=1)), 512, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) if not test: up_3 = F.dropout(up_3, 0.5) # down_3: (1, 512, 16, 16) + up_3: (1, 512, 16, 16) -> up_4: (1, 256, 32, 32) with nn.parameter_scope('up4'): up_4 = PF.batch_normalization(PF.deconvolution(F.relu( F.concatenate(down_3, up_3, axis=1)), 256, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_2: (1, 256, 32, 32) + up_4: (1, 256, 32, 32) -> up_5: (1, 128, 64, 64) with nn.parameter_scope('up5'): up_5 = PF.batch_normalization(PF.deconvolution(F.relu( F.concatenate(down_2, up_4, axis=1)), 128, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_1: (1, 128, 64, 64) + up_5: (1, 128, 64, 64) -> up_6: (1, 64, 128, 128) with nn.parameter_scope('up6'): up_6 = PF.batch_normalization(PF.deconvolution(F.relu( F.concatenate(down_1, up_5, axis=1)), 64, (4, 4), pad=(1, 1), stride=(2, 2), with_bias=False), batch_stat=not test) # down_0: (1, 64, 128, 128) + up_6: (1, 64, 128, 128) -> output: (1, 3, 256, 256) with nn.parameter_scope('up7'): output = F.tanh( PF.deconvolution(F.relu(F.concatenate(down_0, up_6, axis=1)), 3, (4, 4), pad=(1, 1), stride=(2, 2))) return output
def infer(self, mels, sigma=0.9): r"""Returns the generated audio. Args: mels (nn.Variable): Inputs containing mel-spectrograms of shape(B, n_mels, Ty). Defaults to None. If None, the mel spectrograms are infferred from data. sigma (float, optional): Sigma used to infer audio. Defaults to 0.9. Returns: nn.Variable: A synthetic audio. """ hp = self.hparams with nn.parameter_scope('', self.parameter_scope): # Upsample spectrogram to size of audio with nn.parameter_scope('upsample'): with nn.parameter_scope('deconv'): mels = PF.deconvolution(mels, hp.n_mels, kernel=(1024, ), stride=(256, )) # cutout conv artifacts mels = mels[..., :-(1024 - 256)] # kernel - stride # transforming to correct shape mels = F.reshape(mels, mels.shape[:2] + (-1, hp.n_samples_per_group)) mels = F.transpose(mels, (0, 2, 1, 3)) mels = F.reshape(mels, mels.shape[:2] + (-1, )) # (B, n_mels * n_groups, L/n_groups) mels = F.transpose(mels, (0, 2, 1)) wave = F.randn(shape=(mels.shape[0], self.n_remaining_channels, mels.shape[2])) * sigma for k in reversed(range(hp.n_flows)): n_half = wave.shape[1] // 2 audio_0 = wave[:, :n_half, :] audio_1 = wave[:, n_half:, :] with nn.parameter_scope(f'wn_{k}'): output = getattr(self, f'WN_{k}')(audio_0, mels) s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = (audio_1 - b) / F.exp(s) wave = F.concatenate(audio_0, audio_1, axis=1) wave = invertible_conv(wave, reverse=True, rng=self.rng, scope=f'inv_{k}') if k % hp.n_early_every == 0 and k > 0: z = F.randn(shape=(mels.shape[0], hp.n_early_size, mels.shape[2])) wave = F.concatenate(sigma * z, wave, axis=1) wave = F.transpose(wave, (0, 2, 1)) wave = F.reshape(wave, (wave.shape[0], -1)) return wave
def call(self, wave): hp = self.hparams batch_size = hp.batch_size # compute mel-spectrogram from waveform with nn.parameter_scope('stft'): mels = self.compute_mel(wave) # Upsample spectrogram to the size of audio with nn.parameter_scope('upsample'): with nn.parameter_scope('deconv'): mels = PF.deconvolution(mels, hp.n_mels, kernel=(1024, ), stride=(256, )) # make sure mels having the same length as wave if mels.shape[2] > wave.shape[1]: mels = mels[..., :wave.shape[1]] # (B, L, n_mels) # transforming to correct shape mels = F.reshape(mels, mels.shape[:2] + (-1, hp.n_samples_per_group)) mels = F.transpose(mels, (0, 2, 1, 3)) mels = F.reshape(mels, mels.shape[:2] + (-1, )) # (B, n_mels * n_groups, L/n_groups) mels = F.transpose(mels, (0, 2, 1)) # reshape audio wave = F.reshape(wave, (batch_size, -1, hp.n_samples_per_group)) wave = F.transpose(wave, (0, 2, 1)) # (B, n_groups, L/n_groups) output_audio, log_s_list, log_det_W_list = [], [], [] for k in range(hp.n_flows): if k % hp.n_early_every == 0 and k > 0: output_audio.append(wave[:, :hp.n_early_size, :]) wave = wave[:, hp.n_early_size:, :] # apply invertible convolution wave, log_det_W = invertible_conv(wave, reverse=False, rng=self.rng, scope=f'inv_{k}') log_det_W_list.append(log_det_W) n_half = wave.shape[1] // 2 audio_0 = wave[:, :n_half, :] audio_1 = wave[:, n_half:, :] with nn.parameter_scope(f'wn_{k}'): output = getattr(self, f'WN_{k}')(audio_0, mels) log_s = output[:, n_half:, :] # (B, n_half, L/n_groups) b = output[:, :n_half, :] # (B, n_half, L/n_groups) audio_1 = F.add2(F.exp(log_s) * audio_1, b, inplace=True) log_s_list.append(log_s) # (B, n_half*2, L/n_groups) wave = F.concatenate(audio_0, audio_1, axis=1) output_audio.append(wave) return F.concatenate(*output_audio, axis=1), log_s_list, log_det_W_list