def training_step(self, batch, optimizer_idx): if optimizer_idx == 0: # generator (z,) = batch g_out = self._generator(z, trainable=True, const_init=True) g_logits = self._discriminator(g_out, trainable=False, const_init=True) g_loss = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(g_logits), g_logits, name="Gloss_sigmoid_cross_entropy_with_logits", ) return (g_loss, g_out) elif optimizer_idx == 1: # discriminator z, images = batch g_out = self._generator(z, trainable=False, const_init=True) g_logits = self._discriminator(g_out, trainable=True, const_init=True) d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits( flow.zeros_like(g_logits), g_logits, name="Dloss_fake_sigmoid_cross_entropy_with_logits", ) d_logits = self._discriminator( images, trainable=True, reuse=True, const_init=True ) d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(d_logits), d_logits, name="Dloss_real_sigmoid_cross_entropy_with_logits", ) d_loss = d_loss_fake + d_loss_real return d_loss
def forward(self, inputs, targets): """ Args: inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim). targets (torch.LongTensor): ground truth labels with shape (num_classes). """ n = inputs.size(0) # Compute pairwise distance, replace by the official when merged dist = flow.pow(inputs, 2).sum(dim=1).expand(n, n) dist = dist + flow.transpose(dist, dim0=1, dim1=0) temp1 = -2 * flow.matmul(inputs, flow.transpose(inputs, dim0=1, dim1=0)) dist = flow.add(dist, temp1) dist = flow.sqrt(flow.clamp(dist, min=1e-12)) # For each anchor, find the hardest positive and negative mask = targets.expand(n, n).eq( flow.transpose(targets.expand(n, n), dim0=1, dim1=0)) dist_ap, dist_an = [], [] y1 = flow.zeros((1, n), dtype=flow.float32).to("cuda") y2 = flow.Tensor(np.exp(100 * np.ones((1, n)))).to("cuda") for i in range(n): temp_dist = flow.slice(dist, [(i, i + 1, 1)]) temp_mask = flow.slice(mask, [(i, i + 1, 1)]) temp_mask_rev = flow.slice(1 - mask, [(i, i + 1, 1)]) dist_ap.append(temp_mask.where(temp_dist, y1).max().unsqueeze(0)) dist_an.append( temp_mask_rev.where(temp_dist, y2).min().unsqueeze(0)) dist_ap = flow.cat(dist_ap) dist_an = flow.cat(dist_an) # Compute ranking hinge loss y = flow.ones_like(dist_an) return self.ranking_loss(dist_an, dist_ap, y)
def test_discriminator( z=flow.FixedTensorDef((self.batch_size, 100)), images=flow.FixedTensorDef((self.batch_size, 1, 28, 28)), label1=flow.FixedTensorDef((self.batch_size, 1)), label0=flow.FixedTensorDef((self.batch_size, 1)), ): g_out = self.generator(z, trainable=False, const_init=True) g_logits = self.discriminator(g_out, trainable=True, const_init=True) d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits( flow.zeros_like(g_logits), g_logits, name="Dloss_fake_sigmoid_cross_entropy_with_logits", ) d_logits = self.discriminator(images, trainable=True, reuse=True, const_init=True) d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(d_logits), d_logits, name="Dloss_real_sigmoid_cross_entropy_with_logits", ) d_loss = d_loss_fake + d_loss_real flow.losses.add_loss(d_loss) return d_loss
def forward( self, inputs: Dict[str, flow.Tensor], ) -> Dict[str, flow.Tensor]: input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask") token_type_ids = inputs.get("token_type_ids") position_ids = inputs.get("position_ids") embeddings = self.embeddings(input_ids, token_type_ids, position_ids) if attention_mask is None: attention_mask = flow.ones_like(input_ids, device=input_ids.device) extended_attention_mask = self.get_extended_attention_mask( attention_mask, input_ids) encoder_output, attention_output = self.encoder( embeddings, extended_attention_mask) pooled_output = self.pooler(encoder_output) output_dict = { "encoder_output": encoder_output, "pooled_output": pooled_output } return output_dict
def _test_autograd_backward(test_case, shape, device): np_input = np.random.rand(*shape) of_input = flow.tensor(np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True) of_out = of_input**2 of_out_sum = of_out.sum() of_out_sum.backward() test_case.assertTrue( np.allclose(of_input.grad.numpy(), np_input * 2, 0.0001, 0.0001)) of_input = flow.tensor(np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True) of_out = of_input**2 of_out_sum = of_out.sum() of_out_sum.backward(flow.ones_like(of_out_sum) * 3) test_case.assertTrue( np.allclose(of_input.grad.numpy(), np_input * 6, 0.0001, 0.0001)) of_input = flow.tensor(np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True) of_out = of_input**2 of_out_sum = of_out.sum() of_out_sum.backward(retain_graph=True) of_out_sum.backward(retain_graph=True) test_case.assertTrue( np.allclose(of_input.grad.numpy(), np_input * 4, 0.0001, 0.0001))
def build(self,inputs,targets): n=inputs.shape[0] if self.distance=='euclidean': dist=flow.math.pow(inputs,2) dist=flow.math.reduce_sum(dist, axis=1, keepdims=True) dist=np.tile(dist,(n, n)) dist_t=flow.transpose(dist) dist=dist+dist_t inputs_t=flow.transpose(inputs) dist=addmm(dist,inputs,inputs_t,beta=1,alpha=-2) dist=flow.clamp(min_value=1e-12) dist=flow.math.sqrt(dist) elif self.distance == 'cosine': fnorm=np.linalg.norm(inputs,ord=2,axis=1,keepdims=True) l2norm=np.tile(inputs,(inputs.shape)) l2norm=inputs/l2norm l2norm_t=flow.transpose(l2norm) dist=-np.matmul(l2norm,l2norm_t) target_expand=np.tile(targets,(n,n)) target_expand_t=flow.transpose(target_expand) mask=flow.math.equal(target_expand,target_expand_t) dist_ap, dist_an = [], [] for i in range(n): temp=np.ndarray.max(dist[i][mask[i]]) temp=flow.expand_dims(temp,axis=0) dist_ap.append(temp) temp=np.ndarray.min(dist[i][mask[i]==0]) temp=flow.expand_dims(temp,axis=0) dist_an.append(temp) dist_ap=flow.concat(dist_ap) dist_an=flow.concat(dist_an) y=flow.ones_like(dist_an) loss=self.ranking_loss(dist_an, dist_ap, y,margin=self.margin) return loss
def test_discriminator( z: oft.Numpy.Placeholder((self.batch_size, 100)), images: oft.Numpy.Placeholder((self.batch_size, 1, 28, 28)), label1: oft.Numpy.Placeholder((self.batch_size, 1)), label0: oft.Numpy.Placeholder((self.batch_size, 1)), ): g_out = self.generator(z, trainable=False, const_init=True) g_logits = self.discriminator(g_out, trainable=True, const_init=True) d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits( flow.zeros_like(g_logits), g_logits, name="Dloss_fake_sigmoid_cross_entropy_with_logits", ) d_logits = self.discriminator( images, trainable=True, reuse=True, const_init=True ) d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(d_logits), d_logits, name="Dloss_real_sigmoid_cross_entropy_with_logits", ) d_loss = d_loss_fake + d_loss_real flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0 ).minimize(d_loss) return d_loss
def _test_ones_like_int(test_case, shape, device): x = flow.tensor(np.random.randn(*shape), dtype=flow.int, device=flow.device(device)) y = flow.ones_like(x) test_case.assertTrue(y.dtype is flow.int) test_case.assertTrue(y.shape == x.shape) test_case.assertTrue(y.device == x.device) y_numpy = np.ones_like(x.numpy()) test_case.assertTrue(np.array_equal(y.numpy(), y_numpy))
def build(self, inputs, targets): """ Args: inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim). targets (torch.LongTensor): ground truth labels with shape (num_classes). """ n = inputs.shape[0] dist = math.reduce_sum(math.pow( inputs, flow.constant_like(inputs, 2, dtype=flow.float32)), axis=1) shape_tensor = flow.constant(value=0.0, dtype=flow.float32, shape=(n, n)) dist = flow.broadcast_like(dist, like=shape_tensor, broadcast_axes=[1]) dist = math.add( dist, flow.transpose(dist, perm=(1, 0), batch_axis_non_change=True)) temp1 = math.multiply( -2, flow.matmul( inputs, flow.transpose(inputs, perm=(1, 0), batch_axis_non_change=True))) dist = math.add(dist, temp1) dist = math.sqrt(flow.clamp(dist, min_value=1e-12)) mask = math.equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) mask_rev = math.not_equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) dist_ap, dist_an = [], [] for i in range(n): temp_dist = flow.slice_v2(dist, [(i, i + 1, 1)]) temp_mask = flow.slice_v2(mask, [(i, i + 1, 1)]) temp_mask_rev = flow.slice_v2(mask_rev, [(i, i + 1, 1)]) dist_ap.append( math.reduce_max( flow.gather_nd(temp_dist, flow.where(temp_mask)))) dist_an.append( math.reduce_min( flow.gather_nd(temp_dist, flow.where(temp_mask_rev)))) dist_ap = flow.concat(dist_ap, 0) dist_an = flow.concat(dist_an, 0) y = flow.ones_like(dist_an) # return dist_an, dist_ap, y return self._MarginRankingLoss(dist_an, dist_ap, y)
def forward(self, predicted, target): # ------------ AM Softmax ------------ # predicted = predicted / (predicted.norm(dim=0) + self.epsilon) indexes = flow.Tensor(range(predicted.size(0))).long().to( predicted.device) cos_theta_y = predicted[indexes, target] cos_theta_y_m = cos_theta_y - self.m exp_s = (flow.ones_like(cos_theta_y_m) * np.e)**(self.s * cos_theta_y_m) sum_cos_theta_j = ((flow.ones_like(predicted) * np.e) **(predicted * self.s)).sum(dim=1) - ( (flow.ones_like(predicted[indexes, target]) * np.e)**(predicted[indexes, target] * self.s)) log = -flow.log(exp_s / (exp_s + sum_cos_theta_j + self.epsilon)).mean() return log
def train_generator(z=flow.FixedTensorDef((self.batch_size, self.z_dim)), ): g_out = self.generator(z, trainable=True) g_logits = self.discriminator(g_out, trainable=False) g_loss = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(g_logits), g_logits, name="Gloss_sigmoid_cross_entropy_with_logits") flow.losses.add_loss(g_loss) return g_loss, g_out
def _test_ones_like_int(test_case, placement, sbp, shape, device): x = flow.tensor(np.random.randn(*shape), dtype=flow.int, device=flow.device(device)) x = x.to_global(placement=placement, sbp=sbp) y = flow.ones_like(x) test_case.assertTrue(y.dtype is flow.int) test_case.assertTrue(y.shape == x.shape) test_case.assertTrue(y.placement == placement) y_numpy = np.ones(x.numpy().shape) test_case.assertTrue(np.array_equal(y.numpy(), y_numpy))
def test_generator( z: oft.Numpy.Placeholder((self.batch_size, self.z_dim)), label1: oft.Numpy.Placeholder((self.batch_size, 1)), ): g_out = self.generator(z, trainable=True, const_init=True) g_logits = self.discriminator(g_out, trainable=False, const_init=True) g_loss = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(g_logits), g_logits, name="Gloss_sigmoid_cross_entropy_with_logits", ) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0 ).minimize(g_loss) return g_loss
def get_target_tensor(self, prediction, target_is_real): """Create label tensors with the same size as the input. Parameters: prediction (tensor) - - tpyically the prediction from a discriminator target_is_real (bool) - - if the ground truth label is for real images or fake images Returns: A label tensor filled with ground truth label, and with the size of the input """ if target_is_real: target_tensor = flow.ones_like(prediction) else: target_tensor = flow.zeros_like(prediction) return target_tensor
def _test_autograd_grad(test_case, shape, device): np_input = np.random.rand(*shape) of_input = flow.tensor(np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True) of_out = of_input**2 of_out_sum = of_out.sum() grad = flow.autograd.grad(of_out_sum, of_input)[0] test_case.assertTrue(of_input.grad is None) test_case.assertTrue( np.allclose(grad.numpy(), np_input * 2, 0.0001, 0.0001)) of_input = flow.tensor(np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True) of_out = of_input**2 of_out_sum = of_out.sum() grad = flow.autograd.grad(of_out_sum, of_input, flow.ones_like(of_out_sum) * 3)[0] test_case.assertTrue( np.allclose(grad.numpy(), np_input * 6, 0.0001, 0.0001))
def recognize_beam(self, encoder_outputs, char_list, args): """ Beam search, decode one utterence now. Args: encoder_outputs: T x H #418 x 512 char_list: list of character #4233 args: args.beam #5 Returns: nbest_hyps: """ # search params beam = args.beam_size nbest = args.nbest if args.decode_max_len == 0: maxlen = encoder_outputs.size(0) else: maxlen = args.decode_max_len encoder_outputs = encoder_outputs.unsqueeze(0) # prepare sos ys = flow.ones(1, 1).fill_(self.sos_id).type_as(encoder_outputs).long() hyp = {"score": 0.0, "yseq": ys} hyps = [hyp] ended_hyps = [] for i in range(maxlen): hyps_best_kept = [] for hyp in hyps: ys = hyp["yseq"] ys = ys.to(device=encoder_outputs.device) # -- Prepare masks non_pad_mask = flow.ones_like(ys).to( dtype=flow.float32).unsqueeze(-1) slf_attn_mask = get_subsequent_mask(ys) # -- Forward dec_output = self.dropout( self.tgt_word_emb(ys) * self.x_logit_scale + self.positional_encoding(ys)) for dec_layer in self.layer_stack: dec_output, _, _ = dec_layer( dec_output, encoder_outputs, non_pad_mask=non_pad_mask, slf_attn_mask=slf_attn_mask, dec_enc_attn_mask=None, ) seq_logit = self.tgt_word_prj(dec_output[:, -1]) local_logit = F.softmax(seq_logit) local_scores = flow.log(local_logit) # topk scores local_best_scores, local_best_ids = flow.topk(local_scores, beam, dim=1) for j in range(beam): new_hyp = {} new_hyp["score"] = hyp["score"] + local_best_scores[0, j] new_hyp["yseq"] = (flow.ones( 1, (1 + ys.size(1))).type_as(encoder_outputs).long()) new_hyp["yseq"][:, :ys.size(1)] = hyp["yseq"] new_hyp["yseq"][:, ys.size(1)] = int( float(local_best_ids[0, j].numpy())) hyps_best_kept.append(new_hyp) hyps_best_kept = sorted(hyps_best_kept, key=lambda x: x["score"], reverse=True)[:beam] # end for hyp in hyps hyps = hyps_best_kept # add eos in the final loop to avoid that there are no ended hyps if i == maxlen - 1: for hyp in hyps: hyp["yseq"] = flow.cat( [ hyp["yseq"], flow.ones(1, 1).fill_( self.eos_id).type_as(encoder_outputs).long(), ], dim=1, ) # add ended hypothes to a final list, and removed them from current hypothes # (this will be a probmlem, number of hyps < beam) remained_hyps = [] for hyp in hyps: if hyp["yseq"][0, -1] == self.eos_id: ended_hyps.append(hyp) else: remained_hyps.append(hyp) hyps = remained_hyps if len(hyps) > 0: print("remeined hypothes: " + str(len(hyps))) else: print("no hypothesis. Finish decoding.") break for hyp in hyps: print("hypo: " + "".join( [char_list[int(x.numpy())] for x in hyp["yseq"][0, 1:]])) nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[:min(len(ended_hyps), nbest)] for hyp in nbest_hyps: hyp["yseq"] = hyp["yseq"][0].cpu().numpy().tolist() return nbest_hyps
def Causal_Self_Attention(x, config, name='csa'): """ Input:: x : Eembedded words input[B, T, C] -- B is the batch size -- T is the sequence length(block_size) -- C is the dimension of the embedding (n_embd) C/head_number = dimension of each head(d_k) config: class object defined with models.GPTConfig Output:: y : output of x, which can be used as new x in next interation Description:: This functions is the causl_sefl_attention core, which is a part of multiple head attention schema. Code refered from: https://github.com/karpathy/minGPT/blob/master/mingpt/model.py Theory refered from: http://jalammar.github.io/illustrated-gpt2/ Related paper: """ assert config.n_embd % config.n_head == 0 #def B, T, C = x.shape #Kaiming_initialize kaiming_init_C = flow.kaiming_initializer(shape=(C, C)) ## calculate query, key, values for all heads in batch and move head forward to be the batch dim # define: key, query and value projections for all heads # process: query + key ----> value # dimension: (B,T,C) -> (B, nh, T, hs), nh*ns=C # query:The query is a representation of the current word used to score against all the other words (using their keys). query = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + '_query')) query = flow.reshape(query, [B, T, config.n_head, C // config.n_head]) query = flow.transpose(query, [0, 2, 1, 3]) # key:Key vectors are like labels for all the words in the segment. key = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + '_key')) key = flow.reshape(key, [B, T, config.n_head, C // config.n_head]) key = flow.transpose(key, [0, 2, 1, 3]) # value: Value vectors are actual word representations value = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + 'value')) value = flow.reshape(value, [B, T, config.n_head, C // config.n_head]) value = flow.transpose(value, [0, 2, 1, 3]) ##causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) att = flow.matmul(query, flow.transpose( key, [0, 1, 3, 2])) * (1.0 / math.sqrt(key.shape[-1])) att_tril = flow.math.tril( flow.constant(value=int(-1), dtype=flow.int32, shape=(B, config.n_head, T, T), name=name + "_ConstantLike_tril")) att_tril = att_tril + flow.ones_like(like=att_tril, dtype=flow.int32) att = flow.masked_fill(att, att_tril, float('-inf')) att = flow.nn.softmax(att, name=name + 'att') att = flow.nn.dropout(att, config.attn_pdrop) ## QK*V: (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) y = flow.matmul(att, value) y = flow.transpose(y, [0, 2, 1, 3]) y = flow.reshape(y, [B, T, C]) y = flow.nn.dropout(y, config.resid_pdrop) return y
def validation_for_B_dir(self): num_mcep = 80 sampling_rate = 22050 frame_period = 5.0 validation_B_dir = self.validation_B_dir output_B_dir = self.output_B_dir os.makedirs(output_B_dir, exist_ok=True) print("Generating Validation Data A from B...") for file in os.listdir(validation_B_dir): filePath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.dataset_B_mean, std_log_src=self.dataset_B_std, mean_log_target=self.dataset_A_mean, std_log_target=self.dataset_A_std, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.dataset_B_mean) / self.dataset_B_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_B2A( coded_sp_norm, flow.ones_like(coded_sp_norm)) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.dataset_A_std + self.dataset_A_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted).astype(np.double) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted[0], decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(output_B_dir, "convert_" + os.path.basename(file)), wav_transformed, sampling_rate, )
def infer(self): """Implements the infering loop for MaskCycleGAN-VC """ # load pretrain models self.loadModel(self.pretrain_models) num_mcep = 80 sampling_rate = self.sample_rate frame_period = 5.0 infer_A_dir = self.infer_data_dir print("Generating Validation Data B from A...") for file in os.listdir(infer_A_dir): filePath = os.path.join(infer_A_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.dataset_A_mean, std_log_src=self.dataset_A_std, mean_log_target=self.dataset_B_mean, std_log_target=self.dataset_B_std, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.dataset_A_mean) / self.dataset_A_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_A2B( coded_sp_norm, flow.ones_like(coded_sp_norm)) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.dataset_B_std + self.dataset_B_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted).astype(np.double) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted[0], decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(infer_A_dir, "convert_" + os.path.basename(file)), wav_transformed, sampling_rate, )
def train(self): # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr start_iters = 0 if self.resume_iters: pass norm = Normalizer() data_iter = iter(self.data_loader) print("Start training......") start_time = datetime.now() for i in range(start_iters, self.num_iters): # Preprocess input data # Fetch real images and labels. try: x_real, speaker_idx_org, label_org = next(data_iter) except: data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # Generate target domain labels randomly. rand_idx = flow.randperm(label_org.size(0)) label_trg = label_org[rand_idx] speaker_idx_trg = speaker_idx_org[rand_idx] x_real = x_real.to(self.device) # Original domain one-hot labels. label_org = label_org.to(self.device) # Target domain one-hot labels. label_trg = label_trg.to(self.device) speaker_idx_org = speaker_idx_org.to(self.device) speaker_idx_trg = speaker_idx_trg.to(self.device) # Train the discriminator # Compute loss with real audio frame. CELoss = nn.CrossEntropyLoss() cls_real = self.C(x_real) cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) self.reset_grad() cls_loss_real.backward() self.c_optimizer.step() # Logging. loss = {} loss["C/C_loss"] = cls_loss_real.item() out_r = self.D(x_real, label_org) # Compute loss with fake audio frame. x_fake = self.G(x_real, label_trg) out_f = self.D(x_fake.detach(), label_trg) d_loss_t = nn.BCEWithLogitsLoss()( input=out_f, target=flow.zeros_like( out_f).float()) + nn.BCEWithLogitsLoss()( input=out_r, target=flow.ones_like(out_r).float()) out_cls = self.C(x_fake) d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Compute loss for gradient penalty. alpha = flow.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = ((alpha * x_real + (1 - alpha) * x_fake).detach().requires_grad_(True)) out_src = self.D(x_hat, label_trg) # TODO: Second-order derivation is not currently supported in oneflow, so gradient penalty cannot be used temporarily. if self.use_gradient_penalty: d_loss_gp = self.gradient_penalty(out_src, x_hat) d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp else: d_loss = d_loss_t + self.lambda_cls * d_loss_cls self.reset_grad() d_loss.backward() self.d_optimizer.step() loss["D/D_loss"] = d_loss.item() # Train the generator if (i + 1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, label_trg) g_out_src = self.D(x_fake, label_trg) g_loss_fake = nn.BCEWithLogitsLoss()( input=g_out_src, target=flow.ones_like(g_out_src).float()) out_cls = self.C(x_real) g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org) # Target-to-original domain. x_reconst = self.G(x_fake, label_org) g_loss_rec = nn.L1Loss()(x_reconst, x_real) # Original-to-Original domain(identity). x_fake_iden = self.G(x_real, label_org) id_loss = nn.L1Loss()(x_fake_iden, x_real) # Backward and optimize. g_loss = (g_loss_fake + self.lambda_cycle * g_loss_rec + self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss) self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss["G/loss_fake"] = g_loss_fake.item() loss["G/loss_rec"] = g_loss_rec.item() loss["G/loss_cls"] = g_loss_cls.item() loss["G/loss_id"] = id_loss.item() loss["G/g_loss"] = g_loss.item() # Miscellaneous # Print out training information. if (i + 1) % self.log_step == 0: et = datetime.now() - start_time et = str(et)[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format( et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: with flow.no_grad(): d, speaker = TestSet(self.test_dir).test_data() target = random.choice( [x for x in speakers if x != speaker]) label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) for filename, content in d.items(): f0 = content["f0"] ap = content["ap"] sp_norm_pad = self.pad_coded_sp( content["coded_sp_norm"]) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = flow.Tensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = flow.Tensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).detach().cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content["coded_sp_norm"]. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f"{speaker}-{target}_iter{i+1}_{filename}" path = os.path.join(self.sample_dir, name) print(f"[save]:{path}") sf.write(path, wav, SAMPLE_RATE) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, "{}-G".format(i + 1)) D_path = os.path.join(self.model_save_dir, "{}-D".format(i + 1)) C_path = os.path.join(self.model_save_dir, "{}-C".format(i + 1)) flow.save(self.G.state_dict(), G_path) flow.save(self.D.state_dict(), D_path) flow.save(self.C.state_dict(), C_path) print("Saved model checkpoints into {}...".format( self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= self.g_lr / float(self.num_iters_decay) d_lr -= self.d_lr / float(self.num_iters_decay) c_lr -= self.c_lr / float(self.num_iters_decay) self.update_lr(g_lr, d_lr, c_lr) print("Decayed learning rates, g_lr: {}, d_lr: {}.".format( g_lr, d_lr))
def forward(self, inputs, targets): n = inputs.shape[0] # Compute pairwise distance, replace by the official when merged tempname = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') shape_tensor = flow.constant(value=0.0, dtype=flow.float32, shape=(n, n)) if self.distance == 'euclidean': blob_2 = flow.get_variable( "blob_2_" + tempname, shape=inputs.shape, initializer=flow.constant_initializer(2), dtype=inputs.dtype) dist = flow.math.pow(inputs, blob_2) dist = flow.math.reduce_sum(dist, axis=1, keepdims=True) dist = flow.broadcast_like(dist, shape_tensor) tempdist = flow.transpose(dist) dist = dist + tempdist inputs_t = flow.transpose(inputs) dist = addmm(dist, inputs, inputs_t, beta=1, alpha=-2) dist = flow.clamp(dist, min_value=1e-12) dist = flow.math.sqrt(dist) elif self.distance == 'cosine': #fnorm=flow.math.l2_normalize(inputs, axis=1) fnorm = flow.math.reduce_mean(flow.math.divide( inputs, flow.math.l2_normalize(inputs, axis=1)), axis=1, keepdims=True) expand_fnorm = flow.broadcast_like(fnorm, like=inputs, broadcast_axes=[1]) l2norm = flow.math.divide(inputs, expand_fnorm) l2norm_t = flow.transpose(l2norm, perm=(1, 0)) dist = flow.math.negative(flow.matmul(l2norm, l2norm_t)) # For each anchor, find the hardest positive and negative mask = math.equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) mask_rev = math.not_equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) dist_ap, dist_an = [], [] for i in range(n): temp_dist = flow.slice_v2(dist, [(i, i + 1, 1)]) temp_mask = flow.slice_v2(mask, [(i, i + 1, 1)]) temp_mask_rev = flow.slice_v2(mask_rev, [(i, i + 1, 1)]) temp_dist_ap = flow.expand_dims( math.reduce_max( flow.gather_nd(temp_dist, flow.where(temp_mask))), 0) temp_dist_an = flow.expand_dims( math.reduce_min( flow.gather_nd(temp_dist, flow.where(temp_mask_rev))), 0) dist_ap.append(temp_dist_ap) dist_an.append(temp_dist_an) dist_ap = flow.concat(dist_ap, 0) dist_an = flow.concat(dist_an, 0) y = flow.ones_like(dist_an) return self._MarginRankingLoss(dist_an, dist_ap, y)
def train(self): """Implements the training loop for MaskCycleGAN-VC """ for epoch in range(self.start_epoch, self.num_epochs + 1): for i, (real_A, mask_A, real_B, mask_B) in enumerate(self.train_dataloader): num_iterations = (self.n_samples // self.mini_batch_size) * epoch + i if num_iterations > 10000: self.identity_loss_lambda = 0 if num_iterations > self.decay_after: self.adjust_lr_rate(self.generator_optimizer, generator=True) self.adjust_lr_rate(self.generator_optimizer, generator=False) real_A = real_A.to(self.device, dtype=flow.float) mask_A = mask_A.to(self.device, dtype=flow.float) real_B = real_B.to(self.device, dtype=flow.float) mask_B = mask_B.to(self.device, dtype=flow.float) # Train Generator self.generator_A2B.train() self.generator_B2A.train() self.discriminator_A.eval() self.discriminator_B.eval() self.discriminator_A2.eval() self.discriminator_B2.eval() # Generator Feed Forward fake_B = self.generator_A2B(real_A, mask_A) cycle_A = self.generator_B2A(fake_B, flow.ones_like(fake_B)) fake_A = self.generator_B2A(real_B, mask_B) cycle_B = self.generator_A2B(fake_A, flow.ones_like(fake_A)) identity_A = self.generator_B2A(real_A, flow.ones_like(real_A)) identity_B = self.generator_A2B(real_B, flow.ones_like(real_B)) d_fake_A = self.discriminator_A(fake_A) d_fake_B = self.discriminator_B(fake_B) # For Two Step Adverserial Loss d_fake_cycle_A = self.discriminator_A2(cycle_A) d_fake_cycle_B = self.discriminator_B2(cycle_B) # Generator Cycle Loss cycleLoss = flow.mean(flow.abs(real_A - cycle_A)) + flow.mean( flow.abs(real_B - cycle_B)) # Generator Identity Loss identityLoss = flow.mean( flow.abs(real_A - identity_A)) + flow.mean( flow.abs(real_B - identity_B)) # Generator Loss g_loss_A2B = flow.mean((1 - d_fake_B)**2) g_loss_B2A = flow.mean((1 - d_fake_A)**2) # Generator Two Step Adverserial Loss generator_loss_A2B_2nd = flow.mean((1 - d_fake_cycle_B)**2) generator_loss_B2A_2nd = flow.mean((1 - d_fake_cycle_A)**2) # Total Generator Loss g_loss = (g_loss_A2B + g_loss_B2A + generator_loss_A2B_2nd + generator_loss_B2A_2nd + self.cycle_loss_lambda * cycleLoss + self.identity_loss_lambda * identityLoss) # Backprop for Generator self.reset_grad() g_loss.backward() self.generator_optimizer.step() # Train Discriminator self.generator_A2B.eval() self.generator_B2A.eval() self.discriminator_A.train() self.discriminator_B.train() self.discriminator_A2.train() self.discriminator_B2.train() # Discriminator Feed Forward d_real_A = self.discriminator_A(real_A) d_real_B = self.discriminator_B(real_B) d_real_A2 = self.discriminator_A2(real_A) d_real_B2 = self.discriminator_B2(real_B) generated_A = self.generator_B2A(real_B, mask_B) d_fake_A = self.discriminator_A(generated_A) # For Two Step Adverserial Loss A->B cycled_B = self.generator_A2B(generated_A, flow.ones_like(generated_A)) d_cycled_B = self.discriminator_B2(cycled_B) generated_B = self.generator_A2B(real_A, mask_A) d_fake_B = self.discriminator_B(generated_B) # For Two Step Adverserial Loss B->A cycled_A = self.generator_B2A(generated_B, flow.ones_like(generated_B)) d_cycled_A = self.discriminator_A2(cycled_A) # Loss Functions d_loss_A_real = flow.mean((1 - d_real_A)**2) d_loss_A_fake = flow.mean((0 - d_fake_A)**2) d_loss_A = (d_loss_A_real + d_loss_A_fake) / 2.0 d_loss_B_real = flow.mean((1 - d_real_B)**2) d_loss_B_fake = flow.mean((0 - d_fake_B)**2) d_loss_B = (d_loss_B_real + d_loss_B_fake) / 2.0 # Two Step Adverserial Loss d_loss_A_cycled = flow.mean((0 - d_cycled_A)**2) d_loss_B_cycled = flow.mean((0 - d_cycled_B)**2) d_loss_A2_real = flow.mean((1 - d_real_A2)**2) d_loss_B2_real = flow.mean((1 - d_real_B2)**2) d_loss_A_2nd = (d_loss_A2_real + d_loss_A_cycled) / 2.0 d_loss_B_2nd = (d_loss_B2_real + d_loss_B_cycled) / 2.0 # Final Loss for discriminator with the Two Step Adverserial Loss d_loss = (d_loss_A + d_loss_B) / 2.0 + (d_loss_A_2nd + d_loss_B_2nd) / 2.0 # Backprop for Discriminator self.reset_grad() d_loss.backward() self.discriminator_optimizer.step() if (i + 1) % 2 == 0: print( "Iter:{} Generator Loss:{:.4f} Discrimator Loss:{:.4f} GA2B:{:.4f} GB2A:{:.4f} G_id:{:.4f} G_cyc:{:.4f} D_A:{:.4f} D_B:{:.4f}" .format( num_iterations, g_loss.item(), d_loss.item(), g_loss_A2B, g_loss_B2A, identityLoss, cycleLoss, d_loss_A, d_loss_B, )) # Save each model checkpoint and validation if epoch % self.epochs_per_save == 0 and epoch != 0: self.saveModelCheckPoint(epoch, PATH="model_checkpoint") self.validation_for_A_dir() self.validation_for_B_dir()