def test_resize_sequence_with_permutation(self): sub_sequence, seq_lengths, _ = utils.resize_sequence( sequence=np.array([[1, 1], [2, 2], [3, 3]]), cluster_id=np.array([1, 2, 1]), num_permutations=None) self.assertEqual(len(sub_sequence), 2) self.assertTrue((sub_sequence[0] == [[1, 1], [3, 3]]).all()) self.assertTrue((sub_sequence[1] == [[2, 2]]).all()) self.assertListEqual(seq_lengths, [3, 2])
def fit(self, train_sequence, train_cluster_id, args): """Fit UISRNN model. Args: train_sequence: 2-dim numpy array of real numbers, size: N * D - the training observation sequence. N - summation of lengths of all utterances D - observation dimension For example, train_sequence = [[1.2 3.0 -4.1 6.0] --> an entry of speaker #0 from utterance 'iaaa' [0.8 -1.1 0.4 0.5] --> an entry of speaker #1 from utterance 'iaaa' [-0.2 1.0 3.8 5.7] --> an entry of speaker #0 from utterance 'iaaa' [3.8 -0.1 1.5 2.3] --> an entry of speaker #0 from utterance 'ibbb' [1.2 1.4 3.6 -2.7]] --> an entry of speaker #0 from utterance 'ibbb' Here N=5, D=4. We concatenate all training utterances into a single sequence. train_cluster_id: 1-dim list or numpy array of strings, size: N - the speaker id sequence. For example, train_cluster_id = ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0'] 'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'. Note that the order of entries within an utterance are preserved, and all utterances are simply concatenated together. args: Training configurations. See arguments.py for details. Raises: TypeError: If train_sequence or train_cluster_id is of wrong type. ValueError: If train_sequence or train_cluster_id has wrong dimension. """ # check type if (not isinstance(train_sequence, np.ndarray) or train_sequence.dtype != float): raise TypeError( 'train_sequence should be a numpy array of float type.') if isinstance(train_cluster_id, list): train_cluster_id = np.array(train_cluster_id) if (not isinstance(train_cluster_id, np.ndarray) or not train_cluster_id.dtype.name.startswith('str')): raise TypeError( 'train_cluster_id type be a numpy array of strings.') # check dimension if train_sequence.ndim != 2: raise ValueError('train_sequence must be 2-dim array.') if train_cluster_id.ndim != 1: raise ValueError('train_cluster_id must be 1-dim array.') # check length and size train_total_length, observation_dim = train_sequence.shape if observation_dim != self.observation_dim: raise ValueError( 'train_sequence does not match the dimension specified ' 'by args.observation_dim.') if train_total_length != len(train_cluster_id): raise ValueError('train_sequence length is not equal to ' 'train_cluster_id length.') self.rnn_model.train() optimizer = self._get_optimizer(optimizer=args.optimizer, learning_rate=args.learning_rate) sub_sequences, seq_lengths, transition_bias = utils.resize_sequence( sequence=train_sequence, cluster_id=train_cluster_id, num_permutations=args.num_permutations) if self.transition_bias is None: self.transition_bias = transition_bias # For batch learning, pack the entire dataset. if args.batch_size is None: packed_train_sequence, rnn_truth = utils.pack_sequence( sub_sequences, seq_lengths, args.batch_size, self.observation_dim, self.device) train_loss = [] for t in range(args.train_iteration): # Update learning rate if half life is specified. if args.learning_rate_half_life > 0: if t > 0 and t % args.learning_rate_half_life == 0: optimizer.param_groups[0]['lr'] /= 2.0 print('Changing learning rate to: {}'.format( optimizer.param_groups[0]['lr'])) optimizer.zero_grad() # For online learning, pack a subset in each iteration. if args.batch_size is not None: packed_train_sequence, rnn_truth = utils.pack_sequence( sub_sequences, seq_lengths, args.batch_size, self.observation_dim, self.device) hidden = self.rnn_init_hidden.repeat(1, args.batch_size, 1) mean, _ = self.rnn_model(packed_train_sequence, hidden) # use mean to predict mean = torch.cumsum(mean, dim=0) mean_size = mean.size() mean = torch.mm( torch.diag( 1.0 / torch.arange(1, mean_size[0] + 1).float().to(self.device)), mean.view(mean_size[0], -1)) mean = mean.view(mean_size) # Likelihood part. loss1 = utils.weighted_mse_loss( input_tensor=(rnn_truth != 0).float() * mean[:-1, :, :], target_tensor=rnn_truth, weight=1 / (2 * self.sigma2)) weight = (((rnn_truth != 0).float() * mean[:-1, :, :] - rnn_truth)**2).view(-1, observation_dim) num_non_zero = torch.sum((weight != 0).float(), dim=0).squeeze() loss2 = ((2 * args.sigma_alpha + num_non_zero + 2) / (2 * num_non_zero) * torch.log(self.sigma2)).sum() + ( args.sigma_beta / (self.sigma2 * num_non_zero)).sum() # regularization l2_reg = 0 for param in self.rnn_model.parameters(): l2_reg += torch.norm(param) loss3 = args.regularization_weight * l2_reg loss = loss1 + loss2 + loss3 loss.backward() nn.utils.clip_grad_norm_(self.rnn_model.parameters(), 5.0) # nn.utils.clip_grad_norm_(self.sigma2, 1.0) optimizer.step() # avoid numerical issues self.sigma2.data.clamp_(min=1e-6) if np.remainder(t, 10) == 0: print('Iter: {:d} \t' 'Training Loss: {:.4f} \n' ' Negative Log Likelihood: {:.4f}\t' 'Sigma2 Prior: {:.4f}\t' 'Regularization: {:.4f}'.format(t, float(loss.data), float(loss1.data), float(loss2.data), float(loss3.data))) train_loss.append(float( loss1.data)) # only save the likelihood part print('Done training with {} iterations'.format(args.train_iteration))
def fit(self, train_sequence, train_cluster_id, args): """Fit UISRNN model. Args: train_sequence: (real 2d numpy array, size: N by D) - the training d_vector sequence. N - summation of lengths of all utterances D - observation dimension For example, train_sequence = [[1.2 3.0 -4.1 6.0] --> an entry of speaker #0 from utterance 'iaaa' [0.8 -1.1 0.4 0.5] --> an entry of speaker #1 from utterance 'iaaa' [-0.2 1.0 3.8 5.7] --> an entry of speaker #0 from utterance 'iaaa' [3.8 -0.1 1.5 2.3] --> an entry of speaker #0 from utterance 'ibbb' [1.2 1.4 3.6 -2.7]] --> an entry of speaker #0 from utterance 'ibbb' Here N=5, d=4. We concatenate all training utterances into a single sequence. train_cluster_id: (a vector of strings, size: N) - the speaker id sequence. For example, train_cluster_id = ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0'] 'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'. Note that the order of entries within an utterance are preserved, and all utterances are simply concatenated together. args: Training configurations. See arguments.py for details. Raises: ValueError: If train_sequence has wrong dimension. """ train_total_length, observation_dim = train_sequence.shape if observation_dim != self.observation_dim: raise ValueError( 'train_sequence does not match the dimension specified ' 'by args.observation_dim.') if train_total_length != len(train_cluster_id): raise ValueError('train_sequence length is not equal to ' 'train_cluster_id length.') if type(train_sequence).__module__ != np.__name__: raise TypeError('train_sequence type should be an numpy array.') if type(train_cluster_id).__module__ != np.__name__: raise TypeError('train_cluster_id type should be an numpy array.') self.rnn_model.train() optimizer = self._get_optimizer(optimizer=args.optimizer, learning_rate=args.learning_rate) sub_sequences, seq_lengths, transition_bias = utils.resize_sequence( sequence=train_sequence, cluster_id=train_cluster_id, num_permutations=args.num_permutations) num_clusters = len(seq_lengths) sorted_seq_lengths = np.sort(seq_lengths)[::-1] permute_index = np.argsort(seq_lengths)[::-1] if self.transition_bias is None: self.transition_bias = transition_bias if args.batch_size is None: # Packing sequences. rnn_input = np.zeros( (sorted_seq_lengths[0], num_clusters, self.observation_dim)) for i in range(num_clusters): rnn_input[1:sorted_seq_lengths[i], i, :] = sub_sequences[permute_index[i]] rnn_input = autograd.Variable( torch.from_numpy(rnn_input).float()).to(self.device) packed_train_sequence, rnn_truth = utils.pack_seq( rnn_input, sorted_seq_lengths) train_loss = [] for t in range(args.train_iteration): optimizer.zero_grad() if args.batch_size is not None: mini_batch = np.sort( np.random.choice(num_clusters, args.batch_size)) mini_batch_rnn_input = np.zeros( (sorted_seq_lengths[mini_batch[0]], args.batch_size, self.observation_dim)) for i in range(args.batch_size): mini_batch_rnn_input[1:sorted_seq_lengths[mini_batch[i]], i, :] = sub_sequences[permute_index[ mini_batch[i]]] mini_batch_rnn_input = autograd.Variable( torch.from_numpy(mini_batch_rnn_input).float()).to( self.device) packed_train_sequence, rnn_truth = utils.pack_seq( mini_batch_rnn_input, sorted_seq_lengths[mini_batch]) hidden = self.rnn_init_hidden.repeat(1, args.batch_size, 1) mean, _ = self.rnn_model(packed_train_sequence, hidden) # use mean to predict mean = torch.cumsum(mean, dim=0) mean_size = mean.size() mean = torch.mm( torch.diag( 1.0 / torch.arange(1, mean_size[0] + 1).float().to(self.device)), mean.view(mean_size[0], -1)) mean = mean.view(mean_size) # Likelihood part. loss1 = utils.weighted_mse_loss( input_tensor=(rnn_truth != 0).float() * mean[:-1, :, :], target_tensor=rnn_truth, weight=1 / (2 * self.sigma2)) weight = (((rnn_truth != 0).float() * mean[:-1, :, :] - rnn_truth)**2).view(-1, observation_dim) num_non_zero = torch.sum((weight != 0).float(), dim=0).squeeze() loss2 = ((2 * args.sigma_alpha + num_non_zero + 2) / (2 * num_non_zero) * torch.log(self.sigma2)).sum() + ( args.sigma_beta / (self.sigma2 * num_non_zero)).sum() # regularization l2_reg = 0 for param in self.rnn_model.parameters(): l2_reg += torch.norm(param) loss3 = args.regularization_weight * l2_reg loss = loss1 + loss2 + loss3 loss.backward() nn.utils.clip_grad_norm_(self.rnn_model.parameters(), 5.0) # nn.utils.clip_grad_norm_(self.sigma2, 1.0) optimizer.step() # avoid numerical issues self.sigma2.data.clamp_(min=1e-6) if np.remainder(t, 10) == 0: print('Iter: {:d} \t' 'Training Loss: {:.4f} \n' ' Negative Log Likelihood: {:.4f}\t' 'Sigma2 Prior: {:.4f}\t' 'Regularization: {:.4f}'.format(t, float(loss.data), float(loss1.data), float(loss2.data), float(loss3.data))) train_loss.append(float( loss1.data)) # only save the likelihood part print('Done training with {} iterations'.format(args.train_iteration))