def _update_beam_state(self, beam_state, look_ahead_seq, cluster_seq): """Update a beam state given a look ahead sequence and known cluster assignments. Args: beam_state: A BeamState object. look_ahead_seq: Look ahead sequence, size: look_ahead*D. look_ahead: number of step to look ahead in the beam search. D: observation dimension cluster_seq: Cluster assignment sequence for look_ahead_seq. Returns: new_beam_state: An updated BeamState object. """ loss = 0 new_beam_state = BeamState(beam_state) for sub_idx, cluster in enumerate(cluster_seq): if cluster > len(new_beam_state.mean_set): # invalid trace new_beam_state.neg_likelihood = float('inf') break elif cluster < len(new_beam_state.mean_set): # existing cluster last_cluster = new_beam_state.trace[-1] loss = loss_func.weighted_mse_loss( input_tensor=torch.squeeze( new_beam_state.mean_set[cluster]), target_tensor=look_ahead_seq[sub_idx, :], weight=1 / (2 * self.sigma2)).cpu().detach().numpy() if cluster == last_cluster: loss -= np.log(1 - self.transition_bias) else: loss -= np.log(self.transition_bias) + np.log( new_beam_state.block_counts[cluster]) - np.log( sum(new_beam_state.block_counts) + self.crp_alpha) # update new mean and new hidden mean, hidden = self.rnn_model( look_ahead_seq[sub_idx, :].unsqueeze(0).unsqueeze(0), new_beam_state.hidden_set[cluster]) new_beam_state.mean_set[cluster] = ( new_beam_state.mean_set[cluster] * ((np.array(new_beam_state.trace) == cluster).sum() - 1).astype(float) + mean.clone()) / (np.array( new_beam_state.trace) == cluster).sum().astype( float) # use mean to predict new_beam_state.hidden_set[cluster] = hidden.clone() if cluster != last_cluster: new_beam_state.block_counts[cluster] += 1 new_beam_state.trace.append(cluster) else: # new cluster init_input = autograd.Variable( torch.zeros( self.observation_dim)).unsqueeze(0).unsqueeze(0).to( self.device) mean, hidden = self.rnn_model(init_input, self.rnn_init_hidden) loss = loss_func.weighted_mse_loss( input_tensor=torch.squeeze(mean), target_tensor=look_ahead_seq[sub_idx, :], weight=1 / (2 * self.sigma2)).cpu().detach().numpy() loss -= np.log(self.transition_bias) + np.log( self.crp_alpha) - np.log( sum(new_beam_state.block_counts) + self.crp_alpha) # update new min and new hidden mean, hidden = self.rnn_model( look_ahead_seq[sub_idx, :].unsqueeze(0).unsqueeze(0), hidden) new_beam_state.append(mean, hidden, cluster) new_beam_state.neg_likelihood += loss return new_beam_state
def fit_concatenated(self, train_sequence, train_cluster_id, args): """Fit UISRNN model to concatenated sequence and cluster_id. Args: train_sequence: the training observation sequence, which is a 2-dim numpy array of real numbers, of size `N * D`. - `N`: summation of lengths of all utterances. - `D`: observation dimension. For example, ``` train_sequence = [[1.2 3.0 -4.1 6.0] --> an entry of speaker #0 from utterance 'iaaa' [0.8 -1.1 0.4 0.5] --> an entry of speaker #1 from utterance 'iaaa' [-0.2 1.0 3.8 5.7] --> an entry of speaker #0 from utterance 'iaaa' [3.8 -0.1 1.5 2.3] --> an entry of speaker #0 from utterance 'ibbb' [1.2 1.4 3.6 -2.7]] --> an entry of speaker #0 from utterance 'ibbb' ``` Here `N=5`, `D=4`. We concatenate all training utterances into this single sequence. train_cluster_id: the speaker id sequence, which is 1-dim list or numpy array of strings, of size `N`. For example, ``` train_cluster_id = ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0'] ``` 'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'. Note that the order of entries within an utterance are preserved, and all utterances are simply concatenated together. args: Training configurations. See `arguments.py` for details. Raises: TypeError: If train_sequence or train_cluster_id is of wrong type. ValueError: If train_sequence or train_cluster_id has wrong dimension. """ # check type if (not isinstance(train_sequence, np.ndarray) or train_sequence.dtype != float): raise TypeError( 'train_sequence should be a numpy array of float type.') if isinstance(train_cluster_id, list): train_cluster_id = np.array(train_cluster_id) if (not isinstance(train_cluster_id, np.ndarray) or not train_cluster_id.dtype.name.startswith( ('str', 'unicode'))): raise TypeError( 'train_cluster_id type be a numpy array of strings.') # check dimension if train_sequence.ndim != 2: raise ValueError('train_sequence must be 2-dim array.') if train_cluster_id.ndim != 1: raise ValueError('train_cluster_id must be 1-dim array.') # check length and size train_total_length, observation_dim = train_sequence.shape if observation_dim != self.observation_dim: raise ValueError( 'train_sequence does not match the dimension specified ' 'by args.observation_dim.') if train_total_length != len(train_cluster_id): raise ValueError('train_sequence length is not equal to ' 'train_cluster_id length.') self.rnn_model.train() optimizer = self._get_optimizer(optimizer=args.optimizer, learning_rate=args.learning_rate) (sub_sequences, seq_lengths, transition_bias, transition_bias_denominator) = utils.resize_sequence( sequence=train_sequence, cluster_id=train_cluster_id, num_permutations=args.num_permutations) if self.estimate_transition_bias: if self.transition_bias is None: self.transition_bias = transition_bias self.transition_bias_denominator = transition_bias_denominator else: self.transition_bias = ( self.transition_bias * self.transition_bias_denominator + transition_bias * transition_bias_denominator) / ( self.transition_bias_denominator + transition_bias_denominator) self.transition_bias_denominator += transition_bias_denominator # For batch learning, pack the entire dataset. if args.batch_size is None: packed_train_sequence, rnn_truth = utils.pack_sequence( sub_sequences, seq_lengths, args.batch_size, self.observation_dim, self.device) train_loss = [] for num_iter in range(args.train_iteration): optimizer.zero_grad() # For online learning, pack a subset in each iteration. if args.batch_size is not None: packed_train_sequence, rnn_truth = utils.pack_sequence( sub_sequences, seq_lengths, args.batch_size, self.observation_dim, self.device) hidden = self.rnn_init_hidden.repeat(1, args.batch_size, 1) mean, _ = self.rnn_model(packed_train_sequence, hidden) # use mean to predict mean = torch.cumsum(mean, dim=0) mean_size = mean.size() mean = torch.mm( torch.diag( 1.0 / torch.arange(1, mean_size[0] + 1).float().to(self.device)), mean.view(mean_size[0], -1)) mean = mean.view(mean_size) # Likelihood part. loss1 = loss_func.weighted_mse_loss( input_tensor=(rnn_truth != 0).float() * mean[:-1, :, :], target_tensor=rnn_truth, weight=1 / (2 * self.sigma2)) # Sigma2 prior part. weight = (((rnn_truth != 0).float() * mean[:-1, :, :] - rnn_truth)**2).view(-1, observation_dim) num_non_zero = torch.sum((weight != 0).float(), dim=0).squeeze() loss2 = loss_func.sigma2_prior_loss(num_non_zero, args.sigma_alpha, args.sigma_beta, self.sigma2) # Regularization part. loss3 = loss_func.regularization_loss(self.rnn_model.parameters(), args.regularization_weight) loss = loss1 + loss2 + loss3 loss.backward() nn.utils.clip_grad_norm_(self.rnn_model.parameters(), args.grad_max_norm) optimizer.step() # avoid numerical issues self.sigma2.data.clamp_(min=1e-6) if (np.remainder(num_iter, 10) == 0 or num_iter == args.train_iteration - 1): self.logger.print( 2, 'Iter: {:d} \t' 'Training Loss: {:.4f} \n' ' Negative Log Likelihood: {:.4f}\t' 'Sigma2 Prior: {:.4f}\t' 'Regularization: {:.4f}'.format(num_iter, float(loss.data), float(loss1.data), float(loss2.data), float(loss3.data))) train_loss.append(float( loss1.data)) # only save the likelihood part self.logger.print( 1, 'Done training with {} iterations'.format(args.train_iteration))