def evaluate(in_session, in_model, in_dataset, batch_size=64): batch_gen = batch_generator(in_dataset, batch_size) kl_losses = [] nll_losses = [] kl_w = [] outputs = [] for batch_enc_input, batch_dec_input, batch_dec_output in batch_gen: loss_dict, batch_outputs = in_model.step(batch_enc_input, batch_dec_input, batch_dec_output, in_session, forward_only=True) kl_losses.append(loss_dict['kl_loss']) nll_losses.append(loss_dict['nll_loss']) kl_w.append(loss_dict['kl_w']) outputs += list(batch_outputs) print('10 random eval sequences:') random_idx = np.random.choice(range(len(outputs)), size=10) for idx in random_idx: output = list(outputs[idx]) if EOS in output: output = output[:output.index(EOS)] print(' '.join(output)) loss = np.mean(np.array(kl_losses) * np.array(kl_w) + np.array(nll_losses)) ppx = np.exp(loss) return { 'nll_loss': np.mean(nll_losses), 'kl_loss': np.mean(kl_losses), 'loss': loss, 'perplexity': ppx, 'kl_w': loss_dict['kl_w'] }
def train(in_session, in_model, in_train, in_dev, in_dst_folder, nb_epoch, batch_size, early_stopping_threshold, **kwargs): best_dev_loss = np.inf epochs_without_improvement = 0 for epoch_counter in range(nb_epoch): batch_gen = batch_generator(in_train, batch_size) train_batch_losses = [] for batch in batch_gen: enc_inp, dec_out = batch train_batch_loss = in_model.step(enc_inp, dec_out, in_session) train_batch_losses.append(train_batch_loss) train_loss = np.mean(train_batch_losses) print('Epoch {} out of {} results'.format(epoch_counter, nb_epoch)) print('train loss: {:.3f}'.format(train_loss)) dev_eval = evaluate(in_session, in_model, in_dev) print('; '.join([ 'dev {}: {:.3f}'.format(key, value) for key, value in dev_eval.items() ])) if dev_eval['loss'] < best_dev_loss: best_dev_loss = dev_eval['loss'] in_model.save(in_dst_folder, in_session) print('New best loss. Saving checkpoint') epochs_without_improvement = 0 else: epochs_without_improvement += 1 if early_stopping_threshold < epochs_without_improvement: print('Early stopping after {} epochs'.format(epoch_counter)) break print('Optimization Finished!')
def train(self): print('\n:: training started\n') epochs = self.config['epochs'] best_dev_accuracy = 0.0 epochs_without_improvement = 0 random_input_prob = self.config.get('random_input_prob', 0.0) unk_action_id = self.action_templates.index(UNK) for j in range(epochs): batch_gen = batch_generator(self.data_train, self.batch_size) for batch in batch_gen: batch_copy = [np.copy(elem) for elem in batch] enc_inp, dec_inp, dec_out, bow_out, context_features, action_masks, y = batch_copy num_turns = np.sum(np.vectorize(lambda x: x != 0)(y)) for idx in range(num_turns): if np.random.random() < random_input_prob: random_input_idx = np.random.choice( range(self.random_input[0].shape[0])) random_input = [ random_input_i[random_input_idx] for random_input_i in self.random_input ] enc_inp[0][idx], dec_inp[0][idx], dec_out[0][ idx], bow_out[0][idx] = random_input y[0][idx] = unk_action_id batch_loss_dict, lr = self.net.train_step( enc_inp, dec_inp, dec_out, bow_out, context_features, action_masks, y) # evaluate every epoch train_accuracy, train_mean_losses = evaluate(self.net, self.data_train, runs_number=1) train_loss_report = ' '.join([ '{}: {:.3f}'.format(key, value) for key, value in train_mean_losses.items() ]) dev_accuracy, dev_mean_losses = evaluate(self.net, self.data_dev, runs_number=1) dev_loss_report = ' '.join([ '{}: {:.3f}'.format(key, value) for key, value in dev_mean_losses.items() ]) print( ':: {}@lr={:.5f} || trn accuracy {:.3f} {} || dev accuracy {:.3f} {}' .format(j + 1, lr, train_accuracy, train_loss_report, dev_accuracy, dev_loss_report)) if best_dev_accuracy < dev_accuracy: print('New best dev accuracy. Saving checkpoint') self.net.save(self.model_folder) best_dev_accuracy = dev_accuracy epochs_without_improvement = 0 else: epochs_without_improvement += 1 if self.config[ 'early_stopping_threshold'] < epochs_without_improvement: print( 'Finished after {} epochs due to early stopping'.format(j)) break
def evaluate(in_session, in_model, in_dataset, batch_size=64): X, masks, labels = in_dataset batch_gen = batch_generator((X, masks), batch_size) predictions = [] for batch in batch_gen: batch_predictions = in_model.predict(batch, in_session) predictions += batch_predictions.tolist() print(confusion_matrix(labels, predictions)) return accuracy_score(labels, predictions)
def evaluate(in_session, in_model, in_dataset, batch_size=64): eval_data, labels = in_dataset batch_gen = batch_generator(eval_data, batch_size) predictions = [] for batch_data in batch_gen: batch_predictions = in_model.predict(batch_data, in_session) predictions += list(batch_predictions) print(confusion_matrix(labels, predictions)) return accuracy_score(labels, predictions)
def evaluate(in_model, in_dataset, batch_size=64): data, labels = in_dataset[:-1], in_dataset[-1] batch_gen = batch_generator(data, batch_size) predictions = [] for batch in batch_gen: batch_predictions = in_model.predict(batch) predictions += list(batch_predictions) print(confusion_matrix(labels, predictions)) return accuracy_score(labels, predictions)
def predict(in_session, in_model, in_dataset, batch_size=64): X, masks, labels = in_dataset batch_gen = batch_generator((X, masks), batch_size) losses, predictions, = [], [] for batch_x, batch_masks in batch_gen: batch_predictions = in_model.predict_loss(batch_x, batch_masks, in_session) losses += batch_predictions[0].tolist() predictions += batch_predictions[1].tolist() return losses, predictions
def evaluate_advanced(in_model, in_dataset, in_dialog_indices, in_action_templates, ignore_ood_accuracy=False): if BABI_CONFIG['backoff_utterance'].lower() in in_action_templates: backoff_action = in_action_templates.index( babi_config['backoff_utterance'].lower()) else: backoff_action = UNK_ID X, action_masks, sequence_masks, y = in_dataset losses, predictions = [], [] batch_gen = batch_generator([X, action_masks, sequence_masks, y], 64, verbose=False) for batch_x, batch_action_masks, batch_sequence_masks, batch_y in batch_gen: batch_predictions, batch_losses = in_model.forward( batch_x, batch_action_masks, batch_sequence_masks, batch_y) losses += list(batch_losses) predictions += batch_predictions.tolist() y_true_dialog, y_pred_dialog = [], [] for y_true_i, y_pred_i in zip(y, predictions): y_true_dialog_i, y_pred_dialog_i = [], [] for y_true_i_j, y_pred_i_j in zip(y_true_i, y_pred_i): if y_true_i_j != PAD_ID: y_true_dialog_i.append(y_true_i_j) y_pred_dialog_i.append(y_pred_i_j) y_true_dialog.append(y_true_dialog_i) y_pred_dialog.append(y_pred_dialog_i) total_turns = 0 correct_turns = 0 total_turns_after_ood = 0 correct_turns_after_ood = 0 for y_true, y_pred in zip(y_true_dialog, y_pred_dialog): ood_occurred = False for y_i_true, y_i_pred in zip(y_true, y_pred): current_turn_counts = not (y_i_true == backoff_action and ignore_ood_accuracy) total_turns += int(current_turn_counts) correct_turns += int(y_i_true == y_i_pred and current_turn_counts) if ood_occurred: total_turns_after_ood += int(current_turn_counts) correct_turns_after_ood += int(y_i_true == y_i_pred and current_turn_counts) if y_i_true == backoff_action: ood_occurred = True return { 'avg_loss': np.mean(losses), 'correct_turns': correct_turns, 'total_turns': total_turns, 'correct_turns_after_ood': correct_turns_after_ood, 'total_turns_after_ood': total_turns_after_ood }
def train(self): print('\n:: training started\n') epochs = self.config['epochs'] best_dev_accuracy = 0.0 epochs_without_improvement = 0 for j in range(epochs): losses = [] batch_gen = batch_generator(self.data_train, self.batch_size) for batch in batch_gen: # batch_x, batch_context_features, batch_action_masks, batch_y in batch_gen: batch_copy = [np.copy(elem) for elem in batch] dropped_out_batch = self.drop_out_batch(batch_copy) batch_loss_dict, lr = self.net.train_step(*dropped_out_batch) # evaluate every epoch train_accuracy, train_loss_dict = evaluate(self.net, self.data_train) train_loss_report = ' '.join(['{}: {:.3f}'.format(key, value) for key, value in train_loss_dict.items()]) dev_accuracy, dev_loss_dict = evaluate(self.net, self.data_dev, runs_number=3) dev_loss_report = ' '.join(['{}: {:.3f}'.format(key, value) for key, value in dev_loss_dict.items()]) print(':: {}@lr={:.5f} || trn accuracy {:.3f} {} || dev accuracy {:.3f} {}'.format(j + 1, lr, train_accuracy, train_loss_report, dev_accuracy, dev_loss_report)) eval_stats_noisy = evaluate_advanced(self.net, self.data_test, self.action_templates, BABI_CONFIG['backoff_utterance'].lower(), post_ood_turns=self.post_ood_turns_noisy, runs_number=1) print('\n\n') print('Noisy dataset: {} turns overall, {} turns after the first OOD'.format(eval_stats_noisy['total_turns'], eval_stats_noisy['total_turns_after_ood'])) print('Accuracy:') accuracy = eval_stats_noisy['correct_turns'] / eval_stats_noisy['total_turns'] accuracy_after_ood = eval_stats_noisy['correct_turns_after_ood'] / eval_stats_noisy['total_turns_after_ood'] \ if eval_stats_noisy['total_turns_after_ood'] != 0 \ else 0 accuracy_post_ood = eval_stats_noisy['correct_post_ood_turns'] / eval_stats_noisy['total_post_ood_turns'] \ if eval_stats_noisy['total_post_ood_turns'] != 0 \ else 0 accuracy_ood = eval_stats_noisy['correct_ood_turns'] / eval_stats_noisy['total_ood_turns'] \ if eval_stats_noisy['total_ood_turns'] != 0 \ else 0 print('overall: {:.3f}; after first OOD: {:.3f}, directly post-OOD: {:.3f}; OOD: {:.3f}'.format(accuracy, accuracy_after_ood, accuracy_post_ood, accuracy_ood)) if best_dev_accuracy < dev_accuracy: print('New best dev accuracy. Saving checkpoint') self.net.save(self.model_folder) best_dev_accuracy = dev_accuracy epochs_without_improvement = 0 else: epochs_without_improvement += 1 if self.config['early_stopping_threshold'] < epochs_without_improvement: print('Finished after {} epochs due to early stopping'.format(j)) break
def get_loss_stats(in_model, in_dataset, in_session, batch_size=64): batch_gen = batch_generator(in_dataset, batch_size) losses = [] for batch in batch_gen: batch_losses, batch_outputs = in_model.step(*batch, in_session, forward_only=True) losses += list(batch_losses) return { 'max': np.max(losses), 'min': np.min(losses), 'avg': np.mean(losses) }
def train(in_session, in_model, in_train, in_dev, in_dst_folder, nb_epoch, batch_size, early_stopping_threshold, dropout_keep_prob, **kwargs): best_dev_loss = np.inf epochs_without_improvement = 0 for epoch_counter in range(nb_epoch): [random.shuffle(train_i) for train_i in in_train] batch_gen = batch_generator(in_train, batch_size, verbose=False) train_nll_losses = [] train_kl_losses = [] train_kl_w = [] for idx, (batch_enc_input, batch_dec_input, batch_dec_output) in enumerate(batch_gen): dec_input_dropped = [ token if random.random() < dropout_keep_prob else UNK_ID for token in batch_dec_input[:, 1:].flatten() ] batch_dec_input_dropped = np.concatenate([ np.expand_dims(batch_dec_input[:, 0], axis=-1), np.array(dec_input_dropped).reshape( batch_dec_input.shape[0], batch_dec_input.shape[1] - 1) ], axis=1) loss_dict = in_model.step(batch_enc_input, batch_dec_input_dropped, batch_dec_output, in_session) train_nll_losses.append(loss_dict['nll_loss']) train_kl_losses.append(loss_dict['kl_loss']) train_kl_w.append(loss_dict['kl_w']) train_loss = np.mean( np.array(train_nll_losses) + np.array(train_kl_losses) * np.array(train_kl_w)) print('Epoch {} out of {} results'.format(epoch_counter, nb_epoch)) print( 'train loss: {:.3f} | nll_loss: {:.3f} | kl_loss: {:.3f} | kl_w: {:.5f}' .format(train_loss, np.mean(train_nll_losses), np.mean(train_kl_losses), loss_dict['kl_w'])) dev_eval = evaluate(in_session, in_model, in_dev) print( 'dev loss: {:.3f} | nll_loss: {:.3f} | kl_loss: {:.3f} | kl_w: {:.5f}' .format(dev_eval['loss'], dev_eval['nll_loss'], dev_eval['kl_loss'], dev_eval['kl_w'])) if dev_eval['loss'] < best_dev_loss: best_dev_loss = dev_eval['loss'] in_model.save(in_dst_folder, in_session) print('New best loss. Saving checkpoint') epochs_without_improvement = 0 else: epochs_without_improvement += 1 if early_stopping_threshold < epochs_without_improvement: print('Early stopping after {} epochs'.format(epoch_counter)) break print('Optimization Finished!')
def get_kl_loss_stats(in_model, in_dataset, in_session, batch_size=64): batch_gen = batch_generator(in_dataset, batch_size) losses = [] for batch_encoder_input, batch_decoder_input, batch_decoder_output in batch_gen: batch_loss_dict, batch_outputs = in_model.step(batch_encoder_input, batch_decoder_input, batch_decoder_output, in_session, forward_only=True) losses += batch_loss_dict['kl_loss'].tolist() return { 'max': np.max(losses), 'min': np.min(losses), 'avg': np.mean(losses) }
def evaluate(in_model, in_dataset): X, action_masks, sequence_masks, y = in_dataset losses, predictions = [], [] batch_gen = batch_generator([X, action_masks, sequence_masks, y], 64, verbose=False) for batch_x, batch_action_masks, batch_sequence_masks, batch_y in batch_gen: batch_predictions, batch_losses = in_model.forward( batch_x, batch_action_masks, batch_sequence_masks, batch_y) losses += batch_losses.tolist() predictions += batch_predictions.flatten().tolist() return (sum([ gold_i == pred_i and gold_i != PAD_ID for gold_i, pred_i in zip(y.flatten(), predictions) ]) / np.sum(map(lambda x: int(x != 0), y.flatten())), np.mean(losses))
def get_loss_stats(in_model, in_dataset, loss_components=['kl_loss', 'nll_loss'], batch_size=64): batch_gen = batch_generator(in_dataset, batch_size) losses = [] for batch in batch_gen: loss_dict = in_model.step(*batch, forward_only=True) cumulative_loss = np.sum( [loss_dict[component] for component in loss_components]) losses.append(cumulative_loss) return { 'max': np.max(losses), 'min': np.min(losses), 'avg': np.mean(losses) }
def evaluate_single_run(in_model, in_dataset): loss_dict = defaultdict(lambda: []) predictions = [] batch_gen = batch_generator(in_dataset, 64) for batch in batch_gen: batch_predictions, batch_loss_dict = in_model.forward(*batch) for key, value in batch_loss_dict.items(): loss_dict[key] += value.tolist() predictions += batch_predictions.flatten().tolist() y = in_dataset[-2] return (sum([ gold_i == pred_i and gold_i != PAD_ID for gold_i, pred_i in zip(y.flatten(), predictions) ]) / np.sum(list(map(lambda x: int(x != 0), y.flatten()))), {key: np.mean(value) for key, value in loss_dict.items()})
def evaluate(in_model, in_dataset, batch_size=64): batch_gen = batch_generator(in_dataset, batch_size) kl_losses = [] nll_losses = [] kl_w = [] bow_losses = [] for batch in batch_gen: loss_dict = in_model.step(*batch, forward_only=True) kl_losses += loss_dict['kl_loss'].tolist() nll_losses += loss_dict['nll_loss'].tolist() kl_w += loss_dict['kl_w'].tolist() bow_losses += loss_dict['bow_loss'].tolist() loss = np.mean(np.array(kl_losses) * np.array(kl_w) + np.array(nll_losses)) return { 'loss': loss, 'nll_loss': np.mean(nll_losses), 'kl_loss': np.mean(kl_losses), 'kl_w': np.mean(loss_dict['kl_w']), 'bow_loss': np.mean(bow_losses) }
def evaluate(in_session, in_model, in_dataset, batch_size=64): batch_gen = batch_generator(in_dataset, batch_size) losses = [] outputs = [] for batch in batch_gen: enc_inp, dec_out = batch batch_losses, batch_outputs = in_model.step(enc_inp, dec_out, in_session, forward_only=True) losses += list(batch_losses) outputs += list(batch_outputs) print('10 random eval sequences:') random_idx = np.random.choice(range(len(outputs)), size=10) for idx in random_idx: output = list(outputs[idx]) if EOS in output: output = output[:output.index(EOS)] print(' '.join(output)) loss = np.mean(losses) ppx = np.exp(loss) return {'loss': loss, 'perplexity': ppx}
def train(in_model, in_train, in_dev, in_config, in_model_folder): best_dev_loss = np.inf epochs_without_improvement = 0 for epoch in range(in_config['num_epoch']): print("Epoch [%d/%d]" % (epoch + 1, in_config['num_epoch'])) batch_gen = batch_generator(in_train, in_config['batch_size']) for batch in batch_gen: in_model.step(*batch) trn_loss_dict = evaluate(in_model, in_train) print( "trn results | nll_loss:%.3f | kl_w:%.3f | kl_loss:%.3f | bow_loss:%.3f" % (trn_loss_dict['nll_loss'], trn_loss_dict['kl_w'], trn_loss_dict['kl_loss'], trn_loss_dict['bow_loss'])) dev_loss_dict = evaluate(in_model, in_dev) print( "dev results | nll_loss:%.3f | kl_w:%.3f | kl_loss:%.3f | bow_loss:%.3f" % (dev_loss_dict['nll_loss'], dev_loss_dict['kl_w'], dev_loss_dict['kl_loss'], dev_loss_dict['bow_loss'])) for idx in np.random.choice(np.arange(len(in_dev)), size=5): encoder_input, decoder_input, decoder_output, bow_output = [ dev_i[idx] for dev_i in in_dev ] encoder_input = ' '.join( [in_model.rev_vocab[word] for word in encoder_input]) in_model.customized_reconstruct(encoder_input) resulting_dev_loss = dev_loss_dict['loss'] if resulting_dev_loss < best_dev_loss: print('New best dev loss! Saving checkpoint') best_dev_loss = resulting_dev_loss in_model.save(in_model_folder) epochs_without_improvement = 0 else: epochs_without_improvement += 1 if epochs_without_improvement == in_config['early_stopping_threshold']: print('Stopped after {} epochs due to no loss improvement'.format( epoch + 1)) break
def train(self): print('\n:: training started\n') epochs = self.config['epochs'] best_dev_accuracy = 0.0 epochs_without_improvement = 0 random_input_prob = self.config.get('random_input_prob', 0.0) unk_action_id = self.action_templates.index(UNK) for j in range(epochs): losses = [] batch_gen = batch_generator([self.X_train, self.context_features_train, self.action_masks_train, self.prev_action_train, self.y_train], self.batch_size) for batch in batch_gen: # batch_x, batch_context_features, batch_action_masks, batch_y in batch_gen: batch_copy = [np.copy(elem) for elem in batch] X, context_features, action_masks, prev_action, y = batch_copy num_turns = np.sum(np.vectorize(lambda x: x!= 0)(y)) for idx in range(num_turns): if np.random.random() < random_input_prob: random_input_idx = np.random.choice(range(self.random_input[0].shape[0])) random_input = [random_input_i[random_input_idx] for random_input_i in self.random_input] X[0][idx] = random_input[0] y[0][idx] = unk_action_id if idx + 1 < num_turns: prev_action[0][idx + 1] = unk_action_id batch_loss_dict, lr = self.net.train_step(X, context_features, action_masks, prev_action, y) # evaluate every epoch train_accuracy, train_loss_dict = evaluate(self.net, (self.X_train, self.context_features_train, self.action_masks_train, self.prev_action_train, self.y_train)) train_loss_report = ' '.join(['{}: {:.3f}'.format(key, value) for key, value in train_loss_dict.items()]) dev_accuracy, dev_loss_dict = evaluate(self.net, (self.X_dev, self.context_features_dev, self.action_masks_dev, self.prev_action_dev, self.y_dev)) dev_loss_report = ' '.join(['{}: {:.3f}'.format(key, value) for key, value in dev_loss_dict.items()]) print(':: {}@lr={:.5f} || trn accuracy {:.3f} {} || dev accuracy {:.3f} {}'.format(j + 1, lr, train_accuracy, train_loss_report, dev_accuracy, dev_loss_report)) eval_stats_noisy = evaluate_advanced(self.net, (self.X_test, self.context_features_test, self.action_masks_test, self.prev_action_test, self.y_test), self.action_templates, BABI_CONFIG['backoff_utterance'].lower(), post_ood_turns=self.post_ood_turns_noisy, runs_number=1) print('\n\n') print('Noisy dataset: {} turns overall, {} turns after the first OOD'.format(eval_stats_noisy['total_turns'], eval_stats_noisy['total_turns_after_ood'])) print('Accuracy:') accuracy = eval_stats_noisy['correct_turns'] / eval_stats_noisy['total_turns'] accuracy_after_ood = eval_stats_noisy['correct_turns_after_ood'] / eval_stats_noisy['total_turns_after_ood'] \ if eval_stats_noisy['total_turns_after_ood'] != 0 \ else 0 accuracy_post_ood = eval_stats_noisy['correct_post_ood_turns'] / eval_stats_noisy['total_post_ood_turns'] \ if eval_stats_noisy['total_post_ood_turns'] != 0 \ else 0 accuracy_ood = eval_stats_noisy['correct_ood_turns'] / eval_stats_noisy['total_ood_turns'] \ if eval_stats_noisy['total_ood_turns'] != 0 \ else 0 print('overall: {:.3f}; after first OOD: {:.3f}, directly post-OOD: {:.3f}; OOD: {:.3f}'.format(accuracy, accuracy_after_ood, accuracy_post_ood, accuracy_ood)) if best_dev_accuracy < dev_accuracy: print('New best dev loss. Saving checkpoint') self.net.save(self.model_folder) best_dev_accuracy = dev_accuracy epochs_without_improvement = 0 else: epochs_without_improvement += 1 if self.config['early_stopping_threshold'] < epochs_without_improvement: print('Finished after {} epochs due to early stopping'.format(j)) break
def evaluate_advanced(in_model, in_dataset, in_action_templates, post_ood_turns=None, ignore_ood_accuracy=False): if BABI_CONFIG['backoff_utterance'].lower() in in_action_templates: backoff_action = in_action_templates.index( BABI_CONFIG['backoff_utterance'].lower()) else: backoff_action = UNK_ID X, action_masks, sequence_masks, y = in_dataset losses, predictions = [], [] batch_gen = batch_generator([X, action_masks, sequence_masks, y], 64, verbose=False) for batch_x, batch_action_masks, batch_sequence_masks, batch_y in batch_gen: batch_predictions, batch_losses = in_model.forward( batch_x, batch_action_masks, batch_sequence_masks, batch_y) losses += list(batch_losses) predictions += batch_predictions.tolist() y_true_dialog, y_pred_dialog = [], [] for y_true_i, y_pred_i in zip(y, predictions): y_true_dialog_i, y_pred_dialog_i = [], [] for y_true_i_j, y_pred_i_j in zip(y_true_i, y_pred_i): if y_true_i_j != PAD_ID: y_true_dialog_i.append(y_true_i_j) y_pred_dialog_i.append(y_pred_i_j) y_true_dialog.append(y_true_dialog_i) y_pred_dialog.append(y_pred_dialog_i) total_turns = 0 correct_turns = 0 total_turns_after_ood = 0 total_post_ood_turns = 0 correct_post_ood_turns = 0 correct_turns_after_ood = 0 for y_true, y_pred in zip(y_true_dialog, y_pred_dialog): ood_occurred = False prev_action = None for y_i_true, y_i_pred in zip(y_true, y_pred): current_turn_counts = not (y_i_true == backoff_action and ignore_ood_accuracy) total_turns += int(current_turn_counts) correct_turns += int(y_i_true == y_i_pred and current_turn_counts) if prev_action == backoff_action and y_i_true != backoff_action: total_post_ood_turns += 1 correct_post_ood_turns += int(y_i_true == y_i_pred) if ood_occurred: total_turns_after_ood += int(current_turn_counts) correct_turns_after_ood += int(y_i_true == y_i_pred and current_turn_counts) if y_i_true == backoff_action: ood_occurred = True prev_action = y_i_true post_ood_correct, post_ood_total = 0, 0 if post_ood_turns is not None: non_pad_counter = 0 post_ood_true, post_ood_pred = [], [] for y_true_i, y_pred_i in zip( np.array(y).flatten(), np.array(predictions).flatten()): if y_true_i == PAD_ID: continue if non_pad_counter in post_ood_turns: post_ood_true.append(y_true_i) post_ood_pred.append(y_pred_i) non_pad_counter += 1 post_ood_correct = sum([ int(true_i == pred_i) for true_i, pred_i in zip(post_ood_true, post_ood_pred) ]) post_ood_total = len(post_ood_turns) return { 'avg_loss': np.mean(losses), 'correct_turns': correct_turns, 'total_turns': total_turns, 'correct_turns_after_ood': correct_turns_after_ood, 'total_turns_after_ood': total_turns_after_ood, 'total_post_ood_turns': post_ood_total, 'correct_post_ood_turns': post_ood_correct }
def evaluate_advanced_single(in_model, in_dataset, in_action_templates, in_fallback_utterance, post_ood_turns=None, ignore_ood_accuracy=False): if in_fallback_utterance in in_action_templates: fallback_action = in_action_templates.index(in_fallback_utterance) else: fallback_action = len(in_action_templates) - 1 losses, predictions = [], [] batch_gen = batch_generator(in_dataset, 64) for batch in batch_gen: batch_predictions, batch_loss_dict = in_model.forward(*batch) losses += batch_loss_dict['loss'].tolist() predictions += batch_predictions.tolist() y = in_dataset[-2] y_true_dialog, y_pred_dialog = [], [] y_true_all, y_pred_all = [], [] y_true_binary, y_pred_binary = [], [] for y_true_i, y_pred_i in zip(y, predictions): y_true_dialog_i, y_pred_dialog_i = [], [] for y_true_i_j, y_pred_i_j in zip(y_true_i, y_pred_i): if y_true_i_j != PAD_ID: y_true_dialog_i.append(y_true_i_j) y_pred_dialog_i.append(y_pred_i_j) y_true_dialog.append(y_true_dialog_i) y_pred_dialog.append(y_pred_dialog_i) y_true_all += y_true_dialog_i y_pred_all += y_pred_dialog_i y_true_binary = [int(action == fallback_action) for action in y_true_all] y_pred_binary = [int(action == fallback_action) for action in y_pred_all] ood_f1 = f1_score(y_true_binary, y_pred_binary, average='binary') acc = accuracy_score(y_true_all, y_pred_all) total_turns = 0 correct_turns = 0 correct_continuous_turns = 0 total_ood_turns = 0 correct_ood_turns = 0 for y_true, y_pred in zip(y_true_dialog, y_pred_dialog): error_occurred = False for y_i_true, y_i_pred in zip(y_true, y_pred): error_occurred = error_occurred or y_i_true != y_i_pred current_turn_counts = not (y_i_true == fallback_action and ignore_ood_accuracy) total_turns += int(current_turn_counts) correct_turns += int(y_i_true == y_i_pred and current_turn_counts) correct_continuous_turns += int(not error_occurred) if y_i_true == fallback_action: total_ood_turns += 1 correct_ood_turns += int(y_i_true == y_i_pred) post_ood_correct, post_ood_total = 0, 0 if post_ood_turns is not None: non_pad_counter = 0 post_ood_true, post_ood_pred = [], [] for y_true_i, y_pred_i in zip( np.array(y).flatten(), np.array(predictions).flatten()): if y_true_i == PAD_ID: continue if non_pad_counter in post_ood_turns: post_ood_true.append(y_true_i) post_ood_pred.append(y_pred_i) non_pad_counter += 1 post_ood_correct = sum([ int(true_i == pred_i) for true_i, pred_i in zip(post_ood_true, post_ood_pred) ]) post_ood_total = len(post_ood_turns) assert abs(acc - correct_turns / total_turns) < 1e-7 return { 'avg_loss': np.mean(losses), 'correct_turns': correct_turns, 'total_turns': total_turns, 'total_ood_turns': total_ood_turns, 'correct_ood_turns': correct_ood_turns, 'total_post_ood_turns': post_ood_total, 'correct_post_ood_turns': post_ood_correct, 'correct_continuous_turns': correct_continuous_turns, 'ood_f1': ood_f1 }