Example #1
0
def write_dict_to_tensorboard(writer, val_dict, base_name, iteration):
	for name, val in val_dict.items():
		if isinstance(val, dict):
			write_dict_to_tensorboard(writer, val, base_name=base_name+"/"+name, iteration=iteration)
		elif isinstance(val, (list, np.ndarray)):
			continue
		elif isinstance(val, (int, float)):
			writer.add_scalar(base_name + "/" + name, val, iteration)
		else:
			if debug_level() == 0:
				print("Skipping output \""+str(name) + "\" of value " + str(val) + "(%s)" % (val.__class__.__name__))
Example #2
0
    def eval(self, dataset=None, batch_size=64):
        if dataset is None:
            assert self.val_dataset is not None, "[!] ERROR: Validation dataset not loaded. Please load the dataset beforehand for evaluation."
            dataset = self.val_dataset

        self.model.eval()

        # Prepare metrics
        number_batches = int(
            math.ceil(dataset.get_num_examples() * 1.0 / batch_size))
        eval_metrics = None
        eval_loss = []
        num_counter = []

        # Evaluation loop
        with torch.no_grad():
            for batch_ind in range(number_batches):
                if debug_level() == 0:
                    print("Evaluation process: %4.2f%%" %
                          (100.0 * batch_ind / number_batches),
                          end="\r")
                # Evaluate single batch
                batch = dataset.get_batch(batch_size,
                                          loop_dataset=False,
                                          toTorch=True)
                batch_loss, additional_metrics = self._eval_batch(batch)
                if eval_metrics is None:
                    eval_metrics = {
                        metric_name: [metric_val.item()]
                        for metric_name, metric_val in
                        additional_metrics.items()
                    }
                else:
                    [
                        eval_metrics[metric_name].append(metric_val.item()) for
                        metric_name, metric_val in additional_metrics.items()
                    ]
                eval_loss.append(batch_loss.item())
                num_counter.append(batch[0].size(0))

        mean_loss = sum([n * l for n, l in zip(num_counter, eval_loss)
                         ]) / sum(num_counter)
        detailed_metrics = {
            metric_name:
            sum([n * l
                 for n, l in zip(num_counter, metric_vals)]) / sum(num_counter)
            for metric_name, metric_vals in eval_metrics.items()
        }

        detailed_metrics["eval_loss"] = mean_loss

        self.model.train()

        return mean_loss, detailed_metrics
Example #3
0
def run_inference(model,
                  input_file,
                  output_file=None,
                  batch_size=64,
                  load_file=True):
    infer_dataset = create_dataset_from_file(input_file, load_file=load_file)

    num_batches = int(
        math.ceil(infer_dataset.get_num_examples() * 1.0 / batch_size))
    predictions = list()
    for batch_index in range(num_batches):

        if debug_level() == 0:
            print("Inference process: %4.2f%%" %
                  (100.0 * batch_index / num_batches),
                  end="\r")

        embeds, lengths, _ = infer_dataset.get_batch(batch_size,
                                                     loop_dataset=False,
                                                     toTorch=True)
        preds = model(words_s1=embeds[0],
                      lengths_s1=lengths[0],
                      words_s2=embeds[1],
                      lengths_s2=lengths[1],
                      applySoftmax=True)
        _, pred_labels = torch.max(preds, dim=-1)
        out = torch.squeeze(pred_labels).tolist()
        predictions += out if isinstance(out, list) else [out]
        print(preds)

    out_s = ""
    for i in range(len(infer_dataset.data_list)):
        out_s += "=" * 100 + "\n"
        out_s += " Example %i\n" % (i + 1)
        out_s += "-" * 100 + "\n"
        out_s += " Premise: " + infer_dataset.data_list[i].get_premise() + "\n"
        out_s += " Hypothesis: " + infer_dataset.data_list[i].get_hypothesis(
        ) + "\n"
        out_s += " Prediction: " + NLIData.label_to_string(
            predictions[i]) + "\n"
        out_s += "=" * 100 + "\n\n"

    if output_file is not None:
        with open(output_file, "w") as f:
            f.write(out_s)

    print(out_s)
Example #4
0
    def eval(self, dataset=None, batch_size=64):
        # Default: if no dataset is specified, we use validation dataset
        if dataset is None:
            assert self.val_dataset is not None, "[!] ERROR: Validation dataset not loaded. Please load the dataset beforehand for evaluation."
            dataset = self.val_dataset

        self.model.eval()

        # Prepare metrics
        number_batches = int(
            math.ceil(dataset.get_num_examples() * 1.0 / batch_size))
        perplexity = []
        diversity_unigram, diversity_bigram = None, None

        # Evaluation loop
        for batch_ind in range(number_batches):
            if debug_level() == 0:
                print("Evaluation process: %4.2f%%" %
                      (100.0 * batch_ind / number_batches),
                      end="\r")
            # Evaluate single batch
            batch = dataset.get_batch(batch_size,
                                      loop_dataset=False,
                                      toTorch=True)
            batch_labels, perplexity_logits, generated_words, generated_lengths = self._eval_batch(
                batch)
            # Perplexity calculation
            perplexity += TaskTemplate._eval_preplexity(
                perplexity_logits, batch_labels).cpu().numpy().tolist()
            loc_div_uni, loc_div_bi = TaskTemplate._eval_diversity(
                generated_words,
                generated_lengths,
                num_classes=perplexity_logits.shape[-1])
            if diversity_unigram is None or diversity_bigram is None:
                diversity_unigram, diversity_bigram = loc_div_uni, loc_div_bi
            else:
                diversity_unigram += loc_div_uni
                diversity_bigram += loc_div_bi

        diversity_unigram = diversity_unigram.cpu().numpy()
        diversity_bigram = diversity_bigram.cpu().numpy()
        # Metric output
        avg_perplexity = sum(perplexity) / len(perplexity)
        div_uni_probs = diversity_unigram / max(np.sum(diversity_unigram),
                                                1e-5)
        div_bi_probs = diversity_bigram / max(np.sum(diversity_bigram), 1e-5)
        unigram_entropy = -(div_uni_probs *
                            np.log(np.maximum(div_uni_probs, 1e-10))).sum()
        bigram_entropy = -(div_bi_probs *
                           np.log(np.maximum(div_bi_probs, 1e-10))).sum()
        unigram_variety = int(np.sum(diversity_unigram > 0))
        bigram_variety = int(np.sum(diversity_bigram > 0))

        detailed_metrics = {
            "perplexity": avg_perplexity,
            "unigram_entropy": unigram_entropy,
            "bigram_entropy": bigram_entropy,
            "unigram_variety": unigram_variety,
            "bigram_variety": bigram_variety
        }

        self.model.train()

        return avg_perplexity, detailed_metrics
Example #5
0
    def export_best_results(self, checkpoint_path):
        self.model.eval()

        # Prepare metrics
        batch_size = 64
        number_batches = int(
            math.ceil(self.val_dataset.get_num_examples() * 1.0 / batch_size))
        number_batches = min(5, number_batches)
        true_positive_sents = list()
        false_positive_sents = list()
        true_negative_sents = list()
        false_negative_sents = list()

        # Evaluation loop
        with torch.no_grad():
            for batch_ind in range(number_batches):
                if debug_level() == 0:
                    print("Evaluation process: %4.2f%%" %
                          (100.0 * batch_ind / number_batches),
                          end="\r")
                # Evaluate single batch
                batch = self.val_dataset.get_batch(batch_size,
                                                   loop_dataset=False,
                                                   toTorch=True)
                discriminator_predictions, labels, _, _ = self.model(
                    _input=batch,
                    use_VAE=self.use_VAE,
                    use_semantic_specific_attn=self.use_semantic_specific_attn)
                positive_predictions = (discriminator_predictions >
                                        0.5).float().cpu().numpy()
                labels = labels.cpu().numpy()

                batch = tuple([tensor.cpu().numpy() for tensor in batch])
                par_1_words, par_1_lengths, par_2_words, par_2_lengths, par_1_slots, par_1_slot_lengths, par_2_slots, par_2_slot_lengths, contexts_1_words, contexts_1_lengths, contexts_2_words, contexts_2_lengths = batch
                reconstructed_sents_1 = reconstruct_sentences(
                    par_1_words,
                    par_1_lengths,
                    slot_vals=par_1_slots,
                    slot_lengths=par_1_slot_lengths)
                reconstructed_sents_2 = reconstruct_sentences(
                    par_2_words,
                    par_2_lengths,
                    slot_vals=par_2_slots,
                    slot_lengths=par_2_slot_lengths)
                reconstructed_contexts_1 = reconstruct_sentences(
                    contexts_1_words, contexts_1_lengths)
                reconstructed_contexts_2 = reconstruct_sentences(
                    contexts_2_words, contexts_2_lengths)

                loc_batch_size = par_1_words.shape[0]

                for b in range(positive_predictions.shape[0]):
                    semantic_sents = reconstructed_sents_1[
                        b %
                        loc_batch_size] if b < loc_batch_size or b >= loc_batch_size * 3 else reconstructed_sents_2[
                            b % loc_batch_size]
                    context_sents = reconstructed_contexts_1[
                        b %
                        loc_batch_size] if b < loc_batch_size * 2 else reconstructed_contexts_2[
                            b % loc_batch_size]
                    s = "\n" + "=" * 100 + "\n" + context_sents + "\n" + "-" * 100 + "\nResponse: " + semantic_sents + "\n" + "=" * 100 + "\n"
                    if positive_predictions[b] == 1 and labels[b] == 1:
                        true_positive_sents.append(s)
                    elif positive_predictions[b] == 1 and labels[b] == 0:
                        false_positive_sents.append(s)
                    elif positive_predictions[b] == 0 and labels[b] == 1:
                        false_negative_sents.append(s)
                    elif positive_predictions[b] == 0 and labels[b] == 0:
                        true_negative_sents.append(s)
                    else:
                        print(
                            "[!] ERROR: Something went wrong. Prediction is not any of TP, FP, FN, and TN..."
                        )
                        sys.exit(1)

        for sents, filename in zip([
                true_positive_sents, false_positive_sents,
                false_negative_sents, true_negative_sents
        ], [
                "true_positives", "false_positives", "false_negatives",
                "true_negatives"
        ]):
            sents = list(set(sents))
            with open(
                    os.path.join(checkpoint_path,
                                 "%s_%s.txt" % (self.name, filename)),
                    "w") as f:
                f.write("\n".join(sents))

        self.model.train()
Example #6
0
    def eval(self, dataset=None, batch_size=64):
        # Default: if no dataset is specified, we use validation dataset
        if dataset is None:
            assert self.val_dataset is not None, "[!] ERROR: Validation dataset not loaded. Please load the dataset beforehand for evaluation."
            dataset = self.val_dataset

        self.model.eval()
        self.classifier.eval()
        # Prepare metrics
        number_batches = int(
            math.ceil(dataset.get_num_examples() * 1.0 / batch_size))
        label_list = []
        preds_list = []

        # Evaluation loop
        for batch_ind in range(number_batches):
            if debug_level() == 0:
                print("Evaluation process: %4.2f%%" %
                      (100.0 * batch_ind / number_batches),
                      end="\r")
            # Evaluate single batch
            batch = dataset.get_batch(batch_size,
                                      loop_dataset=False,
                                      toTorch=True)
            pred_labels, batch_labels = self._eval_batch(batch)
            preds_list += torch.squeeze(pred_labels).tolist()
            label_list += torch.squeeze(batch_labels).tolist()

        # to_remove = [i for i, l in enumerate(label_list) if l < 0]
        # for r_index in sorted(to_remove)[::-1]:
        # 	del preds_list[r_index]
        # 	del label_list[r_index]

        # Metric output
        preds_list = np.array(preds_list)
        label_list = np.array(label_list)
        preds_list = preds_list[label_list >= 0]
        label_list = label_list[label_list >= 0]
        accuracy = np.sum(preds_list == label_list) * 1.0 / preds_list.shape[0]
        detailed_acc = {
            "accuracy": accuracy,
            "predictions": preds_list,
            "labels": label_list,
            "class_scores": dict()
        }

        print("-" * 75)
        print("Evaluation accuracy: %4.2f%%" % (accuracy * 100.0))
        print("Accuracy per class: ")
        for c in list(set(label_list)):
            TP = np.sum(np.logical_and(preds_list == c, label_list == c))
            FP = np.sum(np.logical_and(preds_list == c, label_list != c))
            FN = np.sum(np.logical_and(preds_list != c, label_list == c))
            recall = TP * 1.0 / max(1e-5, TP + FN)
            precision = TP * 1.0 / max(1e-5, TP + FP)
            F1_score = 2.0 * TP / max(1e-5, 2 * TP + FP + FN)
            print(
                "\t- Class %s: Recall=%4.2f%%, Precision=%4.2f%%, F1 score=%4.2f%%"
                % (dataset.label_to_string(c), recall * 100.0,
                   precision * 100.0, F1_score * 100.0))
            detailed_acc["class_scores"][dataset.label_to_string(c)] = {
                "recall": recall,
                "precision": precision,
                "f1": F1_score
            }
        print("-" * 75)

        self.classifier.train()

        return accuracy, detailed_acc