Example #1
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:
        """
        Determine the majority class.
        """
        if train_input.multilabel:
            train_labels: List[str] = list(
                itertools.chain.from_iterable(train_input.y_train_multilabel))
        else:
            train_labels = train_input.y_train_multiclass

        unique_values, value_counts = np.unique(train_labels,
                                                return_counts=True)
        self.majority_class = unique_values[value_counts.argmax(axis=0)]

        labels = train_input.labels()
        y_train_pred_proba = self._make_pred_df(labels,
                                                len(train_input.y_train))
        y_valid_pred_proba = self._make_pred_df(labels,
                                                len(train_input.y_valid))

        if train_input.multilabel:
            y_train_indicator = multilabel_to_indicator_df(
                train_input.y_train_multilabel, labels)
            train_loss = ((y_train_pred_proba.subtract(y_train_indicator)
                           ).abs().to_numpy().sum())

            y_valid_indicator = multilabel_to_indicator_df(
                train_input.y_valid_multilabel, labels)
            valid_loss = ((y_valid_pred_proba.subtract(y_valid_indicator)
                           ).abs().to_numpy().sum())
            valid_accuracy = valid_loss / (y_valid_pred_proba.shape[0] *
                                           y_valid_pred_proba.shape[1])
        else:
            y_train_pred = pred_prob_to_pred_label(y_train_pred_proba)
            train_loss = np.sum(y_train_pred != train_input.y_train_multiclass)

            y_valid_pred = pred_prob_to_pred_label(y_valid_pred_proba)
            valid_loss = np.sum(y_valid_pred != train_input.y_valid_multiclass)
            valid_accuracy = valid_loss / len(y_valid_pred)

        return gobbli.io.TrainOutput(
            valid_loss=valid_loss,
            valid_accuracy=valid_accuracy,
            train_loss=train_loss,
            labels=train_input.labels(),
            multilabel=train_input.multilabel,
        )
Example #2
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:
        self._write_input(
            train_input.X_train,
            train_input.y_train_multilabel,
            context.host_input_dir / FastText._TRAIN_INPUT_FILE,
        )
        self._write_input(
            train_input.X_valid,
            train_input.y_valid_multilabel,
            context.host_input_dir / FastText._VALID_INPUT_FILE,
        )

        container_validation_input_path = (context.container_input_dir /
                                           FastText._VALID_INPUT_FILE)
        train_logs, train_loss = self._run_supervised(
            train_input.checkpoint,
            context.container_input_dir / FastText._TRAIN_INPUT_FILE,
            context.container_output_dir / FastText._CHECKPOINT_BASE,
            context,
            train_input.num_train_epochs,
            autotune_validation_file_path=container_validation_input_path,
        )

        host_checkpoint_path = context.host_output_dir / f"{FastText._CHECKPOINT_BASE}"

        labels = train_input.labels()

        # Calculate validation accuracy on our own, since the CLI only provides
        # precision/recall
        predict_logs, pred_prob_df = self._run_predict_prob(
            host_checkpoint_path, labels, container_validation_input_path,
            context)

        if train_input.multilabel:
            pred_labels = pred_prob_to_pred_multilabel(pred_prob_df)
            gold_labels = multilabel_to_indicator_df(
                train_input.y_valid_multilabel, labels)
        else:
            pred_labels = pred_prob_to_pred_label(pred_prob_df)
            gold_labels = train_input.y_valid_multiclass

        valid_accuracy = accuracy_score(gold_labels, pred_labels)

        # Not ideal, but fastText doesn't provide a way to get validation loss;
        # Negate the validation accuracy instead
        valid_loss = -valid_accuracy

        return gobbli.io.TrainOutput(
            train_loss=train_loss,
            valid_loss=valid_loss,
            valid_accuracy=valid_accuracy,
            labels=labels,
            multilabel=train_input.multilabel,
            checkpoint=host_checkpoint_path,
            _console_output="\n".join((train_logs, predict_logs)),
        )
Example #3
0
 def y_pred(self) -> List[str]:
     """
     Returns:
       The predicted class for each observation.
     """
     return pred_prob_to_pred_label(self.y_pred_proba)
Example #4
0
    escape_line_delimited_text,
    pred_prob_to_pred_label,
    truncate_text,
)

MetricFunc = Callable[[Sequence[str], pd.DataFrame], float]
"""
A function used to calculate some metric.  It should accept a sequence of true labels (y_true)
and a dataframe of shape (n_samples, n_classes) containing predicted probabilities; it should
output a real number.
"""

DEFAULT_METRICS: Dict[str, MetricFunc] = {
    "Weighted F1 Score":
    lambda y_true, y_pred_proba: f1_score(
        y_true, pred_prob_to_pred_label(y_pred_proba), average="weighted"),
    "Weighted Precision Score":
    lambda y_true, y_pred_proba: precision_score(
        y_true, pred_prob_to_pred_label(y_pred_proba), average="weighted"),
    "Weighted Recall Score":
    lambda y_true, y_pred_proba: recall_score(
        y_true, pred_prob_to_pred_label(y_pred_proba), average="weighted"),
    "Accuracy":
    lambda y_true, y_pred_proba: accuracy_score(
        y_true, pred_prob_to_pred_label(y_pred_proba)),
}
"""
The default set of metrics to be reported in experiment results.  Users may want to extend
this.
"""
Example #5
0
 def y_pred(self) -> List[str]:
     """
     Returns:
       The most likely predicted label for each observation.
     """
     return pred_prob_to_pred_label(self.y_pred_proba)
Example #6
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = IMDBDataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        all_results = []

        majority, minority = ClassImbalanceScenario.find_majority_minority_classes(
            y_test)
        majority_df, minority_df = ClassImbalanceScenario.split_dataset(
            X_train_valid_preprocessed, y_train_valid, majority, minority)

        for proportion in self.params["imbalance_proportions"]:
            # Downsample the minority class so the final dataset contains the desired
            # proportion of the minority
            orig_len = majority_df.shape[0]
            downsample_proportion = -orig_len / (orig_len -
                                                 orig_len / proportion)
            minority_sample = minority_df.sample(
                frac=downsample_proportion).reset_index()
            sampled_df = pd.concat([majority_df, minority_sample])

            X = sampled_df["X"].tolist()
            y = sampled_df["y"].tolist()

            LOGGER.info(
                f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} "
                f"Evaluating proportion {round(proportion, 3)} ({len(X)} obs)")

            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X,
                y,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_preprocessed, y_test),
                run_kwargs=run.run_kwargs,
            )
            all_results.append(results)

        minority_f1_scores = []
        majority_f1_scores = []
        for result in all_results:
            majority_f1, minority_f1 = f1_score(
                result.y_true,
                pred_prob_to_pred_label(result.y_pred_proba),
                average=None,
                labels=[majority, minority],
            )
            minority_f1_scores.append(minority_f1)
            majority_f1_scores.append(majority_f1)

        all_metrics = pd.DataFrame([{
            "imbalance_proportion": p,
            **r.metrics()
        } for p, r in zip(self.params["imbalance_proportions"], all_results)])

        all_metrics["Minority Class F1 Score"] = minority_f1_scores
        all_metrics["Majority Class F1 Score"] = majority_f1_scores

        fig = plt.figure(figsize=(10, 10))
        minority_ax = fig.add_subplot()
        all_metrics.plot(x="imbalance_proportion",
                         y="Minority Class F1 Score",
                         ax=minority_ax)

        majority_ax = fig.add_subplot()
        all_metrics.plot(x="imbalance_proportion",
                         y="Majority Class F1 Score",
                         ax=majority_ax)

        plt.xlabel("Prevalence of Minority Class")
        plt.title(
            f"Model Performance by Prevalence of Minority Class - {model_cls.__name__}"
        )
        plt.xlim(0, 0.5)
        plt.ylim(0, 1)

        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += tabulate(all_metrics, tablefmt="pipe", headers="keys")
        md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
Example #7
0
 def y_pred_multiclass(self) -> List[str]:
     """
     Returns:
       Predicted class for each observation (assuming multiclass context).
     """
     return pred_prob_to_pred_label(self.y_pred_proba)