Beispiel #1
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth: `gobbli.model.base.BaseModel.init`.

        For more info on fastText parameter semantics, see
        `the docs <https://fasttext.cc/docs/en/options.html>`__.  The fastText
        `supervised tutorial <https://fasttext.cc/docs/en/supervised-tutorial.html>`__ has
        some more detailed explanation.

        fastText parameters:

        - ``word_ngrams`` (:obj:`int`): Max length of word n-grams.
        - ``lr`` (:obj:`float`): Learning rate.
        - ``dim`` (:obj:`int`): Dimension of learned vectors.
        - ``ws`` (:obj:`int`): Context window size.
        - ``fasttext_model`` (:obj:`str`): Name of a pretrained fastText model to use.
          See :obj:`FASTTEXT_VECTOR_ARCHIVES` for a listing of available pretrained models.
        """
        self.word_ngrams = 1
        self.lr = 0.1
        self.ws = 5
        self.fasttext_model = None
        # Default to dimensionality of the passed model, if any;
        # otherwise, default to 100
        if "fasttext_model" in params:
            self.dim = _parse_dim(params["fasttext_model"])
        else:
            self.dim = 100

        for name, value in params.items():
            if name == "word_ngrams":
                assert_type(name, value, int)
                self.word_ngrams = value
            elif name == "lr":
                assert_type(name, value, float)
                self.lr = value
            elif name == "dim":
                assert_type(name, value, int)
                self.dim = value
            elif name == "ws":
                assert_type(name, value, int)
                self.ws = value
            elif name == "fasttext_model":
                assert_in(name, value, set(FASTTEXT_VECTOR_ARCHIVES.keys()))
                self.fasttext_model = value
            else:
                raise ValueError(f"Unknown param '{name}'")

        if (self.fasttext_model is not None
                and f"{self.dim}d" not in self.fasttext_model):
            raise ValueError(
                "When using pretrained vectors, 'dim' must match the"
                f" dimensionality of the vectors; 'dim' value of {self.dim}"
                f" is incompatible with vectors {self.fasttext_model}.")
Beispiel #2
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = self.dataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        stdout_catcher = StdoutCatcher()
        with stdout_catcher:
            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X_train_valid_preprocessed,
                y_train_valid,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_preprocessed, y_test),
                worker_log_level=logging.INFO,
                run_kwargs=run.run_kwargs,
            )
            # Sleep a few seconds to let logs from the worker catch up
            time.sleep(3)

        # Sample the observations if there are more than 1,000 in the test set, since we
        # need to save the chart, and trying to save large charts can cause Selenium timeouts
        # when they're rendered to PNG
        sample_size = 1000
        chart = results.plot(sample_size=sample_size).properties(
            title=
            f"Predicted Probability (Sampled Test Set Observations, n={sample_size})"
        )
        plot_path = run_output_dir / "plot.png"
        # Longer driver timeout needed since these images can be very big
        chart.save(str(plot_path), driver_timeout=600)

        md = f"# Results: {run.key}\n"
        md += f"```\n{stdout_catcher.get_logs()}\n```\n"
        md += tabulate(pd.DataFrame(results.training_results),
                       tablefmt="pipe",
                       headers="keys")
        md += f"\n```\n{results.metrics_report()}\n```\n"
        md += f"\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
Beispiel #3
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        USE parameters:

        - ``use_model`` (:obj:`str`): Name of a USE model to use.
          See :obj:`USE_MODEL_ARCHIVES` for a listing of available USE models.
        """
        self.use_model = "universal-sentence-encoder"

        for name, value in params.items():
            if name == "use_model":
                assert_in(name, value, set(USE_MODEL_ARCHIVES.keys()))
                self.use_model = value
            else:
                raise ValueError(f"Unknown param '{name}'")
Beispiel #4
0
    def _validate_params(self):
        assert_param_required("percent_multipliers", self.params)
        percent_multipliers = self.params["percent_multipliers"]
        assert_type("percent_multipliers", percent_multipliers, list)
        for (p, m) in percent_multipliers:
            assert_type("percent", p, float)
            assert_proportion("percent", p)
            assert_type("multiplier", m, (int, float))

        assert_type("param_grid", self.params.get("param_grid", {}), dict)

        assert_param_required("model_name", self.params)
        assert_type("model_name", self.params["model_name"], str)
        assert_valid_model(self.params["model_name"])

        assert_param_required("augment_probability", self.params)
        assert_type("augment_probability", p, float)
        assert_proportion("augment_probability", p)

        assert_param_required("preprocess_func", self.params)
        assert_in("preprocess_func", self.params["preprocess_func"],
                  PREPROCESS_FUNCS)
Beispiel #5
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        BERT parameters:

        - ``max_seq_length`` (:obj:`int`): The maximum total input sequence length after
          WordPiece tokenization.  Sequences longer than this will be truncated,
          and sequences shorter than this will be padded.  Default: 128
        - ``bert_model`` (:obj:`str`): Name of a pretrained BERT model to use.
          See :obj:`BERT_MODEL_ARCHIVES` for a listing of available BERT models.
        """
        self.max_seq_length = 128
        self.bert_model = "bert-base-uncased"

        for name, value in params.items():
            if name == "max_seq_length":
                assert_type(name, value, int)
                self.max_seq_length = value
            elif name == "bert_model":
                assert_in(name, value, set(BERT_MODEL_ARCHIVES.keys()))
                self.bert_model = value
            else:
                raise ValueError(f"Unknown param '{name}'")
Beispiel #6
0
    def __init__(
        self,
        # Can't make this type more restrictive since gensim might not be
        # available, and we need to make the union include a gensim type
        model: Any,
        tokenizer: Union[str, TokenizeMethod,
                         Callable[[List[str]],
                                  List[List[str]]]] = TokenizeMethod.SPLIT,
        n_similar: int = 10,
        diversity: float = 0.8,
    ):
        try:
            import gensim
            from gensim.scripts.glove2word2vec import glove2word2vec
        except ImportError:
            raise ImportError(
                "word2vec-based data augmentation requires gensim to be installed."
            )

        if isinstance(model, str):
            # Download and extract pretrained weights from a public source
            assert_in("word2vec model", model, set(WORD2VEC_MODELS.keys()))
            archive_name, filename = WORD2VEC_MODELS[model]
            archive_url = _WORD2VEC_MODEL_ARCHIVES[archive_name]

            LOGGER.debug(f"Downloading word2vec model '{model}'")
            # Some downloads aren't contained in archives
            if is_archive(Path(archive_url)):
                extract_dir = download_archive(archive_url,
                                               self.data_dir(),
                                               junk_paths=True)
                model_file = extract_dir / filename
            else:
                model_file = download_file(archive_url)

            if model.startswith("glove"):
                LOGGER.debug(f"Converting GloVe format to word2vec format")
                # Need to convert the downloaded file to word2vec format,
                # since GloVe vectorsr are formatted slightly differently
                with tempfile.NamedTemporaryFile() as f:
                    tempfile_path = Path(f.name)
                    glove2word2vec(model_file, tempfile_path)
                    shutil.copy2(tempfile_path, model_file)

            LOGGER.debug(f"Loading word2vec model '{model}'")
            self._model = gensim.models.KeyedVectors.load_word2vec_format(
                model_file)
            LOGGER.debug(f"word2vec model loaded")
        elif isinstance(model, Path):
            LOGGER.debug(f"Loading word2vec model from path '{model}'")
            self._model = gensim.models.KeyedVectors.load_word2vec_format(
                str(model))
            LOGGER.debug(f"word2vec model loaded")
        elif isinstance(model,
                        (gensim.models.Word2Vec, gensim.models.KeyedVectors)):
            self._model = model
        else:
            raise TypeError(
                f"unsupported type for initializing word2vec model: '{type(model)}'"
            )

        assert_type("n_similar", n_similar, int)
        if n_similar <= 0:
            raise ValueError("n_similar must be > 0")
        self.n_similar = n_similar

        assert_type("diversity", diversity, float)
        if not 0 < diversity <= 1:
            raise ValueError("diversity must be > 0 and <= 1")
        self.diversity = diversity

        if isinstance(tokenizer, str):
            tokenizer = TokenizeMethod[tokenizer]

        if isinstance(tokenizer, TokenizeMethod):
            # Avoid mypy error when passing a partially-applied function created by
            # functools.partial
            self.tokenizer = cast(
                Callable[[List[str]], List[List[str]]],
                functools.partial(tokenize, tokenizer),
            )
        elif callable(tokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                f"unsupported type for tokenizer: '{type(tokenizer)}'")
Beispiel #7
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = IMDBDataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        all_results = []

        for window_len, pooling in self.params["window_len_poolings"]:

            if window_len is not None and pooling is not None:
                with tempfile.TemporaryDirectory() as tmpdir:
                    tokenizer_path = Path(tmpdir) / "tokenizer"

                    X_windowed, _, y_windowed = make_document_windows(
                        X_train_valid_preprocessed,
                        window_len=window_len,
                        y=y_train_valid,
                        tokenize_method=TokenizeMethod.SENTENCEPIECE,
                        vocab_size=self.params["vocab_size"],
                        model_path=tokenizer_path,
                    )
                    (
                        X_test_windowed,
                        X_test_windowed_indices,
                        y_test_windowed,
                    ) = make_document_windows(
                        X_test_preprocessed,
                        window_len=window_len,
                        y=y_test,
                        tokenize_method=TokenizeMethod.SENTENCEPIECE,
                        vocab_size=self.params["vocab_size"],
                        model_path=tokenizer_path,
                    )
            else:
                X_windowed, y_windowed = X_train_valid_preprocessed, y_train_valid
                X_test_windowed, y_test_windowed = X_test_preprocessed, y_test

            print(
                f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} "
                f"Evaluating window: Length {window_len}, pooling {pooling} ({len(X_windowed)} obs)"
            )
            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X_windowed,
                y_windowed,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_windowed, y_test_windowed),
                run_kwargs=run.run_kwargs,
            )

            if window_len is not None:
                pooled_output = PredictOutput(
                    y_pred_proba=results.y_pred_proba.copy())

                pool_document_windows(
                    pooled_output,
                    X_test_windowed_indices,
                    pooling=WindowPooling(pooling),
                )

            all_results.append(results.metrics())

        all_metrics = pd.DataFrame([{
            "Window Config": f"Length {window_len}, pooling {pooling}",
            **r
        } for w, r in zip(self.params["window_len_poolings"], all_results)])

        fig = plt.figure(figsize=(10, 10))

        acc_ax = fig.add_subplot()
        all_metrics.plot(x="Window Config",
                         y="Accuracy",
                         ax=acc_ax,
                         kind="bar")

        plt.xlabel("Document Windowing")
        plt.title(
            f"Model Performance by Document Windowing - {model_cls.__name__}")
        plt.ylim(0, 1)

        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += tabulate(all_metrics, tablefmt="pipe", headers="keys")
        md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
Beispiel #8
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = IMDBDataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        all_results = []

        # Finish linting, test
        for proportion in self.params["data_proportions"]:
            X_sampled, _, y_sampled, _ = train_test_split(
                X_train_valid_preprocessed,
                y_train_valid,
                train_size=proportion,
                random_state=1,
            )
            LOGGER.info(
                f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} "
                f"Evaluating proportion {round(proportion, 3)} ({len(X_sampled)} obs)"
            )
            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X_sampled,
                y_sampled,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_preprocessed, y_test),
                run_kwargs=run.run_kwargs,
            )
            all_results.append(results)

        all_metrics = pd.DataFrame([{
            "data_proportion":
            p,
            "num_documents":
            int(p * len(X_train_valid)),
            **r.metrics(),
        } for p, r in zip(self.params["data_proportions"], all_results)])

        fig = plt.figure(figsize=(10, 10))
        f1_ax = fig.add_subplot()
        all_metrics.plot(x="num_documents", y="Weighted F1 Score", ax=f1_ax)

        acc_ax = fig.add_subplot()
        all_metrics.plot(x="num_documents", y="Accuracy", ax=acc_ax)

        plt.xlabel("Number of Documents Used for Training/Validation")
        plt.title(
            f"Model Performance by Number of Documents Used for Training/Validation - {model_cls.__name__}"
        )
        plt.xlim(0, int(all_metrics["num_documents"].max() * 1.1))
        plt.ylim(0, 1)
        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += tabulate(all_metrics, tablefmt="pipe", headers="keys")
        md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
Beispiel #9
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = IMDBDataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        all_results = []

        majority, minority = ClassImbalanceScenario.find_majority_minority_classes(
            y_test)
        majority_df, minority_df = ClassImbalanceScenario.split_dataset(
            X_train_valid_preprocessed, y_train_valid, majority, minority)

        for proportion in self.params["imbalance_proportions"]:
            # Downsample the minority class so the final dataset contains the desired
            # proportion of the minority
            orig_len = majority_df.shape[0]
            downsample_proportion = -orig_len / (orig_len -
                                                 orig_len / proportion)
            minority_sample = minority_df.sample(
                frac=downsample_proportion).reset_index()
            sampled_df = pd.concat([majority_df, minority_sample])

            X = sampled_df["X"].tolist()
            y = sampled_df["y"].tolist()

            LOGGER.info(
                f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} "
                f"Evaluating proportion {round(proportion, 3)} ({len(X)} obs)")

            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X,
                y,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_preprocessed, y_test),
                run_kwargs=run.run_kwargs,
            )
            all_results.append(results)

        minority_f1_scores = []
        majority_f1_scores = []
        for result in all_results:
            majority_f1, minority_f1 = f1_score(
                result.y_true,
                pred_prob_to_pred_label(result.y_pred_proba),
                average=None,
                labels=[majority, minority],
            )
            minority_f1_scores.append(minority_f1)
            majority_f1_scores.append(majority_f1)

        all_metrics = pd.DataFrame([{
            "imbalance_proportion": p,
            **r.metrics()
        } for p, r in zip(self.params["imbalance_proportions"], all_results)])

        all_metrics["Minority Class F1 Score"] = minority_f1_scores
        all_metrics["Majority Class F1 Score"] = majority_f1_scores

        fig = plt.figure(figsize=(10, 10))
        minority_ax = fig.add_subplot()
        all_metrics.plot(x="imbalance_proportion",
                         y="Minority Class F1 Score",
                         ax=minority_ax)

        majority_ax = fig.add_subplot()
        all_metrics.plot(x="imbalance_proportion",
                         y="Majority Class F1 Score",
                         ax=majority_ax)

        plt.xlabel("Prevalence of Minority Class")
        plt.title(
            f"Model Performance by Prevalence of Minority Class - {model_cls.__name__}"
        )
        plt.xlim(0, 0.5)
        plt.ylim(0, 1)

        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += tabulate(all_metrics, tablefmt="pipe", headers="keys")
        md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
Beispiel #10
0
    def _do_run(self, run: ModelEmbeddingRun, run_output_dir: Path) -> str:
        ds = self.dataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)
        X_embed = X_train_valid + X_test
        labels = y_train_valid + y_test

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_embed_preprocessed = preprocess_func(X_embed)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        stdout_catcher = StdoutCatcher()
        with stdout_catcher:
            # Construct the dict of kwargs up-front so each run can override the "use_gpu"
            # option if necessary using its model params -- ex. for models like spaCy
            # which have trouble controlling memory usage on the GPU and don't gain
            # much benefit from it
            model_kwargs = {**get_model_run_params(), **run.model_params}
            model = model_cls(**model_kwargs)
            model.build()

            embed_input = EmbedInput(X=X_embed_preprocessed,
                                     embed_batch_size=run.batch_size)
            embed_output = model.embed(embed_input)

        X_embedded = pd.DataFrame(embed_output.X_embedded)
        umap = UMAP(random_state=1)
        umap_data = umap.fit_transform(X_embedded)
        umap_df = pd.DataFrame(
            umap_data, columns=["UMAP Component 1", "UMAP Component 2"])
        umap_df["Label"] = labels
        groups = umap_df.groupby("Label")

        fig = plt.figure(figsize=(15, 15))
        ax = fig.add_subplot()
        cmap = plt.cm.get_cmap("tab20")

        for (name, group), c in zip(groups, cmap.colors):
            ax.plot(
                group["UMAP Component 1"],
                group["UMAP Component 2"],
                marker="o",
                linestyle="",
                ms=6,
                label=name,
                color=c,
                alpha=0.5,
            )
        ax.legend()
        ax.axis("off")
        ax.set_title(f"Embeddings by Label - {run.key}")
        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += f"```\n{stdout_catcher.get_logs()}\n```\n"
        md += f"\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md