def init(self, params: Dict[str, Any]): """ See :meth: `gobbli.model.base.BaseModel.init`. For more info on fastText parameter semantics, see `the docs <https://fasttext.cc/docs/en/options.html>`__. The fastText `supervised tutorial <https://fasttext.cc/docs/en/supervised-tutorial.html>`__ has some more detailed explanation. fastText parameters: - ``word_ngrams`` (:obj:`int`): Max length of word n-grams. - ``lr`` (:obj:`float`): Learning rate. - ``dim`` (:obj:`int`): Dimension of learned vectors. - ``ws`` (:obj:`int`): Context window size. - ``fasttext_model`` (:obj:`str`): Name of a pretrained fastText model to use. See :obj:`FASTTEXT_VECTOR_ARCHIVES` for a listing of available pretrained models. """ self.word_ngrams = 1 self.lr = 0.1 self.ws = 5 self.fasttext_model = None # Default to dimensionality of the passed model, if any; # otherwise, default to 100 if "fasttext_model" in params: self.dim = _parse_dim(params["fasttext_model"]) else: self.dim = 100 for name, value in params.items(): if name == "word_ngrams": assert_type(name, value, int) self.word_ngrams = value elif name == "lr": assert_type(name, value, float) self.lr = value elif name == "dim": assert_type(name, value, int) self.dim = value elif name == "ws": assert_type(name, value, int) self.ws = value elif name == "fasttext_model": assert_in(name, value, set(FASTTEXT_VECTOR_ARCHIVES.keys())) self.fasttext_model = value else: raise ValueError(f"Unknown param '{name}'") if (self.fasttext_model is not None and f"{self.dim}d" not in self.fasttext_model): raise ValueError( "When using pretrained vectors, 'dim' must match the" f" dimensionality of the vectors; 'dim' value of {self.dim}" f" is incompatible with vectors {self.fasttext_model}.")
def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = self.dataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) stdout_catcher = StdoutCatcher() with stdout_catcher: results = run_benchmark_experiment( f"{self.name}_{run.key}", X_train_valid_preprocessed, y_train_valid, model_cls, run.param_grid, test_dataset=(X_test_preprocessed, y_test), worker_log_level=logging.INFO, run_kwargs=run.run_kwargs, ) # Sleep a few seconds to let logs from the worker catch up time.sleep(3) # Sample the observations if there are more than 1,000 in the test set, since we # need to save the chart, and trying to save large charts can cause Selenium timeouts # when they're rendered to PNG sample_size = 1000 chart = results.plot(sample_size=sample_size).properties( title= f"Predicted Probability (Sampled Test Set Observations, n={sample_size})" ) plot_path = run_output_dir / "plot.png" # Longer driver timeout needed since these images can be very big chart.save(str(plot_path), driver_timeout=600) md = f"# Results: {run.key}\n" md += f"```\n{stdout_catcher.get_logs()}\n```\n" md += tabulate(pd.DataFrame(results.training_results), tablefmt="pipe", headers="keys") md += f"\n```\n{results.metrics_report()}\n```\n" md += f"\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def init(self, params: Dict[str, Any]): """ See :meth:`gobbli.model.base.BaseModel.init`. USE parameters: - ``use_model`` (:obj:`str`): Name of a USE model to use. See :obj:`USE_MODEL_ARCHIVES` for a listing of available USE models. """ self.use_model = "universal-sentence-encoder" for name, value in params.items(): if name == "use_model": assert_in(name, value, set(USE_MODEL_ARCHIVES.keys())) self.use_model = value else: raise ValueError(f"Unknown param '{name}'")
def _validate_params(self): assert_param_required("percent_multipliers", self.params) percent_multipliers = self.params["percent_multipliers"] assert_type("percent_multipliers", percent_multipliers, list) for (p, m) in percent_multipliers: assert_type("percent", p, float) assert_proportion("percent", p) assert_type("multiplier", m, (int, float)) assert_type("param_grid", self.params.get("param_grid", {}), dict) assert_param_required("model_name", self.params) assert_type("model_name", self.params["model_name"], str) assert_valid_model(self.params["model_name"]) assert_param_required("augment_probability", self.params) assert_type("augment_probability", p, float) assert_proportion("augment_probability", p) assert_param_required("preprocess_func", self.params) assert_in("preprocess_func", self.params["preprocess_func"], PREPROCESS_FUNCS)
def init(self, params: Dict[str, Any]): """ See :meth:`gobbli.model.base.BaseModel.init`. BERT parameters: - ``max_seq_length`` (:obj:`int`): The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded. Default: 128 - ``bert_model`` (:obj:`str`): Name of a pretrained BERT model to use. See :obj:`BERT_MODEL_ARCHIVES` for a listing of available BERT models. """ self.max_seq_length = 128 self.bert_model = "bert-base-uncased" for name, value in params.items(): if name == "max_seq_length": assert_type(name, value, int) self.max_seq_length = value elif name == "bert_model": assert_in(name, value, set(BERT_MODEL_ARCHIVES.keys())) self.bert_model = value else: raise ValueError(f"Unknown param '{name}'")
def __init__( self, # Can't make this type more restrictive since gensim might not be # available, and we need to make the union include a gensim type model: Any, tokenizer: Union[str, TokenizeMethod, Callable[[List[str]], List[List[str]]]] = TokenizeMethod.SPLIT, n_similar: int = 10, diversity: float = 0.8, ): try: import gensim from gensim.scripts.glove2word2vec import glove2word2vec except ImportError: raise ImportError( "word2vec-based data augmentation requires gensim to be installed." ) if isinstance(model, str): # Download and extract pretrained weights from a public source assert_in("word2vec model", model, set(WORD2VEC_MODELS.keys())) archive_name, filename = WORD2VEC_MODELS[model] archive_url = _WORD2VEC_MODEL_ARCHIVES[archive_name] LOGGER.debug(f"Downloading word2vec model '{model}'") # Some downloads aren't contained in archives if is_archive(Path(archive_url)): extract_dir = download_archive(archive_url, self.data_dir(), junk_paths=True) model_file = extract_dir / filename else: model_file = download_file(archive_url) if model.startswith("glove"): LOGGER.debug(f"Converting GloVe format to word2vec format") # Need to convert the downloaded file to word2vec format, # since GloVe vectorsr are formatted slightly differently with tempfile.NamedTemporaryFile() as f: tempfile_path = Path(f.name) glove2word2vec(model_file, tempfile_path) shutil.copy2(tempfile_path, model_file) LOGGER.debug(f"Loading word2vec model '{model}'") self._model = gensim.models.KeyedVectors.load_word2vec_format( model_file) LOGGER.debug(f"word2vec model loaded") elif isinstance(model, Path): LOGGER.debug(f"Loading word2vec model from path '{model}'") self._model = gensim.models.KeyedVectors.load_word2vec_format( str(model)) LOGGER.debug(f"word2vec model loaded") elif isinstance(model, (gensim.models.Word2Vec, gensim.models.KeyedVectors)): self._model = model else: raise TypeError( f"unsupported type for initializing word2vec model: '{type(model)}'" ) assert_type("n_similar", n_similar, int) if n_similar <= 0: raise ValueError("n_similar must be > 0") self.n_similar = n_similar assert_type("diversity", diversity, float) if not 0 < diversity <= 1: raise ValueError("diversity must be > 0 and <= 1") self.diversity = diversity if isinstance(tokenizer, str): tokenizer = TokenizeMethod[tokenizer] if isinstance(tokenizer, TokenizeMethod): # Avoid mypy error when passing a partially-applied function created by # functools.partial self.tokenizer = cast( Callable[[List[str]], List[List[str]]], functools.partial(tokenize, tokenizer), ) elif callable(tokenizer): self.tokenizer = tokenizer else: raise TypeError( f"unsupported type for tokenizer: '{type(tokenizer)}'")
def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = IMDBDataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) all_results = [] for window_len, pooling in self.params["window_len_poolings"]: if window_len is not None and pooling is not None: with tempfile.TemporaryDirectory() as tmpdir: tokenizer_path = Path(tmpdir) / "tokenizer" X_windowed, _, y_windowed = make_document_windows( X_train_valid_preprocessed, window_len=window_len, y=y_train_valid, tokenize_method=TokenizeMethod.SENTENCEPIECE, vocab_size=self.params["vocab_size"], model_path=tokenizer_path, ) ( X_test_windowed, X_test_windowed_indices, y_test_windowed, ) = make_document_windows( X_test_preprocessed, window_len=window_len, y=y_test, tokenize_method=TokenizeMethod.SENTENCEPIECE, vocab_size=self.params["vocab_size"], model_path=tokenizer_path, ) else: X_windowed, y_windowed = X_train_valid_preprocessed, y_train_valid X_test_windowed, y_test_windowed = X_test_preprocessed, y_test print( f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} " f"Evaluating window: Length {window_len}, pooling {pooling} ({len(X_windowed)} obs)" ) results = run_benchmark_experiment( f"{self.name}_{run.key}", X_windowed, y_windowed, model_cls, run.param_grid, test_dataset=(X_test_windowed, y_test_windowed), run_kwargs=run.run_kwargs, ) if window_len is not None: pooled_output = PredictOutput( y_pred_proba=results.y_pred_proba.copy()) pool_document_windows( pooled_output, X_test_windowed_indices, pooling=WindowPooling(pooling), ) all_results.append(results.metrics()) all_metrics = pd.DataFrame([{ "Window Config": f"Length {window_len}, pooling {pooling}", **r } for w, r in zip(self.params["window_len_poolings"], all_results)]) fig = plt.figure(figsize=(10, 10)) acc_ax = fig.add_subplot() all_metrics.plot(x="Window Config", y="Accuracy", ax=acc_ax, kind="bar") plt.xlabel("Document Windowing") plt.title( f"Model Performance by Document Windowing - {model_cls.__name__}") plt.ylim(0, 1) plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += tabulate(all_metrics, tablefmt="pipe", headers="keys") md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = IMDBDataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) all_results = [] # Finish linting, test for proportion in self.params["data_proportions"]: X_sampled, _, y_sampled, _ = train_test_split( X_train_valid_preprocessed, y_train_valid, train_size=proportion, random_state=1, ) LOGGER.info( f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} " f"Evaluating proportion {round(proportion, 3)} ({len(X_sampled)} obs)" ) results = run_benchmark_experiment( f"{self.name}_{run.key}", X_sampled, y_sampled, model_cls, run.param_grid, test_dataset=(X_test_preprocessed, y_test), run_kwargs=run.run_kwargs, ) all_results.append(results) all_metrics = pd.DataFrame([{ "data_proportion": p, "num_documents": int(p * len(X_train_valid)), **r.metrics(), } for p, r in zip(self.params["data_proportions"], all_results)]) fig = plt.figure(figsize=(10, 10)) f1_ax = fig.add_subplot() all_metrics.plot(x="num_documents", y="Weighted F1 Score", ax=f1_ax) acc_ax = fig.add_subplot() all_metrics.plot(x="num_documents", y="Accuracy", ax=acc_ax) plt.xlabel("Number of Documents Used for Training/Validation") plt.title( f"Model Performance by Number of Documents Used for Training/Validation - {model_cls.__name__}" ) plt.xlim(0, int(all_metrics["num_documents"].max() * 1.1)) plt.ylim(0, 1) plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += tabulate(all_metrics, tablefmt="pipe", headers="keys") md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = IMDBDataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) all_results = [] majority, minority = ClassImbalanceScenario.find_majority_minority_classes( y_test) majority_df, minority_df = ClassImbalanceScenario.split_dataset( X_train_valid_preprocessed, y_train_valid, majority, minority) for proportion in self.params["imbalance_proportions"]: # Downsample the minority class so the final dataset contains the desired # proportion of the minority orig_len = majority_df.shape[0] downsample_proportion = -orig_len / (orig_len - orig_len / proportion) minority_sample = minority_df.sample( frac=downsample_proportion).reset_index() sampled_df = pd.concat([majority_df, minority_sample]) X = sampled_df["X"].tolist() y = sampled_df["y"].tolist() LOGGER.info( f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} " f"Evaluating proportion {round(proportion, 3)} ({len(X)} obs)") results = run_benchmark_experiment( f"{self.name}_{run.key}", X, y, model_cls, run.param_grid, test_dataset=(X_test_preprocessed, y_test), run_kwargs=run.run_kwargs, ) all_results.append(results) minority_f1_scores = [] majority_f1_scores = [] for result in all_results: majority_f1, minority_f1 = f1_score( result.y_true, pred_prob_to_pred_label(result.y_pred_proba), average=None, labels=[majority, minority], ) minority_f1_scores.append(minority_f1) majority_f1_scores.append(majority_f1) all_metrics = pd.DataFrame([{ "imbalance_proportion": p, **r.metrics() } for p, r in zip(self.params["imbalance_proportions"], all_results)]) all_metrics["Minority Class F1 Score"] = minority_f1_scores all_metrics["Majority Class F1 Score"] = majority_f1_scores fig = plt.figure(figsize=(10, 10)) minority_ax = fig.add_subplot() all_metrics.plot(x="imbalance_proportion", y="Minority Class F1 Score", ax=minority_ax) majority_ax = fig.add_subplot() all_metrics.plot(x="imbalance_proportion", y="Majority Class F1 Score", ax=majority_ax) plt.xlabel("Prevalence of Minority Class") plt.title( f"Model Performance by Prevalence of Minority Class - {model_cls.__name__}" ) plt.xlim(0, 0.5) plt.ylim(0, 1) plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += tabulate(all_metrics, tablefmt="pipe", headers="keys") md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def _do_run(self, run: ModelEmbeddingRun, run_output_dir: Path) -> str: ds = self.dataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) X_embed = X_train_valid + X_test labels = y_train_valid + y_test assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_embed_preprocessed = preprocess_func(X_embed) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) stdout_catcher = StdoutCatcher() with stdout_catcher: # Construct the dict of kwargs up-front so each run can override the "use_gpu" # option if necessary using its model params -- ex. for models like spaCy # which have trouble controlling memory usage on the GPU and don't gain # much benefit from it model_kwargs = {**get_model_run_params(), **run.model_params} model = model_cls(**model_kwargs) model.build() embed_input = EmbedInput(X=X_embed_preprocessed, embed_batch_size=run.batch_size) embed_output = model.embed(embed_input) X_embedded = pd.DataFrame(embed_output.X_embedded) umap = UMAP(random_state=1) umap_data = umap.fit_transform(X_embedded) umap_df = pd.DataFrame( umap_data, columns=["UMAP Component 1", "UMAP Component 2"]) umap_df["Label"] = labels groups = umap_df.groupby("Label") fig = plt.figure(figsize=(15, 15)) ax = fig.add_subplot() cmap = plt.cm.get_cmap("tab20") for (name, group), c in zip(groups, cmap.colors): ax.plot( group["UMAP Component 1"], group["UMAP Component 2"], marker="o", linestyle="", ms=6, label=name, color=c, alpha=0.5, ) ax.legend() ax.axis("off") ax.set_title(f"Embeddings by Label - {run.key}") plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += f"```\n{stdout_catcher.get_logs()}\n```\n" md += f"\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md