Beispiel #1
0
def test_tokenize_detokenize_sentencepiece(tmpdir, model_path):
    texts = ["a b c", "a ab c", "a b ac"]

    # Model should be trained
    if model_path is not None:
        model_path = Path(tmpdir) / model_path
    tokens = tokenize(TokenizeMethod.SENTENCEPIECE,
                      texts,
                      model_path=model_path,
                      vocab_size=7)

    # Control sequence indicating whitespace
    _ = "▁"
    expected_tokens = [
        [_, "a", _, "b", _, "c"],
        [_, "a", _, "a", "b", _, "c"],
        [_, "a", _, "b", _, "a", "c"],
    ]
    assert tokens == expected_tokens

    # Can't detokenize if we didn't give a persistent model path to the tokenize
    # function
    if model_path is not None:
        assert detokenize(TokenizeMethod.SENTENCEPIECE, tokens,
                          model_path) == texts

        # Previously should be reused with the old vocab size, and a new model
        # shouldn't be trained
        tokens = tokenize(TokenizeMethod.SENTENCEPIECE,
                          texts,
                          model_path=model_path)
        assert tokens == expected_tokens
Beispiel #2
0
def test_tokenize_detokenize_sentencepiece(tmpdir):
    texts = ["a b c", "a ab c", "a b ac"]

    # Model should be trained
    model_path = Path(tmpdir) / "spm"
    tokens = tokenize(TokenizeMethod.SENTENCEPIECE,
                      texts,
                      model_path=model_path,
                      vocab_size=7)

    # Control sequence indicating whitespace
    _ = "▁"
    expected_tokens = [
        [_, "a", _, "b", _, "c"],
        [_, "a", _, "a", "b", _, "c"],
        [_, "a", _, "b", _, "a", "c"],
    ]
    assert tokens == expected_tokens

    assert detokenize(TokenizeMethod.SENTENCEPIECE, tokens,
                      model_path) == texts

    # Previously should be reused with the old vocab size, and a new model
    # shouldn't be trained
    tokens = tokenize(TokenizeMethod.SENTENCEPIECE,
                      texts,
                      model_path=model_path)
    assert tokens == expected_tokens
Beispiel #3
0
def make_document_windows(
    X: List[str],
    window_len: int,
    y: Optional[List[T]] = None,
    tokenize_method: TokenizeMethod = TokenizeMethod.SPLIT,
    model_path: Optional[Path] = None,
    vocab_size: Optional[int] = None,
) -> Tuple[List[str], List[int], Optional[List[T]]]:
    """
    This is a helper for when you have a dataset with long documents which is going to be
    passed through a model with a fixed max sequence length.  If you don't have enough
    memory to raise the max sequence length, but you don't want to miss out on the information
    in longer documents, you can use this helper to generate a dataset that splits each
    document into windows roughly the size of your ``max_seq_len``.  The resulting dataset can
    then be used to train your model.  You should then use :func:`pool_document_windows` to
    pool the results from downstream tasks (ex. predictions, embeddings).

    Note there may still be some mismatch between the window size and the size as tokenized
    by your model, since some models use custom tokenization methods.

    Args:
      X: List of texts to make windows out of.
      window_len: The maximum length of each window.  This should roughly correspond to
        the ``max_seq_len`` of your model.
      y: Optional list of classes (or list of list of labels).  If passed, a corresponding
        list of targets for each window (the target(s) associated with the window's document)
        will be returned.
      tokenize_method: :class:`gobbli.util.TokenizeMethod` corresponding to the tokenization
        method to use for determining windows.
      model_path: This argument is used if the tokenization method requires
        training a model; otherwise, it's ignored.  Path for a tokenization model.
        If it doesn't exist, a new tokenization model will be trained and saved at
        the given path.  If it does exist, the existing model will be used.  If no path
        is given, a temporary directory will be created/used and discarded
      vocab_size: Number of terms in the vocabulary for tokenization. May be ignored depending
        on the tokenization method and whether a model is already trained.

    Returns:
      A 3-tuple containing a new list of texts split into windows, a corresponding list
      containing the index of each original document for each window, and (optionally)
      a list containing a target per window.  The index should
      be used to pool the output from the windowed text (see :func:`pool_document_windows`).
    """
    X_windowed: List[str] = []
    X_windowed_indices: List[int] = []
    y_windowed: List[T] = []

    # Create a temp dir in case it's needed
    with tempfile.TemporaryDirectory() as tmpdir:
        tokenize_kwargs: Dict[str, Any] = {}

        if model_path is None:
            model_path = Path(tmpdir) / "tokenizer"

        tokenize_kwargs["model_path"] = model_path

        detokenize_kwargs = tokenize_kwargs.copy()

        if vocab_size is not None:
            tokenize_kwargs["vocab_size"] = vocab_size

        for i, tokens in enumerate(
                tokenize(tokenize_method, X, **tokenize_kwargs)):
            for window in detokenize(tokenize_method,
                                     _chunk_tokens(tokens, window_len),
                                     **detokenize_kwargs):
                X_windowed.append(window)
                X_windowed_indices.append(i)
                if y is not None:
                    y_windowed.append(y[i])

    if y is not None:
        return X_windowed, X_windowed_indices, y_windowed
    else:
        return X_windowed, X_windowed_indices, None
Beispiel #4
0
def test_detokenize_split_spacy(text, tokens, tokenize_method):
    assert detokenize(tokenize_method, [tokens]) == [text]