Ejemplo n.º 1
0
def compute_surprisals(model: Model, suite):
    """
    Compute per-region surprisals for a language model on the given suite.

    Args:
        model: An LM Zoo ``Model``.
        suite_file: A path or open file stream to a suite JSON file, or an
            already loaded suite dict

    Returns:
        An evaluated test suite dict --- a copy of the data from
        ``suite_file``, now including per-region surprisal data
    """
    suite = _load_suite(suite)
    image_spec = spec(model)

    # Convert to sentences
    suite_sentences = get_sentences(suite)

    # First compute surprisals
    surprisals_df = get_surprisals(model, suite_sentences)

    # Track tokens+unks
    tokens = tokenize(model, suite_sentences)
    unks = unkify(model, suite_sentences)

    # Now aggregate over regions and get result df
    result = aggregate_surprisals(surprisals_df, tokens, unks, suite,
                                  image_spec)

    return result
Ejemplo n.º 2
0
def test_tokenize_two(registry):
    result = Z.tokenize(
        registry["GRNN"],
        ['This is a test sentence', "This is a second test sentence"])
    assert len(result) == 2
    assert result[0] == "This is a test sentence <eos>".split()
    assert result[1] == "This is a second test sentence <eos>".split()
Ejemplo n.º 3
0
def test_tokenize(registry, runner, any_model, test_file):
    result = invoke(runner, ["tokenize", any_model, test_file])

    assert result.output.endswith("\n"), "Should have final trailing newline"
    output = result.output[:-1]
    lines = [line.strip().split(" ") for line in output.split("\n")]

    # API as ground truth
    with open(test_file) as test_f:
        test_text = test_f.read()
    API_result = Z.tokenize(registry[any_model], test_text.strip().split("\n"))
    assert lines == API_result
Ejemplo n.º 4
0
def test_checkpoint_mounting(template_model):
    """
    We should be able to mount a "checkpoint" with a custom vocabulary in the
    LM Zoo template image, and see tokenization output vary accordingly.
    """

    dummy_vocab = "This is test".split()
    with TemporaryDirectory() as checkpoint_dir:
        with (Path(checkpoint_dir) / "vocab.txt").open("w") as vocab_f:
            vocab_f.write("\n".join(dummy_vocab))

        custom_model = template_model.with_checkpoint(checkpoint_dir)
        tokenized = Z.tokenize(custom_model, ["This is a test sentence"])
        assert len(tokenized) == 1
        assert tokenized[0] == "This is <unk> test <unk>".split()
Ejemplo n.º 5
0
def tokenize(state, model, in_file):
    """
    Tokenize natural-language text according to a model's preprocessing
    standards.

    FILE should be a raw natural language text file with one sentence per line.

    This command returns a text file with one tokenized sentence per line, with
    tokens separated by single spaces. For each sentence, there is a one-to-one
    mapping between the tokens output by this command and the tokens used by
    the ``get-surprisals`` command.
    """
    model = _prepare_model(model, state)
    sentences = read_lines(in_file)
    sentences = Z.tokenize(model, sentences, backend=state.requested_backend)
    print("\n".join(" ".join(sentence) for sentence in sentences))
Ejemplo n.º 6
0
def test_tokenize_single(registry):
    result = Z.tokenize(registry["GRNN"], ['This is a test sentence'])
    assert len(result) == 1
    assert result[0] == "This is a test sentence <eos>".split()
Ejemplo n.º 7
0
def test_singularity(registry, singularity_local_model):
    assert Z.tokenize(registry["singularity://%s" % singularity_local_model],
                      ["This is a test sentence"]) \
                              == ["This is a test sentence".split()]