def compute_surprisals(model: Model, suite): """ Compute per-region surprisals for a language model on the given suite. Args: model: An LM Zoo ``Model``. suite_file: A path or open file stream to a suite JSON file, or an already loaded suite dict Returns: An evaluated test suite dict --- a copy of the data from ``suite_file``, now including per-region surprisal data """ suite = _load_suite(suite) image_spec = spec(model) # Convert to sentences suite_sentences = get_sentences(suite) # First compute surprisals surprisals_df = get_surprisals(model, suite_sentences) # Track tokens+unks tokens = tokenize(model, suite_sentences) unks = unkify(model, suite_sentences) # Now aggregate over regions and get result df result = aggregate_surprisals(surprisals_df, tokens, unks, suite, image_spec) return result
def test_tokenize_two(registry): result = Z.tokenize( registry["GRNN"], ['This is a test sentence', "This is a second test sentence"]) assert len(result) == 2 assert result[0] == "This is a test sentence <eos>".split() assert result[1] == "This is a second test sentence <eos>".split()
def test_tokenize(registry, runner, any_model, test_file): result = invoke(runner, ["tokenize", any_model, test_file]) assert result.output.endswith("\n"), "Should have final trailing newline" output = result.output[:-1] lines = [line.strip().split(" ") for line in output.split("\n")] # API as ground truth with open(test_file) as test_f: test_text = test_f.read() API_result = Z.tokenize(registry[any_model], test_text.strip().split("\n")) assert lines == API_result
def test_checkpoint_mounting(template_model): """ We should be able to mount a "checkpoint" with a custom vocabulary in the LM Zoo template image, and see tokenization output vary accordingly. """ dummy_vocab = "This is test".split() with TemporaryDirectory() as checkpoint_dir: with (Path(checkpoint_dir) / "vocab.txt").open("w") as vocab_f: vocab_f.write("\n".join(dummy_vocab)) custom_model = template_model.with_checkpoint(checkpoint_dir) tokenized = Z.tokenize(custom_model, ["This is a test sentence"]) assert len(tokenized) == 1 assert tokenized[0] == "This is <unk> test <unk>".split()
def tokenize(state, model, in_file): """ Tokenize natural-language text according to a model's preprocessing standards. FILE should be a raw natural language text file with one sentence per line. This command returns a text file with one tokenized sentence per line, with tokens separated by single spaces. For each sentence, there is a one-to-one mapping between the tokens output by this command and the tokens used by the ``get-surprisals`` command. """ model = _prepare_model(model, state) sentences = read_lines(in_file) sentences = Z.tokenize(model, sentences, backend=state.requested_backend) print("\n".join(" ".join(sentence) for sentence in sentences))
def test_tokenize_single(registry): result = Z.tokenize(registry["GRNN"], ['This is a test sentence']) assert len(result) == 1 assert result[0] == "This is a test sentence <eos>".split()
def test_singularity(registry, singularity_local_model): assert Z.tokenize(registry["singularity://%s" % singularity_local_model], ["This is a test sentence"]) \ == ["This is a test sentence".split()]