Exemple #1
0
def test_explicit_subword_indices():
    ngrams_test = [
        "<Test>", "<Test", "<Tes", "<Te", "Test>", "Test", "Tes", "est>"
    ]
    indexer = ExplicitIndexer(ngrams_test)
    assert indexer.subword_indices("Test", bracket=True,
                                   with_ngrams=True) == list(
                                       (x, i)
                                       for i, x in enumerate(ngrams_test))
    assert indexer.subword_indices("") == []
    assert indexer.subword_indices("oov") == []
    assert "st>" not in indexer
    with pytest.raises(KeyError):
        _ = indexer["st>"]
Exemple #2
0
def test_explicit_vocab_roundtrip(tmp_path):
    filename = tmp_path / "write_explicit_vocab.fifu"
    i = ExplicitIndexer([str(i) for i in range(10)])
    v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i)
    v.write(filename)
    v2 = load_vocab(filename)
    assert v == v2
Exemple #3
0
 def read_chunk(file: BinaryIO) -> 'ExplicitVocab':
     length, ngram_length, min_n, max_n = _read_required_binary(
         file, "<QQII")
     words = _read_items(file, length)
     ngram_list, ngram_index = _read_items_with_indices(file, ngram_length)
     indexer = ExplicitIndexer(ngram_list, min_n, max_n, ngram_index)
     return ExplicitVocab(words, indexer)
Exemple #4
0
def test_explicit_with_ngram_index():
    ngrams10 = [str(i) for i in range(10)]
    index = dict((v, i) for i, v in enumerate(ngrams10))
    indexer = ExplicitIndexer(ngrams10, ngram_index=index)
    assert indexer.ngrams == ngrams10
    assert indexer.ngram_index == index
    assert indexer["0"] == 0
    assert indexer.ngrams[0] == "0"
    assert indexer("0") == 0
def test_explicit():
    ngrams10 = [str(i) for i in range(10)]
    indexer = ExplicitIndexer(ngrams10)
    assert indexer.ngrams == ngrams10
    assert indexer.ngram_index == dict((v, i) for i, v in enumerate(ngrams10))
    assert repr(indexer) == "ExplicitIndexer(min_n=3, max_n=6, " \
                            "n_ngrams=10, n_indices=10)"
    assert indexer["0"] == 0
    assert indexer.ngrams[0] == "0"
    assert indexer("0") == 0
    assert indexer("") is None

    ngrams5 = [str(i) for i in range(5)]
    assert ExplicitIndexer(ngrams5) in indexer
    assert ngrams5 in indexer
    assert "0" in indexer
    assert "01" not in indexer
    assert 0 not in indexer
Exemple #6
0
def test_explicit_constructor():
    i = ExplicitIndexer([str(i) for i in range(10)])
    v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i)
    assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)]
    with pytest.raises(AssertionError):
        _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21))
    assert len(v) == 90
    assert v.upper_bound == len(v) + 10
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FastTextVocab(v.words, FastTextIndexer(20))
    assert repr(v) == f"ExplicitVocab(\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      "\twords=[...]\n" \
                      "\tword_index={{...}})"
Exemple #7
0
def _bucket_to_explicit(vocab: Union[FinalfusionBucketVocab, FastTextVocab]
                        ) -> 'ExplicitVocab':
    ngram_index = dict()
    idx_index = dict()  # type: Dict[int, int]
    ngram_list = []
    for word in vocab.words:
        token_ngrams = vocab.subwords(word)
        for ngram in token_ngrams:
            if ngram not in ngram_index:
                ngram_list.append(ngram)
                idx = vocab.subword_indexer(ngram)
                if idx not in idx_index:
                    idx_index[idx] = len(idx_index)
                ngram_index[ngram] = idx_index[idx]
    indexer = ExplicitIndexer(ngram_list, vocab.min_n, vocab.max_n,
                              ngram_index)
    return ExplicitVocab(vocab.words, indexer)
Exemple #8
0
def test_explicit_assertions():
    with pytest.raises(AssertionError):
        ExplicitIndexer(["a"] * 2)
    with pytest.raises(AssertionError):
        ExplicitIndexer(["a"], ngram_index={"b": 0})
    with pytest.raises(AssertionError):
        ExplicitIndexer(["a"], ngram_index={"a": 1})
    with pytest.raises(AssertionError):
        ExplicitIndexer(["a"], ngram_index={"a": 0, "b": 1})
    with pytest.raises(AssertionError):
        ExplicitIndexer(["a", "b"], ngram_index={"a": 0, "b": 2})
    with pytest.raises(AssertionError):
        ExplicitIndexer(["a"], ngram_index={"a": 1})