def __init__(self, words: List[str], indexer: Optional[FinalfusionHashIndexer] = None): """ Initialize a FinalfusionBucketVocab. Initializes the vocabulary with the given words. If no indexer is passed, a FinalfusionHashIndexer with bucket exponent 21 is used. The word list cannot contain duplicate entries. Parameters ---------- words : List[str] List of unique words indexer : FinalfusionHashIndexer, optional Subword indexer to use for the vocabulary. Defaults to an indexer with 2^21 buckets with range 3-6. Raises ------ AssertionError If the indexer is not a FinalfusionHashIndexer or ``words`` contains duplicate entries. """ if indexer is None: indexer = FinalfusionHashIndexer(21) assert isinstance(indexer, FinalfusionHashIndexer), \ f"indexer needs to be FinalfusionHashIndexer, not {type(indexer)}" super().__init__() self._index = _validate_items_and_create_index(words) self._words = words self._indexer = indexer
def test_unknown_embeddings(embeddings_fifu, bucket_vocab_embeddings_fifu): assert embeddings_fifu.embedding( "OOV") is None, "Unknown lookup with no default failed" assert embeddings_fifu.embedding( "OOV", default=None) is None, "Unknown lookup with 'None' default failed" assert np.allclose( embeddings_fifu.embedding("OOV", default=np.zeros(10, dtype=np.float32)), np.array([0.] * 10)), "Unknown lookup with 'list' default failed" out = np.zeros(10, dtype=np.float32) default = np.ones(10, dtype=np.float32) out2 = embeddings_fifu.embedding("OOV", default=default, out=out) assert out is out2 assert np.allclose(out, default) out2 = embeddings_fifu.embedding("OOV", default=0, out=out) assert np.allclose(out2, 0) with pytest.raises(TypeError): _ = bucket_vocab_embeddings_fifu.embedding(None) assert bucket_vocab_embeddings_fifu.embedding("") is None assert bucket_vocab_embeddings_fifu.embedding("", default=1) == 1 oov_indices = FinalfusionHashIndexer(10).subword_indices("OOV", offset=2) summed_rows = bucket_vocab_embeddings_fifu.storage[oov_indices].sum(axis=0) summed_rows /= np.linalg.norm(summed_rows) assert np.allclose( bucket_vocab_embeddings_fifu.embedding("OOV", default=1), summed_rows)
def test_subword_indices_finalfusion(): tuebingen_buckets = [ 12, 67, 72, 122, 166, 179, 195, 244, 248, 274, 298, 306, 323, 414, 547, 588, 646, 649, 705, 715, 759, 815, 818, 855, 858, 1005 ] idx = FinalfusionHashIndexer(10) assert idx == idx assert idx != FinalfusionHashIndexer(21) assert idx != FinalfusionHashIndexer(min_n=2) assert idx != FinalfusionHashIndexer(max_n=5) assert sorted(idx.subword_indices("tübingen")) == tuebingen_buckets with pytest.raises(TypeError): _ = idx.subword_indices(None) assert idx("<tü") == 818 assert idx.buckets_exp == 10 assert idx.upper_bound == 2**10 assert idx.subword_indices("") == []
def test_explicit_constructor(): i = ExplicitIndexer([str(i) for i in range(10)]) v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i) assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)] with pytest.raises(AssertionError): _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21)) assert len(v) == 90 assert v.upper_bound == len(v) + 10 assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FastTextVocab(v.words, FastTextIndexer(20)) assert repr(v) == f"ExplicitVocab(\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ "\twords=[...]\n" \ "\tword_index={{...}})"
def test_fasttext_constructor(): v = FastTextVocab([str(i) for i in range(10)]) assert [v[str(i)] for i in range(10)] == [i for i in range(10)] with pytest.raises(AssertionError): v = FastTextVocab(["a"] * 2) with pytest.raises(AssertionError): _ = FastTextVocab(v.words, FinalfusionHashIndexer(21)) assert len(v) == 10 assert v.upper_bound == len(v) + 2_000_000 assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FastTextVocab(v.words, FastTextIndexer(20)) assert repr(v) == f"FastTextVocab(\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ "\twords=[...]\n" \ "\tword_index={{...}})"
def test_fifu_buckets_constructor(): v = FinalfusionBucketVocab([str(i) for i in range(10)]) assert [v[str(i)] for i in range(10)] == [i for i in range(10)] with pytest.raises(AssertionError): v = FinalfusionBucketVocab(["a"] * 2) with pytest.raises(AssertionError): _ = FinalfusionBucketVocab(v.words, FastTextIndexer(21)) assert len(v) == 10 assert v.upper_bound == len(v) + pow(2, 21) assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FinalfusionBucketVocab(v.words, FinalfusionHashIndexer(20)) assert repr(v) == "FinalfusionBucketVocab(\n" \ f"\tn_words={len(v)},\n" \ f"\tupper_bound={len(v) + pow(2, 21)},\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ f")"
def read_chunk(file: BinaryIO) -> 'FinalfusionBucketVocab': length, min_n, max_n, buckets = _read_required_binary(file, "<QIII") words = _read_items(file, length) indexer = FinalfusionHashIndexer(buckets, min_n, max_n) return FinalfusionBucketVocab(words, indexer)