コード例 #1
0
    def __init__(self,
                 words: List[str],
                 indexer: Optional[FinalfusionHashIndexer] = None):
        """
        Initialize a FinalfusionBucketVocab.

        Initializes the vocabulary with the given words.

        If no indexer is passed, a FinalfusionHashIndexer with bucket exponent
        21 is used.

        The word list cannot contain duplicate entries.

        Parameters
        ----------
        words : List[str]
            List of unique words
        indexer : FinalfusionHashIndexer, optional
            Subword indexer to use for the vocabulary. Defaults to an indexer
            with 2^21 buckets with range 3-6.

        Raises
        ------
        AssertionError
            If the indexer is not a FinalfusionHashIndexer or ``words`` contains duplicate entries.
        """
        if indexer is None:
            indexer = FinalfusionHashIndexer(21)
        assert isinstance(indexer, FinalfusionHashIndexer), \
            f"indexer needs to be FinalfusionHashIndexer, not {type(indexer)}"
        super().__init__()
        self._index = _validate_items_and_create_index(words)
        self._words = words
        self._indexer = indexer
コード例 #2
0
def test_unknown_embeddings(embeddings_fifu, bucket_vocab_embeddings_fifu):
    assert embeddings_fifu.embedding(
        "OOV") is None, "Unknown lookup with no default failed"
    assert embeddings_fifu.embedding(
        "OOV",
        default=None) is None, "Unknown lookup with 'None' default failed"
    assert np.allclose(
        embeddings_fifu.embedding("OOV",
                                  default=np.zeros(10, dtype=np.float32)),
        np.array([0.] * 10)), "Unknown lookup with 'list' default failed"
    out = np.zeros(10, dtype=np.float32)
    default = np.ones(10, dtype=np.float32)
    out2 = embeddings_fifu.embedding("OOV", default=default, out=out)
    assert out is out2
    assert np.allclose(out, default)
    out2 = embeddings_fifu.embedding("OOV", default=0, out=out)
    assert np.allclose(out2, 0)
    with pytest.raises(TypeError):
        _ = bucket_vocab_embeddings_fifu.embedding(None)
    assert bucket_vocab_embeddings_fifu.embedding("") is None
    assert bucket_vocab_embeddings_fifu.embedding("", default=1) == 1
    oov_indices = FinalfusionHashIndexer(10).subword_indices("OOV", offset=2)
    summed_rows = bucket_vocab_embeddings_fifu.storage[oov_indices].sum(axis=0)
    summed_rows /= np.linalg.norm(summed_rows)
    assert np.allclose(
        bucket_vocab_embeddings_fifu.embedding("OOV", default=1), summed_rows)
コード例 #3
0
ファイル: test_subwords.py プロジェクト: finalfusion/ffp
def test_subword_indices_finalfusion():
    tuebingen_buckets = [
        12, 67, 72, 122, 166, 179, 195, 244, 248, 274, 298, 306, 323, 414, 547,
        588, 646, 649, 705, 715, 759, 815, 818, 855, 858, 1005
    ]
    idx = FinalfusionHashIndexer(10)
    assert idx == idx
    assert idx != FinalfusionHashIndexer(21)
    assert idx != FinalfusionHashIndexer(min_n=2)
    assert idx != FinalfusionHashIndexer(max_n=5)
    assert sorted(idx.subword_indices("tübingen")) == tuebingen_buckets
    with pytest.raises(TypeError):
        _ = idx.subword_indices(None)
    assert idx("<tü") == 818
    assert idx.buckets_exp == 10
    assert idx.upper_bound == 2**10
    assert idx.subword_indices("") == []
コード例 #4
0
def test_explicit_constructor():
    i = ExplicitIndexer([str(i) for i in range(10)])
    v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i)
    assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)]
    with pytest.raises(AssertionError):
        _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21))
    assert len(v) == 90
    assert v.upper_bound == len(v) + 10
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FastTextVocab(v.words, FastTextIndexer(20))
    assert repr(v) == f"ExplicitVocab(\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      "\twords=[...]\n" \
                      "\tword_index={{...}})"
コード例 #5
0
def test_fasttext_constructor():
    v = FastTextVocab([str(i) for i in range(10)])
    assert [v[str(i)] for i in range(10)] == [i for i in range(10)]
    with pytest.raises(AssertionError):
        v = FastTextVocab(["a"] * 2)
    with pytest.raises(AssertionError):
        _ = FastTextVocab(v.words, FinalfusionHashIndexer(21))
    assert len(v) == 10
    assert v.upper_bound == len(v) + 2_000_000
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FastTextVocab(v.words, FastTextIndexer(20))
    assert repr(v) == f"FastTextVocab(\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      "\twords=[...]\n" \
                      "\tword_index={{...}})"
コード例 #6
0
def test_fifu_buckets_constructor():
    v = FinalfusionBucketVocab([str(i) for i in range(10)])
    assert [v[str(i)] for i in range(10)] == [i for i in range(10)]
    with pytest.raises(AssertionError):
        v = FinalfusionBucketVocab(["a"] * 2)
    with pytest.raises(AssertionError):
        _ = FinalfusionBucketVocab(v.words, FastTextIndexer(21))
    assert len(v) == 10
    assert v.upper_bound == len(v) + pow(2, 21)
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FinalfusionBucketVocab(v.words, FinalfusionHashIndexer(20))
    assert repr(v) == "FinalfusionBucketVocab(\n" \
                      f"\tn_words={len(v)},\n" \
                      f"\tupper_bound={len(v) + pow(2, 21)},\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      f")"
コード例 #7
0
 def read_chunk(file: BinaryIO) -> 'FinalfusionBucketVocab':
     length, min_n, max_n, buckets = _read_required_binary(file, "<QIII")
     words = _read_items(file, length)
     indexer = FinalfusionHashIndexer(buckets, min_n, max_n)
     return FinalfusionBucketVocab(words, indexer)