Esempio n. 1
0
def test_slicing():
    norms = np.float32(np.random.random_sample(10))
    s = Norms(norms)
    assert np.allclose(norms[:], s[:])
    assert np.allclose(norms, s)

    for _ in range(250):
        upper = np.random.randint(-len(norms) * 3, len(norms) * 3)
        lower = np.random.randint(-len(norms) * 3, len(norms) * 3)
        step = np.random.randint(-len(norms) * 3, len(norms) * 3)
        ctx = pytest.raises(ValueError) if step == 0 else contextlib.suppress()

        assert np.allclose(norms[:upper], s[:upper])
        assert np.allclose(norms[lower:upper], s[lower:upper])
        with ctx:
            val = s[lower:upper:step]
        with ctx:
            assert np.allclose(norms[lower:upper:step], val)
        with ctx:
            val = s[:upper:step]
        with ctx:
            assert np.allclose(norms[:upper:step], val)
        with ctx:
            val = s[::step]
        with ctx:
            assert np.allclose(norms[::step], val)
Esempio n. 2
0
def test_set_norms(embeddings_fifu):
    n = Norms(np.ones(len(embeddings_fifu.vocab), dtype=np.float32))
    embeddings_fifu.norms = n
    assert np.allclose(n, embeddings_fifu.norms)
    embeddings_fifu.norms = None
    assert embeddings_fifu.norms is None
    with pytest.raises(AssertionError):
        embeddings_fifu.norms = "bla"
    with pytest.raises(AssertionError):
        embeddings_fifu.norms = np.ones(len(embeddings_fifu.vocab),
                                        dtype=np.float32)
    with pytest.raises(AssertionError):
        embeddings_fifu.norms = Norms(
            np.ones(len(embeddings_fifu.vocab) - 1, dtype=np.float32))
    with pytest.raises(AssertionError):
        embeddings_fifu.norms = Norms(
            np.ones(len(embeddings_fifu.vocab) + 1, dtype=np.float32))
    assert embeddings_fifu.norms is None
def load_finalfusion(file: Union[str, bytes, int, PathLike],
                     mmap: bool = False) -> Embeddings:
    """
    Read embeddings from a file in finalfusion format.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in finalfusoin format.
    mmap : bool
        Toggles memory mapping the storage buffer.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, 'rb') as inf:
        _ = Header.read_chunk(inf)
        chunk_id, _ = _read_required_chunk_header(inf)
        norms = None
        metadata = None

        if chunk_id == ChunkIdentifier.Metadata:
            metadata = Metadata.read_chunk(inf)
            chunk_id, _ = _read_required_chunk_header(inf)

        if chunk_id == ChunkIdentifier.SimpleVocab:
            vocab = SimpleVocab.read_chunk(inf)  # type: Vocab
        elif chunk_id == ChunkIdentifier.BucketSubwordVocab:
            vocab = FinalfusionBucketVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.FastTextSubwordVocab:
            vocab = FastTextVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab:
            vocab = ExplicitVocab.read_chunk(inf)
        else:
            raise FinalfusionFormatError(
                f'Expected vocab chunk, not {str(chunk_id)}')

        chunk_id, _ = _read_required_chunk_header(inf)
        if chunk_id == ChunkIdentifier.NdArray:
            storage = NdArray.load(inf, mmap)  # type: Storage
        elif chunk_id == ChunkIdentifier.QuantizedArray:
            storage = QuantizedArray.load(inf, mmap)
        else:
            raise FinalfusionFormatError(
                f'Expected storage chunk, not {str(chunk_id)}')
        maybe_chunk_id = _read_chunk_header(inf)
        if maybe_chunk_id is not None:
            if maybe_chunk_id[0] == ChunkIdentifier.NdNorms:
                norms = Norms.read_chunk(inf)
            else:
                raise FinalfusionFormatError(
                    f'Expected norms chunk, not {str(chunk_id)}')

        return Embeddings(storage, vocab, norms, metadata, inf.name)
Esempio n. 4
0
def test_iter_sliced():
    norms = np.float32(np.random.random_sample(10))
    s = Norms(norms)
    for _ in range(250):
        upper = np.random.randint(-len(norms) * 3, len(norms) * 3)
        lower = np.random.randint(-len(norms) * 3, len(norms) * 3)
        step = np.random.randint(-len(norms) * 3, len(norms) * 3)
        if step == 0:
            continue
        for norms_row, norms_row in zip(s[lower:upper:step],
                                        norms[lower:upper:step]):
            assert np.allclose(norms_row, norms_row)
Esempio n. 5
0
def test_write_sliced(tmp_path):
    filename = tmp_path / "write_sliced.fifu"
    norms = np.float32(np.random.random_sample(10))
    s = Norms(norms)
    for _ in range(250):
        upper = np.random.randint(-len(norms) * 3, len(norms) * 3)
        lower = np.random.randint(-len(norms) * 3, len(norms) * 3)
        step = np.random.randint(-len(norms) * 3, len(norms) * 3)
        if step == 0:
            continue
        s[lower:upper:step].write(filename)
        s2 = load_norms(filename)
        assert np.allclose(norms[lower:upper:step], s2)
Esempio n. 6
0
def test_nonascii_whitespace_text_roundtrip(tmp_path):
    vocab = ["\u00A0"]
    storage = np.ones((1, 5), dtype=np.float32)
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    embeds = Embeddings(NdArray(storage),
                        SimpleVocab(vocab),
                        norms=Norms(norms))
    filename = tmp_path / "non-ascii.txt"
    write_text(filename, embeds)
    text = load_text(filename)
    assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}'
    assert np.allclose(embeds.storage, text.storage)
    assert np.allclose(embeds.norms, text.norms)
Esempio n. 7
0
def test_slice_slice():
    for _ in range(250):
        norms = np.float32(np.random.random_sample(100))
        s = Norms(norms)
        assert np.allclose(norms[:], s[:])
        assert np.allclose(norms, s)
        for _ in range(5):
            if len(norms) == 0:
                break
            upper = np.random.randint(-len(norms) * 2, len(norms) * 2)
            lower = np.random.randint(-len(norms) * 2, len(norms) * 2)
            step = np.random.randint(-len(norms) * 2, len(norms) * 2)
            ctx = pytest.raises(
                ValueError) if step == 0 else contextlib.suppress()
            with ctx:
                norms = norms[lower:upper:step]
            with ctx:
                s = s[lower:upper:step]
            assert np.allclose(norms, s)
Esempio n. 8
0
def _normalize_matrix(storage: np.ndarray) -> Norms:
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    return Norms(norms)
Esempio n. 9
0
def _normalize_ndarray_storage(storage: NdArray) -> Norms:
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    return Norms(norms)