def test_slicing(): norms = np.float32(np.random.random_sample(10)) s = Norms(norms) assert np.allclose(norms[:], s[:]) assert np.allclose(norms, s) for _ in range(250): upper = np.random.randint(-len(norms) * 3, len(norms) * 3) lower = np.random.randint(-len(norms) * 3, len(norms) * 3) step = np.random.randint(-len(norms) * 3, len(norms) * 3) ctx = pytest.raises(ValueError) if step == 0 else contextlib.suppress() assert np.allclose(norms[:upper], s[:upper]) assert np.allclose(norms[lower:upper], s[lower:upper]) with ctx: val = s[lower:upper:step] with ctx: assert np.allclose(norms[lower:upper:step], val) with ctx: val = s[:upper:step] with ctx: assert np.allclose(norms[:upper:step], val) with ctx: val = s[::step] with ctx: assert np.allclose(norms[::step], val)
def test_set_norms(embeddings_fifu): n = Norms(np.ones(len(embeddings_fifu.vocab), dtype=np.float32)) embeddings_fifu.norms = n assert np.allclose(n, embeddings_fifu.norms) embeddings_fifu.norms = None assert embeddings_fifu.norms is None with pytest.raises(AssertionError): embeddings_fifu.norms = "bla" with pytest.raises(AssertionError): embeddings_fifu.norms = np.ones(len(embeddings_fifu.vocab), dtype=np.float32) with pytest.raises(AssertionError): embeddings_fifu.norms = Norms( np.ones(len(embeddings_fifu.vocab) - 1, dtype=np.float32)) with pytest.raises(AssertionError): embeddings_fifu.norms = Norms( np.ones(len(embeddings_fifu.vocab) + 1, dtype=np.float32)) assert embeddings_fifu.norms is None
def load_finalfusion(file: Union[str, bytes, int, PathLike], mmap: bool = False) -> Embeddings: """ Read embeddings from a file in finalfusion format. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in finalfusoin format. mmap : bool Toggles memory mapping the storage buffer. Returns ------- embeddings : Embeddings The embeddings from the input file. """ with open(file, 'rb') as inf: _ = Header.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) norms = None metadata = None if chunk_id == ChunkIdentifier.Metadata: metadata = Metadata.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.SimpleVocab: vocab = SimpleVocab.read_chunk(inf) # type: Vocab elif chunk_id == ChunkIdentifier.BucketSubwordVocab: vocab = FinalfusionBucketVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.FastTextSubwordVocab: vocab = FastTextVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab: vocab = ExplicitVocab.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected vocab chunk, not {str(chunk_id)}') chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.NdArray: storage = NdArray.load(inf, mmap) # type: Storage elif chunk_id == ChunkIdentifier.QuantizedArray: storage = QuantizedArray.load(inf, mmap) else: raise FinalfusionFormatError( f'Expected storage chunk, not {str(chunk_id)}') maybe_chunk_id = _read_chunk_header(inf) if maybe_chunk_id is not None: if maybe_chunk_id[0] == ChunkIdentifier.NdNorms: norms = Norms.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected norms chunk, not {str(chunk_id)}') return Embeddings(storage, vocab, norms, metadata, inf.name)
def test_iter_sliced(): norms = np.float32(np.random.random_sample(10)) s = Norms(norms) for _ in range(250): upper = np.random.randint(-len(norms) * 3, len(norms) * 3) lower = np.random.randint(-len(norms) * 3, len(norms) * 3) step = np.random.randint(-len(norms) * 3, len(norms) * 3) if step == 0: continue for norms_row, norms_row in zip(s[lower:upper:step], norms[lower:upper:step]): assert np.allclose(norms_row, norms_row)
def test_write_sliced(tmp_path): filename = tmp_path / "write_sliced.fifu" norms = np.float32(np.random.random_sample(10)) s = Norms(norms) for _ in range(250): upper = np.random.randint(-len(norms) * 3, len(norms) * 3) lower = np.random.randint(-len(norms) * 3, len(norms) * 3) step = np.random.randint(-len(norms) * 3, len(norms) * 3) if step == 0: continue s[lower:upper:step].write(filename) s2 = load_norms(filename) assert np.allclose(norms[lower:upper:step], s2)
def test_nonascii_whitespace_text_roundtrip(tmp_path): vocab = ["\u00A0"] storage = np.ones((1, 5), dtype=np.float32) norms = np.linalg.norm(storage, axis=1) storage /= norms[:, None] embeds = Embeddings(NdArray(storage), SimpleVocab(vocab), norms=Norms(norms)) filename = tmp_path / "non-ascii.txt" write_text(filename, embeds) text = load_text(filename) assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}' assert np.allclose(embeds.storage, text.storage) assert np.allclose(embeds.norms, text.norms)
def test_slice_slice(): for _ in range(250): norms = np.float32(np.random.random_sample(100)) s = Norms(norms) assert np.allclose(norms[:], s[:]) assert np.allclose(norms, s) for _ in range(5): if len(norms) == 0: break upper = np.random.randint(-len(norms) * 2, len(norms) * 2) lower = np.random.randint(-len(norms) * 2, len(norms) * 2) step = np.random.randint(-len(norms) * 2, len(norms) * 2) ctx = pytest.raises( ValueError) if step == 0 else contextlib.suppress() with ctx: norms = norms[lower:upper:step] with ctx: s = s[lower:upper:step] assert np.allclose(norms, s)
def _normalize_matrix(storage: np.ndarray) -> Norms: norms = np.linalg.norm(storage, axis=1) storage /= norms[:, None] return Norms(norms)
def _normalize_ndarray_storage(storage: NdArray) -> Norms: norms = np.linalg.norm(storage, axis=1) storage /= norms[:, None] return Norms(norms)