def test_from_matrix(): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1)) s = NdArray(matrix) assert np.allclose(matrix, s) assert s.shape == matrix.shape with pytest.raises(AttributeError): _ = NdArray(None) with pytest.raises(TypeError): _ = NdArray(np.arange(0, 10, dtype=np.float32)) with pytest.raises(TypeError): _ = NdArray(np.tile(np.arange(0, 10), (10, 1))) with pytest.raises(TypeError): _ = NdArray(np.tile(np.arange(0, 10, dtype=np.float), (10, 1))) assert np.allclose(matrix, s)
def test_slicing(): matrix = np.float32(np.random.random_sample((10, 10))) s = NdArray(matrix) assert np.allclose(matrix[:], s[:]) assert np.allclose(matrix, s) for _ in range(250): upper = np.random.randint(-len(matrix) * 3, len(matrix) * 3) lower = np.random.randint(-len(matrix) * 3, len(matrix) * 3) step = np.random.randint(-len(matrix) * 3, len(matrix) * 3) ctx = pytest.raises(ValueError) if step == 0 else contextlib.suppress() assert np.allclose(matrix[:upper], s[:upper]) assert np.allclose(matrix[lower:upper], s[lower:upper]) with ctx: val = s[lower:upper:step] with ctx: assert np.allclose(matrix[lower:upper:step], val) with ctx: val = s[:upper:step] with ctx: assert np.allclose(matrix[:upper:step], val) with ctx: val = s[::step] with ctx: assert np.allclose(matrix[::step], val)
def test_embeddings_from_vocab_and_storage(): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1)) s = NdArray(matrix) v = SimpleVocab([str(i) for i in range(len(s))]) e = Embeddings(storage=s, vocab=v) assert np.allclose(e.storage, matrix) assert np.allclose(s, matrix) with pytest.raises(AssertionError): _ = Embeddings(storage=s, vocab=None) with pytest.raises(AssertionError): _ = Embeddings(storage=None, vocab=v) with pytest.raises(AssertionError): _ = Embeddings(storage=s[:-1], vocab=v) with pytest.raises(AssertionError): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1)) _ = Embeddings(storage=NdArray(matrix), vocab=v)
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings: """ Read embeddings in word2vec binary format. The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is l2-normalized per default and the corresponding norms are stored in the Norms. Files are expected to start with a line containing rows and cols in utf-8. Words are encoded in utf-8 followed by a single whitespace. After the whitespace, the embedding components are expected as little-endian single-precision floats. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in word2vec binary format. Returns ------- embeddings : Embeddings The embeddings from the input file. """ words = [] with open(file, 'rb') as inf: rows, cols = map(int, inf.readline().decode("ascii").split()) matrix = np.zeros((rows, cols), dtype=np.float32) for row in matrix: words.append(_read_binary_word(inf, b' ').strip()) array = np.fromfile(file=inf, count=cols, dtype=np.float32) if sys.byteorder == "big": array.byteswap(inplace=True) row[:] = array storage = NdArray(matrix) return Embeddings(storage=storage, norms=_normalize_ndarray_storage(storage), vocab=SimpleVocab(words))
def load_finalfusion(file: Union[str, bytes, int, PathLike], mmap: bool = False) -> Embeddings: """ Read embeddings from a file in finalfusion format. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in finalfusoin format. mmap : bool Toggles memory mapping the storage buffer. Returns ------- embeddings : Embeddings The embeddings from the input file. """ with open(file, 'rb') as inf: _ = Header.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) norms = None metadata = None if chunk_id == ChunkIdentifier.Metadata: metadata = Metadata.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.SimpleVocab: vocab = SimpleVocab.read_chunk(inf) # type: Vocab elif chunk_id == ChunkIdentifier.BucketSubwordVocab: vocab = FinalfusionBucketVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.FastTextSubwordVocab: vocab = FastTextVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab: vocab = ExplicitVocab.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected vocab chunk, not {str(chunk_id)}') chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.NdArray: storage = NdArray.load(inf, mmap) # type: Storage elif chunk_id == ChunkIdentifier.QuantizedArray: storage = QuantizedArray.load(inf, mmap) else: raise FinalfusionFormatError( f'Expected storage chunk, not {str(chunk_id)}') maybe_chunk_id = _read_chunk_header(inf) if maybe_chunk_id is not None: if maybe_chunk_id[0] == ChunkIdentifier.NdNorms: norms = Norms.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected norms chunk, not {str(chunk_id)}') return Embeddings(storage, vocab, norms, metadata, inf.name)
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings: words = [] matrix = np.zeros((rows, cols), dtype=np.float32) for row, line in zip(matrix, file): parts = _ASCII_WHITESPACE_PAT.split(line.rstrip()) words.append(parts[0]) row[:] = parts[1:] storage = NdArray(matrix) return Embeddings(storage=storage, norms=_normalize_ndarray_storage(storage), vocab=SimpleVocab(words))
def test_iter_sliced(): matrix = np.float32(np.random.random_sample((10, 10))) s = NdArray(matrix) for _ in range(250): upper = np.random.randint(-len(matrix) * 3, len(matrix) * 3) lower = np.random.randint(-len(matrix) * 3, len(matrix) * 3) step = np.random.randint(-len(matrix) * 3, len(matrix) * 3) if step == 0: continue for storage_row, matrix_row in zip(s[lower:upper:step], matrix[lower:upper:step]): assert np.allclose(storage_row, matrix_row)
def test_nonascii_whitespace_text_roundtrip(tmp_path): vocab = ["\u00A0"] storage = np.ones((1, 5), dtype=np.float32) norms = np.linalg.norm(storage, axis=1) storage /= norms[:, None] embeds = Embeddings(NdArray(storage), SimpleVocab(vocab), norms=Norms(norms)) filename = tmp_path / "non-ascii.txt" write_text(filename, embeds) text = load_text(filename) assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}' assert np.allclose(embeds.storage, text.storage) assert np.allclose(embeds.norms, text.norms)
def test_indexing(): matrix = np.float32( np.random.random_sample(sorted(np.random.randint(10, 100, 2)))) s = NdArray(matrix) assert np.allclose(matrix, s) for _ in range(1000): idx = np.random.randint(-len(s) * 2, len(s) * 2) if idx >= len(s) or idx < -len(s): ctx = pytest.raises(IndexError) else: ctx = contextlib.suppress() with ctx: val = s[idx] with ctx: assert np.allclose(val, matrix[idx])
def main() -> None: # pylint: disable=missing-function-docstring formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"] parser = argparse.ArgumentParser( prog="ffp-select", description="Build embeddings from list of words.") add_input_output_args(parser) add_format_args(parser, "f", "format", formats, "finalfusion") parser.add_argument( "words", nargs='?', default=0, metavar="WORDS", help= "List of words to include in the embeddings. One word per line. Spaces permitted." "Reads from stdin if unspecified.") parser.add_argument("--ignore_unk", "-i", action="store_true", default=False, help="Skip unrepresentable words.") parser.add_argument( "--verbose", "-v", action="store_true", default=False, help= "Print which tokens are skipped because they can't be represented to stderr." ) add_common_args(parser) args = parser.parse_args() embeds = Format(args.format).load(args.input, args.lossy, args.mmap) with open(args.words, errors='replace' if args.lossy else 'strict') as inp: unique_words = set(word.strip() for word in inp) matrix = np.zeros((len(unique_words), embeds.storage.shape[1]), dtype=np.float32) vocab = SimpleVocab(list(unique_words)) for i, word in enumerate(vocab): try: matrix[i] = embeds[word] except KeyError: if args.verbose or not args.ignore_unk: print(f"Cannot represent '{word}'.", file=sys.stderr) if not args.ignore_unk: sys.exit(1) metadata = Metadata({"source_embeddings": args.input}) if embeds.metadata is not None: metadata["source_metadata"] = embeds.metadata Embeddings(storage=NdArray(matrix), vocab=vocab, metadata=metadata).write(args.output)
def test_write_sliced(tmp_path): matrix = np.float32(np.random.random_sample((10, 10))) s = NdArray(matrix) filename = tmp_path / "write_sliced.fifu" for _ in range(250): upper = np.random.randint(-len(matrix) * 3, len(matrix) * 3) lower = np.random.randint(-len(matrix) * 3, len(matrix) * 3) step = np.random.randint(-len(matrix) * 3, len(matrix) * 3) mmap = np.random.randint(0, 1) if mmap and sys.byteorder == "big": continue if step == 0: continue s[lower:upper:step].write(filename) s2 = load_ndarray(filename, bool(mmap)) assert np.allclose(matrix[lower:upper:step], s2)
def _read_ft_storage(file: BinaryIO, vocab: Vocab) -> NdArray: """ Helper method to read fastText storage. If vocab is a SimpleVocab, the matrix is read and returned as is. If vocab is a FastTextVocab, the word representations are precomputed based on the vocab. """ quantized = _read_required_binary(file, "?")[0] if quantized: raise NotImplementedError( "Quantized storage is not supported for fastText models") rows, cols = _read_required_binary(file, "<qq") matrix = np.fromfile(file=file, count=rows * cols, dtype=np.float32).reshape((rows, cols)) if sys.byteorder == 'big': matrix.byteswap(inplace=True) if isinstance(vocab, FastTextVocab): _precompute_word_vecs(vocab, matrix) return NdArray(matrix)
def test_slice_slice(): for _ in range(250): matrix = np.float32(np.random.random_sample((100, 10))) s = NdArray(matrix) assert np.allclose(matrix[:], s[:]) assert np.allclose(matrix, s) for _ in range(5): if len(matrix) == 0: break upper = np.random.randint(-len(matrix) * 2, len(matrix) * 2) lower = np.random.randint(-len(matrix) * 2, len(matrix) * 2) step = np.random.randint(-len(matrix) * 2, len(matrix) * 2) ctx = pytest.raises( ValueError) if step == 0 else contextlib.suppress() with ctx: matrix = matrix[lower:upper:step] with ctx: s = s[lower:upper:step] assert isinstance(s, np.ndarray) assert isinstance(s, Storage) assert isinstance(s, NdArray) assert np.allclose(matrix, s)
def bucket_to_explicit(self) -> 'Embeddings': """ Bucket to explicit Embeddings conversion. Multiple embeddings can still map to the same bucket, but all buckets that are not indexed by in-vocabulary n-grams are eliminated. This can have a big impact on the size of the embedding matrix. Metadata is **not** copied to the new embeddings since it doesn't reflect the changes. You can manually set the metadata and update the values accordingly. Returns ------- embeddings : Embeddings Embeddings with an ExplicitVocab instead of a hash-based vocabulary. Raises ------ TypeError If the current vocabulary is not a hash-based vocabulary (FinalfusionBucketVocab or FastTextVocab) """ bucket_vocabs = (FastTextVocab, FinalfusionBucketVocab) if not isinstance(self.vocab, bucket_vocabs): raise TypeError( "Only bucketed embeddings can be converted to explicit.") vocab = self.vocab.to_explicit() storage = np.zeros((vocab.upper_bound, self._storage.shape[1]), dtype=np.float32) storage[:len(vocab)] = self._storage[:len(vocab)] for ngram in vocab.subword_indexer: storage[len(vocab) + vocab.subword_indexer[ngram]] = self._storage[ len(vocab) + self.vocab.subword_indexer(ngram)] return Embeddings(vocab=vocab, storage=NdArray(storage), norms=self.norms)
def test_iter(): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1)) s = NdArray(matrix) for storage_row, matrix_row in zip(s, matrix): assert np.allclose(storage_row, matrix_row)
def test_no_norms(vocab_array_tuple): vocab, matrix = vocab_array_tuple embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix)) with pytest.raises(TypeError): _ = embeddings.embedding_with_norm("bla")