def test_simple_constructor(): v = SimpleVocab([str(i) for i in range(10)]) assert [v[str(i)] for i in range(10)] == [i for i in range(10)] with pytest.raises(AssertionError): SimpleVocab(["a"] * 2) assert len(v) == 10 assert v.upper_bound == len(v)
def test_simple_eq(): v = SimpleVocab([str(i) for i in range(10)]) assert v == v with pytest.raises(TypeError): _ = v > v with pytest.raises(TypeError): _ = v >= v with pytest.raises(TypeError): _ = v <= v with pytest.raises(TypeError): _ = v < v v2 = SimpleVocab([str(i + 1) for i in range(10)]) assert v != v2 assert v in v
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings: """ Read embeddings in word2vec binary format. The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is l2-normalized per default and the corresponding norms are stored in the Norms. Files are expected to start with a line containing rows and cols in utf-8. Words are encoded in utf-8 followed by a single whitespace. After the whitespace, the embedding components are expected as little-endian single-precision floats. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in word2vec binary format. Returns ------- embeddings : Embeddings The embeddings from the input file. """ words = [] with open(file, 'rb') as inf: rows, cols = map(int, inf.readline().decode("ascii").split()) matrix = np.zeros((rows, cols), dtype=np.float32) for row in matrix: words.append(_read_binary_word(inf, b' ').strip()) array = np.fromfile(file=inf, count=cols, dtype=np.float32) if sys.byteorder == "big": array.byteswap(inplace=True) row[:] = array storage = NdArray(matrix) return Embeddings(storage=storage, norms=_normalize_ndarray_storage(storage), vocab=SimpleVocab(words))
def load_finalfusion(file: Union[str, bytes, int, PathLike], mmap: bool = False) -> Embeddings: """ Read embeddings from a file in finalfusion format. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in finalfusoin format. mmap : bool Toggles memory mapping the storage buffer. Returns ------- embeddings : Embeddings The embeddings from the input file. """ with open(file, 'rb') as inf: _ = Header.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) norms = None metadata = None if chunk_id == ChunkIdentifier.Metadata: metadata = Metadata.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.SimpleVocab: vocab = SimpleVocab.read_chunk(inf) # type: Vocab elif chunk_id == ChunkIdentifier.BucketSubwordVocab: vocab = FinalfusionBucketVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.FastTextSubwordVocab: vocab = FastTextVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab: vocab = ExplicitVocab.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected vocab chunk, not {str(chunk_id)}') chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.NdArray: storage = NdArray.load(inf, mmap) # type: Storage elif chunk_id == ChunkIdentifier.QuantizedArray: storage = QuantizedArray.load(inf, mmap) else: raise FinalfusionFormatError( f'Expected storage chunk, not {str(chunk_id)}') maybe_chunk_id = _read_chunk_header(inf) if maybe_chunk_id is not None: if maybe_chunk_id[0] == ChunkIdentifier.NdNorms: norms = Norms.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected norms chunk, not {str(chunk_id)}') return Embeddings(storage, vocab, norms, metadata, inf.name)
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings: words = [] matrix = np.zeros((rows, cols), dtype=np.float32) for row, line in zip(matrix, file): parts = _ASCII_WHITESPACE_PAT.split(line.rstrip()) words.append(parts[0]) row[:] = parts[1:] storage = NdArray(matrix) return Embeddings(storage=storage, norms=_normalize_ndarray_storage(storage), vocab=SimpleVocab(words))
def test_nonascii_whitespace_text_roundtrip(tmp_path): vocab = ["\u00A0"] storage = np.ones((1, 5), dtype=np.float32) norms = np.linalg.norm(storage, axis=1) storage /= norms[:, None] embeds = Embeddings(NdArray(storage), SimpleVocab(vocab), norms=Norms(norms)) filename = tmp_path / "non-ascii.txt" write_text(filename, embeds) text = load_text(filename) assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}' assert np.allclose(embeds.storage, text.storage) assert np.allclose(embeds.norms, text.norms)
def main() -> None: # pylint: disable=missing-function-docstring formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"] parser = argparse.ArgumentParser( prog="ffp-select", description="Build embeddings from list of words.") add_input_output_args(parser) add_format_args(parser, "f", "format", formats, "finalfusion") parser.add_argument( "words", nargs='?', default=0, metavar="WORDS", help= "List of words to include in the embeddings. One word per line. Spaces permitted." "Reads from stdin if unspecified.") parser.add_argument("--ignore_unk", "-i", action="store_true", default=False, help="Skip unrepresentable words.") parser.add_argument( "--verbose", "-v", action="store_true", default=False, help= "Print which tokens are skipped because they can't be represented to stderr." ) add_common_args(parser) args = parser.parse_args() embeds = Format(args.format).load(args.input, args.lossy, args.mmap) with open(args.words, errors='replace' if args.lossy else 'strict') as inp: unique_words = set(word.strip() for word in inp) matrix = np.zeros((len(unique_words), embeds.storage.shape[1]), dtype=np.float32) vocab = SimpleVocab(list(unique_words)) for i, word in enumerate(vocab): try: matrix[i] = embeds[word] except KeyError: if args.verbose or not args.ignore_unk: print(f"Cannot represent '{word}'.", file=sys.stderr) if not args.ignore_unk: sys.exit(1) metadata = Metadata({"source_embeddings": args.input}) if embeds.metadata is not None: metadata["source_metadata"] = embeds.metadata Embeddings(storage=NdArray(matrix), vocab=vocab, metadata=metadata).write(args.output)
def test_explicit_constructor(): i = ExplicitIndexer([str(i) for i in range(10)]) v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i) assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)] with pytest.raises(AssertionError): _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21)) assert len(v) == 90 assert v.upper_bound == len(v) + 10 assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FastTextVocab(v.words, FastTextIndexer(20)) assert repr(v) == f"ExplicitVocab(\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ "\twords=[...]\n" \ "\tword_index={{...}})"
def test_embeddings_from_vocab_and_storage(): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1)) s = NdArray(matrix) v = SimpleVocab([str(i) for i in range(len(s))]) e = Embeddings(storage=s, vocab=v) assert np.allclose(e.storage, matrix) assert np.allclose(s, matrix) with pytest.raises(AssertionError): _ = Embeddings(storage=s, vocab=None) with pytest.raises(AssertionError): _ = Embeddings(storage=None, vocab=v) with pytest.raises(AssertionError): _ = Embeddings(storage=s[:-1], vocab=v) with pytest.raises(AssertionError): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1)) _ = Embeddings(storage=NdArray(matrix), vocab=v)
def test_fasttext_constructor(): v = FastTextVocab([str(i) for i in range(10)]) assert [v[str(i)] for i in range(10)] == [i for i in range(10)] with pytest.raises(AssertionError): v = FastTextVocab(["a"] * 2) with pytest.raises(AssertionError): _ = FastTextVocab(v.words, FinalfusionHashIndexer(21)) assert len(v) == 10 assert v.upper_bound == len(v) + 2_000_000 assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FastTextVocab(v.words, FastTextIndexer(20)) assert repr(v) == f"FastTextVocab(\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ "\twords=[...]\n" \ "\tword_index={{...}})"
def test_fifu_buckets_constructor(): v = FinalfusionBucketVocab([str(i) for i in range(10)]) assert [v[str(i)] for i in range(10)] == [i for i in range(10)] with pytest.raises(AssertionError): v = FinalfusionBucketVocab(["a"] * 2) with pytest.raises(AssertionError): _ = FinalfusionBucketVocab(v.words, FastTextIndexer(21)) assert len(v) == 10 assert v.upper_bound == len(v) + pow(2, 21) assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FinalfusionBucketVocab(v.words, FinalfusionHashIndexer(20)) assert repr(v) == "FinalfusionBucketVocab(\n" \ f"\tn_words={len(v)},\n" \ f"\tupper_bound={len(v) + pow(2, 21)},\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ f")"
def _read_ft_vocab(file: BinaryIO, buckets: int, min_n: int, max_n: int) -> Union[FastTextVocab, SimpleVocab]: """ Helper method to read a vocab from a fastText file Returns a SimpleVocab if min_n is 0, otherwise FastTextVocab is returned. """ # discard n_words vocab_size, _n_words, n_labels = _read_required_binary(file, "<iii") if n_labels: raise NotImplementedError( "fastText prediction models are not supported") # discard n_tokens _read_required_binary(file, "<q") prune_idx_size = _read_required_binary(file, "<q")[0] if prune_idx_size > 0: raise NotImplementedError("Pruned vocabs are not supported") if min_n: return _read_ft_subwordvocab(file, buckets, min_n, max_n, vocab_size) return SimpleVocab([_read_binary_word(file) for _ in range(vocab_size)])
def test_no_norms(vocab_array_tuple): vocab, matrix = vocab_array_tuple embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix)) with pytest.raises(TypeError): _ = embeddings.embedding_with_norm("bla")