def test_embeddings_from_vocab_and_storage(): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1)) s = NdArray(matrix) v = SimpleVocab([str(i) for i in range(len(s))]) e = Embeddings(storage=s, vocab=v) assert np.allclose(e.storage, matrix) assert np.allclose(s, matrix) with pytest.raises(AssertionError): _ = Embeddings(storage=s, vocab=None) with pytest.raises(AssertionError): _ = Embeddings(storage=None, vocab=v) with pytest.raises(AssertionError): _ = Embeddings(storage=s[:-1], vocab=v) with pytest.raises(AssertionError): matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1)) _ = Embeddings(storage=NdArray(matrix), vocab=v)
def write(self, path: str, embeddings: Embeddings): """ Helper to write different Formats """ if self == Format.finalfusion: embeddings.write(path) elif self == Format.word2vec: write_word2vec(path, embeddings) elif self == Format.text: write_text(path, embeddings) elif self == Format.textdims: write_text_dims(path, embeddings) elif self == Format.fasttext: write_fasttext(path, embeddings) else: raise ValueError(f"Unknown format {str(self)}")
def load_fasttext(file: Union[str, bytes, int, PathLike]) -> Embeddings: """ Read embeddings from a file in fastText format. The returned embeddings have a FastTextVocab, NdArray storage and a Norms chunk. Loading embeddings with this method will precompute embeddings for each word by averaging all of its subword embeddings together with the distinct word vector. Additionally, all precomputed vectors are l2-normalized and the corresponding norms are stored in the Norms. The subword embeddings are **not** l2-normalized. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in word2vec binary format. Returns ------- embeddings : Embeddings The embeddings from the input file. """ with open(file, 'rb') as inf: _read_ft_header(inf) metadata = _read_ft_cfg(inf) vocab = _read_ft_vocab(inf, metadata['buckets'], metadata['min_n'], metadata['max_n']) storage = _read_ft_storage(inf, vocab) norms = _normalize_ndarray_storage(storage[:len(vocab)]) return Embeddings(storage=storage, vocab=vocab, norms=norms, metadata=metadata)
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings: """ Read embeddings in word2vec binary format. The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is l2-normalized per default and the corresponding norms are stored in the Norms. Files are expected to start with a line containing rows and cols in utf-8. Words are encoded in utf-8 followed by a single whitespace. After the whitespace, the embedding components are expected as little-endian single-precision floats. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in word2vec binary format. Returns ------- embeddings : Embeddings The embeddings from the input file. """ words = [] with open(file, 'rb') as inf: rows, cols = map(int, inf.readline().decode("ascii").split()) matrix = np.zeros((rows, cols), dtype=np.float32) for row in matrix: words.append(_read_binary_word(inf, b' ').strip()) array = np.fromfile(file=inf, count=cols, dtype=np.float32) if sys.byteorder == "big": array.byteswap(inplace=True) row[:] = array storage = NdArray(matrix) return Embeddings(storage=storage, norms=_normalize_ndarray_storage(storage), vocab=SimpleVocab(words))
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings: words = [] matrix = np.zeros((rows, cols), dtype=np.float32) for row, line in zip(matrix, file): parts = _ASCII_WHITESPACE_PAT.split(line.rstrip()) words.append(parts[0]) row[:] = parts[1:] storage = NdArray(matrix) return Embeddings(storage=storage, norms=_normalize_ndarray_storage(storage), vocab=SimpleVocab(words))
def test_nonascii_whitespace_text_roundtrip(tmp_path): vocab = ["\u00A0"] storage = np.ones((1, 5), dtype=np.float32) norms = np.linalg.norm(storage, axis=1) storage /= norms[:, None] embeds = Embeddings(NdArray(storage), SimpleVocab(vocab), norms=Norms(norms)) filename = tmp_path / "non-ascii.txt" write_text(filename, embeds) text = load_text(filename) assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}' assert np.allclose(embeds.storage, text.storage) assert np.allclose(embeds.norms, text.norms)
def main() -> None: # pylint: disable=missing-function-docstring formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"] parser = argparse.ArgumentParser( prog="ffp-select", description="Build embeddings from list of words.") add_input_output_args(parser) add_format_args(parser, "f", "format", formats, "finalfusion") parser.add_argument( "words", nargs='?', default=0, metavar="WORDS", help= "List of words to include in the embeddings. One word per line. Spaces permitted." "Reads from stdin if unspecified.") parser.add_argument("--ignore_unk", "-i", action="store_true", default=False, help="Skip unrepresentable words.") parser.add_argument( "--verbose", "-v", action="store_true", default=False, help= "Print which tokens are skipped because they can't be represented to stderr." ) add_common_args(parser) args = parser.parse_args() embeds = Format(args.format).load(args.input, args.lossy, args.mmap) with open(args.words, errors='replace' if args.lossy else 'strict') as inp: unique_words = set(word.strip() for word in inp) matrix = np.zeros((len(unique_words), embeds.storage.shape[1]), dtype=np.float32) vocab = SimpleVocab(list(unique_words)) for i, word in enumerate(vocab): try: matrix[i] = embeds[word] except KeyError: if args.verbose or not args.ignore_unk: print(f"Cannot represent '{word}'.", file=sys.stderr) if not args.ignore_unk: sys.exit(1) metadata = Metadata({"source_embeddings": args.input}) if embeds.metadata is not None: metadata["source_metadata"] = embeds.metadata Embeddings(storage=NdArray(matrix), vocab=vocab, metadata=metadata).write(args.output)
#!/usr/bin/env python3 import sys from finalfusion import Embeddings if __name__ == "__main__": if len(sys.argv) != 2: sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0]) sys.exit(1) embeds = Embeddings(sys.argv[1]) for line in sys.stdin: print(" ".join(map(lambda v: str(v), embeds.embedding(line.strip()))))
#!/usr/bin/env python3 import sys from finalfusion import Embeddings if __name__ == "__main__": if len(sys.argv) != 2: sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0]) sys.exit(1) embeds = Embeddings(sys.argv[1]) for line in sys.stdin: for result in embeds.similarity(line.strip()): print("%s\t%.2f" % (result.word, result.similarity))
#!/usr/bin/env python3 import sys from finalfusion import Embeddings if __name__ == "__main__": if len(sys.argv) != 2: sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0]) sys.exit(1) embeds = Embeddings(sys.argv[1]) print(embeds.metadata)
def test_no_norms(vocab_array_tuple): vocab, matrix = vocab_array_tuple embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix)) with pytest.raises(TypeError): _ = embeddings.embedding_with_norm("bla")
def write_fifu(path: Union[str, bytes, int, PathLike], embeddings: Embeddings): embeddings.write(path)