Beispiel #1
0
def test_embeddings_from_vocab_and_storage():
    matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1))
    s = NdArray(matrix)
    v = SimpleVocab([str(i) for i in range(len(s))])
    e = Embeddings(storage=s, vocab=v)
    assert np.allclose(e.storage, matrix)
    assert np.allclose(s, matrix)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s, vocab=None)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=None, vocab=v)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s[:-1], vocab=v)
    with pytest.raises(AssertionError):
        matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1))
        _ = Embeddings(storage=NdArray(matrix), vocab=v)
Beispiel #2
0
 def write(self, path: str, embeddings: Embeddings):
     """
     Helper to write different Formats
     """
     if self == Format.finalfusion:
         embeddings.write(path)
     elif self == Format.word2vec:
         write_word2vec(path, embeddings)
     elif self == Format.text:
         write_text(path, embeddings)
     elif self == Format.textdims:
         write_text_dims(path, embeddings)
     elif self == Format.fasttext:
         write_fasttext(path, embeddings)
     else:
         raise ValueError(f"Unknown format {str(self)}")
Beispiel #3
0
def load_fasttext(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings from a file in fastText format.

    The returned embeddings have a FastTextVocab, NdArray storage and a Norms chunk.

    Loading embeddings with this method will precompute embeddings for each word by averaging all
    of its subword embeddings together with the distinct word vector. Additionally, all precomputed
    vectors are l2-normalized and the corresponding norms are stored in the Norms. The subword
    embeddings are **not** l2-normalized.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, 'rb') as inf:
        _read_ft_header(inf)
        metadata = _read_ft_cfg(inf)
        vocab = _read_ft_vocab(inf, metadata['buckets'], metadata['min_n'],
                               metadata['max_n'])
        storage = _read_ft_storage(inf, vocab)
        norms = _normalize_ndarray_storage(storage[:len(vocab)])
    return Embeddings(storage=storage,
                      vocab=vocab,
                      norms=norms,
                      metadata=metadata)
Beispiel #4
0
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings in word2vec binary format.

    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
    l2-normalized per default and the corresponding norms are stored in the Norms.

    Files are expected to start with a line containing rows and cols in utf-8. Words are encoded
    in utf-8 followed by a single whitespace. After the whitespace, the embedding components are
    expected as little-endian single-precision floats.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    words = []
    with open(file, 'rb') as inf:
        rows, cols = map(int, inf.readline().decode("ascii").split())
        matrix = np.zeros((rows, cols), dtype=np.float32)
        for row in matrix:
            words.append(_read_binary_word(inf, b' ').strip())
            array = np.fromfile(file=inf, count=cols, dtype=np.float32)
            if sys.byteorder == "big":
                array.byteswap(inplace=True)
            row[:] = array
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
Beispiel #5
0
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
    words = []
    matrix = np.zeros((rows, cols), dtype=np.float32)
    for row, line in zip(matrix, file):
        parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
        words.append(parts[0])
        row[:] = parts[1:]
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
Beispiel #6
0
def test_nonascii_whitespace_text_roundtrip(tmp_path):
    vocab = ["\u00A0"]
    storage = np.ones((1, 5), dtype=np.float32)
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    embeds = Embeddings(NdArray(storage),
                        SimpleVocab(vocab),
                        norms=Norms(norms))
    filename = tmp_path / "non-ascii.txt"
    write_text(filename, embeds)
    text = load_text(filename)
    assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}'
    assert np.allclose(embeds.storage, text.storage)
    assert np.allclose(embeds.norms, text.norms)
Beispiel #7
0
def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(
        prog="ffp-select", description="Build embeddings from list of words.")
    add_input_output_args(parser)
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument(
        "words",
        nargs='?',
        default=0,
        metavar="WORDS",
        help=
        "List of words to include in the embeddings. One word per line. Spaces permitted."
        "Reads from stdin if unspecified.")
    parser.add_argument("--ignore_unk",
                        "-i",
                        action="store_true",
                        default=False,
                        help="Skip unrepresentable words.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help=
        "Print which tokens are skipped because they can't be represented to stderr."
    )
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(args.format).load(args.input, args.lossy, args.mmap)
    with open(args.words, errors='replace' if args.lossy else 'strict') as inp:
        unique_words = set(word.strip() for word in inp)
        matrix = np.zeros((len(unique_words), embeds.storage.shape[1]),
                          dtype=np.float32)
        vocab = SimpleVocab(list(unique_words))
        for i, word in enumerate(vocab):
            try:
                matrix[i] = embeds[word]
            except KeyError:
                if args.verbose or not args.ignore_unk:
                    print(f"Cannot represent '{word}'.", file=sys.stderr)
                if not args.ignore_unk:
                    sys.exit(1)
    metadata = Metadata({"source_embeddings": args.input})
    if embeds.metadata is not None:
        metadata["source_metadata"] = embeds.metadata
    Embeddings(storage=NdArray(matrix), vocab=vocab,
               metadata=metadata).write(args.output)
#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    for line in sys.stdin:
        print(" ".join(map(lambda v: str(v), embeds.embedding(line.strip()))))
Beispiel #9
0
#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    for line in sys.stdin:
        for result in embeds.similarity(line.strip()):
            print("%s\t%.2f" % (result.word, result.similarity))
Beispiel #10
0
#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    print(embeds.metadata)
Beispiel #11
0
def test_no_norms(vocab_array_tuple):
    vocab, matrix = vocab_array_tuple
    embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix))
    with pytest.raises(TypeError):
        _ = embeddings.embedding_with_norm("bla")
Beispiel #12
0
 def write_fifu(path: Union[str, bytes, int, PathLike],
                embeddings: Embeddings):
     embeddings.write(path)