コード例 #1
0
def test_embeddings_from_vocab_and_storage():
    matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1))
    s = NdArray(matrix)
    v = SimpleVocab([str(i) for i in range(len(s))])
    e = Embeddings(storage=s, vocab=v)
    assert np.allclose(e.storage, matrix)
    assert np.allclose(s, matrix)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s, vocab=None)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=None, vocab=v)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s[:-1], vocab=v)
    with pytest.raises(AssertionError):
        matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1))
        _ = Embeddings(storage=NdArray(matrix), vocab=v)
コード例 #2
0
 def write(self, path: str, embeddings: Embeddings):
     """
     Helper to write different Formats
     """
     if self == Format.finalfusion:
         embeddings.write(path)
     elif self == Format.word2vec:
         write_word2vec(path, embeddings)
     elif self == Format.text:
         write_text(path, embeddings)
     elif self == Format.textdims:
         write_text_dims(path, embeddings)
     elif self == Format.fasttext:
         write_fasttext(path, embeddings)
     else:
         raise ValueError(f"Unknown format {str(self)}")
コード例 #3
0
ファイル: fasttext.py プロジェクト: finalfusion/ffp
def load_fasttext(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings from a file in fastText format.

    The returned embeddings have a FastTextVocab, NdArray storage and a Norms chunk.

    Loading embeddings with this method will precompute embeddings for each word by averaging all
    of its subword embeddings together with the distinct word vector. Additionally, all precomputed
    vectors are l2-normalized and the corresponding norms are stored in the Norms. The subword
    embeddings are **not** l2-normalized.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, 'rb') as inf:
        _read_ft_header(inf)
        metadata = _read_ft_cfg(inf)
        vocab = _read_ft_vocab(inf, metadata['buckets'], metadata['min_n'],
                               metadata['max_n'])
        storage = _read_ft_storage(inf, vocab)
        norms = _normalize_ndarray_storage(storage[:len(vocab)])
    return Embeddings(storage=storage,
                      vocab=vocab,
                      norms=norms,
                      metadata=metadata)
コード例 #4
0
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings in word2vec binary format.

    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
    l2-normalized per default and the corresponding norms are stored in the Norms.

    Files are expected to start with a line containing rows and cols in utf-8. Words are encoded
    in utf-8 followed by a single whitespace. After the whitespace, the embedding components are
    expected as little-endian single-precision floats.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    words = []
    with open(file, 'rb') as inf:
        rows, cols = map(int, inf.readline().decode("ascii").split())
        matrix = np.zeros((rows, cols), dtype=np.float32)
        for row in matrix:
            words.append(_read_binary_word(inf, b' ').strip())
            array = np.fromfile(file=inf, count=cols, dtype=np.float32)
            if sys.byteorder == "big":
                array.byteswap(inplace=True)
            row[:] = array
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
コード例 #5
0
ファイル: text.py プロジェクト: finalfusion/ffp
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
    words = []
    matrix = np.zeros((rows, cols), dtype=np.float32)
    for row, line in zip(matrix, file):
        parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
        words.append(parts[0])
        row[:] = parts[1:]
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
コード例 #6
0
def test_nonascii_whitespace_text_roundtrip(tmp_path):
    vocab = ["\u00A0"]
    storage = np.ones((1, 5), dtype=np.float32)
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    embeds = Embeddings(NdArray(storage),
                        SimpleVocab(vocab),
                        norms=Norms(norms))
    filename = tmp_path / "non-ascii.txt"
    write_text(filename, embeds)
    text = load_text(filename)
    assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}'
    assert np.allclose(embeds.storage, text.storage)
    assert np.allclose(embeds.norms, text.norms)
コード例 #7
0
def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(
        prog="ffp-select", description="Build embeddings from list of words.")
    add_input_output_args(parser)
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument(
        "words",
        nargs='?',
        default=0,
        metavar="WORDS",
        help=
        "List of words to include in the embeddings. One word per line. Spaces permitted."
        "Reads from stdin if unspecified.")
    parser.add_argument("--ignore_unk",
                        "-i",
                        action="store_true",
                        default=False,
                        help="Skip unrepresentable words.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help=
        "Print which tokens are skipped because they can't be represented to stderr."
    )
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(args.format).load(args.input, args.lossy, args.mmap)
    with open(args.words, errors='replace' if args.lossy else 'strict') as inp:
        unique_words = set(word.strip() for word in inp)
        matrix = np.zeros((len(unique_words), embeds.storage.shape[1]),
                          dtype=np.float32)
        vocab = SimpleVocab(list(unique_words))
        for i, word in enumerate(vocab):
            try:
                matrix[i] = embeds[word]
            except KeyError:
                if args.verbose or not args.ignore_unk:
                    print(f"Cannot represent '{word}'.", file=sys.stderr)
                if not args.ignore_unk:
                    sys.exit(1)
    metadata = Metadata({"source_embeddings": args.input})
    if embeds.metadata is not None:
        metadata["source_metadata"] = embeds.metadata
    Embeddings(storage=NdArray(matrix), vocab=vocab,
               metadata=metadata).write(args.output)
コード例 #8
0
#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    for line in sys.stdin:
        print(" ".join(map(lambda v: str(v), embeds.embedding(line.strip()))))
コード例 #9
0
#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    for line in sys.stdin:
        for result in embeds.similarity(line.strip()):
            print("%s\t%.2f" % (result.word, result.similarity))
コード例 #10
0
#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    print(embeds.metadata)
コード例 #11
0
def test_no_norms(vocab_array_tuple):
    vocab, matrix = vocab_array_tuple
    embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix))
    with pytest.raises(TypeError):
        _ = embeddings.embedding_with_norm("bla")
コード例 #12
0
 def write_fifu(path: Union[str, bytes, int, PathLike],
                embeddings: Embeddings):
     embeddings.write(path)