Python Embeddingsの例

プログラミング言語: Python

名前空間/パッケージ名: finalfusion

クラス/型: Embeddings

hotexamples.comのコード掲載数: 12

Python Embeddings - 12件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのfinalfusion.Embeddingsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Embeddings(8)

write(2)

embedding(1)

embedding_with_norm(1)

similarity(1)

コード例 #1

ファイルを表示

def test_embeddings_from_vocab_and_storage():
    matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1))
    s = NdArray(matrix)
    v = SimpleVocab([str(i) for i in range(len(s))])
    e = Embeddings(storage=s, vocab=v)
    assert np.allclose(e.storage, matrix)
    assert np.allclose(s, matrix)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s, vocab=None)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=None, vocab=v)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s[:-1], vocab=v)
    with pytest.raises(AssertionError):
        matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1))
        _ = Embeddings(storage=NdArray(matrix), vocab=v)

コード例 #2

ファイルを表示

ファイル: util.py プロジェクト: finalfusion/finalfusion-python

 def write(self, path: str, embeddings: Embeddings):
     """
     Helper to write different Formats
     """
     if self == Format.finalfusion:
         embeddings.write(path)
     elif self == Format.word2vec:
         write_word2vec(path, embeddings)
     elif self == Format.text:
         write_text(path, embeddings)
     elif self == Format.textdims:
         write_text_dims(path, embeddings)
     elif self == Format.fasttext:
         write_fasttext(path, embeddings)
     else:
         raise ValueError(f"Unknown format {str(self)}")

コード例 #3

ファイルを表示

ファイル: fasttext.py プロジェクト: finalfusion/ffp

def load_fasttext(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings from a file in fastText format.

    The returned embeddings have a FastTextVocab, NdArray storage and a Norms chunk.

    Loading embeddings with this method will precompute embeddings for each word by averaging all
    of its subword embeddings together with the distinct word vector. Additionally, all precomputed
    vectors are l2-normalized and the corresponding norms are stored in the Norms. The subword
    embeddings are **not** l2-normalized.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, 'rb') as inf:
        _read_ft_header(inf)
        metadata = _read_ft_cfg(inf)
        vocab = _read_ft_vocab(inf, metadata['buckets'], metadata['min_n'],
                               metadata['max_n'])
        storage = _read_ft_storage(inf, vocab)
        norms = _normalize_ndarray_storage(storage[:len(vocab)])
    return Embeddings(storage=storage,
                      vocab=vocab,
                      norms=norms,
                      metadata=metadata)

コード例 #4

ファイルを表示

def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings in word2vec binary format.

    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
    l2-normalized per default and the corresponding norms are stored in the Norms.

    Files are expected to start with a line containing rows and cols in utf-8. Words are encoded
    in utf-8 followed by a single whitespace. After the whitespace, the embedding components are
    expected as little-endian single-precision floats.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    words = []
    with open(file, 'rb') as inf:
        rows, cols = map(int, inf.readline().decode("ascii").split())
        matrix = np.zeros((rows, cols), dtype=np.float32)
        for row in matrix:
            words.append(_read_binary_word(inf, b' ').strip())
            array = np.fromfile(file=inf, count=cols, dtype=np.float32)
            if sys.byteorder == "big":
                array.byteswap(inplace=True)
            row[:] = array
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))

コード例 #5

ファイルを表示

ファイル: text.py プロジェクト: finalfusion/ffp

def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
    words = []
    matrix = np.zeros((rows, cols), dtype=np.float32)
    for row, line in zip(matrix, file):
        parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
        words.append(parts[0])
        row[:] = parts[1:]
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))

コード例 #6

ファイルを表示

def test_nonascii_whitespace_text_roundtrip(tmp_path):
    vocab = ["\u00A0"]
    storage = np.ones((1, 5), dtype=np.float32)
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    embeds = Embeddings(NdArray(storage),
                        SimpleVocab(vocab),
                        norms=Norms(norms))
    filename = tmp_path / "non-ascii.txt"
    write_text(filename, embeds)
    text = load_text(filename)
    assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}'
    assert np.allclose(embeds.storage, text.storage)
    assert np.allclose(embeds.norms, text.norms)

コード例 #7

ファイルを表示

def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(
        prog="ffp-select", description="Build embeddings from list of words.")
    add_input_output_args(parser)
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument(
        "words",
        nargs='?',
        default=0,
        metavar="WORDS",
        help=
        "List of words to include in the embeddings. One word per line. Spaces permitted."
        "Reads from stdin if unspecified.")
    parser.add_argument("--ignore_unk",
                        "-i",
                        action="store_true",
                        default=False,
                        help="Skip unrepresentable words.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help=
        "Print which tokens are skipped because they can't be represented to stderr."
    )
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(args.format).load(args.input, args.lossy, args.mmap)
    with open(args.words, errors='replace' if args.lossy else 'strict') as inp:
        unique_words = set(word.strip() for word in inp)
        matrix = np.zeros((len(unique_words), embeds.storage.shape[1]),
                          dtype=np.float32)
        vocab = SimpleVocab(list(unique_words))
        for i, word in enumerate(vocab):
            try:
                matrix[i] = embeds[word]
            except KeyError:
                if args.verbose or not args.ignore_unk:
                    print(f"Cannot represent '{word}'.", file=sys.stderr)
                if not args.ignore_unk:
                    sys.exit(1)
    metadata = Metadata({"source_embeddings": args.input})
    if embeds.metadata is not None:
        metadata["source_metadata"] = embeds.metadata
    Embeddings(storage=NdArray(matrix), vocab=vocab,
               metadata=metadata).write(args.output)

コード例 #8

ファイルを表示

ファイル: print-embedding.py プロジェクト: twuebi/finalfusion-python

#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    for line in sys.stdin:
        print(" ".join(map(lambda v: str(v), embeds.embedding(line.strip()))))

コード例 #9

ファイルを表示

#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    for line in sys.stdin:
        for result in embeds.similarity(line.strip()):
            print("%s\t%.2f" % (result.word, result.similarity))

コード例 #10

ファイルを表示

#!/usr/bin/env python3

import sys

from finalfusion import Embeddings

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: %s embeddings\n" % sys.argv[0])
        sys.exit(1)

    embeds = Embeddings(sys.argv[1])

    print(embeds.metadata)

コード例 #11

ファイルを表示

def test_no_norms(vocab_array_tuple):
    vocab, matrix = vocab_array_tuple
    embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix))
    with pytest.raises(TypeError):
        _ = embeddings.embedding_with_norm("bla")

コード例 #12

ファイルを表示

 def write_fifu(path: Union[str, bytes, int, PathLike],
                embeddings: Embeddings):
     embeddings.write(path)