コード例 #1
0
def test_from_matrix():
    matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1))
    s = NdArray(matrix)
    assert np.allclose(matrix, s)
    assert s.shape == matrix.shape
    with pytest.raises(AttributeError):
        _ = NdArray(None)
    with pytest.raises(TypeError):
        _ = NdArray(np.arange(0, 10, dtype=np.float32))
    with pytest.raises(TypeError):
        _ = NdArray(np.tile(np.arange(0, 10), (10, 1)))
    with pytest.raises(TypeError):
        _ = NdArray(np.tile(np.arange(0, 10, dtype=np.float), (10, 1)))
    assert np.allclose(matrix, s)
コード例 #2
0
def test_slicing():
    matrix = np.float32(np.random.random_sample((10, 10)))
    s = NdArray(matrix)
    assert np.allclose(matrix[:], s[:])
    assert np.allclose(matrix, s)

    for _ in range(250):
        upper = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        lower = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        step = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        ctx = pytest.raises(ValueError) if step == 0 else contextlib.suppress()

        assert np.allclose(matrix[:upper], s[:upper])
        assert np.allclose(matrix[lower:upper], s[lower:upper])
        with ctx:
            val = s[lower:upper:step]
        with ctx:
            assert np.allclose(matrix[lower:upper:step], val)
        with ctx:
            val = s[:upper:step]
        with ctx:
            assert np.allclose(matrix[:upper:step], val)
        with ctx:
            val = s[::step]
        with ctx:
            assert np.allclose(matrix[::step], val)
コード例 #3
0
def test_embeddings_from_vocab_and_storage():
    matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1))
    s = NdArray(matrix)
    v = SimpleVocab([str(i) for i in range(len(s))])
    e = Embeddings(storage=s, vocab=v)
    assert np.allclose(e.storage, matrix)
    assert np.allclose(s, matrix)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s, vocab=None)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=None, vocab=v)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s[:-1], vocab=v)
    with pytest.raises(AssertionError):
        matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1))
        _ = Embeddings(storage=NdArray(matrix), vocab=v)
コード例 #4
0
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings in word2vec binary format.

    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
    l2-normalized per default and the corresponding norms are stored in the Norms.

    Files are expected to start with a line containing rows and cols in utf-8. Words are encoded
    in utf-8 followed by a single whitespace. After the whitespace, the embedding components are
    expected as little-endian single-precision floats.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    words = []
    with open(file, 'rb') as inf:
        rows, cols = map(int, inf.readline().decode("ascii").split())
        matrix = np.zeros((rows, cols), dtype=np.float32)
        for row in matrix:
            words.append(_read_binary_word(inf, b' ').strip())
            array = np.fromfile(file=inf, count=cols, dtype=np.float32)
            if sys.byteorder == "big":
                array.byteswap(inplace=True)
            row[:] = array
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
コード例 #5
0
def load_finalfusion(file: Union[str, bytes, int, PathLike],
                     mmap: bool = False) -> Embeddings:
    """
    Read embeddings from a file in finalfusion format.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in finalfusoin format.
    mmap : bool
        Toggles memory mapping the storage buffer.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, 'rb') as inf:
        _ = Header.read_chunk(inf)
        chunk_id, _ = _read_required_chunk_header(inf)
        norms = None
        metadata = None

        if chunk_id == ChunkIdentifier.Metadata:
            metadata = Metadata.read_chunk(inf)
            chunk_id, _ = _read_required_chunk_header(inf)

        if chunk_id == ChunkIdentifier.SimpleVocab:
            vocab = SimpleVocab.read_chunk(inf)  # type: Vocab
        elif chunk_id == ChunkIdentifier.BucketSubwordVocab:
            vocab = FinalfusionBucketVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.FastTextSubwordVocab:
            vocab = FastTextVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab:
            vocab = ExplicitVocab.read_chunk(inf)
        else:
            raise FinalfusionFormatError(
                f'Expected vocab chunk, not {str(chunk_id)}')

        chunk_id, _ = _read_required_chunk_header(inf)
        if chunk_id == ChunkIdentifier.NdArray:
            storage = NdArray.load(inf, mmap)  # type: Storage
        elif chunk_id == ChunkIdentifier.QuantizedArray:
            storage = QuantizedArray.load(inf, mmap)
        else:
            raise FinalfusionFormatError(
                f'Expected storage chunk, not {str(chunk_id)}')
        maybe_chunk_id = _read_chunk_header(inf)
        if maybe_chunk_id is not None:
            if maybe_chunk_id[0] == ChunkIdentifier.NdNorms:
                norms = Norms.read_chunk(inf)
            else:
                raise FinalfusionFormatError(
                    f'Expected norms chunk, not {str(chunk_id)}')

        return Embeddings(storage, vocab, norms, metadata, inf.name)
コード例 #6
0
ファイル: text.py プロジェクト: finalfusion/ffp
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
    words = []
    matrix = np.zeros((rows, cols), dtype=np.float32)
    for row, line in zip(matrix, file):
        parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
        words.append(parts[0])
        row[:] = parts[1:]
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
コード例 #7
0
def test_iter_sliced():
    matrix = np.float32(np.random.random_sample((10, 10)))
    s = NdArray(matrix)
    for _ in range(250):
        upper = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        lower = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        step = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        if step == 0:
            continue
        for storage_row, matrix_row in zip(s[lower:upper:step],
                                           matrix[lower:upper:step]):
            assert np.allclose(storage_row, matrix_row)
コード例 #8
0
def test_nonascii_whitespace_text_roundtrip(tmp_path):
    vocab = ["\u00A0"]
    storage = np.ones((1, 5), dtype=np.float32)
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    embeds = Embeddings(NdArray(storage),
                        SimpleVocab(vocab),
                        norms=Norms(norms))
    filename = tmp_path / "non-ascii.txt"
    write_text(filename, embeds)
    text = load_text(filename)
    assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}'
    assert np.allclose(embeds.storage, text.storage)
    assert np.allclose(embeds.norms, text.norms)
コード例 #9
0
def test_indexing():
    matrix = np.float32(
        np.random.random_sample(sorted(np.random.randint(10, 100, 2))))
    s = NdArray(matrix)
    assert np.allclose(matrix, s)
    for _ in range(1000):
        idx = np.random.randint(-len(s) * 2, len(s) * 2)
        if idx >= len(s) or idx < -len(s):
            ctx = pytest.raises(IndexError)
        else:
            ctx = contextlib.suppress()
        with ctx:
            val = s[idx]
        with ctx:
            assert np.allclose(val, matrix[idx])
コード例 #10
0
def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(
        prog="ffp-select", description="Build embeddings from list of words.")
    add_input_output_args(parser)
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument(
        "words",
        nargs='?',
        default=0,
        metavar="WORDS",
        help=
        "List of words to include in the embeddings. One word per line. Spaces permitted."
        "Reads from stdin if unspecified.")
    parser.add_argument("--ignore_unk",
                        "-i",
                        action="store_true",
                        default=False,
                        help="Skip unrepresentable words.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help=
        "Print which tokens are skipped because they can't be represented to stderr."
    )
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(args.format).load(args.input, args.lossy, args.mmap)
    with open(args.words, errors='replace' if args.lossy else 'strict') as inp:
        unique_words = set(word.strip() for word in inp)
        matrix = np.zeros((len(unique_words), embeds.storage.shape[1]),
                          dtype=np.float32)
        vocab = SimpleVocab(list(unique_words))
        for i, word in enumerate(vocab):
            try:
                matrix[i] = embeds[word]
            except KeyError:
                if args.verbose or not args.ignore_unk:
                    print(f"Cannot represent '{word}'.", file=sys.stderr)
                if not args.ignore_unk:
                    sys.exit(1)
    metadata = Metadata({"source_embeddings": args.input})
    if embeds.metadata is not None:
        metadata["source_metadata"] = embeds.metadata
    Embeddings(storage=NdArray(matrix), vocab=vocab,
               metadata=metadata).write(args.output)
コード例 #11
0
def test_write_sliced(tmp_path):
    matrix = np.float32(np.random.random_sample((10, 10)))
    s = NdArray(matrix)
    filename = tmp_path / "write_sliced.fifu"
    for _ in range(250):
        upper = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        lower = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        step = np.random.randint(-len(matrix) * 3, len(matrix) * 3)
        mmap = np.random.randint(0, 1)
        if mmap and sys.byteorder == "big":
            continue
        if step == 0:
            continue
        s[lower:upper:step].write(filename)
        s2 = load_ndarray(filename, bool(mmap))
        assert np.allclose(matrix[lower:upper:step], s2)
コード例 #12
0
ファイル: fasttext.py プロジェクト: finalfusion/ffp
def _read_ft_storage(file: BinaryIO, vocab: Vocab) -> NdArray:
    """
    Helper method to read fastText storage.

    If vocab is a SimpleVocab, the matrix is read and returned as is.
    If vocab is a FastTextVocab, the word representations are precomputed based
    on the vocab.
    """
    quantized = _read_required_binary(file, "?")[0]
    if quantized:
        raise NotImplementedError(
            "Quantized storage is not supported for fastText models")
    rows, cols = _read_required_binary(file, "<qq")
    matrix = np.fromfile(file=file, count=rows * cols,
                         dtype=np.float32).reshape((rows, cols))
    if sys.byteorder == 'big':
        matrix.byteswap(inplace=True)
    if isinstance(vocab, FastTextVocab):
        _precompute_word_vecs(vocab, matrix)
    return NdArray(matrix)
コード例 #13
0
def test_slice_slice():
    for _ in range(250):
        matrix = np.float32(np.random.random_sample((100, 10)))
        s = NdArray(matrix)
        assert np.allclose(matrix[:], s[:])
        assert np.allclose(matrix, s)
        for _ in range(5):
            if len(matrix) == 0:
                break
            upper = np.random.randint(-len(matrix) * 2, len(matrix) * 2)
            lower = np.random.randint(-len(matrix) * 2, len(matrix) * 2)
            step = np.random.randint(-len(matrix) * 2, len(matrix) * 2)
            ctx = pytest.raises(
                ValueError) if step == 0 else contextlib.suppress()
            with ctx:
                matrix = matrix[lower:upper:step]
            with ctx:
                s = s[lower:upper:step]
                assert isinstance(s, np.ndarray)
                assert isinstance(s, Storage)
                assert isinstance(s, NdArray)
            assert np.allclose(matrix, s)
コード例 #14
0
    def bucket_to_explicit(self) -> 'Embeddings':
        """
        Bucket to explicit Embeddings conversion.

        Multiple embeddings can still map to the same bucket, but all buckets that are not
        indexed by in-vocabulary n-grams are eliminated. This can have a big impact on the
        size of the embedding matrix.

        Metadata is **not** copied to the new embeddings since it doesn't reflect the
        changes. You can manually set the metadata and update the values accordingly.

        Returns
        -------
        embeddings : Embeddings
            Embeddings with an ExplicitVocab instead of a hash-based vocabulary.

        Raises
        ------
        TypeError
            If the current vocabulary is not a hash-based vocabulary
            (FinalfusionBucketVocab or FastTextVocab)
        """
        bucket_vocabs = (FastTextVocab, FinalfusionBucketVocab)
        if not isinstance(self.vocab, bucket_vocabs):
            raise TypeError(
                "Only bucketed embeddings can be converted to explicit.")
        vocab = self.vocab.to_explicit()
        storage = np.zeros((vocab.upper_bound, self._storage.shape[1]),
                           dtype=np.float32)
        storage[:len(vocab)] = self._storage[:len(vocab)]
        for ngram in vocab.subword_indexer:
            storage[len(vocab) + vocab.subword_indexer[ngram]] = self._storage[
                len(vocab) + self.vocab.subword_indexer(ngram)]
        return Embeddings(vocab=vocab,
                          storage=NdArray(storage),
                          norms=self.norms)
コード例 #15
0
def test_iter():
    matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1))
    s = NdArray(matrix)
    for storage_row, matrix_row in zip(s, matrix):
        assert np.allclose(storage_row, matrix_row)
コード例 #16
0
def test_no_norms(vocab_array_tuple):
    vocab, matrix = vocab_array_tuple
    embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix))
    with pytest.raises(TypeError):
        _ = embeddings.embedding_with_norm("bla")