Exemple #1
0
def test_simple_constructor():
    v = SimpleVocab([str(i) for i in range(10)])
    assert [v[str(i)] for i in range(10)] == [i for i in range(10)]
    with pytest.raises(AssertionError):
        SimpleVocab(["a"] * 2)
    assert len(v) == 10
    assert v.upper_bound == len(v)
Exemple #2
0
def test_simple_eq():
    v = SimpleVocab([str(i) for i in range(10)])
    assert v == v
    with pytest.raises(TypeError):
        _ = v > v
    with pytest.raises(TypeError):
        _ = v >= v
    with pytest.raises(TypeError):
        _ = v <= v
    with pytest.raises(TypeError):
        _ = v < v
    v2 = SimpleVocab([str(i + 1) for i in range(10)])
    assert v != v2
    assert v in v
Exemple #3
0
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings:
    """
    Read embeddings in word2vec binary format.

    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
    l2-normalized per default and the corresponding norms are stored in the Norms.

    Files are expected to start with a line containing rows and cols in utf-8. Words are encoded
    in utf-8 followed by a single whitespace. After the whitespace, the embedding components are
    expected as little-endian single-precision floats.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    words = []
    with open(file, 'rb') as inf:
        rows, cols = map(int, inf.readline().decode("ascii").split())
        matrix = np.zeros((rows, cols), dtype=np.float32)
        for row in matrix:
            words.append(_read_binary_word(inf, b' ').strip())
            array = np.fromfile(file=inf, count=cols, dtype=np.float32)
            if sys.byteorder == "big":
                array.byteswap(inplace=True)
            row[:] = array
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
def load_finalfusion(file: Union[str, bytes, int, PathLike],
                     mmap: bool = False) -> Embeddings:
    """
    Read embeddings from a file in finalfusion format.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in finalfusoin format.
    mmap : bool
        Toggles memory mapping the storage buffer.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, 'rb') as inf:
        _ = Header.read_chunk(inf)
        chunk_id, _ = _read_required_chunk_header(inf)
        norms = None
        metadata = None

        if chunk_id == ChunkIdentifier.Metadata:
            metadata = Metadata.read_chunk(inf)
            chunk_id, _ = _read_required_chunk_header(inf)

        if chunk_id == ChunkIdentifier.SimpleVocab:
            vocab = SimpleVocab.read_chunk(inf)  # type: Vocab
        elif chunk_id == ChunkIdentifier.BucketSubwordVocab:
            vocab = FinalfusionBucketVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.FastTextSubwordVocab:
            vocab = FastTextVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab:
            vocab = ExplicitVocab.read_chunk(inf)
        else:
            raise FinalfusionFormatError(
                f'Expected vocab chunk, not {str(chunk_id)}')

        chunk_id, _ = _read_required_chunk_header(inf)
        if chunk_id == ChunkIdentifier.NdArray:
            storage = NdArray.load(inf, mmap)  # type: Storage
        elif chunk_id == ChunkIdentifier.QuantizedArray:
            storage = QuantizedArray.load(inf, mmap)
        else:
            raise FinalfusionFormatError(
                f'Expected storage chunk, not {str(chunk_id)}')
        maybe_chunk_id = _read_chunk_header(inf)
        if maybe_chunk_id is not None:
            if maybe_chunk_id[0] == ChunkIdentifier.NdNorms:
                norms = Norms.read_chunk(inf)
            else:
                raise FinalfusionFormatError(
                    f'Expected norms chunk, not {str(chunk_id)}')

        return Embeddings(storage, vocab, norms, metadata, inf.name)
Exemple #5
0
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
    words = []
    matrix = np.zeros((rows, cols), dtype=np.float32)
    for row, line in zip(matrix, file):
        parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
        words.append(parts[0])
        row[:] = parts[1:]
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_ndarray_storage(storage),
                      vocab=SimpleVocab(words))
Exemple #6
0
def test_nonascii_whitespace_text_roundtrip(tmp_path):
    vocab = ["\u00A0"]
    storage = np.ones((1, 5), dtype=np.float32)
    norms = np.linalg.norm(storage, axis=1)
    storage /= norms[:, None]
    embeds = Embeddings(NdArray(storage),
                        SimpleVocab(vocab),
                        norms=Norms(norms))
    filename = tmp_path / "non-ascii.txt"
    write_text(filename, embeds)
    text = load_text(filename)
    assert embeds.vocab == text.vocab, f'{embeds.vocab.words}{text.vocab.words}'
    assert np.allclose(embeds.storage, text.storage)
    assert np.allclose(embeds.norms, text.norms)
Exemple #7
0
def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(
        prog="ffp-select", description="Build embeddings from list of words.")
    add_input_output_args(parser)
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument(
        "words",
        nargs='?',
        default=0,
        metavar="WORDS",
        help=
        "List of words to include in the embeddings. One word per line. Spaces permitted."
        "Reads from stdin if unspecified.")
    parser.add_argument("--ignore_unk",
                        "-i",
                        action="store_true",
                        default=False,
                        help="Skip unrepresentable words.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help=
        "Print which tokens are skipped because they can't be represented to stderr."
    )
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(args.format).load(args.input, args.lossy, args.mmap)
    with open(args.words, errors='replace' if args.lossy else 'strict') as inp:
        unique_words = set(word.strip() for word in inp)
        matrix = np.zeros((len(unique_words), embeds.storage.shape[1]),
                          dtype=np.float32)
        vocab = SimpleVocab(list(unique_words))
        for i, word in enumerate(vocab):
            try:
                matrix[i] = embeds[word]
            except KeyError:
                if args.verbose or not args.ignore_unk:
                    print(f"Cannot represent '{word}'.", file=sys.stderr)
                if not args.ignore_unk:
                    sys.exit(1)
    metadata = Metadata({"source_embeddings": args.input})
    if embeds.metadata is not None:
        metadata["source_metadata"] = embeds.metadata
    Embeddings(storage=NdArray(matrix), vocab=vocab,
               metadata=metadata).write(args.output)
Exemple #8
0
def test_explicit_constructor():
    i = ExplicitIndexer([str(i) for i in range(10)])
    v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i)
    assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)]
    with pytest.raises(AssertionError):
        _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21))
    assert len(v) == 90
    assert v.upper_bound == len(v) + 10
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FastTextVocab(v.words, FastTextIndexer(20))
    assert repr(v) == f"ExplicitVocab(\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      "\twords=[...]\n" \
                      "\tword_index={{...}})"
Exemple #9
0
def test_embeddings_from_vocab_and_storage():
    matrix = np.tile(np.arange(0, 10, dtype=np.float32), (10, 1))
    s = NdArray(matrix)
    v = SimpleVocab([str(i) for i in range(len(s))])
    e = Embeddings(storage=s, vocab=v)
    assert np.allclose(e.storage, matrix)
    assert np.allclose(s, matrix)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s, vocab=None)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=None, vocab=v)
    with pytest.raises(AssertionError):
        _ = Embeddings(storage=s[:-1], vocab=v)
    with pytest.raises(AssertionError):
        matrix = np.tile(np.arange(0, 10, dtype=np.float32), (11, 1))
        _ = Embeddings(storage=NdArray(matrix), vocab=v)
Exemple #10
0
def test_fasttext_constructor():
    v = FastTextVocab([str(i) for i in range(10)])
    assert [v[str(i)] for i in range(10)] == [i for i in range(10)]
    with pytest.raises(AssertionError):
        v = FastTextVocab(["a"] * 2)
    with pytest.raises(AssertionError):
        _ = FastTextVocab(v.words, FinalfusionHashIndexer(21))
    assert len(v) == 10
    assert v.upper_bound == len(v) + 2_000_000
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FastTextVocab(v.words, FastTextIndexer(20))
    assert repr(v) == f"FastTextVocab(\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      "\twords=[...]\n" \
                      "\tword_index={{...}})"
Exemple #11
0
def test_fifu_buckets_constructor():
    v = FinalfusionBucketVocab([str(i) for i in range(10)])
    assert [v[str(i)] for i in range(10)] == [i for i in range(10)]
    with pytest.raises(AssertionError):
        v = FinalfusionBucketVocab(["a"] * 2)
    with pytest.raises(AssertionError):
        _ = FinalfusionBucketVocab(v.words, FastTextIndexer(21))
    assert len(v) == 10
    assert v.upper_bound == len(v) + pow(2, 21)
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FinalfusionBucketVocab(v.words, FinalfusionHashIndexer(20))
    assert repr(v) == "FinalfusionBucketVocab(\n" \
                      f"\tn_words={len(v)},\n" \
                      f"\tupper_bound={len(v) + pow(2, 21)},\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      f")"
Exemple #12
0
def _read_ft_vocab(file: BinaryIO, buckets: int, min_n: int,
                   max_n: int) -> Union[FastTextVocab, SimpleVocab]:
    """
    Helper method to read a vocab from a fastText file

    Returns a SimpleVocab if min_n is 0, otherwise FastTextVocab is returned.
    """
    # discard n_words
    vocab_size, _n_words, n_labels = _read_required_binary(file, "<iii")
    if n_labels:
        raise NotImplementedError(
            "fastText prediction models are not supported")
    # discard n_tokens
    _read_required_binary(file, "<q")

    prune_idx_size = _read_required_binary(file, "<q")[0]
    if prune_idx_size > 0:
        raise NotImplementedError("Pruned vocabs are not supported")

    if min_n:
        return _read_ft_subwordvocab(file, buckets, min_n, max_n, vocab_size)
    return SimpleVocab([_read_binary_word(file) for _ in range(vocab_size)])
Exemple #13
0
def test_no_norms(vocab_array_tuple):
    vocab, matrix = vocab_array_tuple
    embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix))
    with pytest.raises(TypeError):
        _ = embeddings.embedding_with_norm("bla")