コード例 #1
0
ファイル: fasttext.py プロジェクト: finalfusion/ffp
def _read_ft_subwordvocab(file: BinaryIO, buckets: int, min_n: int, max_n: int,
                          vocab_size: int) -> FastTextVocab:
    """
    Helper method to build a FastTextVocab from a fastText file.
    """
    words = [_read_binary_word(file) for _ in range(vocab_size)]
    indexer = FastTextIndexer(buckets, min_n, max_n)
    return FastTextVocab(words, indexer)
コード例 #2
0
def test_fasttext_constructor():
    v = FastTextVocab([str(i) for i in range(10)])
    assert [v[str(i)] for i in range(10)] == [i for i in range(10)]
    with pytest.raises(AssertionError):
        v = FastTextVocab(["a"] * 2)
    with pytest.raises(AssertionError):
        _ = FastTextVocab(v.words, FinalfusionHashIndexer(21))
    assert len(v) == 10
    assert v.upper_bound == len(v) + 2_000_000
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FastTextVocab(v.words, FastTextIndexer(20))
    assert repr(v) == f"FastTextVocab(\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      "\twords=[...]\n" \
                      "\tword_index={{...}})"
コード例 #3
0
def load_finalfusion(file: Union[str, bytes, int, PathLike],
                     mmap: bool = False) -> Embeddings:
    """
    Read embeddings from a file in finalfusion format.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in finalfusoin format.
    mmap : bool
        Toggles memory mapping the storage buffer.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, 'rb') as inf:
        _ = Header.read_chunk(inf)
        chunk_id, _ = _read_required_chunk_header(inf)
        norms = None
        metadata = None

        if chunk_id == ChunkIdentifier.Metadata:
            metadata = Metadata.read_chunk(inf)
            chunk_id, _ = _read_required_chunk_header(inf)

        if chunk_id == ChunkIdentifier.SimpleVocab:
            vocab = SimpleVocab.read_chunk(inf)  # type: Vocab
        elif chunk_id == ChunkIdentifier.BucketSubwordVocab:
            vocab = FinalfusionBucketVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.FastTextSubwordVocab:
            vocab = FastTextVocab.read_chunk(inf)
        elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab:
            vocab = ExplicitVocab.read_chunk(inf)
        else:
            raise FinalfusionFormatError(
                f'Expected vocab chunk, not {str(chunk_id)}')

        chunk_id, _ = _read_required_chunk_header(inf)
        if chunk_id == ChunkIdentifier.NdArray:
            storage = NdArray.load(inf, mmap)  # type: Storage
        elif chunk_id == ChunkIdentifier.QuantizedArray:
            storage = QuantizedArray.load(inf, mmap)
        else:
            raise FinalfusionFormatError(
                f'Expected storage chunk, not {str(chunk_id)}')
        maybe_chunk_id = _read_chunk_header(inf)
        if maybe_chunk_id is not None:
            if maybe_chunk_id[0] == ChunkIdentifier.NdNorms:
                norms = Norms.read_chunk(inf)
            else:
                raise FinalfusionFormatError(
                    f'Expected norms chunk, not {str(chunk_id)}')

        return Embeddings(storage, vocab, norms, metadata, inf.name)
コード例 #4
0
ファイル: fasttext.py プロジェクト: finalfusion/ffp
def _precompute_word_vecs(vocab: FastTextVocab, matrix: np.ndarray):
    """
    Helper method to precompute word vectors.

    Averages the distinct word representation and the corresponding ngram
    embeddings.
    """
    for i, word in enumerate(vocab):
        indices = [i]
        if isinstance(vocab, FastTextVocab):
            subword_indices = cast(
                List[int], vocab.subword_indices(word, with_ngrams=False))
            indices += subword_indices
        matrix[i] = matrix[indices].mean(0, keepdims=False)
コード例 #5
0
def test_explicit_constructor():
    i = ExplicitIndexer([str(i) for i in range(10)])
    v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i)
    assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)]
    with pytest.raises(AssertionError):
        _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21))
    assert len(v) == 90
    assert v.upper_bound == len(v) + 10
    assert v == v
    assert v in v
    assert v != SimpleVocab(v.words)
    assert v != FastTextVocab(v.words, FastTextIndexer(20))
    assert repr(v) == f"ExplicitVocab(\n" \
                      f"\tindexer={repr(v.subword_indexer)}\n" \
                      "\twords=[...]\n" \
                      "\tword_index={{...}})"
コード例 #6
0
def _read_ft_vocab(file: BinaryIO, buckets: int, min_n: int, max_n: int,
                   lossy: bool) -> Union[FastTextVocab, SimpleVocab]:
    """
    Helper method to read a vocab from a fastText file

    Returns a FastTextVocab.
    """
    # discard n_words
    vocab_size, _n_words, n_labels = _read_required_binary(file, "<iii")
    if n_labels:
        raise NotImplementedError(
            "fastText prediction models are not supported")
    # discard n_tokens
    _read_required_binary(file, "<q")

    prune_idx_size = _read_required_binary(file, "<q")[0]
    if prune_idx_size >= 0:
        raise NotImplementedError("Pruned vocabs are not supported")

    words = [_read_binary_word(file, lossy) for _ in range(vocab_size)]
    indexer = FastTextIndexer(buckets, min_n, max_n)
    return FastTextVocab(words, indexer)
コード例 #7
0
def test_fasttext_vocab_roundtrip(tmp_path):
    filename = tmp_path / "write_ft_vocab.fifu"
    v = FastTextVocab([str(i) for i in range(10)])
    v.write(filename)
    v2 = load_vocab(filename)
    assert v == v2