コード例 #1
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("QI") + padding + struct.calcsize(
         f"<{self.size}f")
     _write_binary(file, f"<QQI{padding}x", chunk_len, self.size,
                   int(TypeId.f32))
     _serialize_array_as_le(file, self)
コード例 #2
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("<QII") + padding + struct.calcsize(
         f'<{self.size}f')
     # pylint: disable=unpacking-non-sequence
     rows, cols = self.shape
     _write_binary(file, "<QQII", chunk_len, rows, cols, int(TypeId.f32))
     _write_binary(file, f"{padding}x")
     _serialize_array_as_le(file, self)
コード例 #3
0
ファイル: fasttext.py プロジェクト: finalfusion/ffp
def write_fasttext(file: Union[str, bytes, int, PathLike], embeds: Embeddings):
    """
    Write embeddings in fastText format.

    fastText requires Metadata with all expected keys for fastText configs:
        * dims: int (inferred from model)
        * window_size: int (default -1)
        * min_count: int (default -1)
        * ns: int (default -1)
        * word_ngrams: int (default 1)
        * loss: one of ``['HierarchicalSoftmax', 'NegativeSampling', 'Softmax']`` (default Softmax)
        * model: one of ``['CBOW', 'SkipGram', 'Supervised']`` (default SkipGram)
        * buckets: int (inferred from model)
        * min_n: int (inferred from model)
        * max_n: int (inferred from model)
        * lr_update_rate: int (default -1)
        * sampling_threshold: float (default -1)

    ``dims``, ``buckets``, ``min_n`` and ``max_n`` are inferred from the model. If other values
    are unspecified, a default value of ``-1`` is used for all numerical fields. Loss defaults
    to ``Softmax``, model to ``SkipGram``. Unknown values for ``loss`` and ``model`` are
    overwritten with defaults since the models are incompatible with fastText otherwise.

    Some information from original fastText models gets lost e.g.:
        * word frequencies
        * n_tokens

    Embeddings are un-normalized before serialization: if norms are present, each embedding is
    scaled by the associated norm. Additionally, the original state of the embedding matrix is
    restored, precomputation and l2-normalization of word embeddings is undone.

    Only embeddings with a FastTextVocab or SimpleVocab can be serialized to this format.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Output file
    embeds : Embeddings
        Embeddings to write
    """
    with open(file, 'wb') as outf:
        if not isinstance(embeds.vocab, (FastTextVocab, SimpleVocab)):
            raise ValueError(
                f'Expected FastTextVocab or SimpleVocab, not: {type(embeds.vocab).__name__}'
            )
        _write_binary(outf, "<ii", _FT_MAGIC, 12)
        _write_ft_cfg(outf, embeds)
        _write_ft_vocab(outf, embeds.vocab)
        _write_binary(outf, "<?QQ", 0, *embeds.storage.shape)
        if isinstance(embeds.vocab, SimpleVocab):
            _write_ft_storage_simple(outf, embeds)
        else:
            _write_ft_storage_subwords(outf, embeds)
        _serialize_array_as_le(outf, embeds.storage)
コード例 #4
0
def write_fasttext(file: Union[str, bytes, int, PathLike], embeds: Embeddings):
    """
    Write embeddings in fastText format.

    Only embeddings with fastText vocabulary can be written to fastText format.

    fastText models require values for all config keys, some of these can be inferred from
    finalfusion models others are assigned some default values:

        * dims: inferred from model
        * window_size: 0
        * min_count: 0
        * ns: 0
        * word_ngrams: 1
        * loss: HierarchicalSoftmax
        * model: CBOW
        * buckets: inferred from model
        * min_n: inferred from model
        * max_n: inferred from model
        * lr_update_rate: 0
        * sampling_threshold: 0

    Some information from original fastText models gets lost e.g.:
        * word frequencies
        * n_tokens

    Embeddings are un-normalized before serialization: if norms are present, each embedding is
    scaled by the associated norm. Additionally, the original state of the embedding matrix is
    restored, precomputation and l2-normalization of word embeddings is undone.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Output file
    embeds : Embeddings
        Embeddings to write
    """
    with open(file, 'wb') as outf:
        vocab = embeds.vocab
        if not isinstance(vocab, FastTextVocab):
            raise ValueError(
                f'Expected FastTextVocab, not: {type(embeds.vocab).__name__}')
        _write_binary(outf, "<ii", _FT_MAGIC, 12)
        _write_ft_cfg(outf, embeds.dims, vocab.subword_indexer.n_buckets,
                      vocab.min_n, vocab.max_n)
        _write_ft_vocab(outf, embeds.vocab)
        _write_binary(outf, "<?QQ", 0, *embeds.storage.shape)
        if isinstance(embeds.vocab, SimpleVocab):
            _write_ft_storage_simple(outf, embeds)
        else:
            _write_ft_storage_subwords(outf, embeds)
        _serialize_array_as_le(outf, embeds.storage)
コード例 #5
0
ファイル: fasttext.py プロジェクト: finalfusion/ffp
def _write_ft_storage_simple(outf: BinaryIO, embeds: Embeddings):
    """
    Helper method to write storage of a simple vocab model.

    Unnormalizes embeddings.
    """
    storage = embeds.storage
    norms = embeds.norms
    for i in range(storage.shape[0]):
        embed = storage[i]
        if norms is not None:
            embed = norms[i] * embed
        _serialize_array_as_le(outf, embed)
コード例 #6
0
ファイル: fasttext.py プロジェクト: finalfusion/ffp
def _write_ft_storage_subwords(outf: BinaryIO, embeds: Embeddings):
    """
    Helper method to write a storage with subwords.

    Restores the original embedding format of fastText, i.e. precomputation is
    undone and unnormalizes the embeddings.
    """
    vocab = embeds.vocab
    assert isinstance(vocab, FastTextVocab)
    storage = embeds.storage
    norms = embeds.norms
    for i, word in enumerate(vocab):
        indices = vocab.subword_indices(word)
        embed = storage[i] * (len(indices) + 1)
        if norms is not None:
            embed *= norms[i]
        embed -= storage[indices].sum(0, keepdims=False)
        _serialize_array_as_le(outf, embed)

    _serialize_array_as_le(outf, storage[len(vocab):])
コード例 #7
0
def write_word2vec(file: Union[str, bytes, int, PathLike],
                   embeddings: Embeddings):
    r"""
    Write embeddings in word2vec binary format.

    If the embeddings are not compatible with the w2v format (e.g. include a SubwordVocab), only
    the known words and embeddings are serialized. I.e. the subword matrix is discarded.

    Embeddings are un-normalized before serialization, if norms are present, each embedding is
    scaled by the associated norm.

    The output file will contain the shape encoded in utf-8 on the first line as `rows columns`.
    This is followed by the embeddings.

    Each embedding consists of:

    * utf-8 encoded word
    * single space ``' '`` following the word
    * ``cols`` single-precision floating point numbers
    *  ``'\n'`` newline at the end of each line.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Output file
    embeddings : Embeddings
        The embeddings to serialize.
    """
    vocab = embeddings.vocab
    matrix = embeddings.storage[:len(vocab)]
    with open(file, 'wb') as outf:
        outf.write(f'{matrix.shape[0]} {matrix.shape[1]}\n'.encode('ascii'))
        for idx, word in enumerate(vocab):
            row = matrix[idx]  # type: np.ndarray
            if embeddings.norms is not None:
                row = row * embeddings.norms[idx]
            b_word = word.encode('utf-8')
            outf.write(b_word)
            outf.write(b' ')
            _serialize_array_as_le(outf, row)
            outf.write(b'\n')
コード例 #8
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("<IIIIIQII") + padding
     proj = self._quantizer.projection is not None
     if proj:
         chunk_len += struct.calcsize(
             f"<{pow(self._quantizer.reconstructed_len, 2)}f")
     chunk_len += struct.calcsize(f"<{self._quantizer.subquantizers.size}f")
     norms = self._norms is not None
     if self._norms is not None:
         chunk_len += struct.calcsize(f"<{self._norms.size}f")
     chunk_len += self._quantized_embeddings.size
     chunk_header = (chunk_len, proj, norms, self.quantized_len,
                     self.shape[1], self.quantizer.n_centroids,
                     self.shape[0], int(TypeId.u8), int(TypeId.f32))
     _write_binary(file, "<QIIIIIQII", *chunk_header)
     file.write(struct.pack(f"{padding}x"))
     if proj:
         _serialize_array_as_le(file, self.quantizer.projection)
     _serialize_array_as_le(file, self.quantizer.subquantizers)
     if norms:
         _serialize_array_as_le(file, self._norms)
     self._quantized_embeddings.tofile(file)