def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) padding = _pad_float32(file.tell()) chunk_len = struct.calcsize("QI") + padding + struct.calcsize( f"<{self.size}f") _write_binary(file, f"<QQI{padding}x", chunk_len, self.size, int(TypeId.f32)) _serialize_array_as_le(file, self)
def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) padding = _pad_float32(file.tell()) chunk_len = struct.calcsize("<QII") + padding + struct.calcsize( f'<{self.size}f') # pylint: disable=unpacking-non-sequence rows, cols = self.shape _write_binary(file, "<QQII", chunk_len, rows, cols, int(TypeId.f32)) _write_binary(file, f"{padding}x") _serialize_array_as_le(file, self)
def write_fasttext(file: Union[str, bytes, int, PathLike], embeds: Embeddings): """ Write embeddings in fastText format. fastText requires Metadata with all expected keys for fastText configs: * dims: int (inferred from model) * window_size: int (default -1) * min_count: int (default -1) * ns: int (default -1) * word_ngrams: int (default 1) * loss: one of ``['HierarchicalSoftmax', 'NegativeSampling', 'Softmax']`` (default Softmax) * model: one of ``['CBOW', 'SkipGram', 'Supervised']`` (default SkipGram) * buckets: int (inferred from model) * min_n: int (inferred from model) * max_n: int (inferred from model) * lr_update_rate: int (default -1) * sampling_threshold: float (default -1) ``dims``, ``buckets``, ``min_n`` and ``max_n`` are inferred from the model. If other values are unspecified, a default value of ``-1`` is used for all numerical fields. Loss defaults to ``Softmax``, model to ``SkipGram``. Unknown values for ``loss`` and ``model`` are overwritten with defaults since the models are incompatible with fastText otherwise. Some information from original fastText models gets lost e.g.: * word frequencies * n_tokens Embeddings are un-normalized before serialization: if norms are present, each embedding is scaled by the associated norm. Additionally, the original state of the embedding matrix is restored, precomputation and l2-normalization of word embeddings is undone. Only embeddings with a FastTextVocab or SimpleVocab can be serialized to this format. Parameters ---------- file : str, bytes, int, PathLike Output file embeds : Embeddings Embeddings to write """ with open(file, 'wb') as outf: if not isinstance(embeds.vocab, (FastTextVocab, SimpleVocab)): raise ValueError( f'Expected FastTextVocab or SimpleVocab, not: {type(embeds.vocab).__name__}' ) _write_binary(outf, "<ii", _FT_MAGIC, 12) _write_ft_cfg(outf, embeds) _write_ft_vocab(outf, embeds.vocab) _write_binary(outf, "<?QQ", 0, *embeds.storage.shape) if isinstance(embeds.vocab, SimpleVocab): _write_ft_storage_simple(outf, embeds) else: _write_ft_storage_subwords(outf, embeds) _serialize_array_as_le(outf, embeds.storage)
def write_fasttext(file: Union[str, bytes, int, PathLike], embeds: Embeddings): """ Write embeddings in fastText format. Only embeddings with fastText vocabulary can be written to fastText format. fastText models require values for all config keys, some of these can be inferred from finalfusion models others are assigned some default values: * dims: inferred from model * window_size: 0 * min_count: 0 * ns: 0 * word_ngrams: 1 * loss: HierarchicalSoftmax * model: CBOW * buckets: inferred from model * min_n: inferred from model * max_n: inferred from model * lr_update_rate: 0 * sampling_threshold: 0 Some information from original fastText models gets lost e.g.: * word frequencies * n_tokens Embeddings are un-normalized before serialization: if norms are present, each embedding is scaled by the associated norm. Additionally, the original state of the embedding matrix is restored, precomputation and l2-normalization of word embeddings is undone. Parameters ---------- file : str, bytes, int, PathLike Output file embeds : Embeddings Embeddings to write """ with open(file, 'wb') as outf: vocab = embeds.vocab if not isinstance(vocab, FastTextVocab): raise ValueError( f'Expected FastTextVocab, not: {type(embeds.vocab).__name__}') _write_binary(outf, "<ii", _FT_MAGIC, 12) _write_ft_cfg(outf, embeds.dims, vocab.subword_indexer.n_buckets, vocab.min_n, vocab.max_n) _write_ft_vocab(outf, embeds.vocab) _write_binary(outf, "<?QQ", 0, *embeds.storage.shape) if isinstance(embeds.vocab, SimpleVocab): _write_ft_storage_simple(outf, embeds) else: _write_ft_storage_subwords(outf, embeds) _serialize_array_as_le(outf, embeds.storage)
def _write_ft_storage_simple(outf: BinaryIO, embeds: Embeddings): """ Helper method to write storage of a simple vocab model. Unnormalizes embeddings. """ storage = embeds.storage norms = embeds.norms for i in range(storage.shape[0]): embed = storage[i] if norms is not None: embed = norms[i] * embed _serialize_array_as_le(outf, embed)
def _write_ft_storage_subwords(outf: BinaryIO, embeds: Embeddings): """ Helper method to write a storage with subwords. Restores the original embedding format of fastText, i.e. precomputation is undone and unnormalizes the embeddings. """ vocab = embeds.vocab assert isinstance(vocab, FastTextVocab) storage = embeds.storage norms = embeds.norms for i, word in enumerate(vocab): indices = vocab.subword_indices(word) embed = storage[i] * (len(indices) + 1) if norms is not None: embed *= norms[i] embed -= storage[indices].sum(0, keepdims=False) _serialize_array_as_le(outf, embed) _serialize_array_as_le(outf, storage[len(vocab):])
def write_word2vec(file: Union[str, bytes, int, PathLike], embeddings: Embeddings): r""" Write embeddings in word2vec binary format. If the embeddings are not compatible with the w2v format (e.g. include a SubwordVocab), only the known words and embeddings are serialized. I.e. the subword matrix is discarded. Embeddings are un-normalized before serialization, if norms are present, each embedding is scaled by the associated norm. The output file will contain the shape encoded in utf-8 on the first line as `rows columns`. This is followed by the embeddings. Each embedding consists of: * utf-8 encoded word * single space ``' '`` following the word * ``cols`` single-precision floating point numbers * ``'\n'`` newline at the end of each line. Parameters ---------- file : str, bytes, int, PathLike Output file embeddings : Embeddings The embeddings to serialize. """ vocab = embeddings.vocab matrix = embeddings.storage[:len(vocab)] with open(file, 'wb') as outf: outf.write(f'{matrix.shape[0]} {matrix.shape[1]}\n'.encode('ascii')) for idx, word in enumerate(vocab): row = matrix[idx] # type: np.ndarray if embeddings.norms is not None: row = row * embeddings.norms[idx] b_word = word.encode('utf-8') outf.write(b_word) outf.write(b' ') _serialize_array_as_le(outf, row) outf.write(b'\n')
def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) padding = _pad_float32(file.tell()) chunk_len = struct.calcsize("<IIIIIQII") + padding proj = self._quantizer.projection is not None if proj: chunk_len += struct.calcsize( f"<{pow(self._quantizer.reconstructed_len, 2)}f") chunk_len += struct.calcsize(f"<{self._quantizer.subquantizers.size}f") norms = self._norms is not None if self._norms is not None: chunk_len += struct.calcsize(f"<{self._norms.size}f") chunk_len += self._quantized_embeddings.size chunk_header = (chunk_len, proj, norms, self.quantized_len, self.shape[1], self.quantizer.n_centroids, self.shape[0], int(TypeId.u8), int(TypeId.f32)) _write_binary(file, "<QIIIIIQII", *chunk_header) file.write(struct.pack(f"{padding}x")) if proj: _serialize_array_as_le(file, self.quantizer.projection) _serialize_array_as_le(file, self.quantizer.subquantizers) if norms: _serialize_array_as_le(file, self._norms) self._quantized_embeddings.tofile(file)