Beispiel #1
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize(
         "QI") + padding + self.size * struct.calcsize("f")
     _write_binary(file, f"<QQI{padding}x", chunk_len, self.size,
                   int(TypeId.f32))
     self.tofile(file)
Beispiel #2
0
def _write_words_binary(b_words: Iterable[bytes], file: BinaryIO):
    """
    Helper method to write an iterable of bytes and their lengths.
    """
    for word in b_words:
        b_len = len(word)
        _write_binary(file, "<I", b_len)
        file.write(word)
Beispiel #3
0
def _write_bucket_vocab(vocab: Union[FinalfusionBucketVocab, FastTextVocab],
                        file: BinaryIO):
    min_n_max_n_size = struct.calcsize("<II")
    buckets_size = struct.calcsize("<I")
    chunk_length = _calculate_serialized_size(vocab.words)
    chunk_length += min_n_max_n_size
    chunk_length += buckets_size

    chunk_id = vocab.chunk_identifier()
    if chunk_id == ChunkIdentifier.FastTextSubwordVocab:
        buckets = vocab.subword_indexer.idx_bound
    else:
        buckets = vocab.subword_indexer.buckets_exp

    chunk_header = (int(chunk_id), chunk_length, len(vocab.words), vocab.min_n,
                    vocab.max_n, buckets)
    _write_binary(file, "<IQQIII", *chunk_header)
    _write_words_binary((bytes(word, "utf-8") for word in vocab.words), file)
Beispiel #4
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("<IIIIIQII") + padding
     proj = self._quantizer.projection is not None
     if proj:
         chunk_len += struct.calcsize("<f") * pow(
             self._quantizer.reconstructed_len, 2)
     chunk_len += struct.calcsize("<f") * self._quantizer.subquantizers.size
     norms = self._norms is not None
     if norms:
         chunk_len += struct.calcsize("<f") * self._norms.size
     chunk_len += self._quantized_embeddings.size
     chunk_header = (chunk_len, proj, norms, self.quantized_len,
                     self.shape[1], self.quantizer.n_centroids,
                     self.shape[0], int(TypeId.u8), int(TypeId.f32))
     _write_binary(file, "<QIIIIIQII", *chunk_header)
     file.write(struct.pack("x" * padding))
     if proj:
         self.quantizer.projection.tofile(file)
     self.quantizer.subquantizers.tofile(file)
     if norms:
         self._norms.tofile(file)
     self._quantized_embeddings.tofile(file)
Beispiel #5
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("<QII") + padding + struct.calcsize(
         f'<{self.size}f')
     # pylint: disable=unpacking-non-sequence
     rows, cols = self.shape
     _write_binary(file, "<QQII", chunk_len, rows, cols, int(TypeId.f32))
     _write_binary(file, f"{padding}x")
     self.tofile(file)
Beispiel #6
0
 def write_chunk(self, file) -> None:
     chunk_length = _calculate_serialized_size(self.words)
     chunk_length += _calculate_serialized_size(self.subword_indexer.ngrams)
     min_n_max_n_size = struct.calcsize("<II")
     chunk_length += min_n_max_n_size
     chunk_header = (int(self.chunk_identifier()), chunk_length,
                     len(self.words), len(self.subword_indexer.ngrams),
                     self.min_n, self.max_n)
     _write_binary(file, "<IQQQII", *chunk_header)
     _write_words_binary((bytes(word, "utf-8") for word in self.words),
                         file)
     for ngram in self.subword_indexer.ngrams:
         b_ngram = ngram.encode("utf-8")
         _write_binary(file, "<I", len(b_ngram))
         file.write(b_ngram)
         _write_binary(file, "<Q", self.subword_indexer.ngram_index[ngram])
Beispiel #7
0
 def write_chunk(self, file: BinaryIO):
     b_data = bytes(toml.dumps(self), "utf-8")
     _write_binary(file, "<IQ", int(self.chunk_identifier()), len(b_data))
     file.write(b_data)
Beispiel #8
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     chunk_length = _calculate_serialized_size(self.words)
     _write_binary(file, "<QQ", chunk_length, len(self.words))
     _write_words_binary((bytes(word, "utf-8") for word in self.words),
                         file)