def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) padding = _pad_float32(file.tell()) chunk_len = struct.calcsize( "QI") + padding + self.size * struct.calcsize("f") _write_binary(file, f"<QQI{padding}x", chunk_len, self.size, int(TypeId.f32)) self.tofile(file)
def _write_words_binary(b_words: Iterable[bytes], file: BinaryIO): """ Helper method to write an iterable of bytes and their lengths. """ for word in b_words: b_len = len(word) _write_binary(file, "<I", b_len) file.write(word)
def _write_bucket_vocab(vocab: Union[FinalfusionBucketVocab, FastTextVocab], file: BinaryIO): min_n_max_n_size = struct.calcsize("<II") buckets_size = struct.calcsize("<I") chunk_length = _calculate_serialized_size(vocab.words) chunk_length += min_n_max_n_size chunk_length += buckets_size chunk_id = vocab.chunk_identifier() if chunk_id == ChunkIdentifier.FastTextSubwordVocab: buckets = vocab.subword_indexer.idx_bound else: buckets = vocab.subword_indexer.buckets_exp chunk_header = (int(chunk_id), chunk_length, len(vocab.words), vocab.min_n, vocab.max_n, buckets) _write_binary(file, "<IQQIII", *chunk_header) _write_words_binary((bytes(word, "utf-8") for word in vocab.words), file)
def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) padding = _pad_float32(file.tell()) chunk_len = struct.calcsize("<IIIIIQII") + padding proj = self._quantizer.projection is not None if proj: chunk_len += struct.calcsize("<f") * pow( self._quantizer.reconstructed_len, 2) chunk_len += struct.calcsize("<f") * self._quantizer.subquantizers.size norms = self._norms is not None if norms: chunk_len += struct.calcsize("<f") * self._norms.size chunk_len += self._quantized_embeddings.size chunk_header = (chunk_len, proj, norms, self.quantized_len, self.shape[1], self.quantizer.n_centroids, self.shape[0], int(TypeId.u8), int(TypeId.f32)) _write_binary(file, "<QIIIIIQII", *chunk_header) file.write(struct.pack("x" * padding)) if proj: self.quantizer.projection.tofile(file) self.quantizer.subquantizers.tofile(file) if norms: self._norms.tofile(file) self._quantized_embeddings.tofile(file)
def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) padding = _pad_float32(file.tell()) chunk_len = struct.calcsize("<QII") + padding + struct.calcsize( f'<{self.size}f') # pylint: disable=unpacking-non-sequence rows, cols = self.shape _write_binary(file, "<QQII", chunk_len, rows, cols, int(TypeId.f32)) _write_binary(file, f"{padding}x") self.tofile(file)
def write_chunk(self, file) -> None: chunk_length = _calculate_serialized_size(self.words) chunk_length += _calculate_serialized_size(self.subword_indexer.ngrams) min_n_max_n_size = struct.calcsize("<II") chunk_length += min_n_max_n_size chunk_header = (int(self.chunk_identifier()), chunk_length, len(self.words), len(self.subword_indexer.ngrams), self.min_n, self.max_n) _write_binary(file, "<IQQQII", *chunk_header) _write_words_binary((bytes(word, "utf-8") for word in self.words), file) for ngram in self.subword_indexer.ngrams: b_ngram = ngram.encode("utf-8") _write_binary(file, "<I", len(b_ngram)) file.write(b_ngram) _write_binary(file, "<Q", self.subword_indexer.ngram_index[ngram])
def write_chunk(self, file: BinaryIO): b_data = bytes(toml.dumps(self), "utf-8") _write_binary(file, "<IQ", int(self.chunk_identifier()), len(b_data)) file.write(b_data)
def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) chunk_length = _calculate_serialized_size(self.words) _write_binary(file, "<QQ", chunk_length, len(self.words)) _write_words_binary((bytes(word, "utf-8") for word in self.words), file)