def _read_items(file: BinaryIO, length: int, indices=False) -> Tuple[List[str], Dict[str, int]]: """ Helper method to read items from a vocabulary chunk. Parameters ---------- file : BinaryIO input file length : int number of items to read indices : bool Toggles reading an int after each item specifying its index. Returns ------- (words, word_index) : (List[str], Dict[str, int]) Tuple containing the word list and the word index. """ items = [] index = {} for _ in range(length): item_length = _read_binary(file, "<I")[0] word = file.read(item_length).decode("utf-8") items.append(word) if indices: index[word] = _read_binary(file, "<Q")[0] else: index[word] = len(index) return items, index
def _read_array_header(file: BinaryIO) -> Tuple[int, int]: """ Helper method to read the header of an NdArray chunk. The method reads the shape tuple, verifies the TypeId and seeks the file to the start of the array. The shape tuple is returned. Parameters ---------- file : BinaryIO finalfusion file with a storage at the start of a NdArray chunk. Returns ------- shape : Tuple[int, int] Shape of the storage. Raises ------ FinalfusionFormatError If the TypeId does not match TypeId.f32 """ rows, cols = _read_binary(file, "<QI") type_id = _read_binary(file, "<I")[0] if int(TypeId.f32) != type_id: raise FinalfusionFormatError( f"Invalid Type, expected {TypeId.f32}, got {type_id}") file.seek(_pad_float32(file.tell()), 1) return rows, cols
def read_chunk(file: BinaryIO) -> 'Norms': n_norms, type_id = _read_binary(file, "<QI") if int(TypeId.f32) != type_id: raise FinalfusionFormatError( f"Invalid Type, expected {TypeId.f32}, got {type_id}") padding = _pad_float32(file.tell()) file.seek(padding, 1) array = np.fromfile(file=file, count=n_norms, dtype=np.float32) return Norms(array)
def read_chunk(file: BinaryIO) -> 'Metadata': chunk_header_size = struct.calcsize("<IQ") # place the file before the chunk header since the chunk size for # metadata the number of bytes that we need to read file.seek(-chunk_header_size, 1) chunk_id, chunk_len = _read_binary(file, "<IQ") assert ChunkIdentifier(chunk_id) == Metadata.chunk_identifier() buf = file.read(chunk_len) if len(buf) != chunk_len: raise FinalfusionFormatError( f'Could not read {chunk_len} bytes from file') return Metadata(toml.loads(buf.decode("utf-8")))
def _read_quantized_header( file: BinaryIO ) -> Tuple[PQ, Tuple[int, int], Optional[np.ndarray]]: """ Helper method to read the header of a quantized array chunk. Returns a tuple containing PQ, quantized_shape and optional norms. """ projection = _read_binary(file, '<I')[0] != 0 read_norms = _read_binary(file, '<I')[0] != 0 quantized_len = _read_binary(file, '<I')[0] reconstructed_len = _read_binary(file, '<I')[0] n_centroids = _read_binary(file, '<I')[0] n_embeddings = _read_binary(file, '<Q')[0] assert reconstructed_len % quantized_len == 0 type_id = _read_binary(file, '<I')[0] if int(TypeId.u8) != type_id: raise FinalfusionFormatError( f"Invalid Type, expected {str(TypeId.u8)}, got {type_id}") type_id = _read_binary(file, '<I')[0] if int(TypeId.f32) != type_id: raise FinalfusionFormatError( f"Invalid Type, expected {str(TypeId.f32)}, got {type_id}") file.seek(_pad_float32(file.tell()), 1) if projection: projection = np.fromfile(file, count=reconstructed_len * reconstructed_len, dtype=np.float32) projection_shape = (reconstructed_len, reconstructed_len) projection = projection.reshape(projection_shape) else: projection = None quantizer_shape = (quantized_len, n_centroids, reconstructed_len // quantized_len) quantizers = np.fromfile(file, count=quantized_len * n_centroids * (reconstructed_len // quantized_len), dtype=np.float32) quantizers = quantizers.reshape(quantizer_shape) if read_norms: norms = np.fromfile(file, count=n_embeddings, dtype=np.float32) else: norms = None quantizer = PQ(quantizers, projection) return quantizer, (n_embeddings, quantized_len), norms
def read_chunk(file: BinaryIO) -> 'SimpleVocab': length = _read_binary(file, "<Q")[0] words, index = _read_items(file, length) return SimpleVocab(words, index)
def read_chunk(file: BinaryIO) -> 'ExplicitVocab': length, ngram_length, min_n, max_n = _read_binary(file, "<QQII") words, word_index = _read_items(file, length) ngrams, ngram_index = _read_items(file, ngram_length, indices=True) indexer = ExplicitIndexer(ngrams, (min_n, max_n), ngram_index) return ExplicitVocab(words, indexer, word_index)
def read_chunk(file: BinaryIO) -> 'FastTextVocab': length, min_n, max_n, buckets = _read_binary(file, "<QIII") words, index = _read_items(file, length) indexer = FastTextIndexer(buckets, min_n, max_n) return FastTextVocab(words, indexer, index)
def read_chunk(file: BinaryIO) -> 'FinalfusionBucketVocab': length, min_n, max_n, buckets = _read_binary(file, "<QIII") words, index = _read_items(file, length) indexer = FinalfusionHashIndexer(buckets, min_n, max_n) return FinalfusionBucketVocab(words, indexer, index)