Exemple #1
0
    def _read_array_header(file: BinaryIO) -> Tuple[int, int]:
        """
        Helper method to read the header of an NdArray chunk.

        The method reads the shape tuple, verifies the TypeId and seeks the file to the start
        of the array. The shape tuple is returned.

        Parameters
        ----------
        file : BinaryIO
            finalfusion file with a storage at the start of a NdArray chunk.

        Returns
        -------
        shape : Tuple[int, int]
            Shape of the storage.

        Raises
        ------
        FinalfusionFormatError
            If the TypeId does not match TypeId.f32
        """
        rows, cols = _read_required_binary(file, "<QI")
        type_id = TypeId(_read_required_binary(file, "<I")[0])
        if TypeId.f32 != type_id:
            raise FinalfusionFormatError(
                f"Invalid Type, expected {TypeId.f32}, got {type_id}")
        file.seek(_pad_float32(file.tell()), 1)
        return rows, cols
Exemple #2
0
def _read_items_with_indices(file: BinaryIO,
                             length: int) -> Tuple[List[str], Dict[str, int]]:
    """
    Helper method to read items from a vocabulary chunk.

    Parameters
    ----------
    file : BinaryIO
        input file
    length : int
        number of items to read

    Returns
    -------
    words : List[str]
        The word list
    """
    items = []
    index = dict()
    for _ in range(length):
        item_length = _read_required_binary(file, "<I")[0]
        item = file.read(item_length).decode("utf-8")
        idx = _read_required_binary(file, "<Q")[0]
        items.append(item)
        index[item] = idx
    return items, index
Exemple #3
0
 def read_chunk(file: BinaryIO) -> 'ExplicitVocab':
     length, ngram_length, min_n, max_n = _read_required_binary(
         file, "<QQII")
     words = _read_items(file, length)
     ngram_list, ngram_index = _read_items_with_indices(file, ngram_length)
     indexer = ExplicitIndexer(ngram_list, min_n, max_n, ngram_index)
     return ExplicitVocab(words, indexer)
Exemple #4
0
def _read_ft_header(file: BinaryIO):
    """
    Helper method to verify version and magic.
    """
    magic, version = _read_required_binary(file, "<ii")
    if magic != _FT_MAGIC:
        raise ValueError(f"Magic should be 793_712_314, not: {magic}")
    if version != 12:
        raise ValueError(f"Expected version 12, not: {version}")
Exemple #5
0
def _read_ft_cfg(file: BinaryIO) -> Metadata:
    """
    Constructs metadata from fastText config.
    """
    cfg = list(_read_required_binary(file, "<12id"))
    losses = ['HierarchicalSoftmax', 'NegativeSampling', 'Softmax']
    cfg[6] = losses[cfg[6] - 1]
    models = ['CBOW', 'SkipGram', 'Supervised']
    cfg[7] = models[cfg[7] - 1]
    return Metadata(dict(zip(_FT_REQUIRED_CFG_KEYS, cfg)))
Exemple #6
0
def _read_binary_word(file: BinaryIO, lossy: bool) -> str:
    """
    Helper method to read null-terminated binary strings.
    """
    word = bytearray()
    while True:
        byte = file.read(1)
        if byte == b'\x00':
            break
        if byte == b'':
            raise EOFError
        word.extend(byte)
    # discard frequency
    _ = _read_required_binary(file, "<q")
    entry_type = _read_required_binary(file, "b")[0]
    if entry_type != 0:
        raise ValueError(f'Non word entry: {word}')

    return word.decode('utf8', errors='replace' if lossy else 'strict')
Exemple #7
0
def _read_ft_storage(file: BinaryIO, vocab: Vocab) -> NdArray:
    """
    Helper method to read fastText storage.

    If vocab is a SimpleVocab, the matrix is read and returned as is.
    If vocab is a FastTextVocab, the word representations are precomputed based
    on the vocab.
    """
    quantized = _read_required_binary(file, "?")[0]
    if quantized:
        raise NotImplementedError(
            "Quantized storage is not supported for fastText models")
    rows, cols = _read_required_binary(file, "<qq")
    matrix = np.fromfile(file=file, count=rows * cols,
                         dtype=np.float32).reshape((rows, cols))
    if sys.byteorder == 'big':
        matrix.byteswap(inplace=True)
    if isinstance(vocab, FastTextVocab):
        _precompute_word_vecs(vocab, matrix)
    return NdArray(matrix)
Exemple #8
0
def _read_binary_word(file: BinaryIO) -> str:
    """
    Helper method to read null-terminated binary strings.
    """
    word = bytearray()
    while True:
        byte = file.read(1)
        if byte == b'\x00':
            break
        if byte == b'':
            raise EOFError
        word.extend(byte)
    # discard frequency
    _ = _read_required_binary(file, "<q")
    entry_type = _read_required_binary(file, "b")[0]
    if entry_type != 0:
        raise ValueError(f'Non word entry: {word}')

    # pylint: disable=fixme # XXX handle unicode errors
    return word.decode("utf8")
 def read_chunk(file: BinaryIO) -> 'Norms':
     n_norms, dtype = _read_required_binary(file, "<QI")
     type_id = TypeId(dtype)
     if TypeId.f32 != type_id:
         raise FinalfusionFormatError(
             f"Invalid Type, expected {TypeId.f32}, got {str(type_id)}")
     padding = _pad_float32(file.tell())
     file.seek(padding, 1)
     array = np.fromfile(file=file, count=n_norms, dtype=np.float32)
     if sys.byteorder == "big":
         array.byteswap(inplace=True)
     return Norms(array)
Exemple #10
0
 def read_chunk(file: BinaryIO) -> 'Metadata':
     chunk_header_size = struct.calcsize("<IQ")
     # place the file before the chunk header since the chunk size for
     # metadata the number of bytes that we need to read
     file.seek(-chunk_header_size, 1)
     chunk_id, chunk_len = _read_required_binary(file, "<IQ")
     assert ChunkIdentifier(chunk_id) == Metadata.chunk_identifier()
     buf = file.read(chunk_len)
     if len(buf) != chunk_len:
         raise FinalfusionFormatError(
             f'Could not read {chunk_len} bytes from file')
     return Metadata(toml.loads(buf.decode("utf-8")))
Exemple #11
0
def _read_ft_vocab(file: BinaryIO, buckets: int, min_n: int,
                   max_n: int) -> Union[FastTextVocab, SimpleVocab]:
    """
    Helper method to read a vocab from a fastText file

    Returns a SimpleVocab if min_n is 0, otherwise FastTextVocab is returned.
    """
    # discard n_words
    vocab_size, _n_words, n_labels = _read_required_binary(file, "<iii")
    if n_labels:
        raise NotImplementedError(
            "fastText prediction models are not supported")
    # discard n_tokens
    _read_required_binary(file, "<q")

    prune_idx_size = _read_required_binary(file, "<q")[0]
    if prune_idx_size > 0:
        raise NotImplementedError("Pruned vocabs are not supported")

    if min_n:
        return _read_ft_subwordvocab(file, buckets, min_n, max_n, vocab_size)
    return SimpleVocab([_read_binary_word(file) for _ in range(vocab_size)])
Exemple #12
0
def _read_ft_vocab(file: BinaryIO, buckets: int, min_n: int, max_n: int,
                   lossy: bool) -> Union[FastTextVocab, SimpleVocab]:
    """
    Helper method to read a vocab from a fastText file

    Returns a FastTextVocab.
    """
    # discard n_words
    vocab_size, _n_words, n_labels = _read_required_binary(file, "<iii")
    if n_labels:
        raise NotImplementedError(
            "fastText prediction models are not supported")
    # discard n_tokens
    _read_required_binary(file, "<q")

    prune_idx_size = _read_required_binary(file, "<q")[0]
    if prune_idx_size >= 0:
        raise NotImplementedError("Pruned vocabs are not supported")

    words = [_read_binary_word(file, lossy) for _ in range(vocab_size)]
    indexer = FastTextIndexer(buckets, min_n, max_n)
    return FastTextVocab(words, indexer)
 def _read_quantized_header(
         file: BinaryIO
 ) -> Tuple[PQ, Tuple[int, int], Optional[np.ndarray]]:
     """
     Helper method to read the header of a quantized array chunk.
     Returns a tuple containing PQ, quantized_shape and optional norms.
     """
     projection = _read_required_binary(file, '<I')[0] != 0
     read_norms = _read_required_binary(file, '<I')[0] != 0
     quantized_len = _read_required_binary(file, '<I')[0]
     reconstructed_len = _read_required_binary(file, '<I')[0]
     n_centroids = _read_required_binary(file, '<I')[0]
     n_embeddings = _read_required_binary(file, '<Q')[0]
     assert reconstructed_len % quantized_len == 0
     type_id = _read_required_binary(file, '<I')[0]
     if int(TypeId.u8) != type_id:
         raise FinalfusionFormatError(
             f"Invalid Type, expected {str(TypeId.u8)}, got {type_id}")
     type_id = _read_required_binary(file, '<I')[0]
     if int(TypeId.f32) != type_id:
         raise FinalfusionFormatError(
             f"Invalid Type, expected {str(TypeId.f32)}, got {type_id}")
     file.seek(_pad_float32(file.tell()), 1)
     if projection:
         projection = _read_array_as_native(file, np.float32,
                                            reconstructed_len**2)
         projection_shape = (reconstructed_len, reconstructed_len)
         projection = projection.reshape(projection_shape)
     else:
         projection = None
     quantizer_shape = (quantized_len, n_centroids,
                        reconstructed_len // quantized_len)
     quantizers_size = quantized_len * n_centroids * (reconstructed_len //
                                                      quantized_len)
     quantizers = _read_array_as_native(file, np.float32, quantizers_size)
     quantizers = quantizers.reshape(quantizer_shape)
     if read_norms:
         norms = _read_array_as_native(file, np.float32, n_embeddings)
     else:
         norms = None
     quantizer = PQ(quantizers, projection)
     return quantizer, (n_embeddings, quantized_len), norms
Exemple #14
0
def _read_items(file: BinaryIO, length: int) -> List[str]:
    """
    Helper method to read items from a vocabulary chunk.

    Parameters
    ----------
    file : BinaryIO
        input file
    length : int
        number of items to read

    Returns
    -------
    words : List[str]
        The word list
    """
    items = []
    for _ in range(length):
        item_length = _read_required_binary(file, "<I")[0]
        word = file.read(item_length).decode("utf-8")
        items.append(word)
    return items
Exemple #15
0
 def read_chunk(file: BinaryIO) -> 'FastTextVocab':
     length, min_n, max_n, buckets = _read_required_binary(file, "<QIII")
     words = _read_items(file, length)
     indexer = FastTextIndexer(buckets, min_n, max_n)
     return FastTextVocab(words, indexer)
Exemple #16
0
 def read_chunk(file: BinaryIO) -> 'SimpleVocab':
     length = _read_required_binary(file, "<Q")[0]
     words = _read_items(file, length)
     return SimpleVocab(words)