Esempio n. 1
0
def test_open_file(tmp_path, encoding, compression):
    path = get_file_path(tmp_path, 'file.txt', compression)
    # We can't use "wt" mode because of a bug of io.TextIOWrapper with xz and bz2.
    # See: https://stackoverflow.com/questions/55171439/python-bz2-and-lzma-in-mode-wt-dont-write-the-bom-while-gzip-does-why  # noqa
    with open_file(path, 'wb') as f:
        f.write(TEXT_CONTENT.encode(encoding))
    assert path.exists()
    with open_file(path, 'rt', encoding=encoding) as f:
        assert f.read() == TEXT_CONTENT
Esempio n. 2
0
def test_extract_file_decompress_monofile_archives(tmp_path, compression):
    path = get_file_path(tmp_path, 'file.txt', compression)
    with open_file(path, 'wb') as f:
        f.write(TEXT_CONTENT.encode())

    out_path = extract_file(path, dest_dir=tmp_path)
    text = out_path.read_text()
    assert text == TEXT_CONTENT
Esempio n. 3
0
 def from_path(cls,
               path: PathType,
               encoding: str = DEFAULT_ENCODING,
               dtype: DType = DEFAULT_DTYPE,
               out_dtype: Optional[DType] = None):
     return cls(open_file(path, 'rb'),
                encoding=encoding,
                dtype=dtype,
                out_dtype=out_dtype)
Esempio n. 4
0
 def from_path(cls,
               path: PathType,
               encoding: str = DEFAULT_ENCODING,
               out_dtype: DType = DEFAULT_OUT_DTYPE,
               vocab_size: Optional[int] = None) -> 'TextEmbFileReader':
     """
     Returns a `TextEmbFileReader` from the path of a (eventually compressed) text file.
     """
     return cls(open_file(path, 'rt', encoding=encoding),
                out_dtype=out_dtype,
                vocab_size=vocab_size)
Esempio n. 5
0
    def _create(cls,
                out_path: Path,
                word_vectors: Iterable[Tuple[str, VectorType]],
                vector_size: int,
                vocab_size: int,
                compression: Optional[str] = None,
                verbose: bool = True,
                encoding: str = DEFAULT_ENCODING,
                precision: int = 5) -> Path:

        number_fmt = '%.{}f'.format(precision)

        if not vocab_size:
            raise ValueError(
                'unable to infer vocab_size; you must manually provide it')

        # Because of a bug that io.TextIOWrapper presents when used in combination with bz2 and lzma
        # we have to complicate things a little bit. For more info about the bug (that I discovered
        # testing this code) see:
        # https://stackoverflow.com/questions/55171439/python-bz2-and-lzma-in-mode-wt-dont-write-the-bom-while-gzip-does-why);
        # I'll open the file in binary mode and encode the text using an IncrementalEncoder for
        # writing the BOM only at the beginning.
        encoder = codecs.getincrementalencoder(encoding)()
        encode = encoder.encode
        with open_file(out_path, 'wb', compression=compression) as fout:

            fout.write(encode('%d %d\n' % (vocab_size, vector_size)))

            for i, (word, vector) in progbar(enumerate(word_vectors),
                                             enable=verbose,
                                             desc='Writing',
                                             total=vocab_size):
                if ' ' in word:
                    raise ValueError(
                        "the word number %d contains one or more spaces: %r" %
                        (i, word))
                check_vector_size(i, vector, vector_size)

                fout.write(encode(word + ' '))
                vector_string = ' '.join(number_fmt % num for num in vector)
                fout.write(encode(vector_string))
                fout.write(encode('\n'))

        warn_if_wrong_vocab_size(
            vocab_size,
            actual_size=i + 1,
            extra_info='As a consequence, the header of the file has a wrong '
            'vocab_size. You can change it editing the file.')

        return out_path
Esempio n. 6
0
    def _create(cls,
                out_path: Path,
                word_vectors: Iterable[Tuple[str, VectorType]],
                vector_size: int,
                vocab_size: Optional[int],
                compression: Optional[str] = None,
                verbose: bool = True,
                encoding: str = DEFAULT_ENCODING,
                dtype: Optional[DType] = None) -> Path:

        echo = print if verbose else noop
        encoding = _bom_free_version(encoding)
        if not dtype:
            (_,
             first_vector), word_vectors = glance_first_element(word_vectors)
            dtype = first_vector.dtype
        else:
            dtype = numpy.dtype(dtype)

        if not vocab_size:
            raise ValueError(
                'unable to infer vocab_size; you must manually provide it')

        with open_file(out_path, 'wb', compression=compression) as file:
            header_line = '%d %d\n' % (vocab_size, vector_size)
            echo('Writing the header: %s', header_line)
            header_bytes = header_line.encode(encoding)
            file.write(header_bytes)

            for i, (word, vector) in progbar(enumerate(word_vectors),
                                             verbose,
                                             total=vocab_size):
                if ' ' in word:
                    raise ValueError(
                        "the word number %d contains one or more spaces: %r" %
                        (i, word))
                file.write((word + ' ').encode(encoding))

                check_vector_size(i, vector, vector_size)
                file.write(numpy.asarray(vector, dtype).tobytes())

        warn_if_wrong_vocab_size(
            vocab_size,
            actual_size=i + 1,
            extra_info='As a consequence, the header of the file has a wrong '
            'vocab_size')
        return out_path
Esempio n. 7
0
    def _create(cls,
                out_path: Path,
                word_vectors: Iterable[Tuple[str, VectorType]],
                vector_size: int,
                vocab_size: Optional[int],
                compression: Optional[str] = None,
                verbose: bool = True,
                encoding: str = DEFAULT_ENCODING,
                dtype: Optional[DType] = None) -> Path:

        echo = print if verbose else noop
        if not dtype:
            (_, vector), word_vectors = glance_first_element(word_vectors)
            dtype = vector.dtype
        else:
            dtype = numpy.dtype(dtype)

        # Write everything in a temporary directory and then pack them into a tar file
        tempdir = Path(tempfile.mkdtemp())
        vocab_tmp_path = tempdir / VOCAB_FILENAME
        vectors_tmp_path = tempdir / VECTORS_FILENAME
        meta_tmp_path = tempdir / META_FILENAME

        with open(vocab_tmp_path, 'wt', encoding=encoding) as vocab_file, \
            open(vectors_tmp_path, 'wb') as vectors_file:  # noqa

            desc = 'Generating {} and {} file'.format(VOCAB_FILENAME,
                                                      VECTORS_FILENAME)
            i = -1
            for i, (word, vector) in progbar(enumerate(word_vectors),
                                             verbose,
                                             desc=desc,
                                             total=vocab_size):
                if '\n' in word:
                    raise ValueError(
                        "the word number %d contains one or more newline characters: "
                        "%r" % (i, word))
                vocab_file.write(word)
                vocab_file.write('\n')

                check_vector_size(i, vector, vector_size)
                vectors_file.write(numpy.asarray(vector, dtype).tobytes())

        actual_vocab_size = i + 1
        warn_if_wrong_vocab_size(
            vocab_size,
            actual_vocab_size,
            extra_info='the actual size will be written in meta.json')
        vocab_size = actual_vocab_size

        echo('Writing {}...'.format(META_FILENAME))
        metadata = {
            "vocab_size": vocab_size,
            "vector_size": vector_size,
            "dtype": dtype.str,
            "encoding": encoding
        }
        with open(meta_tmp_path, 'w') as meta_file:
            json.dump(metadata, meta_file, indent=2)

        if not compression:
            tar_path = out_path
            tar_mode = 'w'
        elif compression in _TAR_COMPRESSIONS:
            tar_path = out_path
            tar_mode = 'w:' + compression
        else:
            warnings.warn(
                'A VVM file is just a TAR file; you should compress it using '
                'one the formats directly supported by tarfile ({}). '
                'Using another compression format will require me to create a '
                'temporary uncompressed TAR file first, doubling the required time!'
            )
            tar_path = out_path.with_suffix(out_path.suffix + '.tmp')
            tar_mode = 'w'

        echo('Packing all the files together')
        with tarfile.open(tar_path, tar_mode) as tar_file:
            tar_file.add(str(vocab_tmp_path), VOCAB_FILENAME)
            tar_file.add(str(vectors_tmp_path), VECTORS_FILENAME)
            tar_file.add(str(meta_tmp_path), META_FILENAME)

        shutil.rmtree(tempdir)

        if compression and compression not in _TAR_COMPRESSIONS:
            echo("Compressing to %s file: %s" % (compression, out_path))
            with open_file(out_path, 'wb',
                           compression=compression) as compressed_file:
                with open(tar_path, 'rb') as non_compressed_file:
                    shutil.copyfileobj(non_compressed_file, compressed_file)

            os.remove(tar_path)

        return out_path