Esempio n. 1
0
    def __init__(self,
                 words: Iterable[str],
                 word2vec: Word2Vector,
                 word2index: Optional[Callable[[str], int]] = None,
                 missing_ok: bool = True,
                 verbose: bool = False,
                 close_hook: Optional[Callable] = None):
        """
        Args:
            words:
            word2vec:
                object that implements ``word2vec[word]`` and ``word in word2vec``
            word2index:
                function that returns the index (position) of a word inside the file;
                this enables an optimization for formats like VVM that store vectors
                sequentially in the same file.
            missing_ok:
            verbose:
            close_hook:
                function to call when closing this loader
        """
        super().__init__(words, missing_ok)
        self.verbose = verbose
        self.word2vec = word2vec
        self.word2pos = word2index

        echo = print if verbose else noop
        missing_words = set(words)
        num_words = len(missing_words)
        available_words = []
        for word in progbar(self.words,
                            enable=verbose,
                            desc='Collecting all available words'):
            if word in word2vec:
                available_words.append(word)
                missing_words.remove(word)
            elif not missing_ok:
                raise KeyError('word not found in the file: ' + word)

        if word2index:
            echo('Sorting words based on their position in the file')
            available_words.sort(key=self.word2pos)

        self._missing_words = missing_words
        self._available_words = available_words
        self._iterator = (WordVector(word, word2vec[word])
                          for word in available_words)

        desc = 'Loading available vectors (%d of %d)' % (len(available_words),
                                                         num_words)
        self._progbar = progbar(enable=verbose,
                                total=len(available_words),
                                desc=desc)
        self.close_hook = close_hook
Esempio n. 2
0
 def __init__(self,
              file: 'EmbFile',
              words: Iterable[str],
              missing_ok: bool = True,
              verbose: bool = False):
     super().__init__(words, missing_ok)
     self.file = file
     self.verbose = verbose
     self._missing_words = set(words)
     self._reader = file.reader()
     self._progbar = progbar(enable=verbose,
                             total=self.file.vocab_size,
                             desc='Reading')
Esempio n. 3
0
    def _create(cls,
                out_path: Path,
                word_vectors: Iterable[Tuple[str, VectorType]],
                vector_size: int,
                vocab_size: int,
                compression: Optional[str] = None,
                verbose: bool = True,
                encoding: str = DEFAULT_ENCODING,
                precision: int = 5) -> Path:

        number_fmt = '%.{}f'.format(precision)

        if not vocab_size:
            raise ValueError(
                'unable to infer vocab_size; you must manually provide it')

        # Because of a bug that io.TextIOWrapper presents when used in combination with bz2 and lzma
        # we have to complicate things a little bit. For more info about the bug (that I discovered
        # testing this code) see:
        # https://stackoverflow.com/questions/55171439/python-bz2-and-lzma-in-mode-wt-dont-write-the-bom-while-gzip-does-why);
        # I'll open the file in binary mode and encode the text using an IncrementalEncoder for
        # writing the BOM only at the beginning.
        encoder = codecs.getincrementalencoder(encoding)()
        encode = encoder.encode
        with open_file(out_path, 'wb', compression=compression) as fout:

            fout.write(encode('%d %d\n' % (vocab_size, vector_size)))

            for i, (word, vector) in progbar(enumerate(word_vectors),
                                             enable=verbose,
                                             desc='Writing',
                                             total=vocab_size):
                if ' ' in word:
                    raise ValueError(
                        "the word number %d contains one or more spaces: %r" %
                        (i, word))
                check_vector_size(i, vector, vector_size)

                fout.write(encode(word + ' '))
                vector_string = ' '.join(number_fmt % num for num in vector)
                fout.write(encode(vector_string))
                fout.write(encode('\n'))

        warn_if_wrong_vocab_size(
            vocab_size,
            actual_size=i + 1,
            extra_info='As a consequence, the header of the file has a wrong '
            'vocab_size. You can change it editing the file.')

        return out_path
Esempio n. 4
0
    def _create(cls,
                out_path: Path,
                word_vectors: Iterable[Tuple[str, VectorType]],
                vector_size: int,
                vocab_size: Optional[int],
                compression: Optional[str] = None,
                verbose: bool = True,
                encoding: str = DEFAULT_ENCODING,
                dtype: Optional[DType] = None) -> Path:

        echo = print if verbose else noop
        encoding = _bom_free_version(encoding)
        if not dtype:
            (_,
             first_vector), word_vectors = glance_first_element(word_vectors)
            dtype = first_vector.dtype
        else:
            dtype = numpy.dtype(dtype)

        if not vocab_size:
            raise ValueError(
                'unable to infer vocab_size; you must manually provide it')

        with open_file(out_path, 'wb', compression=compression) as file:
            header_line = '%d %d\n' % (vocab_size, vector_size)
            echo('Writing the header: %s', header_line)
            header_bytes = header_line.encode(encoding)
            file.write(header_bytes)

            for i, (word, vector) in progbar(enumerate(word_vectors),
                                             verbose,
                                             total=vocab_size):
                if ' ' in word:
                    raise ValueError(
                        "the word number %d contains one or more spaces: %r" %
                        (i, word))
                file.write((word + ' ').encode(encoding))

                check_vector_size(i, vector, vector_size)
                file.write(numpy.asarray(vector, dtype).tobytes())

        warn_if_wrong_vocab_size(
            vocab_size,
            actual_size=i + 1,
            extra_info='As a consequence, the header of the file has a wrong '
            'vocab_size')
        return out_path
Esempio n. 5
0
    def _create(cls,
                out_path: Path,
                word_vectors: Iterable[Tuple[str, VectorType]],
                vector_size: int,
                vocab_size: Optional[int],
                compression: Optional[str] = None,
                verbose: bool = True,
                encoding: str = DEFAULT_ENCODING,
                dtype: Optional[DType] = None) -> Path:

        echo = print if verbose else noop
        if not dtype:
            (_, vector), word_vectors = glance_first_element(word_vectors)
            dtype = vector.dtype
        else:
            dtype = numpy.dtype(dtype)

        # Write everything in a temporary directory and then pack them into a tar file
        tempdir = Path(tempfile.mkdtemp())
        vocab_tmp_path = tempdir / VOCAB_FILENAME
        vectors_tmp_path = tempdir / VECTORS_FILENAME
        meta_tmp_path = tempdir / META_FILENAME

        with open(vocab_tmp_path, 'wt', encoding=encoding) as vocab_file, \
            open(vectors_tmp_path, 'wb') as vectors_file:  # noqa

            desc = 'Generating {} and {} file'.format(VOCAB_FILENAME,
                                                      VECTORS_FILENAME)
            i = -1
            for i, (word, vector) in progbar(enumerate(word_vectors),
                                             verbose,
                                             desc=desc,
                                             total=vocab_size):
                if '\n' in word:
                    raise ValueError(
                        "the word number %d contains one or more newline characters: "
                        "%r" % (i, word))
                vocab_file.write(word)
                vocab_file.write('\n')

                check_vector_size(i, vector, vector_size)
                vectors_file.write(numpy.asarray(vector, dtype).tobytes())

        actual_vocab_size = i + 1
        warn_if_wrong_vocab_size(
            vocab_size,
            actual_vocab_size,
            extra_info='the actual size will be written in meta.json')
        vocab_size = actual_vocab_size

        echo('Writing {}...'.format(META_FILENAME))
        metadata = {
            "vocab_size": vocab_size,
            "vector_size": vector_size,
            "dtype": dtype.str,
            "encoding": encoding
        }
        with open(meta_tmp_path, 'w') as meta_file:
            json.dump(metadata, meta_file, indent=2)

        if not compression:
            tar_path = out_path
            tar_mode = 'w'
        elif compression in _TAR_COMPRESSIONS:
            tar_path = out_path
            tar_mode = 'w:' + compression
        else:
            warnings.warn(
                'A VVM file is just a TAR file; you should compress it using '
                'one the formats directly supported by tarfile ({}). '
                'Using another compression format will require me to create a '
                'temporary uncompressed TAR file first, doubling the required time!'
            )
            tar_path = out_path.with_suffix(out_path.suffix + '.tmp')
            tar_mode = 'w'

        echo('Packing all the files together')
        with tarfile.open(tar_path, tar_mode) as tar_file:
            tar_file.add(str(vocab_tmp_path), VOCAB_FILENAME)
            tar_file.add(str(vectors_tmp_path), VECTORS_FILENAME)
            tar_file.add(str(meta_tmp_path), META_FILENAME)

        shutil.rmtree(tempdir)

        if compression and compression not in _TAR_COMPRESSIONS:
            echo("Compressing to %s file: %s" % (compression, out_path))
            with open_file(out_path, 'wb',
                           compression=compression) as compressed_file:
                with open(tar_path, 'rb') as non_compressed_file:
                    shutil.copyfileobj(non_compressed_file, compressed_file)

            os.remove(tar_path)

        return out_path