def __init__(self, file_uri: str, encoding: str = DEFAULT_ENCODING, cache_dir: str = None) -> None: self.uri = file_uri self._encoding = encoding self._cache_dir = cache_dir self._archive_handle: Any = None # only if the file is inside an archive main_file_uri, path_inside_archive = parse_embeddings_file_uri( file_uri) main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir) if zipfile.is_zipfile(main_file_local_path): # ZIP archive self._open_inside_zip(main_file_uri, path_inside_archive) elif tarfile.is_tarfile(main_file_local_path): # TAR archive self._open_inside_tar(main_file_uri, path_inside_archive) else: # all the other supported formats, including uncompressed files if path_inside_archive: raise ValueError('Unsupported archive format: %s' + main_file_uri) # All the python packages for compressed files share the same interface of io.open extension = get_file_extension(main_file_uri) package = { '.txt': io, '.vec': io, '.gz': gzip, '.bz2': bz2, '.lzma': lzma, }.get(extension, None) if package is None: logger.warning( 'The embeddings file has an unknown file extension "%s". ' 'We will assume the file is an (uncompressed) text file', extension) package = io self._handle = package.open(main_file_local_path, 'rt', encoding=encoding) # type: ignore # To use this with tqdm we'd like to know the number of tokens. It's possible that the # first line of the embeddings file contains this: if it does, we want to start iteration # from the 2nd line, otherwise we want to start from the 1st. # Unfortunately, once we read the first line, we cannot move back the file iterator # because the underlying file may be "not seekable"; we use itertools.chain instead. first_line = next(self._handle) # this moves the iterator forward self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line( first_line) if self.num_tokens: # the first line is a header line: start iterating from the 2nd line self._iterator = self._handle else: # the first line is not a header line: start iterating from the 1st line self._iterator = itertools.chain([first_line], self._handle)
def _read_pretrained_embeddings_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens", min_pretrained_embeddings: int = None) -> torch.FloatTensor: """ Returns and embedding matrix for the given vocabulary using the pretrained embeddings contained in the given file. Embeddings for tokens not found in the pretrained embedding file are randomly initialized using a normal distribution with mean and standard deviation equal to those of the pretrained embeddings. We support two file formats: * text format - utf-8 encoded text file with space separated fields: [word] [dim 1] [dim 2] ... The text file can eventually be compressed, and even resides in an archive with multiple files. If the file resides in an archive with other files, then ``embeddings_filename`` must be a URI "(archive_uri)#file_path_inside_the_archive" * hdf5 format - hdf5 file containing an embedding matrix in the form of a torch.Tensor. If the filename ends with '.hdf5' or '.h5' then we load from hdf5, otherwise we assume text format. Parameters ---------- file_uri : str, required. It can be: * a file system path or a URL of an eventually compressed text file or a zip/tar archive containing a single file. * URI of the type ``(archive_path_or_url)#file_path_inside_archive`` if the text file is contained in a multi-file archive. vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. min_pretrained_embeddings : int, (optional, default=None): If given, will keep at least this number of embeddings from the start of the pretrained embedding text file (typically the most common words) Returns ------- A weight matrix with embeddings initialized from the read file. The matrix has shape ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ file_ext = get_file_extension(file_uri) if file_ext in ['.h5', '.hdf5']: return _read_embeddings_from_hdf5(file_uri, embedding_dim, vocab, namespace) return _read_embeddings_from_text_file(file_uri, embedding_dim, vocab, namespace, min_pretrained_embeddings)
def _read_pretrained_embeddings_file(file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Returns and embedding matrix for the given vocabulary using the pretrained embeddings contained in the given file. Embeddings for tokens not found in the pretrained embedding file are randomly initialized using a normal distribution with mean and standard deviation equal to those of the pretrained embeddings. We support two file formats: * text format - utf-8 encoded text file with space separated fields: [word] [dim 1] [dim 2] ... The text file can eventually be compressed, and even resides in an archive with multiple files. If the file resides in an archive with other files, then ``embeddings_filename`` must be a URI "(archive_uri)#file_path_inside_the_archive" * hdf5 format - hdf5 file containing an embedding matrix in the form of a torch.Tensor. If the filename ends with '.hdf5' or '.h5' then we load from hdf5, otherwise we assume text format. Parameters ---------- file_uri : str, required. It can be: * a file system path or a URL of an eventually compressed text file or a zip/tar archive containing a single file. * URI of the type ``(archive_path_or_url)#file_path_inside_archive`` if the text file is contained in a multi-file archive. vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. trainable : bool, (optional, default=True) Whether or not the embedding parameters should be optimized. Returns ------- A weight matrix with embeddings initialized from the read file. The matrix has shape ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ file_ext = get_file_extension(file_uri) if file_ext in ['.h5', '.hdf5']: return _read_embeddings_from_hdf5(file_uri, embedding_dim, vocab, namespace) return _read_embeddings_from_text_file(file_uri, embedding_dim, vocab, namespace)
def _read_pretrained_embeddings_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Returns and embedding matrix for the given vocabulary using the pretrained embeddings contained in the given file. Embeddings for tokens not found in the pretrained embedding file are randomly initialized using a normal distribution with mean and standard deviation equal to those of the pretrained embeddings. We support two file formats: * text format - utf-8 encoded text file with space separated fields: [word] [dim 1] [dim 2] ... The text file can eventually be compressed, and even resides in an archive with multiple files. If the file resides in an archive with other files, then `embeddings_filename` must be a URI "(archive_uri)#file_path_inside_the_archive" * hdf5 format - hdf5 file containing an embedding matrix in the form of a torch.Tensor. If the filename ends with '.hdf5' or '.h5' then we load from hdf5, otherwise we assume text format. # Parameters file_uri : `str`, required. It can be: * a file system path or a URL of an eventually compressed text file or a zip/tar archive containing a single file. * URI of the type `(archive_path_or_url)#file_path_inside_archive` if the text file is contained in a multi-file archive. vocab : `Vocabulary`, required. A Vocabulary object. namespace : `str`, (optional, default=`"tokens"`) The namespace of the vocabulary to find pretrained embeddings for. trainable : `bool`, (optional, default=`True`) Whether or not the embedding parameters should be optimized. # Returns A weight matrix with embeddings initialized from the read file. The matrix has shape `(vocab.get_vocab_size(namespace), embedding_dim)`, where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ file_ext = get_file_extension(file_uri) if file_ext in [".h5", ".hdf5"]: return _read_embeddings_from_hdf5(file_uri, embedding_dim, vocab, namespace) return _read_embeddings_from_text_file(file_uri, embedding_dim, vocab, namespace)
def __init__(self, file_uri: str, encoding: str = DEFAULT_ENCODING, cache_dir: str = None) -> None: self.uri = file_uri self._encoding = encoding self._cache_dir = cache_dir self._archive_handle: Any = None # only if the file is inside an archive main_file_uri, path_inside_archive = parse_embeddings_file_uri(file_uri) main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir) if zipfile.is_zipfile(main_file_local_path): # ZIP archive self._open_inside_zip(main_file_uri, path_inside_archive) elif tarfile.is_tarfile(main_file_local_path): # TAR archive self._open_inside_tar(main_file_uri, path_inside_archive) else: # all the other supported formats, including uncompressed files if path_inside_archive: raise ValueError('Unsupported archive format: %s' + main_file_uri) # All the python packages for compressed files share the same interface of io.open extension = get_file_extension(main_file_uri) package = { '.txt': io, '.vec': io, '.gz': gzip, '.bz2': bz2, '.lzma': lzma, }.get(extension, None) if package is None: logger.warning('The embeddings file has an unknown file extension "%s". ' 'We will assume the file is an (uncompressed) text file', extension) package = io self._handle = package.open(main_file_local_path, 'rt', encoding=encoding) # type: ignore # To use this with tqdm we'd like to know the number of tokens. It's possible that the # first line of the embeddings file contains this: if it does, we want to start iteration # from the 2nd line, otherwise we want to start from the 1st. # Unfortunately, once we read the first line, we cannot move back the file iterator # because the underlying file may be "not seekable"; we use itertools.chain instead. first_line = next(self._handle) # this moves the iterator forward self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line(first_line) if self.num_tokens: # the first line is a header line: start iterating from the 2nd line self._iterator = self._handle else: # the first line is not a header line: start iterating from the 1st line self._iterator = itertools.chain([first_line], self._handle)