def cache_data(self, collection: Any, pack: PackType, append: bool): r"""Specify the path to the cache directory. After you call this method, the dataset reader will use its ``cache_directory`` to store a cache of :class:`BasePack` read from every document passed to :func:`read`, serialized as one string-formatted :class:`BasePack`. If the cache file for a given ``file_path`` exists, we read the :class:`BasePack` from the cache. If the cache file does not exist, we will `create` it on our first pass through the data. Args: collection: The collection is a piece of data from the :meth:`_collect` function, to be read to produce DataPack(s). During caching, a cache key is computed based on the data in this collection. pack: The data pack to be cached. append: Whether to allow appending to the cache. """ if not self._cache_directory: raise ValueError("Can not cache without a cache_directory!") os.makedirs(self._cache_directory, exist_ok=True) cache_filename = os.path.join(self._cache_directory, self._get_cache_location(collection)) logger.info("Caching pack to %s", cache_filename) if append: with open(cache_filename, 'a') as cache: cache.write(pack.serialize() + "\n") else: with open(cache_filename, 'w') as cache: cache.write(pack.serialize() + "\n")
def _process(self, input_pack: PackType): sub_path = self.sub_output_path(input_pack) if sub_path == '': raise ValueError("No concrete path provided from sub_output_path.") p = os.path.join(self.root_output_dir, sub_path) ensure_dir(p) if self.zip_pack: with gzip.open(p + '.gz', 'wt') as out: out.write(input_pack.serialize()) else: with open(p, 'w') as out: out.write(input_pack.serialize())
def write_pack(input_pack: PackType, output_dir: str, sub_path: str, indent: Optional[int] = None, zip_pack: bool = False, overwrite: bool = False) -> str: """ Write a pack to a path. Args: input_pack: A Pack to be written. output_dir: The output directory. sub_path: The file name for this pack. indent: Whether to format JSON with an indent. zip_pack: Whether to zip the output JSON. overwrite: Whether to overwrite the file if already exists. Returns: If successfully written, will return the path of the output file. otherwise, will return None. """ output_path = os.path.join(output_dir, sub_path) + '.json' if overwrite or not os.path.exists(output_path): if zip_pack: output_path = output_path + '.gz' ensure_dir(output_path) out_str: str = input_pack.serialize() if indent: out_str = json.dumps(json.loads(out_str), indent=indent) if zip_pack: with gzip.open(output_path, 'wt') as out: out.write(out_str) else: with open(output_path, 'w') as out: out.write(out_str) logging.info("Writing a pack to %s", output_path) return output_path
def serialize_instance(instance: PackType) -> str: """ Serialize a pack to a string. """ return instance.serialize()