Ejemplo n.º 1
0
    def cache_data(self, collection: Any, pack: PackType, append: bool):
        r"""Specify the path to the cache directory.

        After you call this method, the dataset reader will use its
        ``cache_directory`` to store a cache of :class:`BasePack` read
        from every document passed to :func:`read`, serialized as one
        string-formatted :class:`BasePack`. If the cache file for a given
        ``file_path`` exists, we read the :class:`BasePack` from the cache.
        If the cache file does not exist, we will `create` it on our first
        pass through the data.

        Args:
            collection: The collection is a piece of data from the
                :meth:`_collect` function, to be read to produce DataPack(s).
                During caching, a cache key is computed based on the data in
                this collection.
            pack: The data pack to be cached.
            append: Whether to allow appending to the cache.
        """
        if not self._cache_directory:
            raise ValueError("Can not cache without a cache_directory!")

        os.makedirs(self._cache_directory, exist_ok=True)

        cache_filename = os.path.join(self._cache_directory,
                                      self._get_cache_location(collection))

        logger.info("Caching pack to %s", cache_filename)
        if append:
            with open(cache_filename, 'a') as cache:
                cache.write(pack.serialize() + "\n")
        else:
            with open(cache_filename, 'w') as cache:
                cache.write(pack.serialize() + "\n")
Ejemplo n.º 2
0
    def _process(self, input_pack: PackType):
        sub_path = self.sub_output_path(input_pack)
        if sub_path == '':
            raise ValueError("No concrete path provided from sub_output_path.")

        p = os.path.join(self.root_output_dir, sub_path)
        ensure_dir(p)

        if self.zip_pack:
            with gzip.open(p + '.gz', 'wt') as out:
                out.write(input_pack.serialize())
        else:
            with open(p, 'w') as out:
                out.write(input_pack.serialize())
Ejemplo n.º 3
0
def write_pack(input_pack: PackType,
               output_dir: str,
               sub_path: str,
               indent: Optional[int] = None,
               zip_pack: bool = False,
               overwrite: bool = False) -> str:
    """
    Write a pack to a path.

    Args:
        input_pack: A Pack to be written.
        output_dir: The output directory.
        sub_path: The file name for this pack.
        indent: Whether to format JSON with an indent.
        zip_pack: Whether to zip the output JSON.
        overwrite: Whether to overwrite the file if already exists.

    Returns:
        If successfully written, will return the path of the output file.
        otherwise, will return None.

    """
    output_path = os.path.join(output_dir, sub_path) + '.json'
    if overwrite or not os.path.exists(output_path):
        if zip_pack:
            output_path = output_path + '.gz'

        ensure_dir(output_path)

        out_str: str = input_pack.serialize()

        if indent:
            out_str = json.dumps(json.loads(out_str), indent=indent)

        if zip_pack:
            with gzip.open(output_path, 'wt') as out:
                out.write(out_str)
        else:
            with open(output_path, 'w') as out:
                out.write(out_str)

    logging.info("Writing a pack to %s", output_path)
    return output_path
Ejemplo n.º 4
0
 def serialize_instance(instance: PackType) -> str:
     """
     Serialize a pack to a string.
     """
     return instance.serialize()