def to_json(content, filename, indent=2, overwrite=True, warn=True, sort_keys=False): """Write `content` to a JSON file at `filename`. Uses a custom parser that automatically converts numpy arrays to lists. If `filename` has a ".bz2" extension, the contents will be compressed (using bz2 and highest-level of compression, i.e., -9). If `filename` has a ".xor" extension, the contents will be xor-scrambled to make them human-unreadable (this is useful for, e.g., blind fits). Parameters ---------- content : obj Object to be written to file. Tries making use of the object's own `to_json` method if it exists. filename : str Name of the file to be written to. Extension has to be 'json' or 'bz2'. indent : int Pretty-printing. Cf. documentation of json.dump() or json.dumps() overwrite : bool Set to `True` (default) to allow overwriting existing file. Raise exception and quit otherwise. warn : bool Issue a warning message if a file is being overwritten (`True`, default). Suppress warning by setting to `False` (e.g. when overwriting is the desired behaviour). sort_keys : bool Output of dictionaries will be sorted by key if set to `True`. Default is `False`. Cf. json.dump() or json.dumps(). """ if hasattr(content, 'to_json'): return content.to_json(filename, indent=indent, overwrite=overwrite, warn=warn, sort_keys=sort_keys) # Import here to avoid circular imports from pisa.utils.fileio import check_file_exists from pisa.utils.log import logging check_file_exists(fname=filename, overwrite=overwrite, warn=warn) _, ext = os.path.splitext(filename) ext = ext.replace('.', '').lower() assert ext == 'json' or ext in ZIP_EXTS + XOR_EXTS with open(filename, 'w') as outfile: if ext == 'bz2': outfile.write( bz2.compress( json.dumps(content, outfile, indent=indent, cls=NumpyEncoder, sort_keys=sort_keys, allow_nan=True, ignore_nan=False))) elif ext == 'xor': # Create tempfile temp = tempfile.TemporaryFile(mode='w+b') temp.write( json.dumps(content, temp, indent=indent, cls=NumpyEncoder, sort_keys=sort_keys, allow_nan=True, ignore_nan=False)) # Rewind temp.seek(0) for line in temp: # Decrypt with key 42 line = ''.join([chr(ord(c) ^ 42) for c in line]) outfile.write(line) else: json.dump(content, outfile, indent=indent, cls=NumpyEncoder, sort_keys=sort_keys, allow_nan=True, ignore_nan=False) logging.debug('Wrote %.2f kB to %s', outfile.tell() / 1024., filename)
def to_hdf(data_dict, tgt, attrs=None, overwrite=True, warn=True): """Store a (possibly nested) dictionary to an HDF5 file or branch node within an HDF5 file (an h5py Group). This creates hardlinks for duplicate non-trivial leaf nodes (h5py Datasets) to minimize storage space required for redundant datasets. Duplication is detected via object hashing. NOTE: Branch nodes are sorted before storing (by name) for consistency in the generated file despite Python dictionaries having no defined ordering among keys. Parameters ---------- data_dict : Mapping Dictionary, OrderedDict, or other Mapping to be stored tgt : str or h5py.Group Target for storing data. If `tgt` is a str, it is interpreted as a filename; a file is created with that name (overwriting an existing file, if present). After writing, the file is closed. If `tgt` is an h5py.Group, the data is simply written to that Group and it is left open at function return. attrs : Mapping Attributes to apply to the top-level entity being written. See http://docs.h5py.org/en/latest/high/attr.html overwrite : bool Set to `True` (default) to allow overwriting existing file. Raise exception and quit otherwise. warn : bool Issue a warning message if a file is being overwritten. Suppress warning by setting to `False` (e.g. when overwriting is the desired behaviour). """ if not isinstance(data_dict, Mapping): raise TypeError('`data_dict` only accepts top-level' ' dict/OrderedDict/etc.') def store_recursively(fhandle, node, path=None, attrs=None, node_hashes=None): """Function for iteratively doing the work""" path = [] if path is None else path full_path = '/' + '/'.join(path) node_hashes = OrderedDict() if node_hashes is None else node_hashes if attrs is None: sorted_attr_keys = [] else: if isinstance(attrs, OrderedDict): sorted_attr_keys = attrs.keys() else: sorted_attr_keys = sorted(attrs.keys()) if isinstance(node, Mapping): logging.trace(' creating Group "%s"', full_path) try: dset = fhandle.create_group(full_path) for key in sorted_attr_keys: dset.attrs[key] = attrs[key] except ValueError: pass for key in sorted(node.keys()): if isinstance(key, str): key_str = key else: key_str = str(key) logging.warning( 'Making string from key "%s", %s for use as' ' name in HDF5 file', key_str, type(key) ) val = node[key] new_path = path + [key_str] store_recursively(fhandle=fhandle, node=val, path=new_path, node_hashes=node_hashes) else: # Check for existing node node_hash = hash_obj(node) if node_hash in node_hashes: logging.trace(' creating hardlink for Dataset: "%s" -> "%s"', full_path, node_hashes[node_hash]) # Hardlink the matching existing dataset fhandle[full_path] = fhandle[node_hashes[node_hash]] return # For now, convert None to np.nan since h5py appears to not handle # None if node is None: node = np.nan logging.warning( ' encountered `None` at node "%s"; converting to' ' np.nan', full_path ) # "Scalar datasets don't support chunk/filter options". Shuffling # is a good idea otherwise since subsequent compression will # generally benefit; shuffling requires chunking. Compression is # not done here since it is slow, but can be done by # post-processing the generated file(s). if np.isscalar(node): shuffle = False chunks = None else: shuffle = True chunks = True # Store the node_hash for linking to later if this is more than # a scalar datatype. Assumed that "None" has node_hashes[node_hash] = full_path # -- Handle special types -- # # See h5py docs at # # https://docs.h5py.org/en/stable/strings.html#how-to-store-text-strings # # where using `bytes` objects (i.e., in numpy, np.string_) is # deemed the most compatible way to encode objects, but apparently # we don't have pytables compatibility right now. # # For boolean support, see # # https://docs.h5py.org/en/stable/faq.html#faq # TODO: make written hdf5 files compatible with pytables # see docs at https://www.pytables.org/usersguide/datatypes.html if isinstance(node, string_types): node = np.string_(node) elif isinstance(node, bool): # includes np.bool node = np.bool_(node) # same as np.bool8 elif isinstance(node, np.ndarray): if issubclass(node.dtype.type, string_types): node = node.astype(np.string_) elif node.dtype.type in (bool, np.bool): node = node.astype(np.bool_) logging.trace(' creating dataset at path "%s", hash %s', full_path, node_hash) try: dset = fhandle.create_dataset( name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False ) except TypeError: try: shuffle = False chunks = None dset = fhandle.create_dataset( name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False ) except Exception: logging.error(' full_path: "%s"', full_path) logging.error(' chunks : %s', str(chunks)) logging.error(' shuffle : %s', str(shuffle)) logging.error(' node : "%s"', str(node)) raise for key in sorted_attr_keys: dset.attrs[key] = attrs[key] # Perform the actual operation using the dict passed in by user if isinstance(tgt, str): from pisa.utils.fileio import check_file_exists fpath = check_file_exists(fname=tgt, overwrite=overwrite, warn=warn) h5file = h5py.File(fpath, 'w') try: if attrs is not None: h5file.attrs.update(attrs) store_recursively(fhandle=h5file, node=data_dict) finally: h5file.close() elif isinstance(tgt, h5py.Group): store_recursively(fhandle=tgt, node=data_dict, attrs=attrs) else: raise TypeError('to_hdf: Invalid `tgt` type: %s' % type(tgt))