def convert(self, output_limit=0, rchunks=None, wchunks=None): """ Method to perform the Reshaper's designated operation. In this case, convert a list of time-slice files to time-series files. Parameters: output_limit (int): Limit on the number of output (time-series) files to write during the convert() operation. If set to 0, no limit is placed. This limits the number of output files produced by each processor in a parallel run. rchunks (dict): A dictionary of dimension names mapped to reading chunk sizes along that named dimension wchunks (dict): A dictionary of dimension names mapped to writing chunk sizes along that named dimension """ iobackend.set_backend(self._backend) # Type checking input if type(output_limit) is not int: err_msg = 'Output limit must be an integer' raise TypeError(err_msg) # Start the total convert process timer self._timer.start('Complete Conversion Process') # Validate the input files themselves if self._simplecomm.is_manager(): self._vprint('Inspecting input files...', verbosity=0) self._timer.start('Inspect Input Files') self._inspect_input_files() self._timer.stop('Inspect Input Files') if self._simplecomm.is_manager(): self._vprint('...Input files inspected.', verbosity=0) # Validate the output files if self._simplecomm.is_manager(): self._vprint('Inspecting output files...', verbosity=0) self._timer.start('Inspect Output Files') self._inspect_output_files() self._timer.stop('Inspect Output Files') if self._simplecomm.is_manager(): self._vprint('...Output files inspected.', verbosity=0) # Check the read chunking if rchunks is None: # Default chunking is over 1 time-step at a time rchunks = {self._unlimited_dim: 1} if not isinstance(rchunks, dict): err_msg = 'Chunks must be specified with a dictionary' raise TypeError(err_msg) for key, value in rchunks.iteritems(): if not isinstance(key, basestring): err_msg = 'Chunks dictionary must have string-type keys' raise TypeError(err_msg) if not isinstance(value, int): err_msg = 'Chunks dictionary must have integer chunk sizes' raise TypeError(err_msg) # Debugging output if self._simplecomm.is_manager(): if len(rchunks) > 0: self._vprint('Read chunk sizes:', verbosity=1) for dname in rchunks: self._vprint(' {!s}: {}'.format( dname, rchunks[dname]), verbosity=1) else: self._vprint('No read chunking specified.', verbosity=1) self._vprint( 'Converting time-slices to time-series...', verbosity=0) self._simplecomm.sync() # Partition the time-series variables across all processors tsv_names_loc = self._time_series_variables if output_limit > 0: tsv_names_loc = tsv_names_loc[0:output_limit] # Print partitions for all ranks dbg_msg = 'Converting time-series variables: {0}'.format( ', '.join(tsv_names_loc)) self._vprint(dbg_msg, header=True, verbosity=1) # Reset all of the timer values (as it is possible that there are no # time-series variables in the local list procuded above) self._timer.reset('Open Output Files') self._timer.reset('Close Output Files') self._timer.reset('Open Input Files') self._timer.reset('Close Input Files') self._timer.reset('Create Time-Invariant Metadata') self._timer.reset('Create Time-Variant Metadata') self._timer.reset('Create Time-Series Variables') self._timer.reset('Read Time-Invariant Metadata') self._timer.reset('Read Time-Variant Metadata') self._timer.reset('Read Time-Series Variables') self._timer.reset('Write Time-Invariant Metadata') self._timer.reset('Write Time-Variant Metadata') self._timer.reset('Write Time-Series Variables') # Initialize the byte count dictionary self._byte_counts['Requested Data'] = 0 self._byte_counts['Actual Data'] = 0 #===== LOOP OVER TIME_SERIES VARIABLES ===== if len(self._time_invariant_metafile_vars) > 0: metafile = iobackend.NCFile(self._metadata_filename) else: metafile = None # Loop over all time-series variables for out_name in tsv_names_loc: # Once-file data, for convenience is_once_file = (out_name == 'once') write_meta_data = not (self._use_once_file and not is_once_file) write_tser_data = not (self._use_once_file and is_once_file) # Determine the output file name for this variable out_filename = self._time_series_filenames[out_name] dbg_msg = 'Opening output file for variable: {0}'.format(out_name) if out_name == 'once': dbg_msg = 'Opening "once" file.' self._vprint(dbg_msg, header=True, verbosity=1) # Open the output file self._timer.start('Open Output Files') temp_filename = out_filename + '_temp_.nc' if exists(temp_filename): remove(temp_filename) if self._write_mode == 'a' and out_name in self._existing: rename(out_filename, temp_filename) out_file = iobackend.NCFile(temp_filename, mode='a', ncfmt=self._netcdf_format, compression=self._netcdf_compression, least_significant_digit=self._netcdf_least_significant_digit) appending = True else: out_file = iobackend.NCFile(temp_filename, mode='w', ncfmt=self._netcdf_format, compression=self._netcdf_compression, least_significant_digit=self._netcdf_least_significant_digit) appending = False self._timer.stop('Open Output Files') # Start the loop over input files (i.e., time-slices) offsets = { self._unlimited_dim: self._time_series_step_index[out_name]} for in_filename in self._input_filenames: # Open the input file (and metadata file, if necessary) self._timer.start('Open Input Files') in_file = iobackend.NCFile(in_filename) self._timer.stop('Open Input Files') # Create header info, if this is the first input file if in_filename == self._input_filenames[0] and not appending: # Copy file attributes and dimensions to output file for name in in_file.ncattrs: out_file.setncattr(name, in_file.getncattr(name)) for name, val in in_file.dimensions.iteritems(): if name == self._unlimited_dim: out_file.create_dimension(name) else: out_file.create_dimension(name, val) # Create the metadata variables if write_meta_data: # Time-invariant metadata variables self._timer.start('Create Time-Invariant Metadata') for name in self._time_invariant_metadata: self._create_var(in_file, out_file, name) for name in self._time_invariant_metafile_vars: self._create_var(metafile, out_file, name) self._timer.stop('Create Time-Invariant Metadata') # Time-variant metadata variables self._timer.start('Create Time-Variant Metadata') for name in self._time_variant_metadata: self._create_var(in_file, out_file, name) self._timer.stop('Create Time-Variant Metadata') # Create the time-series variable if write_tser_data: # Time-series variable self._timer.start('Create Time-Series Variables') self._create_var(in_file, out_file, out_name, chunks=wchunks) self._timer.stop('Create Time-Series Variables') dbg_msg = 'Writing output file for variable: {0}'.format( out_name) if out_name == 'once': dbg_msg = 'Writing "once" file.' self._vprint(dbg_msg, header=True, verbosity=1) # Copy the time-invariant metadata if write_meta_data: for name in self._time_invariant_metadata: in_var = in_file.variables[name] out_var = out_file.variables[name] self._copy_var('Time-Invariant Metadata', in_var, out_var, chunks=rchunks) for name in self._time_invariant_metafile_vars: in_var = metafile.variables[name] out_var = out_file.variables[name] self._copy_var('Time-Invariant Metadata', in_var, out_var, chunks=rchunks) # Copy the time-varient metadata if write_meta_data: for name in self._time_variant_metadata: in_var = in_file.variables[name] out_var = out_file.variables[name] self._copy_var('Time-Variant Metadata', in_var, out_var, chunks=rchunks, offsets=offsets) # Copy the time-series variables if write_tser_data: in_var = in_file.variables[out_name] out_var = out_file.variables[out_name] self._copy_var('Time-Series Variables', in_var, out_var, chunks=rchunks, offsets=offsets) # Increment the time-series index offset offsets[self._unlimited_dim] += in_file.dimensions[self._unlimited_dim] # Close the input file self._timer.start('Close Input Files') in_file.close() self._timer.stop('Close Input Files') # Close the output file self._timer.start('Close Output Files') out_file.close() rename(temp_filename, out_filename) self._timer.stop('Close Output Files') # Output message to user dbg_msg = 'Closed output file for variable: {0}'.format(out_name) if out_name == 'once': dbg_msg = 'Closed "once" file.' self._vprint(dbg_msg, header=True, verbosity=1) # Close the metadata file, if necessary if metafile: metafile.close() # Information self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint( '...Finished converting time-slices to time-series.', verbosity=0) # Finish clocking the entire convert procedure self._timer.stop('Complete Conversion Process')
def _inspect_input_files(self): """ Inspect the input data files themselves. We check the file contents here, which means opening and reading heading information from the files. """ # Set the I/O backend according to what is specified iobackend.set_backend(self._backend) # Initialize the list of variable names for each category udim = None timeta = [] xtra_timeta = [] tvmeta = [] # Initialize the local dictionary of time-series variables and sizes all_tsvars = {} file_times = {} #===== INSPECT FIRST INPUT FILE (ON MASTER PROCESS ONLY) ===== # Open first file if self._simplecomm.is_manager(): ifile = iobackend.NCFile(self._input_filenames[0]) # Look for the 'unlimited' dimension try: udim = next( dim for dim in ifile.dimensions if ifile.unlimited(dim)) except StopIteration: err_msg = 'Unlimited dimension not found.' raise LookupError(err_msg) # Get the first file's time values file_times[self._input_filenames[0]] = ifile.variables[udim][:] # Categorize each variable (only looking at first file) for var_name, var in ifile.variables.iteritems(): if udim not in var.dimensions: if var_name not in self._exclude_list: timeta.append(var_name) elif var_name in self._metadata_names or (self._1d_metadata and len(var.dimensions) == 1): tvmeta.append(var_name) elif self._time_series_names is None or var_name in self._time_series_names: all_tsvars[var_name] = var.datatype.itemsize * var.size # Close the first file ifile.close() # Find variables only in the metadata file if self._metadata_filename is not None: ifile = iobackend.NCFile(self._metadata_filename) for var_name, var in ifile.variables.iteritems(): if udim not in var.dimensions and var_name not in timeta: xtra_timeta.append(var_name) ifile.close() self._simplecomm.sync() # Send information to worker processes self._unlimited_dim = self._simplecomm.partition( udim, func=Duplicate(), involved=True) self._time_invariant_metadata = self._simplecomm.partition( timeta, func=Duplicate(), involved=True) self._time_invariant_metafile_vars = self._simplecomm.partition( xtra_timeta, func=Duplicate(), involved=True) self._time_variant_metadata = self._simplecomm.partition( tvmeta, func=Duplicate(), involved=True) all_tsvars = self._simplecomm.partition( all_tsvars, func=Duplicate(), involved=True) self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint(' First input file inspected.', verbosity=2) #===== INSPECT REMAINING INPUT FILES (IN PARALLEL) ===== # Get the list of variable names and missing variables var_names = set( all_tsvars.keys() + self._time_invariant_metadata + self._time_invariant_metafile_vars + self._time_variant_metadata) missing_vars = set() # Partition the remaining filenames to inspect input_filenames = self._simplecomm.partition( self._input_filenames[1:], func=EqualStride(), involved=True) # Make a pass through remaining files and: # (1) Make sure it has the 'unlimited' dimension # (2) Make sure this dimension is truely 'unlimited' # (3) Check that this dimension has a corresponding variable # (4) Check if there are any missing variables # (5) Get the time values from the files for ifilename in input_filenames: ifile = iobackend.NCFile(ifilename) # Determine the unlimited dimension if self._unlimited_dim not in ifile.dimensions: err_msg = 'Unlimited dimension not found in file "{0}"'.format( ifilename) raise LookupError(err_msg) if not ifile.unlimited(self._unlimited_dim): err_msg = 'Dimension "{0}" not unlimited in file "{1}"'.format( self._unlimited_dim, ifilename) raise LookupError(err_msg) if self._unlimited_dim not in ifile.variables: err_msg = 'Unlimited dimension variable not found in file "{0}"'.format( ifilename) raise LookupError(err_msg) # Get the time values (list of NDArrays) file_times[ifilename] = ifile.variables[self._unlimited_dim][:] # Get the missing variables var_names_next = set(ifile.variables.keys()) missing_vars.update(var_names - var_names_next) # Close the file ifile.close() self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint(' Remaining input files inspected.', verbosity=2) #===== CHECK FOR MISSING VARIABLES ===== # Gather all missing variables on the master process if self._simplecomm.get_size() > 1: if self._simplecomm.is_manager(): for _ in range(1, self._simplecomm.get_size()): missing_vars.update(self._simplecomm.collect()[1]) else: self._simplecomm.collect(missing_vars) self._simplecomm.sync() # Check for missing variables only on master process if self._simplecomm.is_manager(): # Remove metafile variables from missing vars set missing_vars -= set(self._time_invariant_metafile_vars) # Make sure that the list of variables in each file is the same if len(missing_vars) != 0: warning = ("WARNING: Some variables are not in all input files:{0} " "{1}").format(linesep, ', '.join(sorted(missing_vars))) self._vprint(warning, header=False, verbosity=0) self._vprint(' Checked for missing variables.', verbosity=2) #===== SORT INPUT FILES BY TIME ===== # Gather the file time values onto the master process if self._simplecomm.get_size() > 1: if self._simplecomm.is_manager(): for _ in range(1, self._simplecomm.get_size()): file_times.update(self._simplecomm.collect()[1]) else: self._simplecomm.collect(file_times) self._simplecomm.sync() # Check the order of the input files based on the time values if self._simplecomm.is_manager(): # Determine the sort order based on the first time in the time # values old_order = range(len(self._input_filenames)) new_order = sorted( old_order, key=lambda i: file_times[self._input_filenames[i]][0]) # Re-order the list of input filenames and time values new_filenames = [self._input_filenames[i] for i in new_order] new_values = [file_times[self._input_filenames[i]] for i in new_order] # Now, check that the largest time in each file is less than the smallest time # in the next file (so that the time spans of each file do not # overlap) for i in xrange(1, len(new_values)): if new_values[i - 1][-1] >= new_values[i][0]: err_msg = ('Times in input files {0} and {1} appear to ' 'overlap').format(new_filenames[i - 1], new_filenames[i]) raise ValueError(err_msg) else: new_filenames = None # Now that this is validated, save the time values and filename in the # new order self._input_filenames = self._simplecomm.partition( new_filenames, func=Duplicate(), involved=True) if self._simplecomm.is_manager(): self._vprint(' Input files sorted by time.', verbosity=2) #===== FINALIZING OUTPUT ===== self._simplecomm.sync() # Debug output if self._simplecomm.is_manager(): self._vprint(' Time-Invariant Metadata: {0}'.format( ', '.join(self._time_invariant_metadata)), verbosity=1) if len(self._time_invariant_metafile_vars) > 0: self._vprint(' Additional Time-Invariant Metadata: {0}'.format( ', '.join(self._time_invariant_metafile_vars)), verbosity=1) self._vprint(' Time-Variant Metadata: {0}'.format( ', '.join(self._time_variant_metadata)), verbosity=1) self._vprint( ' Time-Series Variables: {0}'.format(', '.join(all_tsvars.keys())), verbosity=1) # Add 'once' variable if writing to a once file # NOTE: This is a "cheat"! There is no 'once' variable. It's just # a catch for all metadata IFF the 'once-file' is enabled. if self._use_once_file: all_tsvars['once'] = max(all_tsvars.values()) # Partition the time-series variables across processors self._time_series_variables = self._simplecomm.partition( all_tsvars.items(), func=WeightBalanced(), involved=True)
def _inspect_output_files(self): """ Perform inspection of the output data files themselves. We compute the output file name from the prefix and suffix, and then we check whether the output files exist. By default, if the output file exists, then the job is stopped. """ iobackend.set_backend(self._backend) # Loop through the time-series variables and generate output filenames self._time_series_filenames = \ dict([(variable, self._output_prefix + variable + self._output_suffix) for variable in self._time_series_variables]) # Find which files already exist self._existing = [v for (v, f) in self._time_series_filenames.iteritems() if isfile(f)] # Set the starting step index for each variable self._time_series_step_index = dict([(variable, 0) for variable in self._time_series_variables]) # If overwrite is enabled, delete all existing files first if self._write_mode == 'o': if self._simplecomm.is_manager() and len(self._existing) > 0: self._vprint('WARNING: Deleting existing output files for time-series ' 'variables: {0}'.format(', '.join(sorted(self._existing))), verbosity=0) for variable in self._existing: remove(self._time_series_filenames[variable]) self._existing = [] # Or, if skip existing is set, remove the existing time-series # variables from the list of time-series variables to convert elif self._write_mode == 's': if self._simplecomm.is_manager() and len(self._existing) > 0: self._vprint('WARNING: Skipping time-series variables with ' 'existing output files: {0}'.format(', '.join(sorted(self._existing))), verbosity=0) for variable in self._existing: self._time_series_variables.remove(variable) # Or, if appending, check that the existing output files conform # to the expected pattern elif self._write_mode == 'a': # Check each existing time-series file for variable in self._existing: # Get the matching filename filename = self._time_series_filenames[variable] # Open the time-series file for inspection tsfile = iobackend.NCFile(filename) # Check that the file has the unlimited dim and var if not tsfile.unlimited(self._unlimited_dim): err_msg = ('Cannot append to time-series file with missing unlimited ' 'dimension {0!r}').format(self._unlimited_dim) raise RuntimeError(err_msg) # Check for once file is_once_file = (variable == 'once') needs_meta_data = not ( self._use_once_file and not is_once_file) needs_tser_data = not (self._use_once_file and is_once_file) # Look for metadata if needs_meta_data: # Check that the time-variant metadata are all present for metavar in self._time_variant_metadata: if metavar not in tsfile.variables: err_msg = ("Cannot append to time-series file with missing time-variant metadata " "'{0}'").format(metavar) raise RuntimeError(err_msg) # Check that the time-series variable is present if needs_tser_data and variable not in tsfile.variables: err_msg = ("Cannot append to time-series file with missing time-series variable " "'{0}'").format(variable) raise RuntimeError(err_msg) # Get the starting step index to start writing from self._time_series_step_index[variable] = tsfile.dimensions[self._unlimited_dim] # Close the time-series file tsfile.close() # Otherwise, throw an exception if any existing output files are found elif len(self._existing) > 0: err_msg = "Found existing output files for time-series variables: {0}".format( ', '.join(sorted(self._existing))) raise RuntimeError(err_msg)
def _inspect_input_files(self): """ Inspect the input data files themselves. We check the file contents here. """ # Set the I/O backend according to what is specified iobackend.set_backend(self._backend) # Initialize the list of variable names for each category self._time_variant_metadata = [] self._time_invariant_metadata = [] # Initialize the local dictionary of time-series variables and sizes all_tsvars = {} #===== INSPECT FIRST INPUT FILE ===== # Open first file ifile = iobackend.NCFile(self._input_filenames[0]) # Look for the 'unlimited' dimension try: self._unlimited_dim = next(dim for dim in ifile.dimensions if ifile.unlimited(dim)) except StopIteration: err_msg = 'Unlimited dimension not found.' raise LookupError(err_msg) # Get the time values time_values = [ifile.variables[self._unlimited_dim][:]] # Categorize each variable (only looking at first file) for var_name, var in ifile.variables.iteritems(): if self._unlimited_dim not in var.dimensions: self._time_invariant_metadata.append(var_name) elif (var_name in self._metadata_names or (self._1d_metadata and len(var.dimensions) == 1)): self._time_variant_metadata.append(var_name) elif (self._time_series_names is None or var_name in self._time_series_names): all_tsvars[var_name] = var.datatype.itemsize * var.size # Get the list of variable names and missing variables var_names = set(all_tsvars.keys() + self._time_invariant_metadata + self._time_variant_metadata) missing_vars = set() # Close the first file ifile.close() if self._simplecomm.is_manager(): self._vprint(' First input file inspected.', verbosity=2) #===== INSPECT REMAINING INPUT FILES ===== # Make a pass through remaining files and: # (1) Make sure it has the 'unlimited' dimension # (2) Make sure this dimension is truely 'unlimited' # (3) Check that this dimension has a corresponding variable # (4) Check if there are any missing variables # (5) Get the time values from the files for ifilename in self._input_filenames[1:]: ifile = iobackend.NCFile(ifilename) # Determine the unlimited dimension if self._unlimited_dim not in ifile.dimensions: err_msg = ('Unlimited dimension not found ' 'in file "{0}"').format(ifilename) raise LookupError(err_msg) if not ifile.unlimited(self._unlimited_dim): err_msg = ('Dimension "{0}" not unlimited in file ' '"{1}"').format(self._unlimited_dim, ifilename) raise LookupError(err_msg) if self._unlimited_dim not in ifile.variables: err_msg = ('Unlimited dimension variable not found in file ' '"{0}"').format(ifilename) raise LookupError(err_msg) # Get the time values (list of NDArrays) time_values.append(ifile.variables[self._unlimited_dim][:]) # Get the missing variables var_names_next = set(ifile.variables.keys()) missing_vars.update(var_names - var_names_next) # Close the file ifile.close() if self._simplecomm.is_manager(): self._vprint(' Remaining input files inspected.', verbosity=2) #===== CHECK FOR MISSING VARIABLES ===== # Make sure that the list of variables in each file is the same if len(missing_vars) != 0: warning = ("WARNING: The first input file has variables " "that are not in all input files:{0} " "{1}").format(linesep, ', '.join(sorted(missing_vars))) self._vprint(warning, header=True, verbosity=0) if self._simplecomm.is_manager(): self._vprint(' Checked for missing variables.', verbosity=2) #===== SORT INPUT FILES BY TIME ===== # Determine the sort order based on the first time in the time values old_order = range(len(self._input_filenames)) new_order = sorted(old_order, key=lambda i: time_values[i][0]) # Re-order the list of input filenames and time values new_filenames = [self._input_filenames[i] for i in new_order] new_values = [time_values[i] for i in new_order] # Now, check that the largest time in each file is less than the # smallest time in the next file (so that the time spans of each file # do not overlap) for i in xrange(1, len(new_values)): if new_values[i - 1][-1] >= new_values[i][0]: err_msg = ('Times in input files {0} and {1} appear ' 'to overlap').format(new_filenames[i - 1], new_filenames[i]) raise ValueError(err_msg) # Now that this is validated, save the time values and filename in # the new order self._input_filenames = new_filenames if self._simplecomm.is_manager(): self._vprint(' Input files sorted by time.', verbosity=2) #===== FINALIZING OUTPUT ===== # Debug output if self._simplecomm.is_manager(): self._vprint(' Time-Invariant Metadata: ' '{0}'.format(self._time_invariant_metadata), verbosity=1) self._vprint(' Time-Variant Metadata: ' '{0}'.format(self._time_variant_metadata), verbosity=1) self._vprint(' Time-Series Variables: ' '{0}'.format(all_tsvars.keys()), verbosity=1) # Add 'once' variable if writing to a once file # NOTE: This is a "cheat"! There is no 'once' variable. It's just # a catch for all metadata IFF the 'once-file' is enabled. if self._use_once_file: all_tsvars['once'] = max(all_tsvars.values()) # Partition the time-series variables across processors self._time_series_variables = self._simplecomm.partition( all_tsvars.items(), func=WeightBalanced(), involved=True)