def _inspect_input_files(self): """ Inspect the input data files themselves. We check the file contents here, which means opening and reading heading information from the files. """ # Set the I/O backend according to what is specified iobackend.set_backend(self._backend) # Initialize the list of variable names for each category udim = None timeta = [] xtra_timeta = [] tvmeta = [] # Initialize the local dictionary of time-series variables and sizes all_tsvars = {} file_times = {} #===== INSPECT FIRST INPUT FILE (ON MASTER PROCESS ONLY) ===== # Open first file if self._simplecomm.is_manager(): ifile = iobackend.NCFile(self._input_filenames[0]) # Look for the 'unlimited' dimension try: udim = next( dim for dim in ifile.dimensions if ifile.unlimited(dim)) except StopIteration: err_msg = 'Unlimited dimension not found.' raise LookupError(err_msg) # Get the first file's time values file_times[self._input_filenames[0]] = ifile.variables[udim][:] # Categorize each variable (only looking at first file) for var_name, var in ifile.variables.iteritems(): if udim not in var.dimensions: if var_name not in self._exclude_list: timeta.append(var_name) elif var_name in self._metadata_names or (self._1d_metadata and len(var.dimensions) == 1): tvmeta.append(var_name) elif self._time_series_names is None or var_name in self._time_series_names: all_tsvars[var_name] = var.datatype.itemsize * var.size # Close the first file ifile.close() # Find variables only in the metadata file if self._metadata_filename is not None: ifile = iobackend.NCFile(self._metadata_filename) for var_name, var in ifile.variables.iteritems(): if udim not in var.dimensions and var_name not in timeta: xtra_timeta.append(var_name) ifile.close() self._simplecomm.sync() # Send information to worker processes self._unlimited_dim = self._simplecomm.partition( udim, func=Duplicate(), involved=True) self._time_invariant_metadata = self._simplecomm.partition( timeta, func=Duplicate(), involved=True) self._time_invariant_metafile_vars = self._simplecomm.partition( xtra_timeta, func=Duplicate(), involved=True) self._time_variant_metadata = self._simplecomm.partition( tvmeta, func=Duplicate(), involved=True) all_tsvars = self._simplecomm.partition( all_tsvars, func=Duplicate(), involved=True) self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint(' First input file inspected.', verbosity=2) #===== INSPECT REMAINING INPUT FILES (IN PARALLEL) ===== # Get the list of variable names and missing variables var_names = set( all_tsvars.keys() + self._time_invariant_metadata + self._time_invariant_metafile_vars + self._time_variant_metadata) missing_vars = set() # Partition the remaining filenames to inspect input_filenames = self._simplecomm.partition( self._input_filenames[1:], func=EqualStride(), involved=True) # Make a pass through remaining files and: # (1) Make sure it has the 'unlimited' dimension # (2) Make sure this dimension is truely 'unlimited' # (3) Check that this dimension has a corresponding variable # (4) Check if there are any missing variables # (5) Get the time values from the files for ifilename in input_filenames: ifile = iobackend.NCFile(ifilename) # Determine the unlimited dimension if self._unlimited_dim not in ifile.dimensions: err_msg = 'Unlimited dimension not found in file "{0}"'.format( ifilename) raise LookupError(err_msg) if not ifile.unlimited(self._unlimited_dim): err_msg = 'Dimension "{0}" not unlimited in file "{1}"'.format( self._unlimited_dim, ifilename) raise LookupError(err_msg) if self._unlimited_dim not in ifile.variables: err_msg = 'Unlimited dimension variable not found in file "{0}"'.format( ifilename) raise LookupError(err_msg) # Get the time values (list of NDArrays) file_times[ifilename] = ifile.variables[self._unlimited_dim][:] # Get the missing variables var_names_next = set(ifile.variables.keys()) missing_vars.update(var_names - var_names_next) # Close the file ifile.close() self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint(' Remaining input files inspected.', verbosity=2) #===== CHECK FOR MISSING VARIABLES ===== # Gather all missing variables on the master process if self._simplecomm.get_size() > 1: if self._simplecomm.is_manager(): for _ in range(1, self._simplecomm.get_size()): missing_vars.update(self._simplecomm.collect()[1]) else: self._simplecomm.collect(missing_vars) self._simplecomm.sync() # Check for missing variables only on master process if self._simplecomm.is_manager(): # Remove metafile variables from missing vars set missing_vars -= set(self._time_invariant_metafile_vars) # Make sure that the list of variables in each file is the same if len(missing_vars) != 0: warning = ("WARNING: Some variables are not in all input files:{0} " "{1}").format(linesep, ', '.join(sorted(missing_vars))) self._vprint(warning, header=False, verbosity=0) self._vprint(' Checked for missing variables.', verbosity=2) #===== SORT INPUT FILES BY TIME ===== # Gather the file time values onto the master process if self._simplecomm.get_size() > 1: if self._simplecomm.is_manager(): for _ in range(1, self._simplecomm.get_size()): file_times.update(self._simplecomm.collect()[1]) else: self._simplecomm.collect(file_times) self._simplecomm.sync() # Check the order of the input files based on the time values if self._simplecomm.is_manager(): # Determine the sort order based on the first time in the time # values old_order = range(len(self._input_filenames)) new_order = sorted( old_order, key=lambda i: file_times[self._input_filenames[i]][0]) # Re-order the list of input filenames and time values new_filenames = [self._input_filenames[i] for i in new_order] new_values = [file_times[self._input_filenames[i]] for i in new_order] # Now, check that the largest time in each file is less than the smallest time # in the next file (so that the time spans of each file do not # overlap) for i in xrange(1, len(new_values)): if new_values[i - 1][-1] >= new_values[i][0]: err_msg = ('Times in input files {0} and {1} appear to ' 'overlap').format(new_filenames[i - 1], new_filenames[i]) raise ValueError(err_msg) else: new_filenames = None # Now that this is validated, save the time values and filename in the # new order self._input_filenames = self._simplecomm.partition( new_filenames, func=Duplicate(), involved=True) if self._simplecomm.is_manager(): self._vprint(' Input files sorted by time.', verbosity=2) #===== FINALIZING OUTPUT ===== self._simplecomm.sync() # Debug output if self._simplecomm.is_manager(): self._vprint(' Time-Invariant Metadata: {0}'.format( ', '.join(self._time_invariant_metadata)), verbosity=1) if len(self._time_invariant_metafile_vars) > 0: self._vprint(' Additional Time-Invariant Metadata: {0}'.format( ', '.join(self._time_invariant_metafile_vars)), verbosity=1) self._vprint(' Time-Variant Metadata: {0}'.format( ', '.join(self._time_variant_metadata)), verbosity=1) self._vprint( ' Time-Series Variables: {0}'.format(', '.join(all_tsvars.keys())), verbosity=1) # Add 'once' variable if writing to a once file # NOTE: This is a "cheat"! There is no 'once' variable. It's just # a catch for all metadata IFF the 'once-file' is enabled. if self._use_once_file: all_tsvars['once'] = max(all_tsvars.values()) # Partition the time-series variables across processors self._time_series_variables = self._simplecomm.partition( all_tsvars.items(), func=WeightBalanced(), involved=True)
def execute(self, chunks={}, serial=False, history=False, scomm=None, deflate=None): """ Execute the Data Flow Parameters: chunks (dict): A dictionary of output dimension names and chunk sizes for each dimension given. Output dimensions not included in the dictionary will not be chunked. (Use OrderedDict to preserve order of dimensions, where the first dimension will be assumed to correspond to the fastest-varying index and the last dimension will be assumed to correspond to the slowest-varying index.) serial (bool): Whether to run in serial (True) or parallel (False) history (bool): Whether to write a history attribute generated during execution for each variable in the file scomm (SimpleComm): An externally created SimpleComm object to use for managing parallel operation deflate (int): Override all output file deflate levels with given value """ # Check chunks type if not isinstance(chunks, dict): raise TypeError('Chunks must be specified with a dictionary') # Make sure that the specified chunking dimensions are valid for odname, odsize in chunks.iteritems(): if odname not in self._o2imap: raise ValueError( 'Cannot chunk over unknown output dimension {!r}'.format( odname)) if not isinstance(odsize, int): raise TypeError( ('Chunk size invalid for output dimension {!r}: ' '{}').format(odname, odsize)) # Check that we are not chunking over any "sum-like" dimensions sumlike_chunk_dims = sorted(d for d in chunks if d in self._sumlike_dimensions) if len(sumlike_chunk_dims) > 0: raise ValueError(( 'Cannot chunk over dimensions that are summed over (or "sum-like")' ': {}'.format(', '.join(sumlike_chunk_dims)))) # Create the simple communicator, if necessary if scomm is None: scomm = create_comm(serial=bool(serial)) elif isinstance(scomm, SimpleComm): if scomm.is_manager(): print 'Inheriting SimpleComm object from parent. (Ignoring serial argument.)' else: raise TypeError('Communication object is not a SimpleComm!') # Start general output prefix = '[{}/{}]'.format(scomm.get_rank(), scomm.get_size()) if scomm.is_manager(): print 'Beginning execution of data flow...' print 'Mapping Input Dimensions to Output Dimensions:' for d in sorted(self._i2omap): print ' {} --> {}'.format(d, self._i2omap[d]) if len(chunks) > 0: print 'Chunking over Output Dimensions:' for d in chunks: print ' {}: {}'.format(d, chunks[d]) else: print 'Not chunking output.' # Partition the output files/variables over available parallel (MPI) ranks fnames = scomm.partition(self._filesizes.items(), func=WeightBalanced(), involved=True) if scomm.is_manager(): print 'Writing {} files across {} MPI processes.'.format( len(self._filesizes), scomm.get_size()) scomm.sync() # Standard output print '{}: Writing {} files: {}'.format(prefix, len(fnames), ', '.join(fnames)) scomm.sync() # Loop over output files and write using given chunking for fname in fnames: print '{}: Writing file: {}'.format(prefix, fname) if history: self._writenodes[fname].enable_history() else: self._writenodes[fname].disable_history() self._writenodes[fname].execute(chunks=chunks, deflate=deflate) print '{}: Finished writing file: {}'.format(prefix, fname) scomm.sync() if scomm.is_manager(): print 'All output variables written.' print
def convert(self, output_limit=0): """ Method to perform the Reshaper's designated operation. In this case, convert a list of time-slice files to time-series files. Keyword Arguments: output_limit (int): Limit on the number of output (time-series) files to write during the convert() operation. If set to 0, no limit is placed. This limits the number of output files produced by each processor in a parallel run. """ # Type checking input if type(output_limit) is not int: err_msg = 'Output limit must be an integer' raise TypeError(err_msg) # Start the total convert process timer self._simplecomm.sync() self._timer.start('Complete Conversion Process') # Debugging output if self._simplecomm.is_manager(): self._vprint('Converting time-slices to time-series', verbosity=1) # For data common to all input files, we reference only the first ref_infile = self._input_files[0] # Store the common dimensions and attributes for each file # (taken from the first input file in the list) common_dims = ref_infile.dimensions common_atts = ref_infile.attributes # Partition the time-series variables across all processors tsv_names_loc = self._simplecomm.partition( self._time_series_variables.items(), func=WeightBalanced(), involved=True) if output_limit > 0: tsv_names_loc = tsv_names_loc[0:output_limit] # Print partitions for all ranks dbg_msg = 'Local time-series variables are {0}'.format(tsv_names_loc) self._vprint(dbg_msg, header=True, verbosity=2) # Reset all of the timer values (as it is possible that there are no # time-series variables in the local list procuded above) self._timer.reset('Open Output Files') self._timer.reset('Create Time-Invariant Metadata') self._timer.reset('Create Time-Variant Metadata') self._timer.reset('Create Time-Series Variables') self._timer.reset('Write Time-Invariant Metadata') self._timer.reset('Write Time-Variant Metadata') self._timer.reset('Write Time-Series Variables') self._timer.reset('Close Output Files') # Initialize the byte count dictionary self._byte_counts['Requested Data'] = 0 self._byte_counts['Actual Data'] = 0 # Defining a simple helper function to determine whether to # write time-series data and/or write metadata. This is useful # for adding the ability to write a "once" file def _get_once_info(vname): is_once_file = (vname == 'once') write_meta = True write_tser = True if self._use_once_file: write_meta = is_once_file write_tser = not is_once_file return is_once_file, write_meta, write_tser # NOTE: In the prototype, we check for the existance of the output # directory at this point. If it does not exist, we create it (but # only from the master rank). This requires synchronization with # the decomp utility. Instead, we assume the output directory # already exists (and is checked by the Specifier's validation). No # synchronization is needed. # For each time-series variable, create the corresponding output file # (Also defines the header info for each output file) out_files = {} out_tvm_vars = {} for out_name in tsv_names_loc: is_once_file, write_meta, write_tser = _get_once_info(out_name) # Determine the output file name for this variable out_filename = self._time_series_filenames[out_name] dbg_msg = 'Creating output file for variable: {0}'.format(out_name) if is_once_file: dbg_msg = 'Creating "once" file.' self._vprint(dbg_msg, header=True, verbosity=1) # Open each output file and create the dimensions and attributes # NOTE: If the output file already exists, abort! self._timer.start('Open Output Files') if os.path.exists(out_filename): err_msg = 'Found existing output file: {0}'.format( out_filename) raise OSError(err_msg) out_file = Nio.open_file(out_filename, 'w', options=self._nio_options) for att_name, att_val in common_atts.iteritems(): setattr(out_file, att_name, att_val) for dim_name, dim_val in common_dims.iteritems(): if dim_name == self._unlimited_dim: out_file.create_dimension(dim_name, None) else: out_file.create_dimension(dim_name, dim_val) self._timer.stop('Open Output Files') # Create the time-invariant metadata variables if (write_meta): self._timer.start('Create Time-Invariant Metadata') for name in self._time_invariant_metadata: in_var = ref_infile.variables[name] out_var = out_file.create_variable(name, in_var.typecode(), in_var.dimensions) for att_name, att_val in in_var.attributes.iteritems(): setattr(out_var, att_name, att_val) self._timer.stop('Create Time-Invariant Metadata') # Create the time-variant metadata variables if write_meta: self._timer.start('Create Time-Variant Metadata') for name in self._time_variant_metadata: in_var = ref_infile.variables[name] out_tvm_vars[name] = out_file.create_variable( name, in_var.typecode(), in_var.dimensions) for att_name, att_val in in_var.attributes.iteritems(): setattr(out_tvm_vars[name], att_name, att_val) self._timer.stop('Create Time-Variant Metadata') # Create the time-series variable itself if write_tser: self._timer.start('Create Time-Series Variables') in_var = ref_infile.variables[out_name] out_var = out_file.create_variable(out_name, in_var.typecode(), in_var.dimensions) self._timer.stop('Create Time-Series Variables') # Append the output file to list out_files[out_name] = out_file # Now that each output file has been created, start writing the data # (Looping over output file index, which is common in name lists) for out_name, out_file in out_files.iteritems(): is_once_file, write_meta, write_tser = _get_once_info(out_name) dbg_msg = 'Writing output file for variable: {0}'.format(out_name) if is_once_file: dbg_msg = 'Writing "once" file.' self._vprint(dbg_msg, header=True, verbosity=1) # Create the attributes of the time-series variable if write_tser: in_var = ref_infile.variables[out_name] out_var = out_file.variables[out_name] for att_name, att_val in in_var.attributes.iteritems(): setattr(out_var, att_name, att_val) # Write the time-invariant metadata if write_meta: self._timer.start('Write Time-Invariant Metadata') for name in self._time_invariant_metadata: in_meta = ref_infile.variables[name] out_meta = out_file.variables[name] if in_meta.rank > 0: out_meta[:] = in_meta[:] else: out_meta.assign_value(in_meta.get_value()) self._timer.stop('Write Time-Invariant Metadata') # Write each time-variant variable series_step_index = 0 for in_file in self._input_files: # Get the number of time steps in this slice file num_steps = in_file.dimensions[self._unlimited_dim] # Loop over the time steps in this slice file for slice_step_index in range(num_steps): # Write the time-varient metadata if write_meta: self._timer.start('Write Time-Variant Metadata') for name in self._time_variant_metadata: in_meta = in_file.variables[name] out_meta = out_file.variables[name] ndims = len(in_meta.dimensions) udidx = in_meta.dimensions.index( self._unlimited_dim) in_slice = [slice(None)] * ndims in_slice[udidx] = slice_step_index out_slice = [slice(None)] * ndims out_slice[udidx] = series_step_index out_meta[tuple(out_slice)] = in_meta[tuple( in_slice)] requested_nbytes = in_meta[:].nbytes self._byte_counts[ 'Requested Data'] += requested_nbytes actual_nbytes = self.assumed_block_size \ * numpy.ceil(requested_nbytes / self.assumed_block_size) self._byte_counts['Actual Data'] += actual_nbytes self._timer.stop('Write Time-Variant Metadata') # Write the time-series variables if write_tser: self._timer.start('Write Time-Series Variables') in_var = in_file.variables[out_name] ndims = len(in_var.dimensions) udidx = in_var.dimensions.index(self._unlimited_dim) in_slice = [slice(None)] * ndims in_slice[udidx] = slice_step_index out_slice = [slice(None)] * ndims out_slice[udidx] = series_step_index out_var[tuple(out_slice)] = in_var[tuple(in_slice)] requested_nbytes = in_file.variables[ out_name][:].nbytes self._byte_counts['Requested Data'] += requested_nbytes actual_nbytes = self.assumed_block_size \ * numpy.ceil(requested_nbytes / self.assumed_block_size) self._byte_counts['Actual Data'] += actual_nbytes self._timer.stop('Write Time-Series Variables') # Increment the time-series step index series_step_index += 1 # Close the output file self._timer.start('Close Output Files') out_file.close() self._timer.stop('Close Output Files') dbg_msg = 'Closed output file for variable: {0}'.format(out_name) if is_once_file: dbg_msg = 'Closed "once" file.' self._vprint(dbg_msg, header=True, verbosity=1) # Information self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint('Finished converting time-slices to time-series.', verbosity=1) # Finish clocking the entire convert procedure self._timer.stop('Complete Conversion Process')
def _inspect_input_files(self): """ Inspect the input data files themselves. We check the file contents here. """ # Set the I/O backend according to what is specified iobackend.set_backend(self._backend) # Initialize the list of variable names for each category self._time_variant_metadata = [] self._time_invariant_metadata = [] # Initialize the local dictionary of time-series variables and sizes all_tsvars = {} #===== INSPECT FIRST INPUT FILE ===== # Open first file ifile = iobackend.NCFile(self._input_filenames[0]) # Look for the 'unlimited' dimension try: self._unlimited_dim = next(dim for dim in ifile.dimensions if ifile.unlimited(dim)) except StopIteration: err_msg = 'Unlimited dimension not found.' raise LookupError(err_msg) # Get the time values time_values = [ifile.variables[self._unlimited_dim][:]] # Categorize each variable (only looking at first file) for var_name, var in ifile.variables.iteritems(): if self._unlimited_dim not in var.dimensions: self._time_invariant_metadata.append(var_name) elif (var_name in self._metadata_names or (self._1d_metadata and len(var.dimensions) == 1)): self._time_variant_metadata.append(var_name) elif (self._time_series_names is None or var_name in self._time_series_names): all_tsvars[var_name] = var.datatype.itemsize * var.size # Get the list of variable names and missing variables var_names = set(all_tsvars.keys() + self._time_invariant_metadata + self._time_variant_metadata) missing_vars = set() # Close the first file ifile.close() if self._simplecomm.is_manager(): self._vprint(' First input file inspected.', verbosity=2) #===== INSPECT REMAINING INPUT FILES ===== # Make a pass through remaining files and: # (1) Make sure it has the 'unlimited' dimension # (2) Make sure this dimension is truely 'unlimited' # (3) Check that this dimension has a corresponding variable # (4) Check if there are any missing variables # (5) Get the time values from the files for ifilename in self._input_filenames[1:]: ifile = iobackend.NCFile(ifilename) # Determine the unlimited dimension if self._unlimited_dim not in ifile.dimensions: err_msg = ('Unlimited dimension not found ' 'in file "{0}"').format(ifilename) raise LookupError(err_msg) if not ifile.unlimited(self._unlimited_dim): err_msg = ('Dimension "{0}" not unlimited in file ' '"{1}"').format(self._unlimited_dim, ifilename) raise LookupError(err_msg) if self._unlimited_dim not in ifile.variables: err_msg = ('Unlimited dimension variable not found in file ' '"{0}"').format(ifilename) raise LookupError(err_msg) # Get the time values (list of NDArrays) time_values.append(ifile.variables[self._unlimited_dim][:]) # Get the missing variables var_names_next = set(ifile.variables.keys()) missing_vars.update(var_names - var_names_next) # Close the file ifile.close() if self._simplecomm.is_manager(): self._vprint(' Remaining input files inspected.', verbosity=2) #===== CHECK FOR MISSING VARIABLES ===== # Make sure that the list of variables in each file is the same if len(missing_vars) != 0: warning = ("WARNING: The first input file has variables " "that are not in all input files:{0} " "{1}").format(linesep, ', '.join(sorted(missing_vars))) self._vprint(warning, header=True, verbosity=0) if self._simplecomm.is_manager(): self._vprint(' Checked for missing variables.', verbosity=2) #===== SORT INPUT FILES BY TIME ===== # Determine the sort order based on the first time in the time values old_order = range(len(self._input_filenames)) new_order = sorted(old_order, key=lambda i: time_values[i][0]) # Re-order the list of input filenames and time values new_filenames = [self._input_filenames[i] for i in new_order] new_values = [time_values[i] for i in new_order] # Now, check that the largest time in each file is less than the # smallest time in the next file (so that the time spans of each file # do not overlap) for i in xrange(1, len(new_values)): if new_values[i - 1][-1] >= new_values[i][0]: err_msg = ('Times in input files {0} and {1} appear ' 'to overlap').format(new_filenames[i - 1], new_filenames[i]) raise ValueError(err_msg) # Now that this is validated, save the time values and filename in # the new order self._input_filenames = new_filenames if self._simplecomm.is_manager(): self._vprint(' Input files sorted by time.', verbosity=2) #===== FINALIZING OUTPUT ===== # Debug output if self._simplecomm.is_manager(): self._vprint(' Time-Invariant Metadata: ' '{0}'.format(self._time_invariant_metadata), verbosity=1) self._vprint(' Time-Variant Metadata: ' '{0}'.format(self._time_variant_metadata), verbosity=1) self._vprint(' Time-Series Variables: ' '{0}'.format(all_tsvars.keys()), verbosity=1) # Add 'once' variable if writing to a once file # NOTE: This is a "cheat"! There is no 'once' variable. It's just # a catch for all metadata IFF the 'once-file' is enabled. if self._use_once_file: all_tsvars['once'] = max(all_tsvars.values()) # Partition the time-series variables across processors self._time_series_variables = self._simplecomm.partition( all_tsvars.items(), func=WeightBalanced(), involved=True)