Example #1
0
    def __init__(self, specifier, serial=False, verbosity=1, wmode='w', once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{0}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)

        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)
            self._vprint('  MPI Communicator Size: {}'.format(
                self._simplecomm.get_size()), verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # The I/O backend to use
        if iobackend.is_available(specifier.io_backend):
            self._backend = specifier.io_backend
        else:
            self._backend = iobackend.get_backend()
            self._vprint(('  I/O Backend {0} not available.  Using {1} '
                          'instead').format(specifier.io_backend, self._backend), verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the time-series variable names
        self._time_series_names = specifier.time_series
        if self._time_series_names is not None:
            vnames = ', '.join(self._time_series_names)
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Extracting only variables: {0}'.format(
                    vnames), verbosity=-1)

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store whether to treat 1D time-variant variables as metadata
        self._1d_metadata = specifier.assume_1d_time_variant_metadata

        # Store the metadata filename
        self._metadata_filename = specifier.metadata_filename

        # Store time invariant variables that should be excluded from the timeseries files
        self._exclude_list = specifier.exclude_list

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup NetCDF file options
        self._netcdf_format = specifier.netcdf_format
        self._netcdf_compression = specifier.compression_level
        self._netcdf_least_significant_digit = specifier.least_significant_digit
        if self._simplecomm.is_manager():
            self._vprint(
                '  NetCDF I/O Backend: {0}'.format(self._backend), verbosity=1)
            self._vprint('  NetCDF Output Format: {0}'.format(
                self._netcdf_format), verbosity=1)
            self._vprint('  NetCDF Compression: {0}'.format(
                self._netcdf_compression), verbosity=1)
            trunc_str = ('{} decimal places'.format(self._netcdf_least_significant_digit)
                         if self._netcdf_least_significant_digit else 'Disabled')
            self._vprint('  NetCDF Truncation: {0}'.format(
                trunc_str), verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('...Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()
Example #2
0
    def __init__(self,
                 specifier,
                 serial=False,
                 verbosity=1,
                 skip_existing=False,
                 overwrite=False,
                 once=False,
                 simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class, 
                defining the input specification for this reshaper operation.

        Keyword Arguments:
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0 
                means no output, and a higher value means more output.  The
                default value is 1.
            skip_existing (bool): Flag specifying whether to skip the generation
                of time-series for variables with time-series files that already
                exist.  Default is False.
            overwrite (bool): Flag specifying whether to forcefully overwrite
                output files if they already exist.  Default is False.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel 
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(skip_existing) is not bool:
            err_msg = "Skip_existing flag must be True or False."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not (isinstance(simplecomm, SimpleComm) or \
                    isinstance(simplecomm, SimpleCommMPI)):
                err_msg = (
                    "Simple communicator object is not a SimpleComm or ",
                    "SimpleCommMPI")
                raise TypeError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join([
            '[',
            str(self._simplecomm.get_rank()), '/',
            str(self._simplecomm.get_size()), '] '
        ])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper', verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('Specifier validated', verbosity=1)

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = Nio.options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format == 'netcdf4':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = 0
        elif specifier.netcdf_format == 'netcdf4c':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.netcdf_deflate
            if self._simplecomm.is_manager():
                self._vprint('PyNIO compression level: {0}'.format(\
                    specifier.netcdf_deflate), verbosity=2)

        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('PyNIO options set', verbosity=2)

        # Open all of the input files
        self._timer.start('Open Input Files')
        self._input_files = []
        for filename in specifier.input_file_list:
            self._input_files.append(Nio.open_file(filename, "r"))
        self._timer.stop('Open Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files opened', verbosity=2)

        # Validate the input files themselves
        self._timer.start('Input File Validation')
        self._validate_input_files(specifier)
        self._timer.stop('Input File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Input files validated', verbosity=2)

        # Sort the input files by time
        self._timer.start('Sort Input Files')
        self._sort_input_files_by_time(specifier)
        self._timer.stop('Sort Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files sorted', verbosity=2)

        # Retrieve and sort the variables in each time-slice file
        # (To determine if it is time-invariant metadata, time-variant
        # metadata, or if it is a time-series variable)
        self._timer.start('Sort Variables')
        self._sort_variables(specifier)
        self._timer.stop('Sort Variables')
        if self._simplecomm.is_manager():
            self._vprint('Variables sorted', verbosity=2)

        # Validate the output files
        self._timer.start('Output File Validation')
        self._validate_output_files(specifier, skip_existing, overwrite)
        self._timer.stop('Output File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Output files validated', verbosity=2)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=1)

        # Sync before continuing..
        self._simplecomm.sync()
Example #3
0
class Reshaper(object):

    """
    The time-slice to time-series Reshaper class

    This is the class that defines how the time-slice to time-series
    reshaping operation is to be performed.
    """

    def __init__(self, specifier, serial=False, verbosity=1, wmode='w', once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{0}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)

        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)
            self._vprint('  MPI Communicator Size: {}'.format(
                self._simplecomm.get_size()), verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # The I/O backend to use
        if iobackend.is_available(specifier.io_backend):
            self._backend = specifier.io_backend
        else:
            self._backend = iobackend.get_backend()
            self._vprint(('  I/O Backend {0} not available.  Using {1} '
                          'instead').format(specifier.io_backend, self._backend), verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the time-series variable names
        self._time_series_names = specifier.time_series
        if self._time_series_names is not None:
            vnames = ', '.join(self._time_series_names)
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Extracting only variables: {0}'.format(
                    vnames), verbosity=-1)

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store whether to treat 1D time-variant variables as metadata
        self._1d_metadata = specifier.assume_1d_time_variant_metadata

        # Store the metadata filename
        self._metadata_filename = specifier.metadata_filename

        # Store time invariant variables that should be excluded from the timeseries files
        self._exclude_list = specifier.exclude_list

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup NetCDF file options
        self._netcdf_format = specifier.netcdf_format
        self._netcdf_compression = specifier.compression_level
        self._netcdf_least_significant_digit = specifier.least_significant_digit
        if self._simplecomm.is_manager():
            self._vprint(
                '  NetCDF I/O Backend: {0}'.format(self._backend), verbosity=1)
            self._vprint('  NetCDF Output Format: {0}'.format(
                self._netcdf_format), verbosity=1)
            self._vprint('  NetCDF Compression: {0}'.format(
                self._netcdf_compression), verbosity=1)
            trunc_str = ('{} decimal places'.format(self._netcdf_least_significant_digit)
                         if self._netcdf_least_significant_digit else 'Disabled')
            self._vprint('  NetCDF Truncation: {0}'.format(
                trunc_str), verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('...Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()

    def _inspect_input_files(self):
        """
        Inspect the input data files themselves.

        We check the file contents here, which means opening and reading heading information from the files.
        """
        # Set the I/O backend according to what is specified
        iobackend.set_backend(self._backend)

        # Initialize the list of variable names for each category
        udim = None
        timeta = []
        xtra_timeta = []
        tvmeta = []

        # Initialize the local dictionary of time-series variables and sizes
        all_tsvars = {}
        file_times = {}

        #===== INSPECT FIRST INPUT FILE (ON MASTER PROCESS ONLY) =====

        # Open first file
        if self._simplecomm.is_manager():
            ifile = iobackend.NCFile(self._input_filenames[0])

            # Look for the 'unlimited' dimension
            try:
                udim = next(
                    dim for dim in ifile.dimensions if ifile.unlimited(dim))
            except StopIteration:
                err_msg = 'Unlimited dimension not found.'
                raise LookupError(err_msg)

            # Get the first file's time values
            file_times[self._input_filenames[0]] = ifile.variables[udim][:]

            # Categorize each variable (only looking at first file)
            for var_name, var in ifile.variables.iteritems():
                if udim not in var.dimensions:
                    if var_name not in self._exclude_list:
                        timeta.append(var_name)
                elif var_name in self._metadata_names or (self._1d_metadata and len(var.dimensions) == 1):
                    tvmeta.append(var_name)
                elif self._time_series_names is None or var_name in self._time_series_names:
                    all_tsvars[var_name] = var.datatype.itemsize * var.size

            # Close the first file
            ifile.close()

            # Find variables only in the metadata file
            if self._metadata_filename is not None:
                ifile = iobackend.NCFile(self._metadata_filename)
                for var_name, var in ifile.variables.iteritems():
                    if udim not in var.dimensions and var_name not in timeta:
                        xtra_timeta.append(var_name)
                ifile.close()

        self._simplecomm.sync()

        # Send information to worker processes
        self._unlimited_dim = self._simplecomm.partition(
            udim, func=Duplicate(), involved=True)
        self._time_invariant_metadata = self._simplecomm.partition(
            timeta, func=Duplicate(), involved=True)
        self._time_invariant_metafile_vars = self._simplecomm.partition(
            xtra_timeta, func=Duplicate(), involved=True)
        self._time_variant_metadata = self._simplecomm.partition(
            tvmeta, func=Duplicate(), involved=True)
        all_tsvars = self._simplecomm.partition(
            all_tsvars, func=Duplicate(), involved=True)

        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint('  First input file inspected.', verbosity=2)

        #===== INSPECT REMAINING INPUT FILES (IN PARALLEL) =====

        # Get the list of variable names and missing variables
        var_names = set(
            all_tsvars.keys() + self._time_invariant_metadata + self._time_invariant_metafile_vars + self._time_variant_metadata)
        missing_vars = set()

        # Partition the remaining filenames to inspect
        input_filenames = self._simplecomm.partition(
            self._input_filenames[1:], func=EqualStride(), involved=True)

        # Make a pass through remaining files and:
        # (1) Make sure it has the 'unlimited' dimension
        # (2) Make sure this dimension is truely 'unlimited'
        # (3) Check that this dimension has a corresponding variable
        # (4) Check if there are any missing variables
        # (5) Get the time values from the files
        for ifilename in input_filenames:
            ifile = iobackend.NCFile(ifilename)

            # Determine the unlimited dimension
            if self._unlimited_dim not in ifile.dimensions:
                err_msg = 'Unlimited dimension not found in file "{0}"'.format(
                    ifilename)
                raise LookupError(err_msg)
            if not ifile.unlimited(self._unlimited_dim):
                err_msg = 'Dimension "{0}" not unlimited in file "{1}"'.format(
                    self._unlimited_dim, ifilename)
                raise LookupError(err_msg)
            if self._unlimited_dim not in ifile.variables:
                err_msg = 'Unlimited dimension variable not found in file "{0}"'.format(
                    ifilename)
                raise LookupError(err_msg)

            # Get the time values (list of NDArrays)
            file_times[ifilename] = ifile.variables[self._unlimited_dim][:]

            # Get the missing variables
            var_names_next = set(ifile.variables.keys())
            missing_vars.update(var_names - var_names_next)

            # Close the file
            ifile.close()

        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint('  Remaining input files inspected.', verbosity=2)

        #===== CHECK FOR MISSING VARIABLES =====

        # Gather all missing variables on the master process
        if self._simplecomm.get_size() > 1:
            if self._simplecomm.is_manager():
                for _ in range(1, self._simplecomm.get_size()):
                    missing_vars.update(self._simplecomm.collect()[1])
            else:
                self._simplecomm.collect(missing_vars)
        self._simplecomm.sync()

        # Check for missing variables only on master process
        if self._simplecomm.is_manager():

            # Remove metafile variables from missing vars set
            missing_vars -= set(self._time_invariant_metafile_vars)

            # Make sure that the list of variables in each file is the same
            if len(missing_vars) != 0:
                warning = ("WARNING: Some variables are not in all input files:{0}   "
                           "{1}").format(linesep, ', '.join(sorted(missing_vars)))
                self._vprint(warning, header=False, verbosity=0)

            self._vprint('  Checked for missing variables.', verbosity=2)

        #===== SORT INPUT FILES BY TIME =====

        # Gather the file time values onto the master process
        if self._simplecomm.get_size() > 1:
            if self._simplecomm.is_manager():
                for _ in range(1, self._simplecomm.get_size()):
                    file_times.update(self._simplecomm.collect()[1])
            else:
                self._simplecomm.collect(file_times)
        self._simplecomm.sync()

        # Check the order of the input files based on the time values
        if self._simplecomm.is_manager():

            # Determine the sort order based on the first time in the time
            # values
            old_order = range(len(self._input_filenames))
            new_order = sorted(
                old_order, key=lambda i: file_times[self._input_filenames[i]][0])

            # Re-order the list of input filenames and time values
            new_filenames = [self._input_filenames[i] for i in new_order]
            new_values = [file_times[self._input_filenames[i]]
                          for i in new_order]

            # Now, check that the largest time in each file is less than the smallest time
            # in the next file (so that the time spans of each file do not
            # overlap)
            for i in xrange(1, len(new_values)):
                if new_values[i - 1][-1] >= new_values[i][0]:
                    err_msg = ('Times in input files {0} and {1} appear to '
                               'overlap').format(new_filenames[i - 1], new_filenames[i])
                    raise ValueError(err_msg)

        else:
            new_filenames = None

        # Now that this is validated, save the time values and filename in the
        # new order
        self._input_filenames = self._simplecomm.partition(
            new_filenames, func=Duplicate(), involved=True)

        if self._simplecomm.is_manager():
            self._vprint('  Input files sorted by time.', verbosity=2)

        #===== FINALIZING OUTPUT =====
        self._simplecomm.sync()

        # Debug output
        if self._simplecomm.is_manager():
            self._vprint('  Time-Invariant Metadata: {0}'.format(
                ', '.join(self._time_invariant_metadata)), verbosity=1)
            if len(self._time_invariant_metafile_vars) > 0:
                self._vprint('  Additional Time-Invariant Metadata: {0}'.format(
                    ', '.join(self._time_invariant_metafile_vars)), verbosity=1)
            self._vprint('  Time-Variant Metadata: {0}'.format(
                ', '.join(self._time_variant_metadata)), verbosity=1)
            self._vprint(
                '  Time-Series Variables: {0}'.format(', '.join(all_tsvars.keys())), verbosity=1)

        # Add 'once' variable if writing to a once file
        # NOTE: This is a "cheat"!  There is no 'once' variable.  It's just
        #       a catch for all metadata IFF the 'once-file' is enabled.
        if self._use_once_file:
            all_tsvars['once'] = max(all_tsvars.values())

        # Partition the time-series variables across processors
        self._time_series_variables = self._simplecomm.partition(
            all_tsvars.items(), func=WeightBalanced(), involved=True)

    def _inspect_output_files(self):
        """
        Perform inspection of the output data files themselves.

        We compute the output file name from the prefix and suffix, and then
        we check whether the output files exist.  By default, if the output
        file exists, then the job is stopped.
        """
        iobackend.set_backend(self._backend)

        # Loop through the time-series variables and generate output filenames
        self._time_series_filenames = \
            dict([(variable, self._output_prefix + variable + self._output_suffix)
                  for variable in self._time_series_variables])

        # Find which files already exist
        self._existing = [v for (v, f) in self._time_series_filenames.iteritems()
                          if isfile(f)]

        # Set the starting step index for each variable
        self._time_series_step_index = dict([(variable, 0) for variable in
                                             self._time_series_variables])

        # If overwrite is enabled, delete all existing files first
        if self._write_mode == 'o':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Deleting existing output files for time-series '
                             'variables: {0}'.format(', '.join(sorted(self._existing))), verbosity=0)
            for variable in self._existing:
                remove(self._time_series_filenames[variable])
            self._existing = []

        # Or, if skip existing is set, remove the existing time-series
        # variables from the list of time-series variables to convert
        elif self._write_mode == 's':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Skipping time-series variables with '
                             'existing output files: {0}'.format(', '.join(sorted(self._existing))), verbosity=0)
            for variable in self._existing:
                self._time_series_variables.remove(variable)

        # Or, if appending, check that the existing output files conform
        # to the expected pattern
        elif self._write_mode == 'a':

            # Check each existing time-series file
            for variable in self._existing:

                # Get the matching filename
                filename = self._time_series_filenames[variable]

                # Open the time-series file for inspection
                tsfile = iobackend.NCFile(filename)

                # Check that the file has the unlimited dim and var
                if not tsfile.unlimited(self._unlimited_dim):
                    err_msg = ('Cannot append to time-series file with missing unlimited '
                               'dimension {0!r}').format(self._unlimited_dim)
                    raise RuntimeError(err_msg)

                # Check for once file
                is_once_file = (variable == 'once')
                needs_meta_data = not (
                    self._use_once_file and not is_once_file)
                needs_tser_data = not (self._use_once_file and is_once_file)

                # Look for metadata
                if needs_meta_data:

                    # Check that the time-variant metadata are all present
                    for metavar in self._time_variant_metadata:
                        if metavar not in tsfile.variables:
                            err_msg = ("Cannot append to time-series file with missing time-variant metadata "
                                       "'{0}'").format(metavar)
                            raise RuntimeError(err_msg)

                # Check that the time-series variable is present
                if needs_tser_data and variable not in tsfile.variables:
                    err_msg = ("Cannot append to time-series file with missing time-series variable "
                               "'{0}'").format(variable)
                    raise RuntimeError(err_msg)

                # Get the starting step index to start writing from
                self._time_series_step_index[variable] = tsfile.dimensions[self._unlimited_dim]

                # Close the time-series file
                tsfile.close()

        # Otherwise, throw an exception if any existing output files are found
        elif len(self._existing) > 0:
            err_msg = "Found existing output files for time-series variables: {0}".format(
                ', '.join(sorted(self._existing)))
            raise RuntimeError(err_msg)

    def _create_var(self, in_file, out_file, vname, chunks=None):
        in_var = in_file.variables[vname]
        fill_value = in_var.fill_value
        if in_var.chunk_sizes is not None and chunks is not None:
            chunksizes = [chunks[d] if d in chunks else c
                          for d, c in zip(in_var.dimensions, in_var.chunk_sizes)]
        else:
            chunksizes = None
        out_var = out_file.create_variable(
            vname, in_var.datatype, in_var.dimensions, fill_value=fill_value, chunksizes=chunksizes)
        for att_name in in_var.ncattrs:
            att_value = in_var.getncattr(att_name)
            out_var.setncattr(att_name, att_value)

    def _chunk_iter(self, vobj, chunks={}, corder=True):
        """
        This is a generator function to iterator over chunks of arrays with named dimensions

        Parameters:
            vobj: A NetCDF file variable object with dimensions and shape attributes
            chunks (dict): A dictionary of dimension names mapped to chunk sizes along that
                named dimension
            corder (bool): Whether to assume the array has C-style axis ordering, where the
                fastest changing dimension is assumed to be the first axis.  If False, then
                the fastest changing dimension is assumed to be the last.
        """
        dimensions = vobj.dimensions
        shape = vobj.shape

        nchunks = 1
        dchunks = []
        for dname, dlen in zip(dimensions, shape):
            if dname in chunks:
                clen = chunks[dname]
                cnum = dlen // clen
                if dlen % clen > 0:
                    cnum += 1
                nchunks *= cnum
            else:
                clen = dlen
                cnum = 1
            dchunks.append((dlen, clen, cnum))

        for n in xrange(nchunks):
            cidx = []
            nidx = n
            nstride = nchunks
            if corder:
                diter = reversed(dchunks)
            else:
                diter = iter(dchunks)
            for dlen, clen, cnum in diter:
                nstride = nstride // cnum
                cidx.append(nidx // nstride)
                nidx = nidx % nstride
            if corder:
                cidx.reverse()

            cslice = []
            for d in xrange(len(shape)):
                ic = cidx[d]
                dlen, clen, cnum = dchunks[d]

                ibeg = ic * clen
                iend = (ic + 1) * clen
                if iend >= dlen:
                    iend = dlen

                cslice.append(slice(ibeg, iend))

            yield tuple(cslice)

    def _offset_chunk(self, chunk, vobj, offset):
        """
        Compute a new chunk/slice for a variable with a given offset

        Parameters:
            chunk (tuple): A tuple of slices across each dimension
            vobj: A NetCDF file variable object with dimensions and shape attributes
            offset (dict): Offsets for each dimension (if any)

        Returns:
            tuple: A tuple of slices across each dimension with offsets added
        """
        new_chunk = []
        for i, d in enumerate(vobj.dimensions):
            if d in offset:
                o = offset[d]
            else:
                o = 0
            new_chunk.append(slice(chunk[i].start + o, chunk[i].stop + o))
        return tuple(new_chunk)

    def _copy_var(self, kind, in_var, out_var, chunks={}, offsets={}):
        """
        Copy variable data from one variable object to another via chunking

        Parameters:
            kind (str): A string describing the kind of variable being copied
            in_var: A NetCDF variable object to read data from
            out_var: A NetCDF variable object to write data to
            chunks (dict): A dictionary of dimension names mapped to chunk sizes along that named dimension
            offsets (dict): Integer offsets along each dimension
        """
        for rslice in self._chunk_iter(in_var, chunks=chunks):

            self._timer.start('Read {0}'.format(kind))
            tmp_data = in_var[rslice]
            self._timer.stop('Read {0}'.format(kind))
            wslice = self._offset_chunk(rslice, out_var, offsets)
            self._timer.start('Write {0}'.format(kind))
            out_var[wslice] = tmp_data
            self._timer.stop('Write {0}'.format(kind))

            requested_nbytes = tmp_data.nbytes if hasattr(
                tmp_data, 'nbytes') else 0
            self._byte_counts['Requested Data'] += requested_nbytes
            actual_nbytes = (self.assumed_block_size * numpy.ceil(requested_nbytes / self.assumed_block_size))
            self._byte_counts['Actual Data'] += actual_nbytes

    def convert(self, output_limit=0, rchunks=None, wchunks=None):
        """
        Method to perform the Reshaper's designated operation.

        In this case, convert a list of time-slice files to time-series files.

        Parameters:
            output_limit (int): Limit on the number of output (time-series) files to write during the
                convert() operation.  If set to 0, no limit is placed.  This limits the number of output files
                produced by each processor in a parallel run.
            rchunks (dict): A dictionary of dimension names mapped to reading chunk sizes along that named
                dimension
            wchunks (dict): A dictionary of dimension names mapped to writing chunk sizes along that named
                dimension
        """
        iobackend.set_backend(self._backend)

        # Type checking input
        if type(output_limit) is not int:
            err_msg = 'Output limit must be an integer'
            raise TypeError(err_msg)

        # Start the total convert process timer
        self._timer.start('Complete Conversion Process')

        # Validate the input files themselves
        if self._simplecomm.is_manager():
            self._vprint('Inspecting input files...', verbosity=0)
        self._timer.start('Inspect Input Files')
        self._inspect_input_files()
        self._timer.stop('Inspect Input Files')
        if self._simplecomm.is_manager():
            self._vprint('...Input files inspected.', verbosity=0)

        # Validate the output files
        if self._simplecomm.is_manager():
            self._vprint('Inspecting output files...', verbosity=0)
        self._timer.start('Inspect Output Files')
        self._inspect_output_files()
        self._timer.stop('Inspect Output Files')
        if self._simplecomm.is_manager():
            self._vprint('...Output files inspected.', verbosity=0)

        # Check the read chunking
        if rchunks is None:
            # Default chunking is over 1 time-step at a time
            rchunks = {self._unlimited_dim: 1}
        if not isinstance(rchunks, dict):
            err_msg = 'Chunks must be specified with a dictionary'
            raise TypeError(err_msg)
        for key, value in rchunks.iteritems():
            if not isinstance(key, basestring):
                err_msg = 'Chunks dictionary must have string-type keys'
                raise TypeError(err_msg)
            if not isinstance(value, int):
                err_msg = 'Chunks dictionary must have integer chunk sizes'
                raise TypeError(err_msg)

        # Debugging output
        if self._simplecomm.is_manager():
            if len(rchunks) > 0:
                self._vprint('Read chunk sizes:', verbosity=1)
                for dname in rchunks:
                    self._vprint('  {!s}: {}'.format(
                        dname, rchunks[dname]), verbosity=1)
            else:
                self._vprint('No read chunking specified.', verbosity=1)
            self._vprint(
                'Converting time-slices to time-series...', verbosity=0)
        self._simplecomm.sync()

        # Partition the time-series variables across all processors
        tsv_names_loc = self._time_series_variables
        if output_limit > 0:
            tsv_names_loc = tsv_names_loc[0:output_limit]

        # Print partitions for all ranks
        dbg_msg = 'Converting time-series variables: {0}'.format(
            ', '.join(tsv_names_loc))
        self._vprint(dbg_msg, header=True, verbosity=1)

        # Reset all of the timer values (as it is possible that there are no
        # time-series variables in the local list procuded above)
        self._timer.reset('Open Output Files')
        self._timer.reset('Close Output Files')
        self._timer.reset('Open Input Files')
        self._timer.reset('Close Input Files')
        self._timer.reset('Create Time-Invariant Metadata')
        self._timer.reset('Create Time-Variant Metadata')
        self._timer.reset('Create Time-Series Variables')
        self._timer.reset('Read Time-Invariant Metadata')
        self._timer.reset('Read Time-Variant Metadata')
        self._timer.reset('Read Time-Series Variables')
        self._timer.reset('Write Time-Invariant Metadata')
        self._timer.reset('Write Time-Variant Metadata')
        self._timer.reset('Write Time-Series Variables')

        # Initialize the byte count dictionary
        self._byte_counts['Requested Data'] = 0
        self._byte_counts['Actual Data'] = 0

        #===== LOOP OVER TIME_SERIES VARIABLES =====

        if len(self._time_invariant_metafile_vars) > 0:
            metafile = iobackend.NCFile(self._metadata_filename)
        else:
            metafile = None

        # Loop over all time-series variables
        for out_name in tsv_names_loc:

            # Once-file data, for convenience
            is_once_file = (out_name == 'once')
            write_meta_data = not (self._use_once_file and not is_once_file)
            write_tser_data = not (self._use_once_file and is_once_file)

            # Determine the output file name for this variable
            out_filename = self._time_series_filenames[out_name]
            dbg_msg = 'Opening output file for variable: {0}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Opening "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Open the output file
            self._timer.start('Open Output Files')
            temp_filename = out_filename + '_temp_.nc'
            if exists(temp_filename):
                remove(temp_filename)
            if self._write_mode == 'a' and out_name in self._existing:
                rename(out_filename, temp_filename)
                out_file = iobackend.NCFile(temp_filename, mode='a',
                                            ncfmt=self._netcdf_format,
                                            compression=self._netcdf_compression,
                                            least_significant_digit=self._netcdf_least_significant_digit)
                appending = True
            else:
                out_file = iobackend.NCFile(temp_filename, mode='w',
                                            ncfmt=self._netcdf_format,
                                            compression=self._netcdf_compression,
                                            least_significant_digit=self._netcdf_least_significant_digit)
                appending = False
            self._timer.stop('Open Output Files')

            # Start the loop over input files (i.e., time-slices)
            offsets = {
                self._unlimited_dim: self._time_series_step_index[out_name]}
            for in_filename in self._input_filenames:

                # Open the input file (and metadata file, if necessary)
                self._timer.start('Open Input Files')
                in_file = iobackend.NCFile(in_filename)
                self._timer.stop('Open Input Files')

                # Create header info, if this is the first input file
                if in_filename == self._input_filenames[0] and not appending:

                    # Copy file attributes and dimensions to output file
                    for name in in_file.ncattrs:
                        out_file.setncattr(name, in_file.getncattr(name))
                    for name, val in in_file.dimensions.iteritems():
                        if name == self._unlimited_dim:
                            out_file.create_dimension(name)
                        else:
                            out_file.create_dimension(name, val)

                    # Create the metadata variables
                    if write_meta_data:

                        # Time-invariant metadata variables
                        self._timer.start('Create Time-Invariant Metadata')
                        for name in self._time_invariant_metadata:
                            self._create_var(in_file, out_file, name)
                        for name in self._time_invariant_metafile_vars:
                            self._create_var(metafile, out_file, name)
                        self._timer.stop('Create Time-Invariant Metadata')

                        # Time-variant metadata variables
                        self._timer.start('Create Time-Variant Metadata')
                        for name in self._time_variant_metadata:
                            self._create_var(in_file, out_file, name)
                        self._timer.stop('Create Time-Variant Metadata')

                    # Create the time-series variable
                    if write_tser_data:

                        # Time-series variable
                        self._timer.start('Create Time-Series Variables')
                        self._create_var(in_file, out_file,
                                         out_name, chunks=wchunks)
                        self._timer.stop('Create Time-Series Variables')

                    dbg_msg = 'Writing output file for variable: {0}'.format(
                        out_name)
                    if out_name == 'once':
                        dbg_msg = 'Writing "once" file.'
                    self._vprint(dbg_msg, header=True, verbosity=1)

                    # Copy the time-invariant metadata
                    if write_meta_data:
                        for name in self._time_invariant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.variables[name]
                            self._copy_var('Time-Invariant Metadata',
                                           in_var, out_var, chunks=rchunks)
                        for name in self._time_invariant_metafile_vars:
                            in_var = metafile.variables[name]
                            out_var = out_file.variables[name]
                            self._copy_var('Time-Invariant Metadata',
                                           in_var, out_var, chunks=rchunks)

                # Copy the time-varient metadata
                if write_meta_data:
                    for name in self._time_variant_metadata:
                        in_var = in_file.variables[name]
                        out_var = out_file.variables[name]
                        self._copy_var('Time-Variant Metadata', in_var,
                                       out_var, chunks=rchunks, offsets=offsets)

                # Copy the time-series variables
                if write_tser_data:
                    in_var = in_file.variables[out_name]
                    out_var = out_file.variables[out_name]
                    self._copy_var('Time-Series Variables', in_var,
                                   out_var, chunks=rchunks, offsets=offsets)

                # Increment the time-series index offset
                offsets[self._unlimited_dim] += in_file.dimensions[self._unlimited_dim]

                # Close the input file
                self._timer.start('Close Input Files')
                in_file.close()
                self._timer.stop('Close Input Files')

            # Close the output file
            self._timer.start('Close Output Files')
            out_file.close()
            rename(temp_filename, out_filename)
            self._timer.stop('Close Output Files')

            # Output message to user
            dbg_msg = 'Closed output file for variable: {0}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Closed "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

        # Close the metadata file, if necessary
        if metafile:
            metafile.close()

        # Information
        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint(
                '...Finished converting time-slices to time-series.', verbosity=0)

        # Finish clocking the entire convert procedure
        self._timer.stop('Complete Conversion Process')

    def print_diagnostics(self):
        """
        Print out timing and I/O information collected up to this point
        """

        # Get all totals and maxima
        my_times = self._timer.get_all_times()
        max_times = self._simplecomm.allreduce(my_times, op='max')
        my_memory = {'Maximum Memory Use': _get_memory_usage_MB_()}
        max_memory = self._simplecomm.allreduce(my_memory, op='max')
        my_bytes = self._byte_counts
        total_bytes = self._simplecomm.allreduce(my_bytes, op='sum')

        # Synchronize
        self._simplecomm.sync()

        # Print timing maxima
        o = self._timer.get_names()
        time_table_str = _pprint_dictionary('TIMING DATA', max_times, order=o)
        if self._simplecomm.is_manager():
            self._vprint(time_table_str, verbosity=-1)

        # Convert byte count to MB
        for name in total_bytes:
            total_bytes[name] = total_bytes[name] / float(1024 * 1024)

        # Print byte count totals
        byte_count_str = _pprint_dictionary('BYTE COUNTS (MB)', total_bytes)
        if self._simplecomm.is_manager():
            self._vprint(byte_count_str, verbosity=-1)

        # Print maximum memory use in MB
        memory_str = _pprint_dictionary('MEMORY USAGE (MB)', max_memory)
        if self._simplecomm.is_manager():
            self._vprint(memory_str, verbosity=-1)
Example #4
0
    def __init__(self,
                 specifier,
                 serial=False,
                 verbosity=1,
                 wmode='w',
                 once=False,
                 simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join([
            '[',
            str(self._simplecomm.get_rank()), '/',
            str(self._simplecomm.get_size()), '] '
        ])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = nio_options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format in ['netcdf4', 'netcdf4c']:
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.compression_level
        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('  PyNIO options set', verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()
Example #5
0
class Slice2SeriesReshaper(Reshaper):
    """
    The time-slice to time-series Reshaper class

    This is the class that defines how the time-slice to time-series 
    reshaping operation is to be performed.
    """
    def __init__(self,
                 specifier,
                 serial=False,
                 verbosity=1,
                 skip_existing=False,
                 overwrite=False,
                 once=False,
                 simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class, 
                defining the input specification for this reshaper operation.

        Keyword Arguments:
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0 
                means no output, and a higher value means more output.  The
                default value is 1.
            skip_existing (bool): Flag specifying whether to skip the generation
                of time-series for variables with time-series files that already
                exist.  Default is False.
            overwrite (bool): Flag specifying whether to forcefully overwrite
                output files if they already exist.  Default is False.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel 
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(skip_existing) is not bool:
            err_msg = "Skip_existing flag must be True or False."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not (isinstance(simplecomm, SimpleComm) or \
                    isinstance(simplecomm, SimpleCommMPI)):
                err_msg = (
                    "Simple communicator object is not a SimpleComm or ",
                    "SimpleCommMPI")
                raise TypeError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join([
            '[',
            str(self._simplecomm.get_rank()), '/',
            str(self._simplecomm.get_size()), '] '
        ])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper', verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('Specifier validated', verbosity=1)

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = Nio.options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format == 'netcdf4':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = 0
        elif specifier.netcdf_format == 'netcdf4c':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.netcdf_deflate
            if self._simplecomm.is_manager():
                self._vprint('PyNIO compression level: {0}'.format(\
                    specifier.netcdf_deflate), verbosity=2)

        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('PyNIO options set', verbosity=2)

        # Open all of the input files
        self._timer.start('Open Input Files')
        self._input_files = []
        for filename in specifier.input_file_list:
            self._input_files.append(Nio.open_file(filename, "r"))
        self._timer.stop('Open Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files opened', verbosity=2)

        # Validate the input files themselves
        self._timer.start('Input File Validation')
        self._validate_input_files(specifier)
        self._timer.stop('Input File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Input files validated', verbosity=2)

        # Sort the input files by time
        self._timer.start('Sort Input Files')
        self._sort_input_files_by_time(specifier)
        self._timer.stop('Sort Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files sorted', verbosity=2)

        # Retrieve and sort the variables in each time-slice file
        # (To determine if it is time-invariant metadata, time-variant
        # metadata, or if it is a time-series variable)
        self._timer.start('Sort Variables')
        self._sort_variables(specifier)
        self._timer.stop('Sort Variables')
        if self._simplecomm.is_manager():
            self._vprint('Variables sorted', verbosity=2)

        # Validate the output files
        self._timer.start('Output File Validation')
        self._validate_output_files(specifier, skip_existing, overwrite)
        self._timer.stop('Output File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Output files validated', verbosity=2)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=1)

        # Sync before continuing..
        self._simplecomm.sync()

    def _validate_input_files(self, specifier):
        """
        Perform validation of input data files themselves.  

        We check the file contents here, assuming that the files are already 
        open.

        Parameters:
            specifier (Specifier): The reshaper specifier object
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Validating input files', verbosity=1)

        # In the first file, look for the 'unlimited' dimension
        ifile = self._input_files[0]
        self._unlimited_dim = None
        for dim in ifile.dimensions:
            if ifile.unlimited(dim):
                self._unlimited_dim = dim
                break  # There can only be 1!
        if self._unlimited_dim == None:
            err_msg = 'Unlimited dimension not identified.'
            raise LookupError(err_msg)

        # Make a pass through each file and:
        # (1) Make sure it has the 'unlimited' dimension
        # (2) Make sure this dimension is truely 'unlimited'
        # (3) Check that this dimension has a corresponding variable
        for i in range(len(self._input_files)):
            ifile = self._input_files[i]
            if self._unlimited_dim not in ifile.dimensions:
                err_msg = 'Unlimited dimension not found in file ({0})'.\
                          format(specifier.input_file_list[i])
                raise LookupError(err_msg)
            if not ifile.unlimited(self._unlimited_dim):
                err_msg = 'Unlimited dimension not unlimited in file ({0})'.\
                          format(specifier.input_file_list[i])
                raise LookupError(err_msg)
            if self._unlimited_dim not in ifile.variables:
                err_msg = 'Unlimited dimension variable not found in file ({0})'.\
                          format(specifier.input_file_list[i])
                raise LookupError(err_msg)

        # Make sure that the list of variables in each file is the same
        variables = self._input_files[0].variables
        var_names = set(variables.keys())
        missing_vars = set()
        for ifile in self._input_files[1:]:
            var_names_next = set(ifile.variables.keys())
            missing_vars.update(var_names - var_names_next)
        if len(missing_vars) != 0:
            warning = "WARNING: The first input file has variables that are " \
                + "not in all input files:" + os.linesep + '   '
            warning += " ".join(missing_vars)
            self._vprint(warning, header=True, verbosity=1)

    def _sort_input_files_by_time(self, specifier):
        """
        Internal method for sorting the input files by time

        This assumes that 'time' is the unlimited dimension, and it checks
        to make sure that all of the times spanning across each file do not 
        overlap with each other (i.e., that the times across all files are 
        monotonicly increasing).

        Currently, this method assumes that all of the input files
        have the same 'time:units' attribute, such that all time variable
        values are measured from the same date-time.  When this is true,
        we do not need to consider the value of the 'time:units'
        attribute itself.  If this assumption is not true, then we need
        to consider the 'time:units" attribute of each file, together
        with that file's time variable values.  To do that properly,
        however, one should use UDUNITS to do the comparisons.

        Parameters:
            specifier (Specifier): The reshaper specifier object
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Sorting input files', verbosity=1)

        # Get the time attributes (for convenience) and, for each file,
        # add the times to a list.  (Each file will have an array of times
        # associated with it.  Each array will be added to a list, such
        # that the outer-most list contains an array for each input file)
        time_values = []
        for ifile in self._input_files:
            time_values.append(
                ifile.variables[self._unlimited_dim].get_value())

        # Determine the sort order based on the first time in the time values
        order = range(len(self._input_files))
        new_order = sorted(order, key=lambda i: time_values[i][0])

        # Re-order the list of input files and filenames
        new_file_list = [None] * len(new_order)
        new_filenames = [None] * len(new_order)
        new_values = [None] * len(new_order)
        for i in order:
            new_file_list[i] = self._input_files[new_order[i]]
            new_filenames[i] = specifier.input_file_list[new_order[i]]
            new_values[i] = time_values[new_order[i]]

        # Save this data in the new orders
        self._input_files = new_file_list
        self._input_filenames = new_filenames

        # Now, check that the largest time in each file is less than the
        # smallest time in the next file (so that the time spans of each file
        # do not overlap)
        for i in order[:-1]:
            if new_values[i][-1] >= new_values[i + 1][0]:
                err_msg = 'Times in input files {0} and {1} appear to overlap'
                err_msg = err_msg.format(new_filenames[i],
                                         new_filenames[i + 1])
                raise ValueError(err_msg)

        # Now that this is validated, let's string together the numpy array
        # of all times (using the new_values array)
        self._all_time_values = \
            numpy.fromiter(itertools.chain.from_iterable(new_values),
                           dtype='float')

    def _sort_variables(self, specifier):
        """
        Internal method for sorting the variables in each time-slice file

        This method determines if each variable is to be treated as 
        time-invariant metadata, time-variant metadata (user defined), or 
        time-series variables.  All metadata is written to every time-series 
        file, and any time-series variable is written to its own file.  
        The time-variant metadata variables are determined by user input, 
        and are contained in the Specifier data member:

            Specifier.time_variant_metadata.

        Parameters:
            specifier (Specifier): The reshaper specifier object
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Sorting variables', verbosity=1)

        # Initialize the dictionary of variable names for each category
        # (Keys are variable names, Values are variable sizes)
        self._time_variant_metadata = {}
        self._time_invariant_metadata = {}
        self._time_series_variables = {}

        # Categorize each variable (only looking at first file)
        variables = self._input_files[0].variables
        for var_name in variables.keys():
            var = variables[var_name]
            size = numpy.dtype(var.typecode()).itemsize
            size = size * numpy.prod(var.shape)
            if self._unlimited_dim not in var.dimensions:
                self._time_invariant_metadata[var_name] = size
            elif var_name in specifier.time_variant_metadata:
                self._time_variant_metadata[var_name] = size
            else:
                self._time_series_variables[var_name] = size

        # Debug output
        if self._simplecomm.is_manager():
            self._vprint('Time-Invariant Metadata: ' +
                         str(self._time_invariant_metadata.keys()),
                         verbosity=2)
            self._vprint('Time-Variant Metadata: ' +
                         str(self._time_variant_metadata.keys()),
                         verbosity=2)
            self._vprint('Time-Series Variables: ' +
                         str(self._time_series_variables.keys()),
                         verbosity=2)

        # Add 'once' variable if writing to a once file
        # NOTE: This is a "cheat"!  There is no 'once' variable.  It's just
        #       a catch for all metadata IFF the 'once-file' is enabled.
        if self._use_once_file:
            self._time_series_variables['once'] = 1

    def _validate_output_files(self,
                               specifier,
                               skip_existing=False,
                               overwrite=False):
        """
        Perform validation of output data files themselves.  

        We compute the output file name from the prefix and suffix, and then
        we check whether the output files exist.  By default, if the output
        file

        Parameters:
            specifier (Specifier): The reshaper specifier object

        Keyword Arguments:
            skip_existing (bool): Flag specifying whether to skip the generation
                of time-series for variables with time-series files that already
                exist.  Default is False.
            overwrite (bool): Flag specifying whether to forcefully overwrite
                output files if they already exist.  Default is False.
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Validating output files', verbosity=1)

        # Loop through the time-series variables and generate output filenames
        prefix = specifier.output_file_prefix
        suffix = specifier.output_file_suffix
        self._time_series_filenames = \
            dict([(variable, prefix + variable + suffix)
                  for variable in self._time_series_variables])

        # Find which files already exist
        existing = []
        for variable, filename in self._time_series_filenames.items():
            if os.path.isfile(filename):
                existing.append(variable)

        # If overwrite is enabled, delete all existing files first
        if overwrite:
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Deleting existing output files for '
                             'time-series variables: {0}'.format(existing),
                             verbosity=1)
            for variable in existing:
                os.remove(self._time_series_filenames[variable])

        # Or, if skip_existing is set, remove the existing time-series
        # variables from the list of time-series variables to convert
        elif skip_existing:
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Skipping time-series variables with '
                             'existing output files: {0}'.format(existing),
                             verbosity=1)
            for variable in existing:
                self._time_series_variables.pop(variable)

        # Otherwise, throw an exception if any existing output files are found
        elif len(existing) > 0:
            err_msg = ("Found existing output files for time-series "
                       "variables: {0}").format(existing)
            raise RuntimeError(err_msg)

    def convert(self, output_limit=0):
        """
        Method to perform the Reshaper's designated operation.

        In this case, convert a list of time-slice files to time-series files.

        Keyword Arguments:
            output_limit (int): Limit on the number of output (time-series) 
                files to write during the convert() operation.  If set
                to 0, no limit is placed.  This limits the number
                of output files produced by each processor in a
                parallel run.
        """
        # Type checking input
        if type(output_limit) is not int:
            err_msg = 'Output limit must be an integer'
            raise TypeError(err_msg)

        # Start the total convert process timer
        self._simplecomm.sync()
        self._timer.start('Complete Conversion Process')

        # Debugging output
        if self._simplecomm.is_manager():
            self._vprint('Converting time-slices to time-series', verbosity=1)

        # For data common to all input files, we reference only the first
        ref_infile = self._input_files[0]

        # Store the common dimensions and attributes for each file
        # (taken from the first input file in the list)
        common_dims = ref_infile.dimensions
        common_atts = ref_infile.attributes

        # Partition the time-series variables across all processors
        tsv_names_loc = self._simplecomm.partition(
            self._time_series_variables.items(),
            func=WeightBalanced(),
            involved=True)
        if output_limit > 0:
            tsv_names_loc = tsv_names_loc[0:output_limit]

        # Print partitions for all ranks
        dbg_msg = 'Local time-series variables are {0}'.format(tsv_names_loc)
        self._vprint(dbg_msg, header=True, verbosity=2)

        # Reset all of the timer values (as it is possible that there are no
        # time-series variables in the local list procuded above)
        self._timer.reset('Open Output Files')
        self._timer.reset('Create Time-Invariant Metadata')
        self._timer.reset('Create Time-Variant Metadata')
        self._timer.reset('Create Time-Series Variables')
        self._timer.reset('Write Time-Invariant Metadata')
        self._timer.reset('Write Time-Variant Metadata')
        self._timer.reset('Write Time-Series Variables')
        self._timer.reset('Close Output Files')

        # Initialize the byte count dictionary
        self._byte_counts['Requested Data'] = 0
        self._byte_counts['Actual Data'] = 0

        # Defining a simple helper function to determine whether to
        # write time-series data and/or write metadata.  This is useful
        # for adding the ability to write a "once" file
        def _get_once_info(vname):
            is_once_file = (vname == 'once')
            write_meta = True
            write_tser = True
            if self._use_once_file:
                write_meta = is_once_file
                write_tser = not is_once_file
            return is_once_file, write_meta, write_tser

        # NOTE: In the prototype, we check for the existance of the output
        # directory at this point.  If it does not exist, we create it (but
        # only from the master rank).  This requires synchronization with
        # the decomp utility.  Instead, we assume the output directory
        # already exists (and is checked by the Specifier's validation).  No
        # synchronization is needed.

        # For each time-series variable, create the corresponding output file
        # (Also defines the header info for each output file)
        out_files = {}
        out_tvm_vars = {}
        for out_name in tsv_names_loc:
            is_once_file, write_meta, write_tser = _get_once_info(out_name)

            # Determine the output file name for this variable
            out_filename = self._time_series_filenames[out_name]
            dbg_msg = 'Creating output file for variable: {0}'.format(out_name)
            if is_once_file:
                dbg_msg = 'Creating "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Open each output file and create the dimensions and attributes
            # NOTE: If the output file already exists, abort!
            self._timer.start('Open Output Files')
            if os.path.exists(out_filename):
                err_msg = 'Found existing output file: {0}'.format(
                    out_filename)
                raise OSError(err_msg)
            out_file = Nio.open_file(out_filename,
                                     'w',
                                     options=self._nio_options)
            for att_name, att_val in common_atts.iteritems():
                setattr(out_file, att_name, att_val)
            for dim_name, dim_val in common_dims.iteritems():
                if dim_name == self._unlimited_dim:
                    out_file.create_dimension(dim_name, None)
                else:
                    out_file.create_dimension(dim_name, dim_val)
            self._timer.stop('Open Output Files')

            # Create the time-invariant metadata variables
            if (write_meta):
                self._timer.start('Create Time-Invariant Metadata')
                for name in self._time_invariant_metadata:
                    in_var = ref_infile.variables[name]
                    out_var = out_file.create_variable(name, in_var.typecode(),
                                                       in_var.dimensions)
                    for att_name, att_val in in_var.attributes.iteritems():
                        setattr(out_var, att_name, att_val)
                self._timer.stop('Create Time-Invariant Metadata')

            # Create the time-variant metadata variables
            if write_meta:
                self._timer.start('Create Time-Variant Metadata')
                for name in self._time_variant_metadata:
                    in_var = ref_infile.variables[name]
                    out_tvm_vars[name] = out_file.create_variable(
                        name, in_var.typecode(), in_var.dimensions)
                    for att_name, att_val in in_var.attributes.iteritems():
                        setattr(out_tvm_vars[name], att_name, att_val)
                self._timer.stop('Create Time-Variant Metadata')

            # Create the time-series variable itself
            if write_tser:
                self._timer.start('Create Time-Series Variables')
                in_var = ref_infile.variables[out_name]
                out_var = out_file.create_variable(out_name, in_var.typecode(),
                                                   in_var.dimensions)
                self._timer.stop('Create Time-Series Variables')

            # Append the output file to list
            out_files[out_name] = out_file

        # Now that each output file has been created, start writing the data
        # (Looping over output file index, which is common in name lists)
        for out_name, out_file in out_files.iteritems():
            is_once_file, write_meta, write_tser = _get_once_info(out_name)

            dbg_msg = 'Writing output file for variable: {0}'.format(out_name)
            if is_once_file:
                dbg_msg = 'Writing "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Create the attributes of the time-series variable
            if write_tser:
                in_var = ref_infile.variables[out_name]
                out_var = out_file.variables[out_name]
                for att_name, att_val in in_var.attributes.iteritems():
                    setattr(out_var, att_name, att_val)

            # Write the time-invariant metadata
            if write_meta:
                self._timer.start('Write Time-Invariant Metadata')
                for name in self._time_invariant_metadata:
                    in_meta = ref_infile.variables[name]
                    out_meta = out_file.variables[name]
                    if in_meta.rank > 0:
                        out_meta[:] = in_meta[:]
                    else:
                        out_meta.assign_value(in_meta.get_value())
                self._timer.stop('Write Time-Invariant Metadata')

            # Write each time-variant variable
            series_step_index = 0
            for in_file in self._input_files:

                # Get the number of time steps in this slice file
                num_steps = in_file.dimensions[self._unlimited_dim]

                # Loop over the time steps in this slice file
                for slice_step_index in range(num_steps):

                    # Write the time-varient metadata
                    if write_meta:
                        self._timer.start('Write Time-Variant Metadata')
                        for name in self._time_variant_metadata:
                            in_meta = in_file.variables[name]
                            out_meta = out_file.variables[name]
                            ndims = len(in_meta.dimensions)
                            udidx = in_meta.dimensions.index(
                                self._unlimited_dim)
                            in_slice = [slice(None)] * ndims
                            in_slice[udidx] = slice_step_index
                            out_slice = [slice(None)] * ndims
                            out_slice[udidx] = series_step_index
                            out_meta[tuple(out_slice)] = in_meta[tuple(
                                in_slice)]

                            requested_nbytes = in_meta[:].nbytes
                            self._byte_counts[
                                'Requested Data'] += requested_nbytes
                            actual_nbytes = self.assumed_block_size \
                                * numpy.ceil(requested_nbytes / self.assumed_block_size)
                            self._byte_counts['Actual Data'] += actual_nbytes
                        self._timer.stop('Write Time-Variant Metadata')

                    # Write the time-series variables
                    if write_tser:
                        self._timer.start('Write Time-Series Variables')
                        in_var = in_file.variables[out_name]
                        ndims = len(in_var.dimensions)
                        udidx = in_var.dimensions.index(self._unlimited_dim)
                        in_slice = [slice(None)] * ndims
                        in_slice[udidx] = slice_step_index
                        out_slice = [slice(None)] * ndims
                        out_slice[udidx] = series_step_index
                        out_var[tuple(out_slice)] = in_var[tuple(in_slice)]

                        requested_nbytes = in_file.variables[
                            out_name][:].nbytes
                        self._byte_counts['Requested Data'] += requested_nbytes
                        actual_nbytes = self.assumed_block_size \
                            * numpy.ceil(requested_nbytes / self.assumed_block_size)
                        self._byte_counts['Actual Data'] += actual_nbytes
                        self._timer.stop('Write Time-Series Variables')

                    # Increment the time-series step index
                    series_step_index += 1

            # Close the output file
            self._timer.start('Close Output Files')
            out_file.close()
            self._timer.stop('Close Output Files')
            dbg_msg = 'Closed output file for variable: {0}'.format(out_name)
            if is_once_file:
                dbg_msg = 'Closed "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

        # Information
        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint('Finished converting time-slices to time-series.',
                         verbosity=1)

        # Finish clocking the entire convert procedure
        self._timer.stop('Complete Conversion Process')

    def print_diagnostics(self):
        """
        Print out timing and I/O information collected up to this point
        """

        # Get all totals and maxima
        my_times = self._timer.get_all_times()
        max_times = self._simplecomm.allreduce(my_times, op='max')
        my_bytes = self._byte_counts
        total_bytes = self._simplecomm.allreduce(my_bytes, op='sum')

        # Synchronize
        self._simplecomm.sync()

        # Print timing maxima
        o = self._timer.get_names()
        time_table_str = _pprint_dictionary('TIMING DATA', max_times, order=o)
        if self._simplecomm.is_manager():
            self._vprint(time_table_str, verbosity=0)

        # Convert byte count to MB
        for name in total_bytes:
            total_bytes[name] = total_bytes[name] / float(1024 * 1024)

        # Print byte count totals
        byte_count_str = _pprint_dictionary('BYTE COUNTS (MB)', total_bytes)
        if self._simplecomm.is_manager():
            self._vprint(byte_count_str, verbosity=0)
Example #6
0
    def __init__(self, specifier, serial=False, verbosity=1, wmode='w',
                 once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = nio_options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format in ['netcdf4', 'netcdf4c']:
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.compression_level
        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('  PyNIO options set', verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()
Example #7
0
class Slice2SeriesReshaper(Reshaper):
    """
    The time-slice to time-series Reshaper class

    This is the class that defines how the time-slice to time-series
    reshaping operation is to be performed.
    """
    def __init__(self,
                 specifier,
                 serial=False,
                 verbosity=1,
                 wmode='w',
                 once=False,
                 simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join([
            '[',
            str(self._simplecomm.get_rank()), '/',
            str(self._simplecomm.get_size()), '] '
        ])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = nio_options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format in ['netcdf4', 'netcdf4c']:
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.compression_level
        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('  PyNIO options set', verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()

    def _inspect_input_files(self):
        """
        Inspect the input data files themselves.

        We check the file contents here.
        """

        # Initialize the list of variable names for each category
        self._time_variant_metadata = []
        self._time_invariant_metadata = []

        # Initialize the local dictionary of time-series variables and sizes
        all_tsvars = {}

        #===== INSPECT FIRST INPUT FILE =====

        # Open first file
        ifile = nio_open_file(self._input_filenames[0])

        # Look for the 'unlimited' dimension
        try:
            self._unlimited_dim = next(dim for dim in ifile.dimensions
                                       if ifile.unlimited(dim))
        except StopIteration:
            err_msg = 'Unlimited dimension not found.'
            raise LookupError(err_msg)

        # Get the time values
        time_values = [ifile.variables[self._unlimited_dim].get_value()]

        # Get the list of variable names and missing variables
        var_names = set(ifile.variables.keys())
        missing_vars = set()

        # Categorize each variable (only looking at first file)
        for var_name, var in ifile.variables.iteritems():
            if self._unlimited_dim not in var.dimensions:
                self._time_invariant_metadata.append(var_name)
            elif var_name in self._metadata_names:
                self._time_variant_metadata.append(var_name)
            else:
                size = numpy.dtype(var.typecode()).itemsize
                size = size * numpy.prod(var.shape)
                all_tsvars[var_name] = size

        # Close the first file
        ifile.close()

        if self._simplecomm.is_manager():
            self._vprint('  First input file inspected.', verbosity=2)

        #===== INSPECT REMAINING INPUT FILES =====

        # Make a pass through remaining files and:
        # (1) Make sure it has the 'unlimited' dimension
        # (2) Make sure this dimension is truely 'unlimited'
        # (3) Check that this dimension has a corresponding variable
        # (4) Check if there are any missing variables
        # (5) Get the time values from the files
        for ifilename in self._input_filenames[1:]:
            ifile = nio_open_file(ifilename)

            # Determine the unlimited dimension
            if self._unlimited_dim not in ifile.dimensions:
                err_msg = ('Unlimited dimension not found '
                           'in file "{}"').format(ifilename)
                raise LookupError(err_msg)
            if not ifile.unlimited(self._unlimited_dim):
                err_msg = ('Dimension "{}" not unlimited in file '
                           '"{}"').format(self._unlimited_dim, ifilename)
                raise LookupError(err_msg)
            if self._unlimited_dim not in ifile.variables:
                err_msg = ('Unlimited dimension variable not found in file '
                           '"{}"').format(ifilename)
                raise LookupError(err_msg)

            # Get the time values (list of NDArrays)
            time_values.append(
                ifile.variables[self._unlimited_dim].get_value())

            # Get the missing variables
            var_names_next = set(ifile.variables.keys())
            missing_vars.update(var_names - var_names_next)

            # Close the file
            ifile.close()

        if self._simplecomm.is_manager():
            self._vprint('  Remaining input files inspected.', verbosity=2)

        #===== CHECK FOR MISSING VARIABLES =====

        # Make sure that the list of variables in each file is the same
        if len(missing_vars) != 0:
            warning = ("WARNING: The first input file has variables that are "
                       "not in all input files:{}{}").format(linesep, '   ')
            for var in missing_vars:
                warning += ' {}'.format(var)
            self._vprint(warning, header=True, verbosity=0)

        if self._simplecomm.is_manager():
            self._vprint('  Checked for missing variables.', verbosity=2)

        #===== SORT INPUT FILES BY TIME =====

        # Determine the sort order based on the first time in the time values
        old_order = range(len(self._input_filenames))
        new_order = sorted(old_order, key=lambda i: time_values[i][0])

        # Re-order the list of input filenames and time values
        new_filenames = [self._input_filenames[i] for i in new_order]
        new_values = [time_values[i] for i in new_order]

        # Now, check that the largest time in each file is less than the
        # smallest time in the next file (so that the time spans of each file
        # do not overlap)
        for i in xrange(1, len(new_values)):
            if new_values[i - 1][-1] >= new_values[i][0]:
                err_msg = ('Times in input files {} and {} appear '
                           'to overlap').format(new_filenames[i - 1],
                                                new_filenames[i])
                raise ValueError(err_msg)

        # Now that this is validated, save the time values and filename in
        # the new order
        self._input_filenames = new_filenames

        if self._simplecomm.is_manager():
            self._vprint('  Input files sorted by time.', verbosity=2)

        #===== FINALIZING OUTPUT =====

        # Debug output
        if self._simplecomm.is_manager():
            self._vprint('  Time-Invariant Metadata: '
                         '{0}'.format(self._time_invariant_metadata),
                         verbosity=1)
            self._vprint('  Time-Variant Metadata: '
                         '{0}'.format(self._time_variant_metadata),
                         verbosity=1)
            self._vprint('  Time-Series Variables: '
                         '{0}'.format(all_tsvars.keys()),
                         verbosity=1)

        # Add 'once' variable if writing to a once file
        # NOTE: This is a "cheat"!  There is no 'once' variable.  It's just
        #       a catch for all metadata IFF the 'once-file' is enabled.
        if self._use_once_file:
            all_tsvars['once'] = max(all_tsvars.values())

        # Partition the time-series variables across processors
        self._time_series_variables = self._simplecomm.partition(
            all_tsvars.items(), func=WeightBalanced(), involved=True)

    def _inspect_output_files(self):
        """
        Perform inspection of the output data files themselves.

        We compute the output file name from the prefix and suffix, and then
        we check whether the output files exist.  By default, if the output
        file
        """

        # Loop through the time-series variables and generate output filenames
        self._time_series_filenames = \
            dict([(variable, self._output_prefix + variable + self._output_suffix)
                  for variable in self._time_series_variables])

        # Find which files already exist
        self._existing = [
            v for (v, f) in self._time_series_filenames.iteritems()
            if isfile(f)
        ]

        # Set the starting step index for each variable
        self._time_series_step_index = \
            dict([(variable, 0) for variable in self._time_series_variables])

        # If overwrite is enabled, delete all existing files first
        if self._write_mode == 'o':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Deleting existing output files for '
                             'time-series variables: {}'.format(
                                 self._existing),
                             verbosity=0)
            for variable in self._existing:
                remove(self._time_series_filenames[variable])

        # Or, if skip existing is set, remove the existing time-series
        # variables from the list of time-series variables to convert
        elif self._write_mode == 's':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Skipping time-series variables with '
                             'existing output files: {}'.format(
                                 self._existing),
                             verbosity=0)
            for variable in self._existing:
                self._time_series_variables.remove(variable)

        # Or, if appending, check that the existing output files conform
        # to the expected pattern
        elif self._write_mode == 'a':

            # Check each existing time-series file
            for variable in self._existing:

                # Get the matching filename
                filename = self._time_series_filenames[variable]

                # Open the time-series file for inspection
                tsfile = nio_open_file(filename, 'r')

                # Check that the file has the unlimited dim and var
                if not tsfile.unlimited(self._unlimited_dim):
                    err_msg = ("Cannot append to time-series file with "
                               "missing unlimited dimension "
                               "'{}'").format(self._unlimited_dim)
                    raise RuntimeError(err_msg)

                # Check for once file
                is_once_file = (variable == 'once')
                needs_meta_data = not (self._use_once_file
                                       and not is_once_file)
                needs_tser_data = not (self._use_once_file and is_once_file)

                # Look for metadata
                if needs_meta_data:

                    # Check that the time-variant metadata are all present
                    for metavar in self._time_variant_metadata:
                        if metavar not in tsfile.variables:
                            err_msg = ("Cannot append to time-series file "
                                       "with missing time-variant metadata "
                                       "'{}'").format(metavar)
                            raise RuntimeError(err_msg)

                # Check that the time-series variable is present
                if needs_tser_data and variable not in tsfile.variables:
                    err_msg = ("Cannot append to time-series file with "
                               "missing time-series variable "
                               "'{}'").format(variable)
                    raise RuntimeError(err_msg)

                # Get the starting step index to start writing from
                self._time_series_step_index[variable] = \
                    tsfile.dimensions[self._unlimited_dim]

                # Close the time-series file
                tsfile.close()

        # Otherwise, throw an exception if any existing output files are found
        elif len(self._existing) > 0:
            err_msg = ("Found existing output files for time-series "
                       "variables: {}").format(self._existing)
            raise RuntimeError(err_msg)

    def convert(self, output_limit=0):
        """
        Method to perform the Reshaper's designated operation.

        In this case, convert a list of time-slice files to time-series files.

        Parameters:
            output_limit (int): Limit on the number of output (time-series)
                files to write during the convert() operation.  If set
                to 0, no limit is placed.  This limits the number
                of output files produced by each processor in a
                parallel run.
        """
        # Type checking input
        if type(output_limit) is not int:
            err_msg = 'Output limit must be an integer'
            raise TypeError(err_msg)

        # Start the total convert process timer
        self._simplecomm.sync()
        self._timer.start('Complete Conversion Process')

        # Validate the input files themselves
        if self._simplecomm.is_manager():
            self._vprint('Inspecting input files...', verbosity=0)
        self._timer.start('Inspect Input Files')
        self._inspect_input_files()
        self._timer.stop('Inspect Input Files')
        if self._simplecomm.is_manager():
            self._vprint('...Input files inspected.', verbosity=0)

        # Validate the output files
        if self._simplecomm.is_manager():
            self._vprint('Inspecting output files...', verbosity=0)
        self._timer.start('Inspect Output Files')
        self._inspect_output_files()
        self._timer.stop('Inspect Output Files')
        if self._simplecomm.is_manager():
            self._vprint('...Output files inspected.', verbosity=0)

        # Debugging output
        if self._simplecomm.is_manager():
            self._vprint('Converting time-slices to time-series...',
                         verbosity=0)

        # Partition the time-series variables across all processors
        tsv_names_loc = self._time_series_variables
        if output_limit > 0:
            tsv_names_loc = tsv_names_loc[0:output_limit]

        # Print partitions for all ranks
        dbg_msg = 'Converting time-series variables: {0}'.format(tsv_names_loc)
        self._vprint(dbg_msg, header=True, verbosity=1)

        # Reset all of the timer values (as it is possible that there are no
        # time-series variables in the local list procuded above)
        self._timer.reset('Open Output Files')
        self._timer.reset('Close Output Files')
        self._timer.reset('Open Input Files')
        self._timer.reset('Close Input Files')
        self._timer.reset('Create Time-Invariant Metadata')
        self._timer.reset('Create Time-Variant Metadata')
        self._timer.reset('Create Time-Series Variables')
        self._timer.reset('Read Time-Invariant Metadata')
        self._timer.reset('Read Time-Variant Metadata')
        self._timer.reset('Read Time-Series Variables')
        self._timer.reset('Write Time-Invariant Metadata')
        self._timer.reset('Write Time-Variant Metadata')
        self._timer.reset('Write Time-Series Variables')

        # Initialize the byte count dictionary
        self._byte_counts['Requested Data'] = 0
        self._byte_counts['Actual Data'] = 0

        # Defining a simple helper function to determine the bytes size of
        # a variable given to it, whether an NDArray or not
        def _get_bytesize(data):
            return data.nbytes if hasattr(data, 'nbytes') else 0

        #===== LOOP OVER TIME_SERIES VARIABLES =====

        # Loop over all time-series variables
        for out_name in tsv_names_loc:

            # Once-file data, for convenience
            is_once_file = (out_name == 'once')
            write_meta_data = not (self._use_once_file and not is_once_file)
            write_tser_data = not (self._use_once_file and is_once_file)

            # Determine the output file name for this variable
            out_filename = self._time_series_filenames[out_name]
            dbg_msg = 'Opening output file for variable: {0}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Opening "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Open the output file
            self._timer.start('Open Output Files')
            temp_filename = out_filename + '_temp_.nc'
            if exists(temp_filename):
                remove(temp_filename)
            if self._write_mode == 'a' and out_name in self._existing:
                rename(out_filename, temp_filename)
                out_file = nio_open_file(temp_filename,
                                         'a',
                                         options=self._nio_options)
                appending = True
            else:
                out_file = nio_open_file(temp_filename,
                                         'w',
                                         options=self._nio_options)
                appending = False
            self._timer.stop('Open Output Files')

            # Start the loop over input files (i.e., time-steps)
            series_step_index = self._time_series_step_index[out_name]
            for in_filename in self._input_filenames:

                # Open the input file
                self._timer.start('Open Input Files')
                in_file = nio_open_file(in_filename, 'r')
                self._timer.stop('Open Input Files')

                # Create header info, if this is the first input file
                if in_filename == self._input_filenames[0] and not appending:

                    # Copy file attributes and dimensions to output file
                    for name, val in in_file.attributes.iteritems():
                        setattr(out_file, name, val)
                    for name, val in in_file.dimensions.iteritems():
                        if name == self._unlimited_dim:
                            out_file.create_dimension(name, None)
                        else:
                            out_file.create_dimension(name, val)

                    # Create the metadata variables
                    if write_meta_data:

                        # Time-invariant metadata variables
                        self._timer.start('Create Time-Invariant Metadata')
                        for name in self._time_invariant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.create_variable(
                                name, in_var.typecode(), in_var.dimensions)
                            for att_name, att_val in in_var.attributes.iteritems(
                            ):
                                setattr(out_var, att_name, att_val)
                        self._timer.stop('Create Time-Invariant Metadata')

                        # Time-variant metadata variables
                        self._timer.start('Create Time-Variant Metadata')
                        for name in self._time_variant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.create_variable(
                                name, in_var.typecode(), in_var.dimensions)
                            for att_name, att_val in in_var.attributes.iteritems(
                            ):
                                setattr(out_var, att_name, att_val)
                        self._timer.stop('Create Time-Variant Metadata')

                    # Create the time-series variable
                    if write_tser_data:

                        # Time-series variable
                        self._timer.start('Create Time-Series Variables')
                        in_var = in_file.variables[out_name]
                        out_var = out_file.create_variable(
                            out_name, in_var.typecode(), in_var.dimensions)
                        for att_name, att_val in in_var.attributes.iteritems():
                            setattr(out_var, att_name, att_val)
                        self._timer.stop('Create Time-Series Variables')

                    dbg_msg = ('Writing output file for variable: '
                               '{0}').format(out_name)
                    if out_name == 'once':
                        dbg_msg = 'Writing "once" file.'
                    self._vprint(dbg_msg, header=True, verbosity=1)

                    # Copy the time-invariant metadata
                    if write_meta_data:

                        for name in self._time_invariant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.variables[name]
                            self._timer.start('Read Time-Invariant Metadata')
                            tmp_data = in_var.get_value()
                            self._timer.stop('Read Time-Invariant Metadata')
                            self._timer.start('Write Time-Invariant Metadata')
                            out_var.assign_value(tmp_data)
                            self._timer.stop('Write Time-Invariant Metadata')

                            requested_nbytes = _get_bytesize(tmp_data)
                            self._byte_counts[
                                'Requested Data'] += requested_nbytes
                            actual_nbytes = self.assumed_block_size \
                                * numpy.ceil(requested_nbytes / self.assumed_block_size)
                            self._byte_counts['Actual Data'] += actual_nbytes

                # Get the number of time steps in this slice file
                num_steps = in_file.dimensions[self._unlimited_dim]

                # Explicitly loop over time steps (to control memory use)
                for slice_step_index in xrange(num_steps):

                    # Copy the time-varient metadata
                    if write_meta_data:

                        for name in self._time_variant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.variables[name]
                            ndims = len(in_var.dimensions)
                            udidx = in_var.dimensions.index(
                                self._unlimited_dim)
                            in_slice = [slice(None)] * ndims
                            in_slice[udidx] = slice_step_index
                            out_slice = [slice(None)] * ndims
                            out_slice[udidx] = series_step_index
                            self._timer.start('Read Time-Variant Metadata')
                            tmp_data = in_var[tuple(in_slice)]
                            self._timer.stop('Read Time-Variant Metadata')
                            self._timer.start('Write Time-Variant Metadata')
                            out_var[tuple(out_slice)] = tmp_data
                            self._timer.stop('Write Time-Variant Metadata')

                            requested_nbytes = _get_bytesize(tmp_data)
                            self._byte_counts[
                                'Requested Data'] += requested_nbytes
                            actual_nbytes = self.assumed_block_size \
                                * numpy.ceil(requested_nbytes / self.assumed_block_size)
                            self._byte_counts['Actual Data'] += actual_nbytes

                    # Copy the time-series variables
                    if write_tser_data:

                        in_var = in_file.variables[out_name]
                        out_var = out_file.variables[out_name]
                        ndims = len(in_var.dimensions)
                        udidx = in_var.dimensions.index(self._unlimited_dim)
                        in_slice = [slice(None)] * ndims
                        in_slice[udidx] = slice_step_index
                        out_slice = [slice(None)] * ndims
                        out_slice[udidx] = series_step_index
                        self._timer.start('Read Time-Series Variables')
                        tmp_data = in_var[tuple(in_slice)]
                        self._timer.stop('Read Time-Series Variables')
                        self._timer.start('Write Time-Series Variables')
                        out_var[tuple(out_slice)] = tmp_data
                        self._timer.stop('Write Time-Series Variables')

                        requested_nbytes = _get_bytesize(tmp_data)
                        self._byte_counts['Requested Data'] += requested_nbytes
                        actual_nbytes = self.assumed_block_size \
                            * numpy.ceil(requested_nbytes / self.assumed_block_size)
                        self._byte_counts['Actual Data'] += actual_nbytes

                    # Increment the time-series step index
                    series_step_index += 1

                # Close the input file
                self._timer.start('Close Input Files')
                in_file.close()
                self._timer.stop('Close Input Files')

            # Close the output file
            self._timer.start('Close Output Files')
            out_file.close()
            rename(temp_filename, out_filename)
            self._timer.stop('Close Output Files')

            # Output message to user
            dbg_msg = 'Closed output file for variable: {0}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Closed "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

        # Information
        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint(('Finished converting time-slices '
                          'to time-series.'),
                         verbosity=0)

        # Finish clocking the entire convert procedure
        self._timer.stop('Complete Conversion Process')

    def print_diagnostics(self):
        """
        Print out timing and I/O information collected up to this point
        """

        # Get all totals and maxima
        my_times = self._timer.get_all_times()
        max_times = self._simplecomm.allreduce(my_times, op='max')
        my_bytes = self._byte_counts
        total_bytes = self._simplecomm.allreduce(my_bytes, op='sum')

        # Synchronize
        self._simplecomm.sync()

        # Print timing maxima
        o = self._timer.get_names()
        time_table_str = _pprint_dictionary('TIMING DATA', max_times, order=o)
        if self._simplecomm.is_manager():
            self._vprint(time_table_str, verbosity=-1)

        # Convert byte count to MB
        for name in total_bytes:
            total_bytes[name] = total_bytes[name] / float(1024 * 1024)

        # Print byte count totals
        byte_count_str = _pprint_dictionary('BYTE COUNTS (MB)', total_bytes)
        if self._simplecomm.is_manager():
            self._vprint(byte_count_str, verbosity=-1)
Example #8
0
class Slice2SeriesReshaper(Reshaper):

    """
    The time-slice to time-series Reshaper class

    This is the class that defines how the time-slice to time-series
    reshaping operation is to be performed.
    """

    def __init__(self, specifier, serial=False, verbosity=1, wmode='w',
                 once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = nio_options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format in ['netcdf4', 'netcdf4c']:
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.compression_level
        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('  PyNIO options set', verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()

    def _inspect_input_files(self):
        """
        Inspect the input data files themselves.

        We check the file contents here.
        """

        # Initialize the list of variable names for each category
        self._time_variant_metadata = []
        self._time_invariant_metadata = []

        # Initialize the local dictionary of time-series variables and sizes
        all_tsvars = {}

        #===== INSPECT FIRST INPUT FILE =====

        # Open first file
        ifile = nio_open_file(self._input_filenames[0])

        # Look for the 'unlimited' dimension
        try:
            self._unlimited_dim = next(dim for dim in ifile.dimensions
                                       if ifile.unlimited(dim))
        except StopIteration:
            err_msg = 'Unlimited dimension not found.'
            raise LookupError(err_msg)

        # Get the time values
        time_values = [ifile.variables[self._unlimited_dim].get_value()]

        # Get the list of variable names and missing variables
        var_names = set(ifile.variables.keys())
        missing_vars = set()

        # Categorize each variable (only looking at first file)
        for var_name, var in ifile.variables.iteritems():
            if self._unlimited_dim not in var.dimensions:
                self._time_invariant_metadata.append(var_name)
            elif var_name in self._metadata_names:
                self._time_variant_metadata.append(var_name)
            else:
                size = numpy.dtype(var.typecode()).itemsize
                size = size * numpy.prod(var.shape)
                all_tsvars[var_name] = size

        # Close the first file
        ifile.close()

        if self._simplecomm.is_manager():
            self._vprint('  First input file inspected.', verbosity=2)

        #===== INSPECT REMAINING INPUT FILES =====

        # Make a pass through remaining files and:
        # (1) Make sure it has the 'unlimited' dimension
        # (2) Make sure this dimension is truely 'unlimited'
        # (3) Check that this dimension has a corresponding variable
        # (4) Check if there are any missing variables
        # (5) Get the time values from the files
        for ifilename in self._input_filenames[1:]:
            ifile = nio_open_file(ifilename)

            # Determine the unlimited dimension
            if self._unlimited_dim not in ifile.dimensions:
                err_msg = ('Unlimited dimension not found '
                           'in file "{}"').format(ifilename)
                raise LookupError(err_msg)
            if not ifile.unlimited(self._unlimited_dim):
                err_msg = ('Dimension "{}" not unlimited in file '
                           '"{}"').format(self._unlimited_dim, ifilename)
                raise LookupError(err_msg)
            if self._unlimited_dim not in ifile.variables:
                err_msg = ('Unlimited dimension variable not found in file '
                           '"{}"').format(ifilename)
                raise LookupError(err_msg)

            # Get the time values (list of NDArrays)
            time_values.append(
                ifile.variables[self._unlimited_dim].get_value())

            # Get the missing variables
            var_names_next = set(ifile.variables.keys())
            missing_vars.update(var_names - var_names_next)

            # Close the file
            ifile.close()

        if self._simplecomm.is_manager():
            self._vprint('  Remaining input files inspected.', verbosity=2)

        #===== CHECK FOR MISSING VARIABLES =====

        # Make sure that the list of variables in each file is the same
        if len(missing_vars) != 0:
            warning = ("WARNING: The first input file has variables that are "
                       "not in all input files:{}{}").format(linesep, '   ')
            for var in missing_vars:
                warning += ' {}'.format(var)
            self._vprint(warning, header=True, verbosity=0)

        if self._simplecomm.is_manager():
            self._vprint('  Checked for missing variables.', verbosity=2)

        #===== SORT INPUT FILES BY TIME =====

        # Determine the sort order based on the first time in the time values
        old_order = range(len(self._input_filenames))
        new_order = sorted(old_order, key=lambda i: time_values[i][0])

        # Re-order the list of input filenames and time values
        new_filenames = [self._input_filenames[i] for i in new_order]
        new_values = [time_values[i] for i in new_order]

        # Now, check that the largest time in each file is less than the
        # smallest time in the next file (so that the time spans of each file
        # do not overlap)
        for i in xrange(1, len(new_values)):
            if new_values[i - 1][-1] >= new_values[i][0]:
                err_msg = ('Times in input files {} and {} appear '
                           'to overlap').format(new_filenames[i - 1],
                                                new_filenames[i])
                raise ValueError(err_msg)

        # Now that this is validated, save the time values and filename in
        # the new order
        self._input_filenames = new_filenames

        if self._simplecomm.is_manager():
            self._vprint('  Input files sorted by time.', verbosity=2)

        #===== FINALIZING OUTPUT =====

        # Debug output
        if self._simplecomm.is_manager():
            self._vprint('  Time-Invariant Metadata: '
                         '{}'.format(self._time_invariant_metadata), verbosity=1)
            self._vprint('  Time-Variant Metadata: '
                         '{}'.format(self._time_variant_metadata), verbosity=1)
            self._vprint('  Time-Series Variables: '
                         '{}'.format(all_tsvars.keys()), verbosity=1)

        # Add 'once' variable if writing to a once file
        # NOTE: This is a "cheat"!  There is no 'once' variable.  It's just
        #       a catch for all metadata IFF the 'once-file' is enabled.
        if self._use_once_file:
            all_tsvars['once'] = max(all_tsvars.values())

        # Partition the time-series variables across processors
        self._time_series_variables = self._simplecomm.partition(
            all_tsvars.items(), func=WeightBalanced(), involved=True)

    def _inspect_output_files(self):
        """
        Perform inspection of the output data files themselves.

        We compute the output file name from the prefix and suffix, and then
        we check whether the output files exist.  By default, if the output
        file
        """

        # Loop through the time-series variables and generate output filenames
        self._time_series_filenames = \
            dict([(variable, self._output_prefix + variable + self._output_suffix)
                  for variable in self._time_series_variables])

        # Find which files already exist
        self._existing = [v for (v, f) in self._time_series_filenames.iteritems()
                          if isfile(f)]

        # Set the starting step index for each variable
        self._time_series_step_index = \
            dict([(variable, 0) for variable in self._time_series_variables])

        # If overwrite is enabled, delete all existing files first
        if self._write_mode == 'o':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Deleting existing output files for '
                             'time-series variables: {}'.format(self._existing),
                             verbosity=0)
            for variable in self._existing:
                remove(self._time_series_filenames[variable])

        # Or, if skip existing is set, remove the existing time-series
        # variables from the list of time-series variables to convert
        elif self._write_mode == 's':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Skipping time-series variables with '
                             'existing output files: {}'.format(self._existing),
                             verbosity=0)
            for variable in self._existing:
                self._time_series_variables.remove(variable)

        # Or, if appending, check that the existing output files conform
        # to the expected pattern
        elif self._write_mode == 'a':

            # Check each existing time-series file
            for variable in self._existing:

                # Get the matching filename
                filename = self._time_series_filenames[variable]

                # Open the time-series file for inspection
                tsfile = nio_open_file(filename, 'r')

                # Check that the file has the unlimited dim and var
                if not tsfile.unlimited(self._unlimited_dim):
                    err_msg = ("Cannot append to time-series file with "
                               "missing unlimited dimension "
                               "'{}'").format(self._unlimited_dim)
                    raise RuntimeError(err_msg)

                # Check for once file
                is_once_file = (variable == 'once')
                needs_meta_data = not (self._use_once_file and not is_once_file)
                needs_tser_data = not (self._use_once_file and is_once_file)

                # Look for metadata
                if needs_meta_data:

                    # Check that the time-variant metadata are all present
                    for metavar in self._time_variant_metadata:
                        if metavar not in tsfile.variables:
                            err_msg = ("Cannot append to time-series file "
                                       "with missing time-variant metadata "
                                       "'{}'").format(metavar)
                            raise RuntimeError(err_msg)

                # Check that the time-series variable is present
                if needs_tser_data and variable not in tsfile.variables:
                    err_msg = ("Cannot append to time-series file with "
                               "missing time-series variable "
                               "'{}'").format(variable)
                    raise RuntimeError(err_msg)

                # Get the starting step index to start writing from
                self._time_series_step_index[variable] = \
                    tsfile.dimensions[self._unlimited_dim]

                # Close the time-series file
                tsfile.close()

        # Otherwise, throw an exception if any existing output files are found
        elif len(self._existing) > 0:
            err_msg = ("Found existing output files for time-series "
                       "variables: {}").format(self._existing)
            raise RuntimeError(err_msg)

    def convert(self, output_limit=0):
        """
        Method to perform the Reshaper's designated operation.

        In this case, convert a list of time-slice files to time-series files.

        Parameters:
            output_limit (int): Limit on the number of output (time-series)
                files to write during the convert() operation.  If set
                to 0, no limit is placed.  This limits the number
                of output files produced by each processor in a
                parallel run.
        """
        # Type checking input
        if type(output_limit) is not int:
            err_msg = 'Output limit must be an integer'
            raise TypeError(err_msg)

        # Start the total convert process timer
        self._simplecomm.sync()
        self._timer.start('Complete Conversion Process')

        # Validate the input files themselves
        if self._simplecomm.is_manager():
            self._vprint('Inspecting input files...', verbosity=0)
        self._timer.start('Inspect Input Files')
        self._inspect_input_files()
        self._timer.stop('Inspect Input Files')
        if self._simplecomm.is_manager():
            self._vprint('...Input files inspected.', verbosity=0)

        # Validate the output files
        if self._simplecomm.is_manager():
            self._vprint('Inspecting output files...', verbosity=0)
        self._timer.start('Inspect Output Files')
        self._inspect_output_files()
        self._timer.stop('Inspect Output Files')
        if self._simplecomm.is_manager():
            self._vprint('...Output files inspected.', verbosity=0)

        # Debugging output
        if self._simplecomm.is_manager():
            self._vprint('Converting time-slices to time-series...', verbosity=0)

        # Partition the time-series variables across all processors
        tsv_names_loc = self._time_series_variables
        if output_limit > 0:
            tsv_names_loc = tsv_names_loc[0:output_limit]

        # Print partitions for all ranks
        dbg_msg = 'Converting time-series variables: {}'.format(tsv_names_loc)
        self._vprint(dbg_msg, header=True, verbosity=1)

        # Reset all of the timer values (as it is possible that there are no
        # time-series variables in the local list procuded above)
        self._timer.reset('Open Output Files')
        self._timer.reset('Close Output Files')
        self._timer.reset('Open Input Files')
        self._timer.reset('Close Input Files')
        self._timer.reset('Create Time-Invariant Metadata')
        self._timer.reset('Create Time-Variant Metadata')
        self._timer.reset('Create Time-Series Variables')
        self._timer.reset('Read Time-Invariant Metadata')
        self._timer.reset('Read Time-Variant Metadata')
        self._timer.reset('Read Time-Series Variables')
        self._timer.reset('Write Time-Invariant Metadata')
        self._timer.reset('Write Time-Variant Metadata')
        self._timer.reset('Write Time-Series Variables')

        # Initialize the byte count dictionary
        self._byte_counts['Requested Data'] = 0
        self._byte_counts['Actual Data'] = 0

        # Defining a simple helper function to determine the bytes size of
        # a variable given to it, whether an NDArray or not
        def _get_bytesize(data):
            return data.nbytes if hasattr(data, 'nbytes') else 0

        #===== LOOP OVER TIME_SERIES VARIABLES =====

        # Loop over all time-series variables
        for out_name in tsv_names_loc:

            # Once-file data, for convenience
            is_once_file = (out_name == 'once')
            write_meta_data = not (self._use_once_file and not is_once_file)
            write_tser_data = not (self._use_once_file and is_once_file)

            # Determine the output file name for this variable
            out_filename = self._time_series_filenames[out_name]
            dbg_msg = 'Opening output file for variable: {}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Opening "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Open the output file
            self._timer.start('Open Output Files')
            temp_filename = out_filename + '_temp_.nc'
            if exists(temp_filename):
                remove(temp_filename)
            if self._write_mode == 'a' and out_name in self._existing:
                rename(out_filename, temp_filename)
                out_file = nio_open_file(temp_filename, 'a',
                                         options=self._nio_options)
                appending = True
            else:
                out_file = nio_open_file(temp_filename, 'w',
                                         options=self._nio_options)
                appending = False
            self._timer.stop('Open Output Files')

            # Start the loop over input files (i.e., time-steps)
            series_step_index = self._time_series_step_index[out_name]
            for in_filename in self._input_filenames:

                # Open the input file
                self._timer.start('Open Input Files')
                in_file = nio_open_file(in_filename, 'r')
                self._timer.stop('Open Input Files')

                # Create header info, if this is the first input file
                if in_filename == self._input_filenames[0] and not appending:

                    # Copy file attributes and dimensions to output file
                    for name, val in in_file.attributes.iteritems():
                        setattr(out_file, name, val)
                    for name, val in in_file.dimensions.iteritems():
                        if name == self._unlimited_dim:
                            out_file.create_dimension(name, None)
                        else:
                            out_file.create_dimension(name, val)

                    # Create the metadata variables
                    if write_meta_data:

                        # Time-invariant metadata variables
                        self._timer.start('Create Time-Invariant Metadata')
                        for name in self._time_invariant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.create_variable(
                                name, in_var.typecode(), in_var.dimensions)
                            for att_name, att_val in in_var.attributes.iteritems():
                                setattr(out_var, att_name, att_val)
                        self._timer.stop('Create Time-Invariant Metadata')

                        # Time-variant metadata variables
                        self._timer.start('Create Time-Variant Metadata')
                        for name in self._time_variant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.create_variable(
                                name, in_var.typecode(), in_var.dimensions)
                            for att_name, att_val in in_var.attributes.iteritems():
                                setattr(out_var, att_name, att_val)
                        self._timer.stop('Create Time-Variant Metadata')

                    # Create the time-series variable
                    if write_tser_data:

                        # Time-series variable
                        self._timer.start('Create Time-Series Variables')
                        in_var = in_file.variables[out_name]
                        out_var = out_file.create_variable(
                            out_name, in_var.typecode(), in_var.dimensions)
                        for att_name, att_val in in_var.attributes.iteritems():
                            setattr(out_var, att_name, att_val)
                        self._timer.stop('Create Time-Series Variables')

                    dbg_msg = ('Writing output file for variable: '
                               '{}').format(out_name)
                    if out_name == 'once':
                        dbg_msg = 'Writing "once" file.'
                    self._vprint(dbg_msg, header=True, verbosity=1)

                    # Copy the time-invariant metadata
                    if write_meta_data:

                        for name in self._time_invariant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.variables[name]
                            self._timer.start('Read Time-Invariant Metadata')
                            tmp_data = in_var.get_value()
                            self._timer.stop('Read Time-Invariant Metadata')
                            self._timer.start('Write Time-Invariant Metadata')
                            out_var.assign_value(tmp_data)
                            self._timer.stop('Write Time-Invariant Metadata')

                            requested_nbytes = _get_bytesize(tmp_data)
                            self._byte_counts[
                                'Requested Data'] += requested_nbytes
                            actual_nbytes = self.assumed_block_size \
                                * numpy.ceil(requested_nbytes / self.assumed_block_size)
                            self._byte_counts['Actual Data'] += actual_nbytes

                # Get the number of time steps in this slice file
                num_steps = in_file.dimensions[self._unlimited_dim]

                # Explicitly loop over time steps (to control memory use)
                for slice_step_index in xrange(num_steps):

                    # Copy the time-varient metadata
                    if write_meta_data:

                        for name in self._time_variant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.variables[name]
                            ndims = len(in_var.dimensions)
                            udidx = in_var.dimensions.index(
                                self._unlimited_dim)
                            in_slice = [slice(None)] * ndims
                            in_slice[udidx] = slice_step_index
                            out_slice = [slice(None)] * ndims
                            out_slice[udidx] = series_step_index
                            self._timer.start('Read Time-Variant Metadata')
                            tmp_data = in_var[tuple(in_slice)]
                            self._timer.stop('Read Time-Variant Metadata')
                            self._timer.start('Write Time-Variant Metadata')
                            out_var[tuple(out_slice)] = tmp_data
                            self._timer.stop('Write Time-Variant Metadata')

                            requested_nbytes = _get_bytesize(tmp_data)
                            self._byte_counts[
                                'Requested Data'] += requested_nbytes
                            actual_nbytes = self.assumed_block_size \
                                * numpy.ceil(requested_nbytes / self.assumed_block_size)
                            self._byte_counts['Actual Data'] += actual_nbytes

                    # Copy the time-series variables
                    if write_tser_data:

                        in_var = in_file.variables[out_name]
                        out_var = out_file.variables[out_name]
                        ndims = len(in_var.dimensions)
                        udidx = in_var.dimensions.index(self._unlimited_dim)
                        in_slice = [slice(None)] * ndims
                        in_slice[udidx] = slice_step_index
                        out_slice = [slice(None)] * ndims
                        out_slice[udidx] = series_step_index
                        self._timer.start('Read Time-Series Variables')
                        tmp_data = in_var[tuple(in_slice)]
                        self._timer.stop('Read Time-Series Variables')
                        self._timer.start('Write Time-Series Variables')
                        out_var[tuple(out_slice)] = tmp_data
                        self._timer.stop('Write Time-Series Variables')

                        requested_nbytes = _get_bytesize(tmp_data)
                        self._byte_counts['Requested Data'] += requested_nbytes
                        actual_nbytes = self.assumed_block_size \
                            * numpy.ceil(requested_nbytes / self.assumed_block_size)
                        self._byte_counts['Actual Data'] += actual_nbytes

                    # Increment the time-series step index
                    series_step_index += 1

                # Close the input file
                self._timer.start('Close Input Files')
                in_file.close()
                self._timer.stop('Close Input Files')

            # Close the output file
            self._timer.start('Close Output Files')
            out_file.close()
            rename(temp_filename, out_filename)
            self._timer.stop('Close Output Files')

            # Output message to user
            dbg_msg = 'Closed output file for variable: {}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Closed "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

        # Information
        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint(('Finished converting time-slices '
                          'to time-series.'), verbosity=0)

        # Finish clocking the entire convert procedure
        self._timer.stop('Complete Conversion Process')

    def print_diagnostics(self):
        """
        Print out timing and I/O information collected up to this point
        """

        # Get all totals and maxima
        my_times = self._timer.get_all_times()
        max_times = self._simplecomm.allreduce(my_times, op='max')
        my_bytes = self._byte_counts
        total_bytes = self._simplecomm.allreduce(my_bytes, op='sum')

        # Synchronize
        self._simplecomm.sync()

        # Print timing maxima
        o = self._timer.get_names()
        time_table_str = _pprint_dictionary('TIMING DATA', max_times, order=o)
        if self._simplecomm.is_manager():
            self._vprint(time_table_str, verbosity=-1)

        # Convert byte count to MB
        for name in total_bytes:
            total_bytes[name] = total_bytes[name] / float(1024 * 1024)

        # Print byte count totals
        byte_count_str = _pprint_dictionary('BYTE COUNTS (MB)', total_bytes)
        if self._simplecomm.is_manager():
            self._vprint(byte_count_str, verbosity=-1)
Example #9
0
    def __init__(self, specifier, serial=False, verbosity=1,
                 skip_existing=False, overwrite=False,
                 once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class, 
                defining the input specification for this reshaper operation.

        Keyword Arguments:
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0 
                means no output, and a higher value means more output.  The
                default value is 1.
            skip_existing (bool): Flag specifying whether to skip the generation
                of time-series for variables with time-series files that already
                exist.  Default is False.
            overwrite (bool): Flag specifying whether to forcefully overwrite
                output files if they already exist.  Default is False.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel 
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(skip_existing) is not bool:
            err_msg = "Skip_existing flag must be True or False."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not (isinstance(simplecomm, SimpleComm) or \
                    isinstance(simplecomm, SimpleCommMPI)):
                err_msg = ("Simple communicator object is not a SimpleComm or ",
                           "SimpleCommMPI")
                raise TypeError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper', verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('Specifier validated', verbosity=1)

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = Nio.options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format == 'netcdf4':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = 0
        elif specifier.netcdf_format == 'netcdf4c':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.netcdf_deflate
            if self._simplecomm.is_manager():
                self._vprint('PyNIO compression level: {0}'.format(\
                    specifier.netcdf_deflate), verbosity=2)

        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('PyNIO options set', verbosity=2)

        # Open all of the input files
        self._timer.start('Open Input Files')
        self._input_files = []
        for filename in specifier.input_file_list:
            self._input_files.append(Nio.open_file(filename, "r"))
        self._timer.stop('Open Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files opened', verbosity=2)

        # Validate the input files themselves
        self._timer.start('Input File Validation')
        self._validate_input_files(specifier)
        self._timer.stop('Input File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Input files validated', verbosity=2)

        # Sort the input files by time
        self._timer.start('Sort Input Files')
        self._sort_input_files_by_time(specifier)
        self._timer.stop('Sort Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files sorted', verbosity=2)

        # Retrieve and sort the variables in each time-slice file
        # (To determine if it is time-invariant metadata, time-variant
        # metadata, or if it is a time-series variable)
        self._timer.start('Sort Variables')
        self._sort_variables(specifier)
        self._timer.stop('Sort Variables')
        if self._simplecomm.is_manager():
            self._vprint('Variables sorted', verbosity=2)

        # Validate the output files
        self._timer.start('Output File Validation')
        self._validate_output_files(specifier, skip_existing, overwrite)
        self._timer.stop('Output File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Output files validated', verbosity=2)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=1)

        # Sync before continuing..
        self._simplecomm.sync()
Example #10
0
class Slice2SeriesReshaper(Reshaper):

    """
    The time-slice to time-series Reshaper class

    This is the class that defines how the time-slice to time-series 
    reshaping operation is to be performed.
    """

    def __init__(self, specifier, serial=False, verbosity=1,
                 skip_existing=False, overwrite=False,
                 once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class, 
                defining the input specification for this reshaper operation.

        Keyword Arguments:
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0 
                means no output, and a higher value means more output.  The
                default value is 1.
            skip_existing (bool): Flag specifying whether to skip the generation
                of time-series for variables with time-series files that already
                exist.  Default is False.
            overwrite (bool): Flag specifying whether to forcefully overwrite
                output files if they already exist.  Default is False.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel 
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(skip_existing) is not bool:
            err_msg = "Skip_existing flag must be True or False."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not (isinstance(simplecomm, SimpleComm) or \
                    isinstance(simplecomm, SimpleCommMPI)):
                err_msg = ("Simple communicator object is not a SimpleComm or ",
                           "SimpleCommMPI")
                raise TypeError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # Internal timer data
        self._timer = TimeKeeper()

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)
        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper', verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('Specifier validated', verbosity=1)

        # Setup PyNIO options (including disabling the default PreFill option)
        opt = Nio.options()
        opt.PreFill = False

        # Determine the Format and CompressionLevel options
        # from the NetCDF format string in the Specifier
        if specifier.netcdf_format == 'netcdf':
            opt.Format = 'Classic'
        elif specifier.netcdf_format == 'netcdf4':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = 0
        elif specifier.netcdf_format == 'netcdf4c':
            opt.Format = 'NetCDF4Classic'
            opt.CompressionLevel = specifier.netcdf_deflate
            if self._simplecomm.is_manager():
                self._vprint('PyNIO compression level: {0}'.format(\
                    specifier.netcdf_deflate), verbosity=2)

        self._nio_options = opt
        if self._simplecomm.is_manager():
            self._vprint('PyNIO options set', verbosity=2)

        # Open all of the input files
        self._timer.start('Open Input Files')
        self._input_files = []
        for filename in specifier.input_file_list:
            self._input_files.append(Nio.open_file(filename, "r"))
        self._timer.stop('Open Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files opened', verbosity=2)

        # Validate the input files themselves
        self._timer.start('Input File Validation')
        self._validate_input_files(specifier)
        self._timer.stop('Input File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Input files validated', verbosity=2)

        # Sort the input files by time
        self._timer.start('Sort Input Files')
        self._sort_input_files_by_time(specifier)
        self._timer.stop('Sort Input Files')
        if self._simplecomm.is_manager():
            self._vprint('Input files sorted', verbosity=2)

        # Retrieve and sort the variables in each time-slice file
        # (To determine if it is time-invariant metadata, time-variant
        # metadata, or if it is a time-series variable)
        self._timer.start('Sort Variables')
        self._sort_variables(specifier)
        self._timer.stop('Sort Variables')
        if self._simplecomm.is_manager():
            self._vprint('Variables sorted', verbosity=2)

        # Validate the output files
        self._timer.start('Output File Validation')
        self._validate_output_files(specifier, skip_existing, overwrite)
        self._timer.stop('Output File Validation')
        if self._simplecomm.is_manager():
            self._vprint('Output files validated', verbosity=2)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Reshaper initialized.', verbosity=1)

        # Sync before continuing..
        self._simplecomm.sync()

    def _validate_input_files(self, specifier):
        """
        Perform validation of input data files themselves.  

        We check the file contents here, assuming that the files are already 
        open.

        Parameters:
            specifier (Specifier): The reshaper specifier object
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Validating input files', verbosity=1)

        # In the first file, look for the 'unlimited' dimension
        ifile = self._input_files[0]
        self._unlimited_dim = None
        for dim in ifile.dimensions:
            if ifile.unlimited(dim):
                self._unlimited_dim = dim
                break  # There can only be 1!
        if self._unlimited_dim == None:
            err_msg = 'Unlimited dimension not identified.'
            raise LookupError(err_msg)

        # Make a pass through each file and:
        # (1) Make sure it has the 'unlimited' dimension
        # (2) Make sure this dimension is truely 'unlimited'
        # (3) Check that this dimension has a corresponding variable
        for i in range(len(self._input_files)):
            ifile = self._input_files[i]
            if self._unlimited_dim not in ifile.dimensions:
                err_msg = 'Unlimited dimension not found in file ({0})'.\
                          format(specifier.input_file_list[i])
                raise LookupError(err_msg)
            if not ifile.unlimited(self._unlimited_dim):
                err_msg = 'Unlimited dimension not unlimited in file ({0})'.\
                          format(specifier.input_file_list[i])
                raise LookupError(err_msg)
            if self._unlimited_dim not in ifile.variables:
                err_msg = 'Unlimited dimension variable not found in file ({0})'.\
                          format(specifier.input_file_list[i])
                raise LookupError(err_msg)

        # Make sure that the list of variables in each file is the same
        variables = self._input_files[0].variables
        var_names = set(variables.keys())
        missing_vars = set()
        for ifile in self._input_files[1:]:
            var_names_next = set(ifile.variables.keys())
            missing_vars.update(var_names - var_names_next)
        if len(missing_vars) != 0:
            warning = "WARNING: The first input file has variables that are " \
                + "not in all input files:" + os.linesep + '   '
            warning += " ".join(missing_vars)
            self._vprint(warning, header=True, verbosity=1)

    def _sort_input_files_by_time(self, specifier):
        """
        Internal method for sorting the input files by time

        This assumes that 'time' is the unlimited dimension, and it checks
        to make sure that all of the times spanning across each file do not 
        overlap with each other (i.e., that the times across all files are 
        monotonicly increasing).

        Currently, this method assumes that all of the input files
        have the same 'time:units' attribute, such that all time variable
        values are measured from the same date-time.  When this is true,
        we do not need to consider the value of the 'time:units'
        attribute itself.  If this assumption is not true, then we need
        to consider the 'time:units" attribute of each file, together
        with that file's time variable values.  To do that properly,
        however, one should use UDUNITS to do the comparisons.

        Parameters:
            specifier (Specifier): The reshaper specifier object
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Sorting input files', verbosity=1)

        # Get the time attributes (for convenience) and, for each file,
        # add the times to a list.  (Each file will have an array of times
        # associated with it.  Each array will be added to a list, such
        # that the outer-most list contains an array for each input file)
        time_values = []
        for ifile in self._input_files:
            time_values.append(
                ifile.variables[self._unlimited_dim].get_value())

        # Determine the sort order based on the first time in the time values
        order = range(len(self._input_files))
        new_order = sorted(order, key=lambda i: time_values[i][0])

        # Re-order the list of input files and filenames
        new_file_list = [None] * len(new_order)
        new_filenames = [None] * len(new_order)
        new_values = [None] * len(new_order)
        for i in order:
            new_file_list[i] = self._input_files[new_order[i]]
            new_filenames[i] = specifier.input_file_list[new_order[i]]
            new_values[i] = time_values[new_order[i]]

        # Save this data in the new orders
        self._input_files = new_file_list
        self._input_filenames = new_filenames

        # Now, check that the largest time in each file is less than the
        # smallest time in the next file (so that the time spans of each file
        # do not overlap)
        for i in order[:-1]:
            if new_values[i][-1] >= new_values[i + 1][0]:
                err_msg = 'Times in input files {0} and {1} appear to overlap'
                err_msg = err_msg.format(new_filenames[i], new_filenames[i+1])
                raise ValueError(err_msg)

        # Now that this is validated, let's string together the numpy array
        # of all times (using the new_values array)
        self._all_time_values = \
            numpy.fromiter(itertools.chain.from_iterable(new_values),
                           dtype='float')

    def _sort_variables(self, specifier):
        """
        Internal method for sorting the variables in each time-slice file

        This method determines if each variable is to be treated as 
        time-invariant metadata, time-variant metadata (user defined), or 
        time-series variables.  All metadata is written to every time-series 
        file, and any time-series variable is written to its own file.  
        The time-variant metadata variables are determined by user input, 
        and are contained in the Specifier data member:

            Specifier.time_variant_metadata.

        Parameters:
            specifier (Specifier): The reshaper specifier object
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Sorting variables', verbosity=1)

        # Initialize the dictionary of variable names for each category
        # (Keys are variable names, Values are variable sizes)
        self._time_variant_metadata = {}
        self._time_invariant_metadata = {}
        self._time_series_variables = {}

        # Categorize each variable (only looking at first file)
        variables = self._input_files[0].variables
        for var_name in variables.keys():
            var = variables[var_name]
            size = numpy.dtype(var.typecode()).itemsize
            size = size * numpy.prod(var.shape)
            if self._unlimited_dim not in var.dimensions:
                self._time_invariant_metadata[var_name] = size
            elif var_name in specifier.time_variant_metadata:
                self._time_variant_metadata[var_name] = size
            else:
                self._time_series_variables[var_name] = size

        # Debug output
        if self._simplecomm.is_manager():
            self._vprint('Time-Invariant Metadata: ' +
                         str(self._time_invariant_metadata.keys()), verbosity=2)
            self._vprint('Time-Variant Metadata: ' +
                         str(self._time_variant_metadata.keys()), verbosity=2)
            self._vprint('Time-Series Variables: ' +
                         str(self._time_series_variables.keys()), verbosity=2)

        # Add 'once' variable if writing to a once file
        # NOTE: This is a "cheat"!  There is no 'once' variable.  It's just
        #       a catch for all metadata IFF the 'once-file' is enabled.
        if self._use_once_file:
            self._time_series_variables['once'] = 1

    def _validate_output_files(self, specifier,
                               skip_existing=False, overwrite=False):
        """
        Perform validation of output data files themselves.  

        We compute the output file name from the prefix and suffix, and then
        we check whether the output files exist.  By default, if the output
        file

        Parameters:
            specifier (Specifier): The reshaper specifier object

        Keyword Arguments:
            skip_existing (bool): Flag specifying whether to skip the generation
                of time-series for variables with time-series files that already
                exist.  Default is False.
            overwrite (bool): Flag specifying whether to forcefully overwrite
                output files if they already exist.  Default is False.
        """

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('Validating output files', verbosity=1)

        # Loop through the time-series variables and generate output filenames
        prefix = specifier.output_file_prefix
        suffix = specifier.output_file_suffix
        self._time_series_filenames = \
            dict([(variable, prefix + variable + suffix)
                  for variable in self._time_series_variables])

        # Find which files already exist
        existing = []
        for variable, filename in self._time_series_filenames.items():
            if os.path.isfile(filename):
                existing.append(variable)

        # If overwrite is enabled, delete all existing files first
        if overwrite:
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Deleting existing output files for '
                             'time-series variables: {0}'.format(existing),
                             verbosity=1)
            for variable in existing:
                os.remove(self._time_series_filenames[variable])

        # Or, if skip_existing is set, remove the existing time-series
        # variables from the list of time-series variables to convert
        elif skip_existing:
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Skipping time-series variables with '
                             'existing output files: {0}'.format(existing),
                             verbosity=1)
            for variable in existing:
                self._time_series_variables.pop(variable)

        # Otherwise, throw an exception if any existing output files are found
        elif len(existing) > 0:
            err_msg = ("Found existing output files for time-series "
                       "variables: {0}").format(existing)
            raise RuntimeError(err_msg)

    def convert(self, output_limit=0):
        """
        Method to perform the Reshaper's designated operation.

        In this case, convert a list of time-slice files to time-series files.

        Keyword Arguments:
            output_limit (int): Limit on the number of output (time-series) 
                files to write during the convert() operation.  If set
                to 0, no limit is placed.  This limits the number
                of output files produced by each processor in a
                parallel run.
        """
        # Type checking input
        if type(output_limit) is not int:
            err_msg = 'Output limit must be an integer'
            raise TypeError(err_msg)

        # Start the total convert process timer
        self._simplecomm.sync()
        self._timer.start('Complete Conversion Process')

        # Debugging output
        if self._simplecomm.is_manager():
            self._vprint('Converting time-slices to time-series', verbosity=1)

        # For data common to all input files, we reference only the first
        ref_infile = self._input_files[0]

        # Store the common dimensions and attributes for each file
        # (taken from the first input file in the list)
        common_dims = ref_infile.dimensions
        common_atts = ref_infile.attributes

        # Partition the time-series variables across all processors
        tsv_names_loc = self._simplecomm.partition(self._time_series_variables.items(),
                                                   func=WeightBalanced(),
                                                   involved=True)
        if output_limit > 0:
            tsv_names_loc = tsv_names_loc[0:output_limit]

        # Print partitions for all ranks
        dbg_msg = 'Local time-series variables are {0}'.format(tsv_names_loc)
        self._vprint(dbg_msg, header=True, verbosity=2)

        # Reset all of the timer values (as it is possible that there are no
        # time-series variables in the local list procuded above)
        self._timer.reset('Open Output Files')
        self._timer.reset('Create Time-Invariant Metadata')
        self._timer.reset('Create Time-Variant Metadata')
        self._timer.reset('Create Time-Series Variables')
        self._timer.reset('Write Time-Invariant Metadata')
        self._timer.reset('Write Time-Variant Metadata')
        self._timer.reset('Write Time-Series Variables')
        self._timer.reset('Close Output Files')

        # Initialize the byte count dictionary
        self._byte_counts['Requested Data'] = 0
        self._byte_counts['Actual Data'] = 0

        # Defining a simple helper function to determine whether to
        # write time-series data and/or write metadata.  This is useful
        # for adding the ability to write a "once" file
        def _get_once_info(vname):
            is_once_file = (vname == 'once')
            write_meta = True
            write_tser = True
            if self._use_once_file:
                write_meta = is_once_file
                write_tser = not is_once_file
            return is_once_file, write_meta, write_tser

        # NOTE: In the prototype, we check for the existance of the output
        # directory at this point.  If it does not exist, we create it (but
        # only from the master rank).  This requires synchronization with
        # the decomp utility.  Instead, we assume the output directory
        # already exists (and is checked by the Specifier's validation).  No
        # synchronization is needed.

        # For each time-series variable, create the corresponding output file
        # (Also defines the header info for each output file)
        out_files = {}
        out_tvm_vars = {}
        for out_name in tsv_names_loc:
            is_once_file, write_meta, write_tser = _get_once_info(out_name)

            # Determine the output file name for this variable
            out_filename = self._time_series_filenames[out_name]
            dbg_msg = 'Creating output file for variable: {0}'.format(out_name)
            if is_once_file:
                dbg_msg = 'Creating "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Open each output file and create the dimensions and attributes
            # NOTE: If the output file already exists, abort!
            self._timer.start('Open Output Files')
            if os.path.exists(out_filename):
                err_msg = 'Found existing output file: {0}'.format(out_filename)
                raise OSError(err_msg)
            out_file = Nio.open_file(out_filename, 'w',
                                     options=self._nio_options)
            for att_name, att_val in common_atts.iteritems():
                setattr(out_file, att_name, att_val)
            for dim_name, dim_val in common_dims.iteritems():
                if dim_name == self._unlimited_dim:
                    out_file.create_dimension(dim_name, None)
                else:
                    out_file.create_dimension(dim_name, dim_val)
            self._timer.stop('Open Output Files')

            # Create the time-invariant metadata variables
            if (write_meta):
                self._timer.start('Create Time-Invariant Metadata')
                for name in self._time_invariant_metadata:
                    in_var = ref_infile.variables[name]
                    out_var = out_file.create_variable(name,
                                                       in_var.typecode(),
                                                       in_var.dimensions)
                    for att_name, att_val in in_var.attributes.iteritems():
                        setattr(out_var, att_name, att_val)
                self._timer.stop('Create Time-Invariant Metadata')

            # Create the time-variant metadata variables
            if write_meta:
                self._timer.start('Create Time-Variant Metadata')
                for name in self._time_variant_metadata:
                    in_var = ref_infile.variables[name]
                    out_tvm_vars[name] = out_file.create_variable(name,
                                                                  in_var.typecode(), in_var.dimensions)
                    for att_name, att_val in in_var.attributes.iteritems():
                        setattr(out_tvm_vars[name], att_name, att_val)
                self._timer.stop('Create Time-Variant Metadata')

            # Create the time-series variable itself
            if write_tser:
                self._timer.start('Create Time-Series Variables')
                in_var = ref_infile.variables[out_name]
                out_var = out_file.create_variable(out_name,
                                                   in_var.typecode(), in_var.dimensions)
                self._timer.stop('Create Time-Series Variables')

            # Append the output file to list
            out_files[out_name] = out_file

        # Now that each output file has been created, start writing the data
        # (Looping over output file index, which is common in name lists)
        for out_name, out_file in out_files.iteritems():
            is_once_file, write_meta, write_tser = _get_once_info(out_name)

            dbg_msg = 'Writing output file for variable: {0}'.format(out_name)
            if is_once_file:
                dbg_msg = 'Writing "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Create the attributes of the time-series variable
            if write_tser:
                in_var = ref_infile.variables[out_name]
                out_var = out_file.variables[out_name]
                for att_name, att_val in in_var.attributes.iteritems():
                    setattr(out_var, att_name, att_val)

            # Write the time-invariant metadata
            if write_meta:
                self._timer.start('Write Time-Invariant Metadata')
                for name in self._time_invariant_metadata:
                    in_meta = ref_infile.variables[name]
                    out_meta = out_file.variables[name]
                    if in_meta.rank > 0:
                        out_meta[:] = in_meta[:]
                    else:
                        out_meta.assign_value(in_meta.get_value())
                self._timer.stop('Write Time-Invariant Metadata')

            # Write each time-variant variable
            series_step_index = 0
            for in_file in self._input_files:

                # Get the number of time steps in this slice file
                num_steps = in_file.dimensions[self._unlimited_dim]

                # Loop over the time steps in this slice file
                for slice_step_index in range(num_steps):

                    # Write the time-varient metadata
                    if write_meta:
                        self._timer.start('Write Time-Variant Metadata')
                        for name in self._time_variant_metadata:
                            in_meta = in_file.variables[name]
                            out_meta = out_file.variables[name]
                            ndims = len(in_meta.dimensions)
                            udidx = in_meta.dimensions.index(
                                self._unlimited_dim)
                            in_slice = [slice(None)] * ndims
                            in_slice[udidx] = slice_step_index
                            out_slice = [slice(None)] * ndims
                            out_slice[udidx] = series_step_index
                            out_meta[tuple(out_slice)] = in_meta[
                                tuple(in_slice)]

                            requested_nbytes = in_meta[:].nbytes
                            self._byte_counts[
                                'Requested Data'] += requested_nbytes
                            actual_nbytes = self.assumed_block_size \
                                * numpy.ceil(requested_nbytes / self.assumed_block_size)
                            self._byte_counts['Actual Data'] += actual_nbytes
                        self._timer.stop('Write Time-Variant Metadata')

                    # Write the time-series variables
                    if write_tser:
                        self._timer.start('Write Time-Series Variables')
                        in_var = in_file.variables[out_name]
                        ndims = len(in_var.dimensions)
                        udidx = in_var.dimensions.index(self._unlimited_dim)
                        in_slice = [slice(None)] * ndims
                        in_slice[udidx] = slice_step_index
                        out_slice = [slice(None)] * ndims
                        out_slice[udidx] = series_step_index
                        out_var[tuple(out_slice)] = in_var[tuple(in_slice)]

                        requested_nbytes = in_file.variables[
                            out_name][:].nbytes
                        self._byte_counts['Requested Data'] += requested_nbytes
                        actual_nbytes = self.assumed_block_size \
                            * numpy.ceil(requested_nbytes / self.assumed_block_size)
                        self._byte_counts['Actual Data'] += actual_nbytes
                        self._timer.stop('Write Time-Series Variables')

                    # Increment the time-series step index
                    series_step_index += 1

            # Close the output file
            self._timer.start('Close Output Files')
            out_file.close()
            self._timer.stop('Close Output Files')
            dbg_msg = 'Closed output file for variable: {0}'.format(out_name)
            if is_once_file:
                dbg_msg = 'Closed "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

        # Information
        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint(
                'Finished converting time-slices to time-series.', verbosity=1)

        # Finish clocking the entire convert procedure
        self._timer.stop('Complete Conversion Process')

    def print_diagnostics(self):
        """
        Print out timing and I/O information collected up to this point
        """

        # Get all totals and maxima
        my_times = self._timer.get_all_times()
        max_times = self._simplecomm.allreduce(my_times, op='max')
        my_bytes = self._byte_counts
        total_bytes = self._simplecomm.allreduce(my_bytes, op='sum')

        # Synchronize
        self._simplecomm.sync()

        # Print timing maxima
        o = self._timer.get_names()
        time_table_str = _pprint_dictionary('TIMING DATA', max_times, order=o)
        if self._simplecomm.is_manager():
            self._vprint(time_table_str, verbosity=0)

        # Convert byte count to MB
        for name in total_bytes:
            total_bytes[name] = total_bytes[name] / float(1024 * 1024)

        # Print byte count totals
        byte_count_str = _pprint_dictionary('BYTE COUNTS (MB)', total_bytes)
        if self._simplecomm.is_manager():
            self._vprint(byte_count_str, verbosity=0)
Example #11
0
    def __init__(self, specifier, serial=False, verbosity=1, wmode='w', once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{0}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)

        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)
            self._vprint('  MPI Communicator Size: {}'.format(
                self._simplecomm.get_size()), verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # The I/O backend to use
        if iobackend.is_available(specifier.io_backend):
            self._backend = specifier.io_backend
        else:
            self._backend = iobackend.get_backend()
            self._vprint(('  I/O Backend {0} not available.  Using {1} '
                          'instead').format(specifier.io_backend, self._backend), verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the time-series variable names
        self._time_series_names = specifier.time_series
        if self._time_series_names is not None:
            vnames = ', '.join(self._time_series_names)
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Extracting only variables: {0}'.format(
                    vnames), verbosity=-1)

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store whether to treat 1D time-variant variables as metadata
        self._1d_metadata = specifier.assume_1d_time_variant_metadata

        # Store the metadata filename
        self._metadata_filename = specifier.metadata_filename

        # Store time invariant variables that should be excluded from the timeseries files
        self._exclude_list = specifier.exclude_list

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup NetCDF file options
        self._netcdf_format = specifier.netcdf_format
        self._netcdf_compression = specifier.compression_level
        self._netcdf_least_significant_digit = specifier.least_significant_digit
        if self._simplecomm.is_manager():
            self._vprint(
                '  NetCDF I/O Backend: {0}'.format(self._backend), verbosity=1)
            self._vprint('  NetCDF Output Format: {0}'.format(
                self._netcdf_format), verbosity=1)
            self._vprint('  NetCDF Compression: {0}'.format(
                self._netcdf_compression), verbosity=1)
            trunc_str = ('{} decimal places'.format(self._netcdf_least_significant_digit)
                         if self._netcdf_least_significant_digit else 'Disabled')
            self._vprint('  NetCDF Truncation: {0}'.format(
                trunc_str), verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('...Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()
Example #12
0
class Reshaper(object):

    """
    The time-slice to time-series Reshaper class

    This is the class that defines how the time-slice to time-series
    reshaping operation is to be performed.
    """

    def __init__(self, specifier, serial=False, verbosity=1, wmode='w', once=False, simplecomm=None):
        """
        Constructor

        Parameters:
            specifier (Specifier): An instance of the Specifier class,
                defining the input specification for this reshaper operation.
            serial (bool): True or False, indicating whether the operation
                should be performed in serial (True) or parallel
                (False).  The default is to assume parallel operation
                (but serial will be chosen if the mpi4py cannot be
                found when trying to initialize decomposition.
            verbosity(int): Level of printed output (stdout).  A value of 0
                means no output, and a higher value means more output.  The
                default value is 1.
            wmode (str): The mode to use for writing output.  Can be 'w' for
                normal write operation, 's' to skip the output generation for
                existing time-series files, 'o' to overwrite existing
                time-series files, 'a' to append to existing time-series files.
            once (bool): True or False, indicating whether the Reshaper should
                write all metadata to a 'once' file (separately).
            simplecomm (SimpleComm): A SimpleComm object to handle the parallel
                communication, if necessary
        """

        # Type checking (or double-checking)
        if not isinstance(specifier, Specifier):
            err_msg = "Input must be given in the form of a Specifier object"
            raise TypeError(err_msg)
        if type(serial) is not bool:
            err_msg = "Serial indicator must be True or False."
            raise TypeError(err_msg)
        if type(verbosity) is not int:
            err_msg = "Verbosity level must be an integer."
            raise TypeError(err_msg)
        if type(wmode) is not str:
            err_msg = "Write mode flag must be a str."
            raise TypeError(err_msg)
        if type(once) is not bool:
            err_msg = "Once-file indicator must be True or False."
            raise TypeError(err_msg)
        if simplecomm is not None:
            if not isinstance(simplecomm, SimpleComm):
                err_msg = "Simple communicator object is not a SimpleComm"
                raise TypeError(err_msg)
        if wmode not in ['w', 's', 'o', 'a']:
            err_msg = "Write mode '{0}' not recognized".format(wmode)
            raise ValueError(err_msg)

        # Whether to write a once file
        self._use_once_file = once

        # The output write mode to use
        self._write_mode = wmode

        # Internal timer data
        self._timer = TimeKeeper()

        self._timer.start('Initializing Simple Communicator')
        if simplecomm is None:
            simplecomm = create_comm(serial=serial)

        # Reference to the simple communicator
        self._simplecomm = simplecomm
        self._timer.stop('Initializing Simple Communicator')

        # Dictionary storing read/write data amounts
        self.assumed_block_size = float(4 * 1024 * 1024)
        self._byte_counts = {}

        # Contruct the print header
        header = ''.join(['[', str(self._simplecomm.get_rank()),
                          '/', str(self._simplecomm.get_size()), '] '])

        # Reference to the verbose printer tool
        self._vprint = VPrinter(header=header, verbosity=verbosity)

        # Debug output starting
        if self._simplecomm.is_manager():
            self._vprint('Initializing Reshaper...', verbosity=0)
            self._vprint('  MPI Communicator Size: {}'.format(
                self._simplecomm.get_size()), verbosity=1)

        # Validate the user input data
        self._timer.start('Specifier Validation')
        specifier.validate()
        self._timer.stop('Specifier Validation')
        if self._simplecomm.is_manager():
            self._vprint('  Specifier validated', verbosity=1)

        # The I/O backend to use
        if iobackend.is_available(specifier.io_backend):
            self._backend = specifier.io_backend
        else:
            self._backend = iobackend.get_backend()
            self._vprint(('  I/O Backend {0} not available.  Using {1} '
                          'instead').format(specifier.io_backend, self._backend), verbosity=1)

        # Store the input file names
        self._input_filenames = specifier.input_file_list

        # Store the time-series variable names
        self._time_series_names = specifier.time_series
        if self._time_series_names is not None:
            vnames = ', '.join(self._time_series_names)
            if self._simplecomm.is_manager():
                self._vprint('WARNING: Extracting only variables: {0}'.format(
                    vnames), verbosity=-1)

        # Store the list of metadata names
        self._metadata_names = specifier.time_variant_metadata

        # Store whether to treat 1D time-variant variables as metadata
        self._1d_metadata = specifier.assume_1d_time_variant_metadata

        # Store the metadata filename
        self._metadata_filename = specifier.metadata_filename

        # Store time invariant variables that should be excluded from the timeseries files
        self._exclude_list = specifier.exclude_list

        # Store the output file prefix and suffix
        self._output_prefix = specifier.output_file_prefix
        self._output_suffix = specifier.output_file_suffix

        # Setup NetCDF file options
        self._netcdf_format = specifier.netcdf_format
        self._netcdf_compression = specifier.compression_level
        self._netcdf_least_significant_digit = specifier.least_significant_digit
        if self._simplecomm.is_manager():
            self._vprint(
                '  NetCDF I/O Backend: {0}'.format(self._backend), verbosity=1)
            self._vprint('  NetCDF Output Format: {0}'.format(
                self._netcdf_format), verbosity=1)
            self._vprint('  NetCDF Compression: {0}'.format(
                self._netcdf_compression), verbosity=1)
            trunc_str = ('{} decimal places'.format(self._netcdf_least_significant_digit)
                         if self._netcdf_least_significant_digit else 'Disabled')
            self._vprint('  NetCDF Truncation: {0}'.format(
                trunc_str), verbosity=1)

        # Helpful debugging message
        if self._simplecomm.is_manager():
            self._vprint('...Reshaper initialized.', verbosity=0)

        # Sync before continuing..
        self._simplecomm.sync()

    def _inspect_input_files(self):
        """
        Inspect the input data files themselves.

        We check the file contents here, which means opening and reading heading information from the files.
        """
        # Set the I/O backend according to what is specified
        iobackend.set_backend(self._backend)

        # Initialize the list of variable names for each category
        udim = None
        timeta = []
        xtra_timeta = []
        tvmeta = []

        # Initialize the local dictionary of time-series variables and sizes
        all_tsvars = {}
        file_times = {}

        #===== INSPECT FIRST INPUT FILE (ON MASTER PROCESS ONLY) =====

        # Open first file
        if self._simplecomm.is_manager():
            ifile = iobackend.NCFile(self._input_filenames[0])

            # Look for the 'unlimited' dimension
            try:
                udim = next(
                    dim for dim in ifile.dimensions if ifile.unlimited(dim))
            except StopIteration:
                err_msg = 'Unlimited dimension not found.'
                raise LookupError(err_msg)

            # Get the first file's time values
            file_times[self._input_filenames[0]] = ifile.variables[udim][:]

            # Categorize each variable (only looking at first file)
            for var_name, var in ifile.variables.iteritems():
                if udim not in var.dimensions:
                    if var_name not in self._exclude_list:
                        timeta.append(var_name)
                elif var_name in self._metadata_names or (self._1d_metadata and len(var.dimensions) == 1):
                    tvmeta.append(var_name)
                elif self._time_series_names is None or var_name in self._time_series_names:
                    all_tsvars[var_name] = var.datatype.itemsize * var.size

            # Close the first file
            ifile.close()

            # Find variables only in the metadata file
            if self._metadata_filename is not None:
                ifile = iobackend.NCFile(self._metadata_filename)
                for var_name, var in ifile.variables.iteritems():
                    if udim not in var.dimensions and var_name not in timeta:
                        xtra_timeta.append(var_name)
                ifile.close()

        self._simplecomm.sync()

        # Send information to worker processes
        self._unlimited_dim = self._simplecomm.partition(
            udim, func=Duplicate(), involved=True)
        self._time_invariant_metadata = self._simplecomm.partition(
            timeta, func=Duplicate(), involved=True)
        self._time_invariant_metafile_vars = self._simplecomm.partition(
            xtra_timeta, func=Duplicate(), involved=True)
        self._time_variant_metadata = self._simplecomm.partition(
            tvmeta, func=Duplicate(), involved=True)
        all_tsvars = self._simplecomm.partition(
            all_tsvars, func=Duplicate(), involved=True)

        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint('  First input file inspected.', verbosity=2)

        #===== INSPECT REMAINING INPUT FILES (IN PARALLEL) =====

        # Get the list of variable names and missing variables
        var_names = set(
            all_tsvars.keys() + self._time_invariant_metadata +
            self._time_invariant_metafile_vars + self._time_variant_metadata)
        missing_vars = set()

        # Partition the remaining filenames to inspect
        input_filenames = self._simplecomm.partition(
            self._input_filenames[1:], func=EqualStride(), involved=True)

        # Make a pass through remaining files and:
        # (1) Make sure it has the 'unlimited' dimension
        # (2) Make sure this dimension is truely 'unlimited'
        # (3) Check that this dimension has a corresponding variable
        # (4) Check if there are any missing variables
        # (5) Get the time values from the files
        for ifilename in input_filenames:
            ifile = iobackend.NCFile(ifilename)

            # Determine the unlimited dimension
            if self._unlimited_dim not in ifile.dimensions:
                err_msg = 'Unlimited dimension not found in file "{0}"'.format(
                    ifilename)
                raise LookupError(err_msg)
            if not ifile.unlimited(self._unlimited_dim):
                err_msg = 'Dimension "{0}" not unlimited in file "{1}"'.format(
                    self._unlimited_dim, ifilename)
                raise LookupError(err_msg)
            if self._unlimited_dim not in ifile.variables:
                err_msg = 'Unlimited dimension variable not found in file "{0}"'.format(
                    ifilename)
                raise LookupError(err_msg)

            # Get the time values (list of NDArrays)
            file_times[ifilename] = ifile.variables[self._unlimited_dim][:]

            # Get the missing variables
            var_names_next = set(ifile.variables.keys())
            missing_vars.update(var_names - var_names_next)

            # Close the file
            ifile.close()

        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint('  Remaining input files inspected.', verbosity=2)

        #===== CHECK FOR MISSING VARIABLES =====

        # Gather all missing variables on the master process
        if self._simplecomm.get_size() > 1:
            if self._simplecomm.is_manager():
                for _ in range(1, self._simplecomm.get_size()):
                    missing_vars.update(self._simplecomm.collect()[1])
            else:
                self._simplecomm.collect(missing_vars)
        self._simplecomm.sync()

        # Check for missing variables only on master process
        if self._simplecomm.is_manager():

            # Remove metafile variables from missing vars set
            missing_vars -= set(self._time_invariant_metafile_vars)

            # Make sure that the list of variables in each file is the same
            if len(missing_vars) != 0:
                warning = ("WARNING: Some variables are not in all input files:{0}   "
                           "{1}").format(linesep, ', '.join(sorted(missing_vars)))
                self._vprint(warning, header=False, verbosity=0)

            self._vprint('  Checked for missing variables.', verbosity=2)

        #===== SORT INPUT FILES BY TIME =====

        # Gather the file time values onto the master process
        if self._simplecomm.get_size() > 1:
            if self._simplecomm.is_manager():
                for _ in range(1, self._simplecomm.get_size()):
                    file_times.update(self._simplecomm.collect()[1])
            else:
                self._simplecomm.collect(file_times)
        self._simplecomm.sync()

        # Check the order of the input files based on the time values
        if self._simplecomm.is_manager():

            # Determine the sort order based on the first time in the time
            # values
            old_order = range(len(self._input_filenames))
            new_order = sorted(
                old_order, key=lambda i: file_times[self._input_filenames[i]][0])

            # Re-order the list of input filenames and time values
            new_filenames = [self._input_filenames[i] for i in new_order]
            new_values = [file_times[self._input_filenames[i]]
                          for i in new_order]

            # Now, check that the largest time in each file is less than the smallest time
            # in the next file (so that the time spans of each file do not
            # overlap)
            for i in xrange(1, len(new_values)):
                if new_values[i - 1][-1] >= new_values[i][0]:
                    err_msg = ('Times in input files {0} and {1} appear to '
                               'overlap').format(new_filenames[i - 1], new_filenames[i])
                    raise ValueError(err_msg)

        else:
            new_filenames = None

        # Now that this is validated, save the time values and filename in the
        # new order
        self._input_filenames = self._simplecomm.partition(
            new_filenames, func=Duplicate(), involved=True)

        if self._simplecomm.is_manager():
            self._vprint('  Input files sorted by time.', verbosity=2)

        #===== FINALIZING OUTPUT =====
        self._simplecomm.sync()

        # Debug output
        if self._simplecomm.is_manager():
            self._vprint('  Time-Invariant Metadata: {0}'.format(
                ', '.join(self._time_invariant_metadata)), verbosity=1)
            if len(self._time_invariant_metafile_vars) > 0:
                self._vprint('  Additional Time-Invariant Metadata: {0}'.format(
                    ', '.join(self._time_invariant_metafile_vars)), verbosity=1)
            self._vprint('  Time-Variant Metadata: {0}'.format(
                ', '.join(self._time_variant_metadata)), verbosity=1)
            self._vprint(
                '  Time-Series Variables: {0}'.format(', '.join(all_tsvars.keys())), verbosity=1)

        # Add 'once' variable if writing to a once file
        # NOTE: This is a "cheat"!  There is no 'once' variable.  It's just
        #       a catch for all metadata IFF the 'once-file' is enabled.
        if self._use_once_file:
            all_tsvars['once'] = max(all_tsvars.values())

        # Partition the time-series variables across processors
        self._time_series_variables = self._simplecomm.partition(
            all_tsvars.items(), func=WeightBalanced(), involved=True)

    def _inspect_output_files(self):
        """
        Perform inspection of the output data files themselves.

        We compute the output file name from the prefix and suffix, and then
        we check whether the output files exist.  By default, if the output
        file exists, then the job is stopped.
        """
        iobackend.set_backend(self._backend)

        # Loop through the time-series variables and generate output filenames
        self._time_series_filenames = \
            dict([(variable, self._output_prefix + variable + self._output_suffix)
                  for variable in self._time_series_variables])

        # Find which files already exist
        self._existing = [v for (v, f) in self._time_series_filenames.iteritems()
                          if isfile(f)]

        # Set the starting step index for each variable
        self._time_series_step_index = dict([(variable, 0) for variable in
                                             self._time_series_variables])

        # If overwrite is enabled, delete all existing files first
        if self._write_mode == 'o':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Deleting existing output files for time-series '
                             'variables: {0}'.format(', '.join(sorted(self._existing))), verbosity=0)
            for variable in self._existing:
                remove(self._time_series_filenames[variable])
            self._existing = []

        # Or, if skip existing is set, remove the existing time-series
        # variables from the list of time-series variables to convert
        elif self._write_mode == 's':
            if self._simplecomm.is_manager() and len(self._existing) > 0:
                self._vprint('WARNING: Skipping time-series variables with '
                             'existing output files: {0}'.format(', '.join(sorted(self._existing))), verbosity=0)
            for variable in self._existing:
                self._time_series_variables.remove(variable)

        # Or, if appending, check that the existing output files conform
        # to the expected pattern
        elif self._write_mode == 'a':

            # Check each existing time-series file
            for variable in self._existing:

                # Get the matching filename
                filename = self._time_series_filenames[variable]

                # Open the time-series file for inspection
                tsfile = iobackend.NCFile(filename)

                # Check that the file has the unlimited dim and var
                if not tsfile.unlimited(self._unlimited_dim):
                    err_msg = ('Cannot append to time-series file with missing unlimited '
                               'dimension {0!r}').format(self._unlimited_dim)
                    raise RuntimeError(err_msg)

                # Check for once file
                is_once_file = (variable == 'once')
                needs_meta_data = not (
                    self._use_once_file and not is_once_file)
                needs_tser_data = not (self._use_once_file and is_once_file)

                # Look for metadata
                if needs_meta_data:

                    # Check that the time-variant metadata are all present
                    for metavar in self._time_variant_metadata:
                        if metavar not in tsfile.variables:
                            err_msg = ("Cannot append to time-series file with missing time-variant metadata "
                                       "'{0}'").format(metavar)
                            raise RuntimeError(err_msg)

                # Check that the time-series variable is present
                if needs_tser_data and variable not in tsfile.variables:
                    err_msg = ("Cannot append to time-series file with missing time-series variable "
                               "'{0}'").format(variable)
                    raise RuntimeError(err_msg)

                # Get the starting step index to start writing from
                self._time_series_step_index[variable] = tsfile.dimensions[self._unlimited_dim]

                # Close the time-series file
                tsfile.close()

        # Otherwise, throw an exception if any existing output files are found
        elif len(self._existing) > 0:
            err_msg = "Found existing output files for time-series variables: {0}".format(
                ', '.join(sorted(self._existing)))
            raise RuntimeError(err_msg)

    def _create_var(self, in_file, out_file, vname, chunks=None):
        in_var = in_file.variables[vname]
        fill_value = in_var.fill_value
        if in_var.chunk_sizes is not None and chunks is not None:
            chunksizes = [chunks[d] if d in chunks else c
                          for d, c in zip(in_var.dimensions, in_var.chunk_sizes)]
        else:
            chunksizes = None
        out_var = out_file.create_variable(
            vname, in_var.datatype, in_var.dimensions, fill_value=fill_value, chunksizes=chunksizes)
        for att_name in in_var.ncattrs:
            att_value = in_var.getncattr(att_name)
            out_var.setncattr(att_name, att_value)

    def _chunk_iter(self, vobj, chunks={}, corder=True):
        """
        This is a generator function to iterator over chunks of arrays with named dimensions

        Parameters:
            vobj: A NetCDF file variable object with dimensions and shape attributes
            chunks (dict): A dictionary of dimension names mapped to chunk sizes along that
                named dimension
            corder (bool): Whether to assume the array has C-style axis ordering, where the
                fastest changing dimension is assumed to be the first axis.  If False, then
                the fastest changing dimension is assumed to be the last.
        """
        dimensions = vobj.dimensions
        shape = vobj.shape

        nchunks = 1
        dchunks = []
        for dname, dlen in zip(dimensions, shape):
            if dname in chunks:
                clen = chunks[dname]
                cnum = dlen // clen
                if dlen % clen > 0:
                    cnum += 1
                nchunks *= cnum
            else:
                clen = dlen
                cnum = 1
            dchunks.append((dlen, clen, cnum))

        for n in xrange(nchunks):
            cidx = []
            nidx = n
            nstride = nchunks
            if corder:
                diter = reversed(dchunks)
            else:
                diter = iter(dchunks)
            for dlen, clen, cnum in diter:
                nstride = nstride // cnum
                cidx.append(nidx // nstride)
                nidx = nidx % nstride
            if corder:
                cidx.reverse()

            cslice = []
            for d in xrange(len(shape)):
                ic = cidx[d]
                dlen, clen, cnum = dchunks[d]

                ibeg = ic * clen
                iend = (ic + 1) * clen
                if iend >= dlen:
                    iend = dlen

                cslice.append(slice(ibeg, iend))

            yield tuple(cslice)

    def _offset_chunk(self, chunk, vobj, offset):
        """
        Compute a new chunk/slice for a variable with a given offset

        Parameters:
            chunk (tuple): A tuple of slices across each dimension
            vobj: A NetCDF file variable object with dimensions and shape attributes
            offset (dict): Offsets for each dimension (if any)

        Returns:
            tuple: A tuple of slices across each dimension with offsets added
        """
        new_chunk = []
        for i, d in enumerate(vobj.dimensions):
            if d in offset:
                o = offset[d]
            else:
                o = 0
            new_chunk.append(slice(chunk[i].start + o, chunk[i].stop + o))
        return tuple(new_chunk)

    def _copy_var(self, kind, in_var, out_var, chunks={}, offsets={}):
        """
        Copy variable data from one variable object to another via chunking

        Parameters:
            kind (str): A string describing the kind of variable being copied
            in_var: A NetCDF variable object to read data from
            out_var: A NetCDF variable object to write data to
            chunks (dict): A dictionary of dimension names mapped to chunk sizes along that named dimension
            offsets (dict): Integer offsets along each dimension
        """
        for rslice in self._chunk_iter(in_var, chunks=chunks):

            self._timer.start('Read {0}'.format(kind))
            tmp_data = in_var[rslice]
            self._timer.stop('Read {0}'.format(kind))
            wslice = self._offset_chunk(rslice, out_var, offsets)
            self._timer.start('Write {0}'.format(kind))
            out_var[wslice] = tmp_data
            self._timer.stop('Write {0}'.format(kind))

            requested_nbytes = tmp_data.nbytes if hasattr(
                tmp_data, 'nbytes') else 0
            self._byte_counts['Requested Data'] += requested_nbytes
            actual_nbytes = (self.assumed_block_size *
                             numpy.ceil(requested_nbytes / self.assumed_block_size))
            self._byte_counts['Actual Data'] += actual_nbytes

    def convert(self, output_limit=0, rchunks=None, wchunks=None):
        """
        Method to perform the Reshaper's designated operation.

        In this case, convert a list of time-slice files to time-series files.

        Parameters:
            output_limit (int): Limit on the number of output (time-series) files to write during the
                convert() operation.  If set to 0, no limit is placed.  This limits the number of output files
                produced by each processor in a parallel run.
            rchunks (dict): A dictionary of dimension names mapped to reading chunk sizes along that named
                dimension
            wchunks (dict): A dictionary of dimension names mapped to writing chunk sizes along that named
                dimension
        """
        iobackend.set_backend(self._backend)

        # Type checking input
        if type(output_limit) is not int:
            err_msg = 'Output limit must be an integer'
            raise TypeError(err_msg)

        # Start the total convert process timer
        self._timer.start('Complete Conversion Process')

        # Validate the input files themselves
        if self._simplecomm.is_manager():
            self._vprint('Inspecting input files...', verbosity=0)
        self._timer.start('Inspect Input Files')
        self._inspect_input_files()
        self._timer.stop('Inspect Input Files')
        if self._simplecomm.is_manager():
            self._vprint('...Input files inspected.', verbosity=0)

        # Validate the output files
        if self._simplecomm.is_manager():
            self._vprint('Inspecting output files...', verbosity=0)
        self._timer.start('Inspect Output Files')
        self._inspect_output_files()
        self._timer.stop('Inspect Output Files')
        if self._simplecomm.is_manager():
            self._vprint('...Output files inspected.', verbosity=0)

        # Check the read chunking
        if rchunks is None:
            # Default chunking is over 1 time-step at a time
            rchunks = {self._unlimited_dim: 1}
        if not isinstance(rchunks, dict):
            err_msg = 'Chunks must be specified with a dictionary'
            raise TypeError(err_msg)
        for key, value in rchunks.iteritems():
            if not isinstance(key, basestring):
                err_msg = 'Chunks dictionary must have string-type keys'
                raise TypeError(err_msg)
            if not isinstance(value, int):
                err_msg = 'Chunks dictionary must have integer chunk sizes'
                raise TypeError(err_msg)

        # Debugging output
        if self._simplecomm.is_manager():
            if len(rchunks) > 0:
                self._vprint('Read chunk sizes:', verbosity=1)
                for dname in rchunks:
                    self._vprint('  {!s}: {}'.format(
                        dname, rchunks[dname]), verbosity=1)
            else:
                self._vprint('No read chunking specified.', verbosity=1)
            self._vprint(
                'Converting time-slices to time-series...', verbosity=0)
        self._simplecomm.sync()

        # Partition the time-series variables across all processors
        tsv_names_loc = self._time_series_variables
        if output_limit > 0:
            tsv_names_loc = tsv_names_loc[0:output_limit]

        # Print partitions for all ranks
        dbg_msg = 'Converting time-series variables: {0}'.format(
            ', '.join(tsv_names_loc))
        self._vprint(dbg_msg, header=True, verbosity=1)

        # Reset all of the timer values (as it is possible that there are no
        # time-series variables in the local list procuded above)
        self._timer.reset('Open Output Files')
        self._timer.reset('Close Output Files')
        self._timer.reset('Open Input Files')
        self._timer.reset('Close Input Files')
        self._timer.reset('Create Time-Invariant Metadata')
        self._timer.reset('Create Time-Variant Metadata')
        self._timer.reset('Create Time-Series Variables')
        self._timer.reset('Read Time-Invariant Metadata')
        self._timer.reset('Read Time-Variant Metadata')
        self._timer.reset('Read Time-Series Variables')
        self._timer.reset('Write Time-Invariant Metadata')
        self._timer.reset('Write Time-Variant Metadata')
        self._timer.reset('Write Time-Series Variables')

        # Initialize the byte count dictionary
        self._byte_counts['Requested Data'] = 0
        self._byte_counts['Actual Data'] = 0

        #===== LOOP OVER TIME_SERIES VARIABLES =====

        if len(self._time_invariant_metafile_vars) > 0:
            metafile = iobackend.NCFile(self._metadata_filename)
        else:
            metafile = None

        # Loop over all time-series variables
        for out_name in tsv_names_loc:

            # Once-file data, for convenience
            is_once_file = (out_name == 'once')
            write_meta_data = not (self._use_once_file and not is_once_file)
            write_tser_data = not (self._use_once_file and is_once_file)

            # Determine the output file name for this variable
            out_filename = self._time_series_filenames[out_name]
            dbg_msg = 'Opening output file for variable: {0}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Opening "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

            # Open the output file
            self._timer.start('Open Output Files')
            temp_filename = out_filename + '_temp_.nc'
            if exists(temp_filename):
                remove(temp_filename)
            if self._write_mode == 'a' and out_name in self._existing:
                rename(out_filename, temp_filename)
                out_file = iobackend.NCFile(temp_filename, mode='a',
                                            ncfmt=self._netcdf_format,
                                            compression=self._netcdf_compression,
                                            least_significant_digit=self._netcdf_least_significant_digit)
                appending = True
            else:
                out_file = iobackend.NCFile(temp_filename, mode='w',
                                            ncfmt=self._netcdf_format,
                                            compression=self._netcdf_compression,
                                            least_significant_digit=self._netcdf_least_significant_digit)
                appending = False
            self._timer.stop('Open Output Files')

            # Start the loop over input files (i.e., time-slices)
            offsets = {
                self._unlimited_dim: self._time_series_step_index[out_name]}
            for in_filename in self._input_filenames:

                # Open the input file (and metadata file, if necessary)
                self._timer.start('Open Input Files')
                in_file = iobackend.NCFile(in_filename)
                self._timer.stop('Open Input Files')

                # Create header info, if this is the first input file
                if in_filename == self._input_filenames[0] and not appending:

                    # Copy file attributes and dimensions to output file
                    for name in in_file.ncattrs:
                        out_file.setncattr(name, in_file.getncattr(name))
                    for name, val in in_file.dimensions.iteritems():
                        if name == self._unlimited_dim:
                            out_file.create_dimension(name)
                        else:
                            out_file.create_dimension(name, val)

                    # Create the metadata variables
                    if write_meta_data:

                        # Time-invariant metadata variables
                        self._timer.start('Create Time-Invariant Metadata')
                        for name in self._time_invariant_metadata:
                            self._create_var(in_file, out_file, name)
                        for name in self._time_invariant_metafile_vars:
                            self._create_var(metafile, out_file, name)
                        self._timer.stop('Create Time-Invariant Metadata')

                        # Time-variant metadata variables
                        self._timer.start('Create Time-Variant Metadata')
                        for name in self._time_variant_metadata:
                            self._create_var(in_file, out_file, name)
                        self._timer.stop('Create Time-Variant Metadata')

                    # Create the time-series variable
                    if write_tser_data:

                        # Time-series variable
                        self._timer.start('Create Time-Series Variables')
                        self._create_var(in_file, out_file,
                                         out_name, chunks=wchunks)
                        self._timer.stop('Create Time-Series Variables')

                    dbg_msg = 'Writing output file for variable: {0}'.format(
                        out_name)
                    if out_name == 'once':
                        dbg_msg = 'Writing "once" file.'
                    self._vprint(dbg_msg, header=True, verbosity=1)

                    # Copy the time-invariant metadata
                    if write_meta_data:
                        for name in self._time_invariant_metadata:
                            in_var = in_file.variables[name]
                            out_var = out_file.variables[name]
                            self._copy_var('Time-Invariant Metadata',
                                           in_var, out_var, chunks=rchunks)
                        for name in self._time_invariant_metafile_vars:
                            in_var = metafile.variables[name]
                            out_var = out_file.variables[name]
                            self._copy_var('Time-Invariant Metadata',
                                           in_var, out_var, chunks=rchunks)

                # Copy the time-varient metadata
                if write_meta_data:
                    for name in self._time_variant_metadata:
                        in_var = in_file.variables[name]
                        out_var = out_file.variables[name]
                        self._copy_var('Time-Variant Metadata', in_var,
                                       out_var, chunks=rchunks, offsets=offsets)

                # Copy the time-series variables
                if write_tser_data:
                    in_var = in_file.variables[out_name]
                    out_var = out_file.variables[out_name]
                    self._copy_var('Time-Series Variables', in_var,
                                   out_var, chunks=rchunks, offsets=offsets)

                # Increment the time-series index offset
                offsets[self._unlimited_dim] += in_file.dimensions[self._unlimited_dim]

                # Close the input file
                self._timer.start('Close Input Files')
                in_file.close()
                self._timer.stop('Close Input Files')

            # Close the output file
            self._timer.start('Close Output Files')
            out_file.close()
            rename(temp_filename, out_filename)
            self._timer.stop('Close Output Files')

            # Output message to user
            dbg_msg = 'Closed output file for variable: {0}'.format(out_name)
            if out_name == 'once':
                dbg_msg = 'Closed "once" file.'
            self._vprint(dbg_msg, header=True, verbosity=1)

        # Close the metadata file, if necessary
        if metafile:
            metafile.close()

        # Information
        self._simplecomm.sync()
        if self._simplecomm.is_manager():
            self._vprint(
                '...Finished converting time-slices to time-series.', verbosity=0)

        # Finish clocking the entire convert procedure
        self._timer.stop('Complete Conversion Process')

    def print_diagnostics(self):
        """
        Print out timing and I/O information collected up to this point
        """

        # Get all totals and maxima
        my_times = self._timer.get_all_times()
        max_times = self._simplecomm.allreduce(my_times, op='max')
        my_memory = {'Maximum Memory Use': _get_memory_usage_MB_()}
        max_memory = self._simplecomm.allreduce(my_memory, op='max')
        my_bytes = self._byte_counts
        total_bytes = self._simplecomm.allreduce(my_bytes, op='sum')

        # Synchronize
        self._simplecomm.sync()

        # Print timing maxima
        o = self._timer.get_names()
        time_table_str = _pprint_dictionary('TIMING DATA', max_times, order=o)
        if self._simplecomm.is_manager():
            self._vprint(time_table_str, verbosity=-1)

        # Convert byte count to MB
        for name in total_bytes:
            total_bytes[name] = total_bytes[name] / float(1024 * 1024)

        # Print byte count totals
        byte_count_str = _pprint_dictionary('BYTE COUNTS (MB)', total_bytes)
        if self._simplecomm.is_manager():
            self._vprint(byte_count_str, verbosity=-1)

        # Print maximum memory use in MB
        memory_str = _pprint_dictionary('MEMORY USAGE (MB)', max_memory)
        if self._simplecomm.is_manager():
            self._vprint(memory_str, verbosity=-1)