def main(argv=None): opts, args = cli(argv) # Create the input object for the Reshaper spec = specification.create_specifier() # Generate the input file list from (potentially) globs/wildcards full_input_file_list = [] for infile in args: full_input_file_list.extend(glob.glob(infile)) # Add input to the specifier spec.input_file_list = full_input_file_list spec.io_backend = opts.backend spec.compression_level = opts.compression_level spec.least_significant_digit = opts.least_significant_digit spec.netcdf_format = opts.netcdf_format spec.output_file_prefix = opts.output_prefix spec.output_file_suffix = opts.output_suffix spec.time_series = opts.time_series spec.time_variant_metadata = opts.metadata spec.metadata_filename = opts.metafile spec.exclude_list = opts.exclude spec.assume_1d_time_variant_metadata = opts.meta1d # Validate before saving spec.validate() # Write the specfile spec.write(opts.specfile)
def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename, standalone, completechunk, debug, debugMsg): """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of reshaper specifications to be passed to the pyReshaper tool. Arguments: caseroot (string) - case root path input_rootdir (string) - rootdir to input raw history files output_rootdir (string) - rootdir to output single variable time series files casename (string) - casename standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not completechunk (boolean) - end on a ragid boundary if True. Otherwise, do not create incomplete chunks if False """ specifiers = list() xml_tree = ET.ElementTree() # get path to env_timeseries.xml file env_timeseries = '{0}/env_timeseries.xml'.format(caseroot) # read tseries log file to see if we've already started converting files, if so, where did we leave off log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot)) # check if the env_timeseries.xml file exists if (not os.path.isfile(env_timeseries)): err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format( env_timeseries) raise OSError(err_msg) else: # parse the xml xml_tree.parse(env_timeseries) # loop through all the comp_archive_spec elements to find the tseries related elements for comp_archive_spec in xml_tree.findall( "components/comp_archive_spec"): comp = comp_archive_spec.get("name") rootdir = comp_archive_spec.find("rootdir").text multi_instance = comp_archive_spec.find("multi_instance").text default_calendar = comp_archive_spec.find("default_calendar").text debugMsg("default_calendar = {0}".format(default_calendar), header=True) # for now, set instance value to empty string implying only 1 instance instance = "" # loop through all the files/file_spec elements for file_spec in comp_archive_spec.findall("files/file_extension"): file_extension = file_spec.get("suffix") subdir = file_spec.find("subdir").text # check if tseries_create is an element for this file_spec if file_spec.find("tseries_create") is not None: tseries_create = file_spec.find("tseries_create").text # check if the tseries_create element is set to TRUE if tseries_create.upper() in ["T", "TRUE"]: # check if tseries_format is an element for this file_spec and if it is valid if file_spec.find("tseries_output_format") is not None: tseries_output_format = file_spec.find( "tseries_output_format").text if tseries_output_format not in [ "netcdf", "netcdf4", "netcdf4c", "netcdfLarge" ]: err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # load the tseries_time_variant_variables into a list variable_list = list() if comp_archive_spec.find( "tseries_time_variant_variables") is not None: for variable in comp_archive_spec.findall( "tseries_time_variant_variables/variable"): variable_list.append(variable.text) # get a list of all the input files for this stream from the archive location history_files = list() in_file_path = '/'.join( [input_rootdir, rootdir, subdir]) # get XML tseries elements for chunking if file_spec.find("tseries_tper") is not None: tseries_tper = file_spec.find("tseries_tper").text if file_spec.find("tseries_filecat_tper") is not None: tper = file_spec.find("tseries_filecat_tper").text if file_spec.find("tseries_filecat_n") is not None: size = file_spec.find("tseries_filecat_n").text comp_name = comp stream = file_extension.split('.[')[0] stream_dates, file_slices, cal, units, time_period_freq = chunking.get_input_dates( in_file_path + '/*' + file_extension + '*.nc') # check if the calendar attribute was read or not if cal is None or cal == "none": cal = default_calendar debugMsg("calendar = {0}".format(cal), header=True) # the tseries_tper should be set in using the time_period_freq global file attribute if it exists if time_period_freq is not None: tseries_tper = time_period_freq tseries_output_dir = '/'.join([ output_rootdir, rootdir, 'proc/tseries', tseries_tper ]) debugMsg("tseries_output_dir = {0}".format( tseries_output_dir), header=True) if not os.path.exists(tseries_output_dir): os.makedirs(tseries_output_dir) if comp + stream not in log.keys(): log[comp + stream] = {'slices': [], 'index': 0} ts_log_dates = log[comp + stream]['slices'] index = log[comp + stream]['index'] files, dates, index = chunking.get_chunks( tper, index, size, stream_dates, ts_log_dates, cal, units, completechunk) for d in dates: log[comp + stream]['slices'].append(float(d)) log[comp + stream]['index'] = index for cn, cf in files.iteritems(): history_files = cf['fn'] start_time_parts = cf['start'] last_time_parts = cf['end'] # create the tseries output prefix needs to end with a "." tseries_output_prefix = "{0}/{1}.{2}{3}.".format( tseries_output_dir, casename, comp_name, stream) debugMsg("tseries_output_prefix = {0}".format( tseries_output_prefix), header=True) # format the time series variable output suffix based on the # tseries_tper setting suffix needs to start with a "." freq_array = ["week", "day", "hour", "min"] if "year" in tseries_tper: tseries_output_suffix = "." + start_time_parts[ 0] + "-" + last_time_parts[0] + ".nc" elif "month" in tseries_tper: tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[ 1] + "-" + last_time_parts[ 0] + last_time_parts[1] + ".nc" elif "day" in tseries_tper: tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[1] + start_time_parts[ 2] + "-" + last_time_parts[ 0] + last_time_parts[ 1] + last_time_parts[2] + ".nc" elif any(freq_string in tseries_tper for freq_string in freq_array): tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[1] + start_time_parts[ 2] + start_time_parts[ 3] + "-" + last_time_parts[ 0] + last_time_parts[ 1] + last_time_parts[ 2] + last_time_parts[ 3] + ".nc" else: err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format( tseries_tper) raise TypeError(err_msg) debugMsg("tseries_output_suffix = {0}".format( tseries_output_suffix), header=True) # get a reshaper specification object spec = specification.create_specifier() # populate the spec object with data for this history stream spec.input_file_list = history_files spec.netcdf_format = tseries_output_format spec.output_file_prefix = tseries_output_prefix spec.output_file_suffix = tseries_output_suffix spec.time_variant_metadata = variable_list # print the specifier if debug: dbg = list() pp = pprint.PrettyPrinter(indent=5) dbg = [ comp_name, spec.input_file_list, spec.netcdf_format, spec.output_file_prefix, spec.output_file_suffix, spec.time_variant_metadata ] pp.pprint(dbg) # append this spec to the list of specifiers specifiers.append(spec) return specifiers, log
def readArchiveXML(cesmEnv): ''' returns a fully defined list of reshaper specifications ''' from pyreshaper import specification specifiers = list() xml_tree = ET.ElementTree() # check if the env_archive.xml file exists if ( not os.path.isfile('./env_archive.xml') ): err_msg = "cesm_tseries_generator.py ERROR: env_archive.xml does not exist." raise OSError(err_msg) else: # parse the xml xml_tree.parse('./env_archive.xml') # loop through all the comp_archive_spec elements to find the tseries related elements for comp_archive_spec in xml_tree.findall("components/comp_archive_spec"): comp = comp_archive_spec.get("name") rootdir = comp_archive_spec.find("rootdir").text multi_instance = comp_archive_spec.find("multi_instance").text # for now, set instance value to empty string implying only 1 instance instance = "" # loop through all the files/file_spec elements for file_spec in comp_archive_spec.findall("files/file_extension"): file_extension = file_spec.get("suffix") subdir = file_spec.find("subdir").text # check if tseries_create is an element for this file_spec if file_spec.find("tseries_create") is not None: tseries_create = file_spec.find("tseries_create").text # check if the tseries_create element is set to TRUE if tseries_create.upper() in ["T","TRUE"]: # check if tseries_format is an element for this file_spec and if it is valid if file_spec.find("tseries_output_format") is not None: tseries_output_format = file_spec.find("tseries_output_format").text if tseries_output_format not in ["netcdf","netcdf4","netcdf4c"]: err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) # check if the tseries_output_subdir is specified and create the tseries_output_dir if file_spec.find("tseries_output_subdir") is not None: tseries_output_subdir = file_spec.find("tseries_output_subdir").text tseries_output_dir = '/'.join( [cesmEnv["DOUT_S_ROOT"], rootdir,tseries_output_subdir] ) if not os.path.exists(tseries_output_dir): os.makedirs(tseries_output_dir) else: err_msg = "cesm_tseries_generator.py error: tseries_output_subdir undefined for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) # check if tseries_tper is specified and is valid if file_spec.find("tseries_tper") is not None: tseries_tper = file_spec.find("tseries_tper").text if tseries_tper not in ["yearly","monthly","weekly","daily","hourly6","hourly3","hourly1","min30"]: err_msg = "cesm_tseries_generator.py error: tseries_tper invalid for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_tper undefined for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) # load the tseries_time_variant_variables into a list if comp_archive_spec.find("tseries_time_variant_variables") is not None: variable_list = list() for variable in comp_archive_spec.findall("tseries_time_variant_variables/variable"): variable_list.append(variable.text) # get a list of all the input files for this stream from the archive location history_files = list() in_file_path = '/'.join( [cesmEnv["DOUT_S_ROOT"],rootdir,subdir] ) all_in_files = os.listdir(in_file_path) # check that there are actually a list of history files to work with for in_file in all_in_files: if re.search(file_extension, in_file): history_files.append(in_file_path+"/"+in_file) # sort the list of input history files in order to get the output suffix from the first and last file if len(history_files) > 0: history_files.sort() start_file = history_files[0] start_file_parts = list() start_file_parts = start_file.split( "." ) start_file_time = start_file_parts[-2] last_file = history_files[-1] last_file_parts = list() last_file_parts = last_file.split( "." ) last_file_time = last_file_parts[-2] # get the actual component name from the history file - will also need to deal with the instance numbers based on the comp_name comp_name = last_file_parts[-4] stream = last_file_parts[-3] # check for pop.h nday1 and nyear1 history streams if last_file_parts[-3] in ["nday1","nyear1"]: comp_name = last_file_parts[-5] stream = last_file_parts[-4]+"."+last_file_parts[-3] # create the tseries output prefix needs to end with a "." tseries_output_prefix = tseries_output_dir+"/"+cesmEnv["CASE"]+"."+comp_name+"."+stream+"." # format the time series variable output suffix based on the tseries_tper setting suffix needs to start with a "." if tseries_tper == "yearly": tseries_output_suffix = "."+start_file_time+"-"+last_file_time+".nc" elif tseries_tper == "monthly": start_time_parts = start_file_time.split( "-" ) last_time_parts = last_file_time.split( "-" ) tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+"-"+last_time_parts[0]+last_time_parts[1]+".nc" elif tseries_tper in ["weekly","daily","hourly6","hourly3","hourly1","min30"]: start_time_parts = start_file_time.split( "-" ) last_time_parts = last_file_time.split( "-" ) tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+".nc" # START HERE... need to create specifiers based on the tseries_filecat_years spec # get a reshpaer specification object spec = specification.create_specifier() # populate the spec object with data for this history stream spec.input_file_list = history_files spec.netcdf_format = tseries_output_format spec.output_file_prefix = tseries_output_prefix spec.output_file_suffix = tseries_output_suffix spec.time_variant_metadata = variable_list dbg = list() pp = pprint.PrettyPrinter(indent=5) dbg = [comp_name, spec.input_file_list, spec.netcdf_format, spec.output_file_prefix, spec.output_file_suffix, spec.time_variant_metadata] pp.pprint(dbg) # append this spec to the list of specifiers specifiers.append(spec) return specifiers
def readArchiveXML(cesmEnv): ''' returns a fully defined list of reshaper specifications ''' from pyreshaper import specification specifiers = list() xml_tree = ET.ElementTree() # check if the env_archive.xml file exists if (not os.path.isfile('../env_archive.xml')): err_msg = "cesm_tseries_generator.py ERROR: env_archive.xml does not exists." raise OSError(err_msg) else: # parse the xml xml_tree.parse('../env_archive.xml') # loop through all the comp_archive_spec elements to find the tseries related elements for comp_archive_spec in xml_tree.findall( "components/comp_archive_spec"): comp = comp_archive_spec.get("name") rootdir = comp_archive_spec.find("rootdir").text multi_instance = comp_archive_spec.find("multi_instance").text # for now, set instance value to empty string implying only 1 instance instance = "" # loop through all the files/file_spec elements for file_spec in comp_archive_spec.findall("files/file_extension"): file_extension = file_spec.get("suffix") subdir = file_spec.find("subdir").text # check if tseries_create is an element for this file_spec if file_spec.find("tseries_create") is not None: tseries_create = file_spec.find("tseries_create").text # check if the tseries_create element is set to TRUE if tseries_create.upper() in ["T", "TRUE"]: # check if tseries_format is an element for this file_spec and if it is valid if file_spec.find("tseries_output_format") is not None: tseries_output_format = file_spec.find( "tseries_output_format").text if tseries_output_format not in [ "netcdf", "netcdf4", "netcdf4c" ]: err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # check if the tseries_output_subdir is specified and create the tseries_output_dir if file_spec.find("tseries_output_subdir") is not None: tseries_output_subdir = file_spec.find( "tseries_output_subdir").text tseries_output_dir = '/'.join([ cesmEnv["DOUT_S_ROOT"], rootdir, tseries_output_subdir ]) if not os.path.exists(tseries_output_dir): os.makedirs(tseries_output_dir) else: err_msg = "cesm_tseries_generator.py error: tseries_output_subdir undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # check if tseries_tper is specified and is valid if file_spec.find("tseries_tper") is not None: tseries_tper = file_spec.find("tseries_tper").text if tseries_tper not in [ "yearly", "monthly", "weekly", "daily", "hourly6", "hourly3", "hourly1", "min30" ]: err_msg = "cesm_tseries_generator.py error: tseries_tper invalid for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_tper undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # load the tseries_time_variant_variables into a list if comp_archive_spec.find( "tseries_time_variant_variables") is not None: variable_list = list() for variable in comp_archive_spec.findall( "tseries_time_variant_variables/variable"): variable_list.append(variable.text) # get a list of all the input files for this stream from the archive location history_files = list() in_file_path = '/'.join( [cesmEnv["DOUT_S_ROOT"], rootdir, subdir]) all_in_files = os.listdir(in_file_path) # check that there are actually a list of history files to work with for in_file in all_in_files: if re.search(file_extension, in_file): history_files.append(in_file_path + "/" + in_file) # sort the list of input history files in order to get the output suffix from the first and last file if len(history_files) > 0: history_files.sort() start_file = history_files[0] start_file_parts = list() start_file_parts = start_file.split(".") start_file_time = start_file_parts[-2] last_file = history_files[-1] last_file_parts = list() last_file_parts = last_file.split(".") last_file_time = last_file_parts[-2] # get the actual component name from the history file - will also need to deal with the instance numbers based on the comp_name comp_name = last_file_parts[-4] stream = last_file_parts[-3] # check for pop.h nday1 and nyear1 history streams if last_file_parts[-3] in ["nday1", "nyear1"]: comp_name = last_file_parts[-5] stream = last_file_parts[ -4] + "." + last_file_parts[-3] # create the tseries output prefix needs to end with a "." tseries_output_prefix = tseries_output_dir + "/" + cesmEnv[ "CASE"] + "." + comp_name + "." + stream + "." # format the time series variable output suffix based on the tseries_tper setting suffix needs to start with a "." if tseries_tper == "yearly": tseries_output_suffix = "." + start_file_time + "-" + last_file_time + ".nc" elif tseries_tper == "monthly": start_time_parts = start_file_time.split("-") last_time_parts = last_file_time.split("-") tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[ 1] + "-" + last_time_parts[ 0] + last_time_parts[1] + ".nc" elif tseries_tper in [ "weekly", "daily", "hourly6", "hourly3", "hourly1", "min30" ]: start_time_parts = start_file_time.split("-") last_time_parts = last_file_time.split("-") tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[1] + start_time_parts[ 2] + "-" + last_time_parts[ 0] + last_time_parts[ 1] + last_time_parts[2] + ".nc" # START HERE... need to create specifiers based on the tseries_filecat_years spec # get a reshpaer specification object spec = specification.create_specifier() # populate the spec object with data for this history stream spec.input_file_list = history_files spec.netcdf_format = tseries_output_format spec.output_file_prefix = tseries_output_prefix spec.output_file_suffix = tseries_output_suffix spec.time_variant_metadata = variable_list dbg = list() pp = pprint.PrettyPrinter(indent=5) dbg = [ comp_name, spec.input_file_list, spec.netcdf_format, spec.output_file_prefix, spec.output_file_suffix, spec.time_variant_metadata ] pp.pprint(dbg) # append this spec to the list of specifiers specifiers.append(spec) return specifiers
def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename, standalone, completechunk, generate_all, debug, debugMsg, comm, rank, size): """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of reshaper specifications to be passed to the pyReshaper tool. Arguments: caseroot (string) - case root path input_rootdir (string) - rootdir to input raw history files output_rootdir (string) - rootdir to output single variable time series files casename (string) - casename standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not completechunk (boolean) - end on a ragid boundary if True. Otherwise, do not create incomplete chunks if False generate_all (boolean) - generate timeseries for all streams if True. Otherwise, use the tseries_create setting. """ specifiers = list() xml_tree = ET.ElementTree() # get path to env_timeseries.xml file env_timeseries = '{0}/env_timeseries.xml'.format(caseroot) # read tseries log file to see if we've already started converting files, if so, where did we leave off log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot)) # check if the env_timeseries.xml file exists if ( not os.path.isfile(env_timeseries) ): err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format(env_timeseries) raise OSError(err_msg) else: # parse the xml xml_tree.parse(env_timeseries) # loop through all the comp_archive_spec elements to find the tseries related elements for comp_archive_spec in xml_tree.findall("components/comp_archive_spec"): comp = comp_archive_spec.get("name") rootdir = comp_archive_spec.find("rootdir").text multi_instance = comp_archive_spec.find("multi_instance").text default_calendar = comp_archive_spec.find("default_calendar").text if rank == 0: debugMsg("default_calendar = {0}".format(default_calendar), header=True, verbosity=1) # for now, set instance value to empty string implying only 1 instance instance = "" # loop through all the files/file_spec elements for file_spec in comp_archive_spec.findall("files/file_extension"): file_extension = file_spec.get("suffix") subdir = file_spec.find("subdir").text # check if tseries_create is an element for this file_spec if file_spec.find("tseries_create") is not None: tseries_create = file_spec.find("tseries_create").text # check if the tseries_create element is set to TRUE if tseries_create.upper() in ["T","TRUE"] or generate_all.upper() in ["T","TRUE"]: # check if tseries_format is an element for this file_spec and if it is valid if file_spec.find("tseries_output_format") is not None: tseries_output_format = file_spec.find("tseries_output_format").text if tseries_output_format not in ["netcdf","netcdf4","netcdf4c","netcdfLarge"]: err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) # load the tseries_time_variant_variables into a list variable_list = list() if comp_archive_spec.find("tseries_time_variant_variables") is not None: for variable in comp_archive_spec.findall("tseries_time_variant_variables/variable"): variable_list.append(variable.text) # load the tseries_exclude_variables into a list exclude_list = list() if comp_archive_spec.find("tseries_exclude_variables") is not None: for variable in comp_archive_spec.findall("tseries_exclude_variables/variable"): exclude_list.append(variable.text) # get a list of all the input files for this stream from the archive location history_files = list() in_file_path = '/'.join( [input_rootdir,rootdir,subdir] ) # get XML tseries elements for chunking if file_spec.find("tseries_tper") is not None: tseries_tper = file_spec.find("tseries_tper").text if file_spec.find("tseries_filecat_tper") is not None: tper = file_spec.find("tseries_filecat_tper").text if file_spec.find("tseries_filecat_n") is not None: size_n = file_spec.find("tseries_filecat_n").text comp_name = comp stream = file_extension.split('.[')[0] stream_dates,file_slices,cal,units,time_period_freq = chunking.get_input_dates(in_file_path+'/*'+file_extension+'*.nc', comm, rank, size) # check if the calendar attribute was read or not if cal is None or cal == "none": cal = default_calendar if rank == 0: debugMsg("calendar = {0}".format(cal), header=True, verbosity=1) # the tseries_tper should be set in using the time_period_freq global file attribute if it exists if time_period_freq is not None: tseries_tper = time_period_freq tseries_output_dir = '/'.join( [output_rootdir, rootdir, 'proc/tseries', tseries_tper] ) if rank == 0: debugMsg("tseries_output_dir = {0}".format(tseries_output_dir), header=True, verbosity=1) if comp+stream not in log.keys(): log[comp+stream] = {'slices':[],'index':0} ts_log_dates = log[comp+stream]['slices'] index = log[comp+stream]['index'] files,dates,index = chunking.get_chunks(tper, index, size_n, stream_dates, ts_log_dates, cal, units, completechunk, tseries_tper) for d in dates: log[comp+stream]['slices'].append(float(d)) log[comp+stream]['index']=index for cn,cf in files.iteritems(): if rank == 0: if not os.path.exists(tseries_output_dir): os.makedirs(tseries_output_dir) comm.sync() history_files = cf['fn'] start_time_parts = cf['start'] last_time_parts = cf['end'] # create the tseries output prefix needs to end with a "." tseries_output_prefix = "{0}/{1}.{2}{3}.".format(tseries_output_dir,casename,comp_name,stream) if rank == 0: debugMsg("tseries_output_prefix = {0}".format(tseries_output_prefix), header=True, verbosity=1) # format the time series variable output suffix based on the # tseries_tper setting suffix needs to start with a "." freq_array = ["week","day","hour","min"] if "year" in tseries_tper: tseries_output_suffix = "."+start_time_parts[0]+"-"+last_time_parts[0]+".nc" elif "month" in tseries_tper: tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+"-"+last_time_parts[0]+last_time_parts[1]+".nc" elif "day" in tseries_tper: tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+".nc" elif any(freq_string in tseries_tper for freq_string in freq_array): tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+start_time_parts[3]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+last_time_parts[3]+".nc" else: err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format(tseries_tper) raise TypeError(err_msg) if rank == 0: debugMsg("tseries_output_suffix = {0}".format(tseries_output_suffix), header=True, verbosity=1) # get a reshaper specification object/ spec = specification.create_specifier() # populate the spec object with data for this history stream spec.input_file_list = history_files spec.netcdf_format = tseries_output_format spec.output_file_prefix = tseries_output_prefix spec.output_file_suffix = tseries_output_suffix spec.time_variant_metadata = variable_list spec.exclude_list = exclude_list # setting the default backend; netCDF4 or pynio spec.backend = 'netCDF4' if rank == 0: debugMsg("specifier: comp_name = {0}".format(comp_name), header=True, verbosity=1) debugMsg(" input_file_list = {0}".format(spec.input_file_list), header=True, verbosity=1) debugMsg(" netcdf_format = {0}".format(spec.netcdf_format), header=True, verbosity=1) debugMsg(" output_file_prefix = {0}".format(spec.output_file_prefix), header=True, verbosity=1) debugMsg(" output_file_suffix = {0}".format(spec.output_file_suffix), header=True, verbosity=1) debugMsg(" time_variant_metadata = {0}".format(spec.time_variant_metadata), header=True, verbosity=1) debugMsg(" exclude_list = {0}".format(spec.exclude_list), header=True, verbosity=1) # append this spec to the list of specifiers specifiers.append(spec) return specifiers,log
# input_fnames = ['FAMIPC5.cam.h0.2010-09.nc', 'FAMIPC5.cam.h0.2010-10.nc'] input_fnames = conversion_utils.fetch_fnames(dirs.INPUT, 'FAMIPC5', 'cam') input_files = [os.path.join(dirs.INPUT, f) for f in input_fnames] # Create output directory if it doesn't already exist if not os.path.isdir(dirs.OUTPUT): print('Creating directory {}'.format(dirs.OUTPUT)) os.makedirs(dirs.OUTPUT) # Converted time-series file prefix & suffix prefix = 'FAMIPC5.cam.' output_prefix = os.path.join(dirs.OUTPUT, prefix) output_suffix = conversion_utils.parse_output_suffix(input_fnames) # --- Create PyReshaper specifier object --------------------------------------- specifier = specification.create_specifier() # Define specifier input needed perform the conversion specifier.input_file_list = input_files specifier.netcdf_format = "netcdf4" specifier.compression_level = 1 specifier.output_file_prefix = output_prefix specifier.output_file_suffix = output_suffix specifier.time_variant_metadata = ["time", "time_bounds"] # specifier.exclude_list = ['HKSAT','ZLAKE'] # Create the PyReshaper object rshpr = reshaper.create_reshaper(specifier, serial=False, verbosity=1, wmode='s')
def readArchiveXML(caseroot, dout_s_root, casename, standalone, debug): """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of reshaper specifications to be passed to the pyReshaper tool. Arguments: caseroot (string) - case root path dout_s_root (string) - short term archive root path casename (string) - casename standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not """ specifiers = list() xml_tree = ET.ElementTree() # get path to env_timeseries.xml file env_timeseries = '{0}/postprocess/env_timeseries.xml'.format(caseroot) if standalone: env_timeseries = '{0}/env_timeseries.xml'.format(caseroot) # check if the env_timeseries.xml file exists if (not os.path.isfile(env_timeseries)): err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format( env_timeseries) raise OSError(err_msg) else: # parse the xml xml_tree.parse(env_timeseries) # loop through all the comp_archive_spec elements to find the tseries related elements for comp_archive_spec in xml_tree.findall( "components/comp_archive_spec"): comp = comp_archive_spec.get("name") rootdir = comp_archive_spec.find("rootdir").text multi_instance = comp_archive_spec.find("multi_instance").text # for now, set instance value to empty string implying only 1 instance instance = "" # loop through all the files/file_spec elements for file_spec in comp_archive_spec.findall("files/file_extension"): file_extension = file_spec.get("suffix") subdir = file_spec.find("subdir").text # check if tseries_create is an element for this file_spec if file_spec.find("tseries_create") is not None: tseries_create = file_spec.find("tseries_create").text # check if the tseries_create element is set to TRUE if tseries_create.upper() in ["T", "TRUE"]: # check if tseries_format is an element for this file_spec and if it is valid if file_spec.find("tseries_output_format") is not None: tseries_output_format = file_spec.find( "tseries_output_format").text if tseries_output_format not in [ "netcdf", "netcdf4", "netcdf4c" ]: err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # check if the tseries_output_subdir is specified and create the tseries_output_dir if file_spec.find("tseries_output_subdir") is not None: tseries_output_subdir = file_spec.find( "tseries_output_subdir").text tseries_output_dir = '/'.join( [dout_s_root, rootdir, tseries_output_subdir]) if not os.path.exists(tseries_output_dir): os.makedirs(tseries_output_dir) else: err_msg = "cesm_tseries_generator.py error: tseries_output_subdir undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # check if tseries_tper is specified and is valid if file_spec.find("tseries_tper") is not None: tseries_tper = file_spec.find("tseries_tper").text if tseries_tper not in [ "annual", "yearly", "monthly", "weekly", "daily", "hourly6", "hourly3", "hourly1", "min30" ]: err_msg = "cesm_tseries_generator.py error: tseries_tper invalid for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_tper undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # load the tseries_time_variant_variables into a list if comp_archive_spec.find( "tseries_time_variant_variables") is not None: variable_list = list() for variable in comp_archive_spec.findall( "tseries_time_variant_variables/variable"): variable_list.append(variable.text) # get a list of all the input files for this stream from the archive location history_files = list() in_file_path = '/'.join([dout_s_root, rootdir, subdir]) all_in_files = os.listdir(in_file_path) # check that there are actually a list of history files to work with for in_file in all_in_files: if re.search(file_extension, in_file): # check to make sure this file ends in .nc and not something else if in_file.endswith('.nc'): history_files.append(in_file_path + "/" + in_file) else: print( 'cesm_tseries_generator.py WARNING - unable to operate on file {0}/{1}' .format(in_file_path, in_file)) # sort the list of input history files in order to get the output suffix # from the first and last file if len(history_files) > 0: history_files.sort() start_file = history_files[0] start_file_parts = list() start_file_parts = start_file.split(".") start_file_time = start_file_parts[-2] last_file = history_files[-1] last_file_parts = list() last_file_parts = last_file.split(".") last_file_time = last_file_parts[-2] # get the actual component name from the history file # will also need to deal with the instance numbers based on the comp_name comp_name = last_file_parts[-4] stream = last_file_parts[-3] # check for pop.h nday1 and nyear1 history streams if last_file_parts[-3] in ["nday1", "nyear1"]: comp_name = last_file_parts[-5] stream = last_file_parts[ -4] + "." + last_file_parts[-3] # create the tseries output prefix needs to end with a "." tseries_output_prefix = tseries_output_dir + "/" + casename + "." + comp_name + "." + stream + "." # format the time series variable output suffix based on the # tseries_tper setting suffix needs to start with a "." if tseries_tper == "yearly": tseries_output_suffix = "." + start_file_time + "-" + last_file_time + ".nc" elif tseries_tper == "monthly": start_time_parts = start_file_time.split("-") last_time_parts = last_file_time.split("-") tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[ 1] + "-" + last_time_parts[ 0] + last_time_parts[1] + ".nc" elif tseries_tper in [ "weekly", "daily", "hourly6", "hourly3", "hourly1", "min30" ]: start_time_parts = start_file_time.split("-") last_time_parts = last_file_time.split("-") tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[1] + start_time_parts[ 2] + "-" + last_time_parts[ 0] + last_time_parts[ 1] + last_time_parts[2] + ".nc" # get a reshaper specification object spec = specification.create_specifier() # populate the spec object with data for this history stream spec.input_file_list = history_files spec.netcdf_format = tseries_output_format spec.output_file_prefix = tseries_output_prefix spec.output_file_suffix = tseries_output_suffix spec.time_variant_metadata = variable_list # print the specifier if debug: dbg = list() pp = pprint.PrettyPrinter(indent=5) dbg = [ comp_name, spec.input_file_list, spec.netcdf_format, spec.output_file_prefix, spec.output_file_suffix, spec.time_variant_metadata ] pp.pprint(dbg) # append this spec to the list of specifiers specifiers.append(spec) return specifiers