def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename, standalone, completechunk, debug, debugMsg): """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of reshaper specifications to be passed to the pyReshaper tool. Arguments: caseroot (string) - case root path input_rootdir (string) - rootdir to input raw history files output_rootdir (string) - rootdir to output single variable time series files casename (string) - casename standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not completechunk (boolean) - end on a ragid boundary if True. Otherwise, do not create incomplete chunks if False """ specifiers = list() xml_tree = ET.ElementTree() # get path to env_timeseries.xml file env_timeseries = '{0}/env_timeseries.xml'.format(caseroot) # read tseries log file to see if we've already started converting files, if so, where did we leave off log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot)) # check if the env_timeseries.xml file exists if (not os.path.isfile(env_timeseries)): err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format( env_timeseries) raise OSError(err_msg) else: # parse the xml xml_tree.parse(env_timeseries) # loop through all the comp_archive_spec elements to find the tseries related elements for comp_archive_spec in xml_tree.findall( "components/comp_archive_spec"): comp = comp_archive_spec.get("name") rootdir = comp_archive_spec.find("rootdir").text multi_instance = comp_archive_spec.find("multi_instance").text default_calendar = comp_archive_spec.find("default_calendar").text debugMsg("default_calendar = {0}".format(default_calendar), header=True) # for now, set instance value to empty string implying only 1 instance instance = "" # loop through all the files/file_spec elements for file_spec in comp_archive_spec.findall("files/file_extension"): file_extension = file_spec.get("suffix") subdir = file_spec.find("subdir").text # check if tseries_create is an element for this file_spec if file_spec.find("tseries_create") is not None: tseries_create = file_spec.find("tseries_create").text # check if the tseries_create element is set to TRUE if tseries_create.upper() in ["T", "TRUE"]: # check if tseries_format is an element for this file_spec and if it is valid if file_spec.find("tseries_output_format") is not None: tseries_output_format = file_spec.find( "tseries_output_format").text if tseries_output_format not in [ "netcdf", "netcdf4", "netcdf4c", "netcdfLarge" ]: err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format( comp, file_extension) raise TypeError(err_msg) # load the tseries_time_variant_variables into a list variable_list = list() if comp_archive_spec.find( "tseries_time_variant_variables") is not None: for variable in comp_archive_spec.findall( "tseries_time_variant_variables/variable"): variable_list.append(variable.text) # get a list of all the input files for this stream from the archive location history_files = list() in_file_path = '/'.join( [input_rootdir, rootdir, subdir]) # get XML tseries elements for chunking if file_spec.find("tseries_tper") is not None: tseries_tper = file_spec.find("tseries_tper").text if file_spec.find("tseries_filecat_tper") is not None: tper = file_spec.find("tseries_filecat_tper").text if file_spec.find("tseries_filecat_n") is not None: size = file_spec.find("tseries_filecat_n").text comp_name = comp stream = file_extension.split('.[')[0] stream_dates, file_slices, cal, units, time_period_freq = chunking.get_input_dates( in_file_path + '/*' + file_extension + '*.nc') # check if the calendar attribute was read or not if cal is None or cal == "none": cal = default_calendar debugMsg("calendar = {0}".format(cal), header=True) # the tseries_tper should be set in using the time_period_freq global file attribute if it exists if time_period_freq is not None: tseries_tper = time_period_freq tseries_output_dir = '/'.join([ output_rootdir, rootdir, 'proc/tseries', tseries_tper ]) debugMsg("tseries_output_dir = {0}".format( tseries_output_dir), header=True) if not os.path.exists(tseries_output_dir): os.makedirs(tseries_output_dir) if comp + stream not in log.keys(): log[comp + stream] = {'slices': [], 'index': 0} ts_log_dates = log[comp + stream]['slices'] index = log[comp + stream]['index'] files, dates, index = chunking.get_chunks( tper, index, size, stream_dates, ts_log_dates, cal, units, completechunk) for d in dates: log[comp + stream]['slices'].append(float(d)) log[comp + stream]['index'] = index for cn, cf in files.iteritems(): history_files = cf['fn'] start_time_parts = cf['start'] last_time_parts = cf['end'] # create the tseries output prefix needs to end with a "." tseries_output_prefix = "{0}/{1}.{2}{3}.".format( tseries_output_dir, casename, comp_name, stream) debugMsg("tseries_output_prefix = {0}".format( tseries_output_prefix), header=True) # format the time series variable output suffix based on the # tseries_tper setting suffix needs to start with a "." freq_array = ["week", "day", "hour", "min"] if "year" in tseries_tper: tseries_output_suffix = "." + start_time_parts[ 0] + "-" + last_time_parts[0] + ".nc" elif "month" in tseries_tper: tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[ 1] + "-" + last_time_parts[ 0] + last_time_parts[1] + ".nc" elif "day" in tseries_tper: tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[1] + start_time_parts[ 2] + "-" + last_time_parts[ 0] + last_time_parts[ 1] + last_time_parts[2] + ".nc" elif any(freq_string in tseries_tper for freq_string in freq_array): tseries_output_suffix = "." + start_time_parts[ 0] + start_time_parts[1] + start_time_parts[ 2] + start_time_parts[ 3] + "-" + last_time_parts[ 0] + last_time_parts[ 1] + last_time_parts[ 2] + last_time_parts[ 3] + ".nc" else: err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format( tseries_tper) raise TypeError(err_msg) debugMsg("tseries_output_suffix = {0}".format( tseries_output_suffix), header=True) # get a reshaper specification object spec = specification.create_specifier() # populate the spec object with data for this history stream spec.input_file_list = history_files spec.netcdf_format = tseries_output_format spec.output_file_prefix = tseries_output_prefix spec.output_file_suffix = tseries_output_suffix spec.time_variant_metadata = variable_list # print the specifier if debug: dbg = list() pp = pprint.PrettyPrinter(indent=5) dbg = [ comp_name, spec.input_file_list, spec.netcdf_format, spec.output_file_prefix, spec.output_file_suffix, spec.time_variant_metadata ] pp.pprint(dbg) # append this spec to the list of specifiers specifiers.append(spec) return specifiers, log
def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename, standalone, completechunk, generate_all, debug, debugMsg, comm, rank, size): """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of reshaper specifications to be passed to the pyReshaper tool. Arguments: caseroot (string) - case root path input_rootdir (string) - rootdir to input raw history files output_rootdir (string) - rootdir to output single variable time series files casename (string) - casename standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not completechunk (boolean) - end on a ragid boundary if True. Otherwise, do not create incomplete chunks if False generate_all (boolean) - generate timeseries for all streams if True. Otherwise, use the tseries_create setting. """ specifiers = list() xml_tree = ET.ElementTree() # get path to env_timeseries.xml file env_timeseries = '{0}/env_timeseries.xml'.format(caseroot) # read tseries log file to see if we've already started converting files, if so, where did we leave off log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot)) # check if the env_timeseries.xml file exists if ( not os.path.isfile(env_timeseries) ): err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format(env_timeseries) raise OSError(err_msg) else: # parse the xml xml_tree.parse(env_timeseries) # loop through all the comp_archive_spec elements to find the tseries related elements for comp_archive_spec in xml_tree.findall("components/comp_archive_spec"): comp = comp_archive_spec.get("name") rootdir = comp_archive_spec.find("rootdir").text multi_instance = comp_archive_spec.find("multi_instance").text default_calendar = comp_archive_spec.find("default_calendar").text if rank == 0: debugMsg("default_calendar = {0}".format(default_calendar), header=True, verbosity=1) # for now, set instance value to empty string implying only 1 instance instance = "" # loop through all the files/file_spec elements for file_spec in comp_archive_spec.findall("files/file_extension"): file_extension = file_spec.get("suffix") subdir = file_spec.find("subdir").text # check if tseries_create is an element for this file_spec if file_spec.find("tseries_create") is not None: tseries_create = file_spec.find("tseries_create").text # check if the tseries_create element is set to TRUE if tseries_create.upper() in ["T","TRUE"] or generate_all.upper() in ["T","TRUE"]: # check if tseries_format is an element for this file_spec and if it is valid if file_spec.find("tseries_output_format") is not None: tseries_output_format = file_spec.find("tseries_output_format").text if tseries_output_format not in ["netcdf","netcdf4","netcdf4c","netcdfLarge"]: err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) else: err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(comp,file_extension) raise TypeError(err_msg) # load the tseries_time_variant_variables into a list variable_list = list() if comp_archive_spec.find("tseries_time_variant_variables") is not None: for variable in comp_archive_spec.findall("tseries_time_variant_variables/variable"): variable_list.append(variable.text) # load the tseries_exclude_variables into a list exclude_list = list() if comp_archive_spec.find("tseries_exclude_variables") is not None: for variable in comp_archive_spec.findall("tseries_exclude_variables/variable"): exclude_list.append(variable.text) # get a list of all the input files for this stream from the archive location history_files = list() in_file_path = '/'.join( [input_rootdir,rootdir,subdir] ) # get XML tseries elements for chunking if file_spec.find("tseries_tper") is not None: tseries_tper = file_spec.find("tseries_tper").text if file_spec.find("tseries_filecat_tper") is not None: tper = file_spec.find("tseries_filecat_tper").text if file_spec.find("tseries_filecat_n") is not None: size_n = file_spec.find("tseries_filecat_n").text comp_name = comp stream = file_extension.split('.[')[0] stream_dates,file_slices,cal,units,time_period_freq = chunking.get_input_dates(in_file_path+'/*'+file_extension+'*.nc', comm, rank, size) # check if the calendar attribute was read or not if cal is None or cal == "none": cal = default_calendar if rank == 0: debugMsg("calendar = {0}".format(cal), header=True, verbosity=1) # the tseries_tper should be set in using the time_period_freq global file attribute if it exists if time_period_freq is not None: tseries_tper = time_period_freq tseries_output_dir = '/'.join( [output_rootdir, rootdir, 'proc/tseries', tseries_tper] ) if rank == 0: debugMsg("tseries_output_dir = {0}".format(tseries_output_dir), header=True, verbosity=1) if comp+stream not in log.keys(): log[comp+stream] = {'slices':[],'index':0} ts_log_dates = log[comp+stream]['slices'] index = log[comp+stream]['index'] files,dates,index = chunking.get_chunks(tper, index, size_n, stream_dates, ts_log_dates, cal, units, completechunk, tseries_tper) for d in dates: log[comp+stream]['slices'].append(float(d)) log[comp+stream]['index']=index for cn,cf in files.iteritems(): if rank == 0: if not os.path.exists(tseries_output_dir): os.makedirs(tseries_output_dir) comm.sync() history_files = cf['fn'] start_time_parts = cf['start'] last_time_parts = cf['end'] # create the tseries output prefix needs to end with a "." tseries_output_prefix = "{0}/{1}.{2}{3}.".format(tseries_output_dir,casename,comp_name,stream) if rank == 0: debugMsg("tseries_output_prefix = {0}".format(tseries_output_prefix), header=True, verbosity=1) # format the time series variable output suffix based on the # tseries_tper setting suffix needs to start with a "." freq_array = ["week","day","hour","min"] if "year" in tseries_tper: tseries_output_suffix = "."+start_time_parts[0]+"-"+last_time_parts[0]+".nc" elif "month" in tseries_tper: tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+"-"+last_time_parts[0]+last_time_parts[1]+".nc" elif "day" in tseries_tper: tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+".nc" elif any(freq_string in tseries_tper for freq_string in freq_array): tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+start_time_parts[3]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+last_time_parts[3]+".nc" else: err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format(tseries_tper) raise TypeError(err_msg) if rank == 0: debugMsg("tseries_output_suffix = {0}".format(tseries_output_suffix), header=True, verbosity=1) # get a reshaper specification object/ spec = specification.create_specifier() # populate the spec object with data for this history stream spec.input_file_list = history_files spec.netcdf_format = tseries_output_format spec.output_file_prefix = tseries_output_prefix spec.output_file_suffix = tseries_output_suffix spec.time_variant_metadata = variable_list spec.exclude_list = exclude_list # setting the default backend; netCDF4 or pynio spec.backend = 'netCDF4' if rank == 0: debugMsg("specifier: comp_name = {0}".format(comp_name), header=True, verbosity=1) debugMsg(" input_file_list = {0}".format(spec.input_file_list), header=True, verbosity=1) debugMsg(" netcdf_format = {0}".format(spec.netcdf_format), header=True, verbosity=1) debugMsg(" output_file_prefix = {0}".format(spec.output_file_prefix), header=True, verbosity=1) debugMsg(" output_file_suffix = {0}".format(spec.output_file_suffix), header=True, verbosity=1) debugMsg(" time_variant_metadata = {0}".format(spec.time_variant_metadata), header=True, verbosity=1) debugMsg(" exclude_list = {0}".format(spec.exclude_list), header=True, verbosity=1) # append this spec to the list of specifiers specifiers.append(spec) return specifiers,log