def rmse_proc_nc_to_df(exp, filename, reference, p_stages): ''' Arguments: exp = experiment name filename = filename of the netCDF returned by function standard_postproc reference = filename to the reference p_stages = directory where processing steps are stored returns: dataframe with processed data for pattern correlation test ''' test = 'rmse' rmse_interim = 'test_postproc_intermediate_{}_{}.nc'.format(test, exp) rmse_filename = 'test_proc_{}_{}.nc'.format(test, exp) cdo_cmd = 'cdo -L timmean -yearmean -vertsum {} {}'.format( filename, rmse_interim) utils.shell_cmd(cdo_cmd, py_routine=__name__) reference_normalized = normalize_data(reference) rmse_interim_normalized = normalize_data(rmse_interim) # list of variables in the timeserie netcdf file to drop # (not to put into the dataframe) vars_to_drop = [] log.info('Compute root mean square error ' 'between {} and {} (reference)'.format(rmse_interim_normalized, reference_normalized)) cdo_cmd = 'cdo -L sqrt -fldmean -sqr -sub {} {} {}'.format( rmse_interim_normalized, reference_normalized, rmse_filename) utils.shell_cmd(cdo_cmd, py_routine=__name__) # open dataset data = xr.open_dataset(rmse_filename) # Delete variables # useless variable time_bnds if ('time_bnds' in data.keys()): data = data.drop('time_bnds') # 3D vars if len(vars_to_drop) > 0: data.drop(labels=vars_to_drop) # transforms into dataframe df_data = data.to_dataframe() os.makedirs(p_stages, exist_ok=True) csv_filename = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) df_data.to_csv(csv_filename, index=None, header=True, sep=';') log.info('CSV file can be found here: {}'.format(csv_filename)) log.info('Finished {} for file {}'.format(__name__, rmse_filename)) return (df_data)
def download_ref_to_stages_if_required(f_pattern_ref, p_stages, f_vars_to_extract, test): # no ref-file passed as argument of process_data if f_pattern_ref == paths.rootdir: log.info('Download reference file from ftp-server') filename_ftp_link = f_vars_to_extract.replace('.csv', '.txt').replace( 'vars_', 'ftp_') path_to_ftp_link = os.path.join(paths.p_f_vars_proc, test) file_with_ftp_link = utils.clean_path(path_to_ftp_link, filename_ftp_link) output_file = os.path.join(p_stages, 'ftp_ref_pattern.nc') cmd = ('wget --input-file={} ' '--output-document={}'.format(file_with_ftp_link, output_file)) log.debug('ftp-command: {}'.format(cmd)) utils.shell_cmd(cmd, py_routine=__name__) f_pattern_ref = output_file else: log.info('Using user-defined reference file for test ' '{}'.format(test)) return f_pattern_ref
def timeser_proc_nc_to_df(exp, filename, p_stages, already_a_timeseries=False): ''' Arguments: exp = experiment name filename = filename of the netCDF returned by function standard_postproc p_stages = directory where processing steps are stored returns: dataframe with processed data for welchstest ''' test = 'welch' if not already_a_timeseries: timeser_filename = 'test_postproc_{}_{}.nc'.format(test, exp) cdo_cmd = 'cdo -L yearmean -fldmean -vertsum {} {}'.format( filename, timeser_filename) utils.shell_cmd(cdo_cmd, py_routine=__name__) else: log.debug('Skipping CDO-processing step') timeser_filename = filename # list of variables in the timeserie netcdf # file to drop (not to put into the dataframe) vars_to_drop = [] log.info('Processing netCDF: {}'.format(timeser_filename)) # open dataset data = xr.open_dataset(timeser_filename) # Delete variables # useless variable time_bnds if ('time_bnds' in data.keys()): data = data.drop('time_bnds') # 3D vars if len(vars_to_drop) > 0: data.drop(labels=vars_to_drop) # removed degenerated dimensions data = data.squeeze(drop=True) # transforms into dataframe df_data = data.to_dataframe() # export in a file os.makedirs(p_stages, exist_ok=True) csv_filename = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) df_data.to_csv(csv_filename, index=None, header=True, sep=';') log.info('CSV file can be found here: {}'.format(csv_filename)) log.info('Finished {} for file {}'.format(__name__, timeser_filename)) return (df_data)
def main(exp, tests, p_stages=paths.p_stages, p_ref_csv_files=paths.p_ref_csv_files, ltestsuite=False, lverbose=False): # initialisation new_branch_name = 'test_add_{}'.format(exp) files_to_commit = [] # fill up file 'Exps_description.csv' with additional # information via user input f_exp_descr = os.path.join(p_ref_csv_files, 'Exps_description.csv') if not ltestsuite: add_line_descr_f(exp=exp, f_exp_descr=f_exp_descr) files_to_commit.append(f_exp_descr) for test in tests: test_cfg = get_config_of_current_test(test) csv_file = utils.clean_path( p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) # what is the filename in the reference pool filename_in_ref_dir = '{}_{}.csv'.format(test_cfg.ref_name, exp) # what is the location to store that file place_for_reference = os.path.join(p_ref_csv_files, test, filename_in_ref_dir) log.debug('Copy {} to {}'.format(csv_file, place_for_reference)) if not ltestsuite: shutil.copy(csv_file, place_for_reference) files_to_commit.append(place_for_reference) # copy pdf with bar-plots from Welch's-test if test == 'welch': pdf_file = utils.clean_path( p_stages, '{}_{}.pdf'.format(test_cfg.ref_name, exp)) # what is the name of the pdf in the reference pool filename_in_ref_dir = '{}_plots.pdf'.format(test_cfg.ref_name) # what is the location to store that file place_for_reference = os.path.join(p_ref_csv_files, test, filename_in_ref_dir) log.debug('Copy {} to {}'.format(csv_file, place_for_reference)) files_to_commit.append(place_for_reference) if not ltestsuite: shutil.copy(pdf_file, place_for_reference) # root is important to not fail during git commands os.chdir(paths.rootdir) # checkout new branch if not ltestsuite: log.info('Create and checkout new branch {}'.format(new_branch_name)) git_cmd = 'git checkout -B {}'.format(new_branch_name) utils.shell_cmd(git_cmd, py_routine='add_exp_to_ref.py') # commit all modified files prior in the function to git for file in files_to_commit: git_cmd = 'git add {}'.format(file) log.debug(git_cmd) utils.shell_cmd(git_cmd, py_routine=__name__) log.debug('Commit files {}'.format(files_to_commit)) commit_message = input('Please type your commit message :') git_cmd = 'git commit -m "{}"'.format(commit_message) utils.shell_cmd(git_cmd, py_routine=__name__) # Finish log.info( Style.GREEN( 'Files are added in the new branch: ' '{} in your local git repository.'.format(new_branch_name))) log.info('To add the file to the official repository, ' 'please perform the following steps:') log.info('1. Push the new branch into the official repo:') log.info(' git push --set-upstream origin {}'.format(new_branch_name)) log.info('2. On the Open Web interface (GitHub) , open a Pull Request.') log.banner('End add_exp_to_ref for experiment {}'.format(exp)) return ()
def normalize_data(dataset): log.info('Normalize fields in {} with mean and ' 'standard deviation'.format(dataset)) data = dataset.replace('.nc', '') std_data = '{}_std.nc'.format(data) std_data_enlarged = '{}_std_enlarged.nc'.format(data) mean_data = '{}_mean.nc'.format(data) mean_data_enlarged = '{}_enlarged.nc'.format(data) sub_data = '{}_sub.nc'.format(data) normalized_data = '{}_normalized.nc'.format(data) log.debug('Clean intermediate files for normalization') shell_cmd = 'rm {} {} {} {} {} {}'.format(std_data, mean_data, std_data_enlarged, mean_data_enlarged, sub_data, normalized_data) utils.shell_cmd(shell_cmd, py_routine=__name__, lowarn=True) cdo_cmd = 'cdo -L fldstd {} {}'.format(dataset, std_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L fldmean {} {}'.format(dataset, mean_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) #cdo_cmd = 'cdo -L sub {} -enlarge,{} {} {}'.format(dataset, cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, mean_data, mean_data_enlarged) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, std_data, std_data_enlarged) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L sub {} {} {}'.format(dataset, mean_data_enlarged, sub_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L div {} {} {}'.format(sub_data, std_data_enlarged, normalized_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) return normalized_data
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages, f_vars_to_extract): ''' Perfom standard post-processing using cdo Arguments: exp = experiment name test = name of current test to process data spinup = number of files (from begining of simulation) to ignore du to model spinup p_raw_files = path to raw model output raw_f_subfold = subfolder in p_raw_files with model output [p_raw_files]/[raw_f_subfold] p_stages = directory where processing steps are stored f_vars_to_extract = csv file containg the variables to proceed returns: netCDF filename containing the fields as defined in f_vars_to_extract ''' log.info('Postprocess data using CDO for test {}'.format(test)) # check that exp is defined if exp is None: log.error('Experiment is not defined.\n exp = {}'.format(exp)) # get variables to process: p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test) full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract) df_vars = pd.read_csv(full_p_f_vars, sep=',') # define expressions df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula'] # name of output file ofile_tot = os.path.join(p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) # initialisation files_error = [] # list files giving error files_proceed = [] # list of files where data are collected # sometimes data is stored in a folder called Raw p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold) # SPECIAL CASE, echam specific : # if the folder containing the Raw files have been deleted, # but folder 'Data' contains already global annual means if not os.path.isdir(p_raw_folder): log.warning('The folder containing the raw data ' 'has been deleted : {}'.format(p_raw_folder)) p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data') if test == 'welch': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc')) if test == 'fldcor' or test == 'rmse': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc')) if test == 'emi': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'emi_*.nc')) if len(time_series_altern_fold) < 1: log.error('Could not find files in alternative directory ' '{}'.format(time_series_altern_fold)) else: log.info('The alternative folder has been found instead: ' '{}'.format(p_altern_timeser_fold)) log.warning('This section of code is only tested for ECHAM! ' 'It is not recommended to use it for other cases') if len(time_series_altern_fold) == 1: index_ts = 0 if len(time_series_altern_fold) > 1: for (i, item) in enumerate(time_series_altern_fold): print(i, item) index_ts = int( input('Please type the index of the file' ' to use (negative means ' 'none of them) : ')) # If index positive, copy the time serie and exit if index_ts >= 0: log.info('File used : {}'.format( time_series_altern_fold[index_ts])) cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC ' '-chname,ICNC,burden_ICNC ' '-chname,SCF,SCRE -chname,LCF,LCRE ' '{} {}'.format(time_series_altern_fold[index_ts], ofile_tot)) utils.shell_cmd(cdo_cmd, py_routine=__name__) # convert netCDF to dataframe, # therefore skip next processing step if test == 'welch': timeser_proc_nc_to_df(exp, ofile_tot, p_stages, already_a_timeseries=True) skip_next_steps = True else: skip_next_steps = False log.warning('Leave ECHAM-only code-section! ' 'You are save again...') return (ofile_tot, skip_next_steps) # NORMAL CASE else: log.info('Analyse files in : {}'.format(p_raw_folder)) log.banner('Time for a coffee...') # loop over output stream for stream in df_vars['file'].unique(): # extract all lines with file f df_file = df_vars[df_vars.file == stream] # list all available files in p_raw_files/exp/raw_f_subfold #which have stream f # restart files and {}m.format(stream) e.g. echamm.nc # files are not considered final_p_raw_files = os.path.join(p_raw_folder, '*_*{}*.nc'.format(stream)) ifiles = [ fn for fn in glob.glob(final_p_raw_files) if sum([ s in os.path.basename(fn) for s in ['stream', '{}m'.format(stream)] ]) == 0 ] if len(ifiles) == 0: log.warning('No raw files found for stream {} at address : \n' '{}'.format(stream, final_p_raw_files)) # sort files in chronoligcal order # (this will be needed for doing yearmean properly) ifiles.sort() print_statistics_of_raw_files(ifiles, stream, exp) # remove spin-up files log.info('Remove first {} months of data ' 'due to model spinup'.format(spinup)) ifiles = ifiles[int(spinup):] # output file for stream f ofile_str = '{}_{}.nc'.format(exp, stream) # variables to extract form netcdf # files (this is needed for optimization) variables = variables_to_extract(vars_in_expr=df_file.formula.values) # Extract variables needed from big files log.info('Extract variables from file: {}'.format(stream)) # initialization tmp_selvar_files = [] # list to store the ifiles for ifile in ifiles: # basename of ifile ifile_bsn = os.path.basename(ifile) log.debug('File {}'.format(ifile_bsn)) tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn) cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile, tmp_selvar_file) out_status, out_mess = utils.shell_cmd(cdo_cmd, py_routine=__name__, lowarn=True) if out_status == 0: tmp_selvar_files.append(tmp_selvar_file) else: files_error.append(ifile_bsn) # Merge all the monthly files together log.info('Copy {} files'.format(stream)) tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream) if os.path.isfile(tmp_merged): os.remove(tmp_merged) cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files), tmp_merged) utils.shell_cmd(cdo_cmd, py_routine=__name__) # compute needed variables log.info('Compute variables for file : {}'.format(stream)) if os.path.isfile(ofile_str): os.remove(ofile_str) expr_str = ';'.join((df_file.expr.values)) cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format( expr_str, tmp_merged, ofile_str) utils.shell_cmd(cdo_cmd, py_routine=__name__) # keep trace of output file per stream files_proceed.append(ofile_str) # cleaning [os.remove(f) for f in tmp_selvar_files] os.remove(tmp_merged) # merge all stream files if os.path.isfile(ofile_tot): os.remove(ofile_tot) cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot) utils.shell_cmd(cdo_cmd, py_routine=__name__) [os.remove(f) for f in files_proceed] # Finish if len(files_error) != 0: log.warning('Files with a problem: {}'.format(','.join(files_error))) log.info('Postprocess data using CDO for test {} finished. \n ' 'Output here : {}'.format(test, ofile_tot)) # return name of output file return (ofile_tot, False)