def create_big_df(ref_names, list_csv_files, filename_csv=''): ''' :param list_csv_files: list of csv files for the big dataframe :return: big dataframe containing the whole data ''' # initialise big empty dataframe df_tot = pd.DataFrame() # create big dataframe for fexp in list_csv_files: exp = os.path.basename(fexp).rstrip('.csv').replace('glob_means_', '') # read the csv file if os.path.isfile(fexp): df_exp = pd.read_csv(fexp, sep=';') df_exp['exp'] = exp # append dataframe of exp to the total dataframe df_tot = df_tot.append(df_exp, sort=False) else: log.warning('csv file is not a file : {}'.format(fexp)) if len(filename_csv) > 0: df_tot.to_csv(filename_csv, sep=';') return df_tot
def determine_actions_for_data_processing(exp, tests, p_stages, lforce): actions = {'standard_postproc': {}, 'test_postproc': {}} if lforce: log.warning('Redo all processing steps') # see if standard-postprocessing is needed for test in tests: standard_proc_nc = os.path.join( p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) if (not os.path.isfile(standard_proc_nc) or lforce): action_needed = True else: action_needed = False actions['standard_postproc'][test] = action_needed test_specific_csv = os.path.join( p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) if (not os.path.isfile(test_specific_csv) or lforce or actions['standard_postproc'][test]): action_needed = True else: action_needed = False actions['test_postproc'][test] = action_needed log.debug('actions: {}'.format(actions)) return (actions)
def shell_cmd(cmd, py_routine, lowarn=False): """ Send shell command through subprocess.Popen and returns a string containing the cmd output lowarn = True -> only a warning is written, no exit (To use with caution!) """ # send cmd to be executed p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) # gets the output of the cmd out, err = p.communicate() # initailisation output status out_status = 0 # check if cmd was executed properly if p.returncode != 0: log.debug("{} (shell_cmd): ERROR in the command: \n {}".format( py_routine, cmd)) if lowarn: log.warning("Shell command failed, but explicitly " "keep program alive: \n {}".format(err)) out_status = 1 else: log.error("Error returned: {}".format(err)) return (out_status, str(out))
def __init__(self, lev, metric_threshold, color_var): # defining color text dict_col = { 'Red': Style.RED, 'DarkRed': Style.RED_HIGHL, 'Orange': Style.ORANGE, 'Green': Style.GREEN } try: self.col_txt = dict_col[color_var] except KeyError: log.warning('No text color associated with {} - ' 'setting to BLACK'.format(color_var)) self.col_txt = Style.BLACK # other properties self.level = lev self.p_thresh = metric_threshold self.col_graph = color_var
def print_statistics_of_raw_files(ifiles, stream, exp): datepatterns = ['%Y_%m', '%Y%m'] years_found = [] no_summary = False for file in ifiles: file = (os.path.basename(file)) strip_1 = file.strip('_{}_.nc'.format(stream)) strip_2 = strip_1.strip('{}_'.format(exp)) strip_3 = strip_2.strip('.') datestring = strip_3 failed = True for pattern in datepatterns: if failed: try: date = datetime.datetime.strptime(datestring, pattern) failed = False except ValueError: failed = True if failed: no_summary = True else: year = date.year if year not in years_found: years_found.append(year) if no_summary: log.warning('Could not determine years ' 'due to an unkown pattern in the filenames') else: log.info('{} files with model output ' 'found for years:'.format(len(ifiles))) for year in years_found: log.info(year)
def print_warning_color(df_result, metric_thresholds, metric): # dataframe containing only variables a warning has to be printed df_warning = df_result[df_result['level'] != 'high'] log.info('-------------------------------------------' '-------------------------------------------' '--------------------') if df_warning.size > 0: log.warning('The following variables give problematic ' '{} : \n'.format(metric)) # for each level of warning, print the dataframe for metric_lev in metric_thresholds: if metric_lev != 'high': # dataframe containing only this level of warning df_print_warn = df_warning[df_warning.level == metric_lev.level] # print if df_print_warn.size > 0: log.info('Confidence is {} for {} '.format( metric_lev.level.upper(), metric)) log.info(metric_lev.col_txt(df_print_warn)) else: log.info( Style.GREEN('The experiment is fine. ' 'No {} under {} \n').format( metric, metric_thresholds[1].p_thresh)) log.info('-------------------------------------------' '-------------------------------------------' '--------------------') return
def add_line_descr_f(exp, f_exp_descr): ''' Add line for exp exp in file f_exp_descr :param exp: new expirement name :param f_exp_descr: file in which the new line has to be added return: None ''' log.info('Adding line {} in the file {}:'.format(exp, f_exp_descr)) # open file in dataframe if not os.path.isfile(f_exp_descr): # create dataframe cols_exp_descr_f = [ 'Experiment name', 'Platform', 'OS', 'Compiler (with version)', 'Optimisation level (-OX)', '-fast-transcendentals (y/n)', '-no-prec-sqrt (y/n)', '-no-prec-div (y/n)', 'welch (y/n)', 'fldcor (y/n)', 'rmse (y/n)', 'emi (y/n)', 'Date of experiment (month yyyy)' ] pd.DataFrame(columns=cols_exp_descr_f) else: df_exp_descr = pd.read_csv(f_exp_descr, sep=';') # collect information from user log.banner('Please give the following informations ' 'about your experiment') dict_line = {'Experiment name': exp} for col_name in df_exp_descr.keys(): if col_name != 'Experiment name': # ask the user for info dict_line[col_name] = input('{} : '.format(col_name)) # amend the information if needed while True: # new dataframe containing new line for exp df_exp_descr_new = df_exp_descr.append(dict_line, ignore_index=True) log.banner('Here is the content of the description ' 'file including your new experiment.') log.info(df_exp_descr_new) answ_chg = input('Is the new file right ? (y/n/abort).\n' 'If you type n, you will be able to change ' 'column values\n' 'If you type abort, the process of adding ' 'the experiment {} to the reference is stoped.\n' '(y/n/abort) : ' ''.format(exp)) if answ_chg.upper() == 'Y': # save new file df_exp_descr_new.to_csv(f_exp_descr, sep=';', index=False) # get out of the loop return False elif answ_chg.upper() == 'N': answ_col = input('Which column field you want to change ?') if answ_col in df_exp_descr.keys(): dict_line[answ_col] = input('{} : '.format(answ_col)) else: log.warning('{} not in columns!'.format(answ_col)) log.info('Columns are {}\n'.format(list(df_exp_descr.columns))) elif answ_chg.upper() == 'ABORT': exit() return ()
def main(exp, actions, tests, spinup, p_raw_files, p_stages, raw_f_subfold, f_vars_to_extract, f_pattern_ref): log.banner('Start standard-postprocessing') results_data_processing = {} processed_netcdf_filename = {} skip_next_step = {} # init in case standard_postproc is skipped for test in tests: skip_next_step[test] = False for test in tests: if (actions['standard_postproc'][test]): processed_netcdf_filename[test], skip_next_step[test] = \ standard_postproc(exp, test=test, spinup=spinup, p_raw_files=p_raw_files, raw_f_subfold=raw_f_subfold, p_stages=p_stages, f_vars_to_extract=f_vars_to_extract) else: log.info('Data already processed for test {}'.format(test)) processed_netcdf_filename[test] = utils.clean_path( p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) log.banner('End standard-postprocessing') log.banner('Start conversion from NetCDF to dataframe') if 'welch' in tests: test = 'welch' if (actions['test_postproc'][test] and not skip_next_step[test]): # transforming netcdf timeseries into csv file results_data_processing[test] = timeser_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning("Skip Welch's-Test") if 'emi' in tests: test = 'emi' if (actions['test_postproc'][test] and not skip_next_step[test]): results_data_processing[test] = emis_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip emission test') if 'fldcor' in tests: test = 'fldcor' if (actions['test_postproc'][test] and not skip_next_step[test]): f_pattern_ref = download_ref_to_stages_if_required( f_pattern_ref, p_stages, f_vars_to_extract, test) results_data_processing[test] = pattern_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages, reference=f_pattern_ref) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip pattern correlation test') if 'rmse' in tests: test = 'rmse' if (actions['test_postproc'][test] and not skip_next_step[test]): test = 'rmse' f_pattern_ref = download_ref_to_stages_if_required( f_pattern_ref, p_stages, f_vars_to_extract, test) results_data_processing[test] = rmse_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages, reference=f_pattern_ref) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip Rmse test') log.banner('End conversion from NetCDF to dataframe') return (results_data_processing)
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages, f_vars_to_extract): ''' Perfom standard post-processing using cdo Arguments: exp = experiment name test = name of current test to process data spinup = number of files (from begining of simulation) to ignore du to model spinup p_raw_files = path to raw model output raw_f_subfold = subfolder in p_raw_files with model output [p_raw_files]/[raw_f_subfold] p_stages = directory where processing steps are stored f_vars_to_extract = csv file containg the variables to proceed returns: netCDF filename containing the fields as defined in f_vars_to_extract ''' log.info('Postprocess data using CDO for test {}'.format(test)) # check that exp is defined if exp is None: log.error('Experiment is not defined.\n exp = {}'.format(exp)) # get variables to process: p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test) full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract) df_vars = pd.read_csv(full_p_f_vars, sep=',') # define expressions df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula'] # name of output file ofile_tot = os.path.join(p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) # initialisation files_error = [] # list files giving error files_proceed = [] # list of files where data are collected # sometimes data is stored in a folder called Raw p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold) # SPECIAL CASE, echam specific : # if the folder containing the Raw files have been deleted, # but folder 'Data' contains already global annual means if not os.path.isdir(p_raw_folder): log.warning('The folder containing the raw data ' 'has been deleted : {}'.format(p_raw_folder)) p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data') if test == 'welch': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc')) if test == 'fldcor' or test == 'rmse': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc')) if test == 'emi': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'emi_*.nc')) if len(time_series_altern_fold) < 1: log.error('Could not find files in alternative directory ' '{}'.format(time_series_altern_fold)) else: log.info('The alternative folder has been found instead: ' '{}'.format(p_altern_timeser_fold)) log.warning('This section of code is only tested for ECHAM! ' 'It is not recommended to use it for other cases') if len(time_series_altern_fold) == 1: index_ts = 0 if len(time_series_altern_fold) > 1: for (i, item) in enumerate(time_series_altern_fold): print(i, item) index_ts = int( input('Please type the index of the file' ' to use (negative means ' 'none of them) : ')) # If index positive, copy the time serie and exit if index_ts >= 0: log.info('File used : {}'.format( time_series_altern_fold[index_ts])) cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC ' '-chname,ICNC,burden_ICNC ' '-chname,SCF,SCRE -chname,LCF,LCRE ' '{} {}'.format(time_series_altern_fold[index_ts], ofile_tot)) utils.shell_cmd(cdo_cmd, py_routine=__name__) # convert netCDF to dataframe, # therefore skip next processing step if test == 'welch': timeser_proc_nc_to_df(exp, ofile_tot, p_stages, already_a_timeseries=True) skip_next_steps = True else: skip_next_steps = False log.warning('Leave ECHAM-only code-section! ' 'You are save again...') return (ofile_tot, skip_next_steps) # NORMAL CASE else: log.info('Analyse files in : {}'.format(p_raw_folder)) log.banner('Time for a coffee...') # loop over output stream for stream in df_vars['file'].unique(): # extract all lines with file f df_file = df_vars[df_vars.file == stream] # list all available files in p_raw_files/exp/raw_f_subfold #which have stream f # restart files and {}m.format(stream) e.g. echamm.nc # files are not considered final_p_raw_files = os.path.join(p_raw_folder, '*_*{}*.nc'.format(stream)) ifiles = [ fn for fn in glob.glob(final_p_raw_files) if sum([ s in os.path.basename(fn) for s in ['stream', '{}m'.format(stream)] ]) == 0 ] if len(ifiles) == 0: log.warning('No raw files found for stream {} at address : \n' '{}'.format(stream, final_p_raw_files)) # sort files in chronoligcal order # (this will be needed for doing yearmean properly) ifiles.sort() print_statistics_of_raw_files(ifiles, stream, exp) # remove spin-up files log.info('Remove first {} months of data ' 'due to model spinup'.format(spinup)) ifiles = ifiles[int(spinup):] # output file for stream f ofile_str = '{}_{}.nc'.format(exp, stream) # variables to extract form netcdf # files (this is needed for optimization) variables = variables_to_extract(vars_in_expr=df_file.formula.values) # Extract variables needed from big files log.info('Extract variables from file: {}'.format(stream)) # initialization tmp_selvar_files = [] # list to store the ifiles for ifile in ifiles: # basename of ifile ifile_bsn = os.path.basename(ifile) log.debug('File {}'.format(ifile_bsn)) tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn) cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile, tmp_selvar_file) out_status, out_mess = utils.shell_cmd(cdo_cmd, py_routine=__name__, lowarn=True) if out_status == 0: tmp_selvar_files.append(tmp_selvar_file) else: files_error.append(ifile_bsn) # Merge all the monthly files together log.info('Copy {} files'.format(stream)) tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream) if os.path.isfile(tmp_merged): os.remove(tmp_merged) cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files), tmp_merged) utils.shell_cmd(cdo_cmd, py_routine=__name__) # compute needed variables log.info('Compute variables for file : {}'.format(stream)) if os.path.isfile(ofile_str): os.remove(ofile_str) expr_str = ';'.join((df_file.expr.values)) cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format( expr_str, tmp_merged, ofile_str) utils.shell_cmd(cdo_cmd, py_routine=__name__) # keep trace of output file per stream files_proceed.append(ofile_str) # cleaning [os.remove(f) for f in tmp_selvar_files] os.remove(tmp_merged) # merge all stream files if os.path.isfile(ofile_tot): os.remove(ofile_tot) cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot) utils.shell_cmd(cdo_cmd, py_routine=__name__) [os.remove(f) for f in files_proceed] # Finish if len(files_error) != 0: log.warning('Files with a problem: {}'.format(','.join(files_error))) log.info('Postprocess data using CDO for test {} finished. \n ' 'Output here : {}'.format(test, ofile_tot)) # return name of output file return (ofile_tot, False)
def main(new_exp, results_data_processing, tests, p_stages, p_ref_csv_files, ltestsuite, f_vars_to_extract): df_exp = {} df_ref = {} p_csv_files = {} testresult_csv = {} df_result = {} for test in tests: log.info('Prepare references for test {}'.format(test)) test_cfg = get_config_of_current_test(test) results_data_processing[test]['exp'] = new_exp # list of paths to all csv files p_csv_files[test] = glob.glob( os.path.join(p_ref_csv_files, test, '{}_*csv'.format(test_cfg.ref_name))) if len(p_csv_files[test]) == 0: log.error('No reference files found in {}'.format(p_ref_csv_files)) log.debug('{} reference(s) found for test \ {}'.format(len(p_csv_files[test]), test)) # create big dataframe containing all reference exps df_ref[test] = create_big_df(test_cfg.ref_name, list_csv_files=p_csv_files[test]) # Exclude all the non-desired variables (1) var from file, 2) exp) full_p_f_vars = os.path.join(paths.p_f_vars_proc, test, f_vars_to_extract) vars_to_analyse = list( pd.read_csv(full_p_f_vars, sep=',')['var'].values) vars_to_analyse.append('exp') try: df_ref[test] = df_ref[test][vars_to_analyse] except KeyError as e: log.warning(e) log.error('Variables defined in {} are not contained in reference \ {}'.format(utils.rel_path(f_vars_to_extract), utils.rel_path(p_ref_csv_files))) df_exp[test] = results_data_processing[test][vars_to_analyse] log.info('References for test {} prepared'.format(test)) testresult_csv[test] = os.path.join( p_stages, 'result_{}_{}.csv'.format(test, new_exp)) if test == 'welch': log.banner('') log.banner("Perform Welch's t-test for each variable") log.banner('') df_result[test] = welch_test( df_a=df_ref[test], df_b=df_exp[test], filename_student_test=testresult_csv[test]) df_result[test]['p-value [%]'] = df_result[test]['p-value'] * 100. if test == 'fldcor': log.banner('') log.banner("Perform fldcor test for each variable") log.banner('') df_result[test] = pattern_correlation(df_exp[test], test_cfg) if test == 'emi': log.banner('') log.banner("Perform emission test for each variable") log.banner('') df_result[test] = emissions(df_exp[test], df_ref[test], test_cfg) if test == 'rmse': log.banner('') log.banner("Perform rmse test for each variable") log.banner('') df_result[test] = rmse(df_exp[test], test_cfg) df_result[test] = sort_level_metric(df_result[test], test_cfg.metric_threshold, test_cfg.metric) df_result[test] = add_color_df_result(df_result[test], test_cfg.metric_threshold) print_warning_color(df_result[test], test_cfg.metric_threshold, test_cfg.metric) if ltestsuite: for test in tests: test_cfg = get_config_of_current_test(test) utils.exit_if_testresult_is_bad(test, df_result[test], test_cfg.metric_threshold, test_cfg.metric) return df_result, df_ref