def download_ref_to_stages_if_required(f_pattern_ref, p_stages, f_vars_to_extract, test): # no ref-file passed as argument of process_data if f_pattern_ref == paths.rootdir: log.info('Download reference file from ftp-server') filename_ftp_link = f_vars_to_extract.replace('.csv', '.txt').replace( 'vars_', 'ftp_') path_to_ftp_link = os.path.join(paths.p_f_vars_proc, test) file_with_ftp_link = utils.clean_path(path_to_ftp_link, filename_ftp_link) output_file = os.path.join(p_stages, 'ftp_ref_pattern.nc') cmd = ('wget --input-file={} ' '--output-document={}'.format(file_with_ftp_link, output_file)) log.debug('ftp-command: {}'.format(cmd)) utils.shell_cmd(cmd, py_routine=__name__) f_pattern_ref = output_file else: log.info('Using user-defined reference file for test ' '{}'.format(test)) return f_pattern_ref
def normalize_data(dataset): log.info('Normalize fields in {} with mean and ' 'standard deviation'.format(dataset)) data = dataset.replace('.nc', '') std_data = '{}_std.nc'.format(data) std_data_enlarged = '{}_std_enlarged.nc'.format(data) mean_data = '{}_mean.nc'.format(data) mean_data_enlarged = '{}_enlarged.nc'.format(data) sub_data = '{}_sub.nc'.format(data) normalized_data = '{}_normalized.nc'.format(data) log.debug('Clean intermediate files for normalization') shell_cmd = 'rm {} {} {} {} {} {}'.format(std_data, mean_data, std_data_enlarged, mean_data_enlarged, sub_data, normalized_data) utils.shell_cmd(shell_cmd, py_routine=__name__, lowarn=True) cdo_cmd = 'cdo -L fldstd {} {}'.format(dataset, std_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L fldmean {} {}'.format(dataset, mean_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) #cdo_cmd = 'cdo -L sub {} -enlarge,{} {} {}'.format(dataset, cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, mean_data, mean_data_enlarged) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, std_data, std_data_enlarged) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L sub {} {} {}'.format(dataset, mean_data_enlarged, sub_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) cdo_cmd = 'cdo -L div {} {} {}'.format(sub_data, std_data_enlarged, normalized_data) utils.shell_cmd(cdo_cmd, py_routine=__name__) return normalized_data
def rmse_proc_nc_to_df(exp, filename, reference, p_stages): ''' Arguments: exp = experiment name filename = filename of the netCDF returned by function standard_postproc reference = filename to the reference p_stages = directory where processing steps are stored returns: dataframe with processed data for pattern correlation test ''' test = 'rmse' rmse_interim = 'test_postproc_intermediate_{}_{}.nc'.format(test, exp) rmse_filename = 'test_proc_{}_{}.nc'.format(test, exp) cdo_cmd = 'cdo -L timmean -yearmean -vertsum {} {}'.format( filename, rmse_interim) utils.shell_cmd(cdo_cmd, py_routine=__name__) reference_normalized = normalize_data(reference) rmse_interim_normalized = normalize_data(rmse_interim) # list of variables in the timeserie netcdf file to drop # (not to put into the dataframe) vars_to_drop = [] log.info('Compute root mean square error ' 'between {} and {} (reference)'.format(rmse_interim_normalized, reference_normalized)) cdo_cmd = 'cdo -L sqrt -fldmean -sqr -sub {} {} {}'.format( rmse_interim_normalized, reference_normalized, rmse_filename) utils.shell_cmd(cdo_cmd, py_routine=__name__) # open dataset data = xr.open_dataset(rmse_filename) # Delete variables # useless variable time_bnds if ('time_bnds' in data.keys()): data = data.drop('time_bnds') # 3D vars if len(vars_to_drop) > 0: data.drop(labels=vars_to_drop) # transforms into dataframe df_data = data.to_dataframe() os.makedirs(p_stages, exist_ok=True) csv_filename = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) df_data.to_csv(csv_filename, index=None, header=True, sep=';') log.info('CSV file can be found here: {}'.format(csv_filename)) log.info('Finished {} for file {}'.format(__name__, rmse_filename)) return (df_data)
def welch_test(df_a, df_b, filename_student_test=''): ''' Perform Welch t-test for each variable fo dataframe df_b :param df_a: reference datframe, containing big sample :param df_b: datframe containing data to test :param filename_student_test: filename for writing result of t-test result into a csv file :return: result of the student test in a dataframe ''' row_list_df = [] for var in df_b.keys(): if 'exp' in var: continue log.debug("Welch's t-test for {}".format(var)) # Welch's t-test t, p = stats.ttest_ind(df_a[var], df_b[var], equal_var=False, nan_policy='omit') # append results for construction datframe df_result dict1 = {'variable': var, 't-value': t, 'p-value': p} row_list_df.append(dict1) # construction dataframe df_result = pd.DataFrame(row_list_df, columns=['variable', 't-value', 'p-value']) # sort per p value df_result.sort_values(by=['p-value'], inplace=True) # if a filename is given, write the student-stest # result into the file named filename_student_test if len(filename_student_test) > 0: log.info('Write result to {}'.format(filename_student_test)) df_result.to_csv(filename_student_test, sep=',') return (df_result)
def print_statistics_of_raw_files(ifiles, stream, exp): datepatterns = ['%Y_%m', '%Y%m'] years_found = [] no_summary = False for file in ifiles: file = (os.path.basename(file)) strip_1 = file.strip('_{}_.nc'.format(stream)) strip_2 = strip_1.strip('{}_'.format(exp)) strip_3 = strip_2.strip('.') datestring = strip_3 failed = True for pattern in datepatterns: if failed: try: date = datetime.datetime.strptime(datestring, pattern) failed = False except ValueError: failed = True if failed: no_summary = True else: year = date.year if year not in years_found: years_found.append(year) if no_summary: log.warning('Could not determine years ' 'due to an unkown pattern in the filenames') else: log.info('{} files with model output ' 'found for years:'.format(len(ifiles))) for year in years_found: log.info(year)
def print_warning_color(df_result, metric_thresholds, metric): # dataframe containing only variables a warning has to be printed df_warning = df_result[df_result['level'] != 'high'] log.info('-------------------------------------------' '-------------------------------------------' '--------------------') if df_warning.size > 0: log.warning('The following variables give problematic ' '{} : \n'.format(metric)) # for each level of warning, print the dataframe for metric_lev in metric_thresholds: if metric_lev != 'high': # dataframe containing only this level of warning df_print_warn = df_warning[df_warning.level == metric_lev.level] # print if df_print_warn.size > 0: log.info('Confidence is {} for {} '.format( metric_lev.level.upper(), metric)) log.info(metric_lev.col_txt(df_print_warn)) else: log.info( Style.GREEN('The experiment is fine. ' 'No {} under {} \n').format( metric, metric_thresholds[1].p_thresh)) log.info('-------------------------------------------' '-------------------------------------------' '--------------------') return
def print_warning_if_testresult_is_bad(test, df_result, metric_thresholds, metric): df_warning = df_result[df_result['level'] == 'very low'] log.info('-----------------------------------------' '-----------------------------------------') log.info(test) if df_warning.size > 0: log.info( Style.RED('Results are bad! \n' 'It is not recommended to add this ' 'test to the reference pool')) # for each level of warning, print the dataframe else: log.info(Style.GREEN('Results OK')) log.info('-----------------------------------------' '-----------------------------------------') return
def timeser_proc_nc_to_df(exp, filename, p_stages, already_a_timeseries=False): ''' Arguments: exp = experiment name filename = filename of the netCDF returned by function standard_postproc p_stages = directory where processing steps are stored returns: dataframe with processed data for welchstest ''' test = 'welch' if not already_a_timeseries: timeser_filename = 'test_postproc_{}_{}.nc'.format(test, exp) cdo_cmd = 'cdo -L yearmean -fldmean -vertsum {} {}'.format( filename, timeser_filename) utils.shell_cmd(cdo_cmd, py_routine=__name__) else: log.debug('Skipping CDO-processing step') timeser_filename = filename # list of variables in the timeserie netcdf # file to drop (not to put into the dataframe) vars_to_drop = [] log.info('Processing netCDF: {}'.format(timeser_filename)) # open dataset data = xr.open_dataset(timeser_filename) # Delete variables # useless variable time_bnds if ('time_bnds' in data.keys()): data = data.drop('time_bnds') # 3D vars if len(vars_to_drop) > 0: data.drop(labels=vars_to_drop) # removed degenerated dimensions data = data.squeeze(drop=True) # transforms into dataframe df_data = data.to_dataframe() # export in a file os.makedirs(p_stages, exist_ok=True) csv_filename = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) df_data.to_csv(csv_filename, index=None, header=True, sep=';') log.info('CSV file can be found here: {}'.format(csv_filename)) log.info('Finished {} for file {}'.format(__name__, timeser_filename)) return (df_data)
def exit_if_testresult_is_bad(test, df_result, metric_thresholds, metric): df_warning = df_result[df_result['level'] != 'high'] df_warning = df_warning[df_warning['level'] != 'middle'] log.info('-----------------------------------------' '-----------------------------------------') log.info(test) if df_warning.size > 0: log.error(Style.RED('Results are bad!')) else: log.info(Style.GREEN('Results OK')) log.info('-----------------------------------------' '-----------------------------------------') return
def main(new_exp, p_raw_files, raw_f_subfold, p_stages, p_ref_csv_files, wrk_dir, f_vars_to_extract, f_pattern_ref, tests, spinup, lclean, ltestsuite, lverbose): # init logger logger_config.init_logger(lverbose, __file__) log.banner('Start sanity checker') # make all paths from user to absolute paths wrk_dir = utils.abs_path(wrk_dir) p_stages = utils.abs_path(p_stages) p_ref_csv_files = utils.abs_path(p_ref_csv_files) f_pattern_ref = utils.abs_path(f_pattern_ref) # create directories os.makedirs(p_stages, exist_ok=True) os.makedirs(wrk_dir, exist_ok=True) # go to working directory os.chdir((wrk_dir)) log.info('Working directory is {}'.format(wrk_dir)) # data processing takes a while, check that no step is done twice actions = utils.determine_actions_for_data_processing( new_exp, tests, p_stages, lclean) # create dataframe out of raw data results_data_processing = process_data.main( new_exp, actions, tests, spinup, p_raw_files=p_raw_files, p_stages=p_stages, raw_f_subfold=raw_f_subfold, f_vars_to_extract=f_vars_to_extract, f_pattern_ref=f_pattern_ref) results_test, references = perform_test.main( new_exp, results_data_processing=results_data_processing, p_stages=p_stages, tests=tests, p_ref_csv_files=p_ref_csv_files, ltestsuite=ltestsuite, f_vars_to_extract=f_vars_to_extract) if 'welch' in tests: test = 'welch' plt.plt_welchstest(references[test].append( results_data_processing[test], sort=False), new_exp, results_test[test], p_stages=p_stages) # Add experiment to the reference pool #-------------------------------------------------------------------- log.banner('') log.banner('Check results again before adding to reference pool') log.banner('') for test in tests: test_cfg = test_config.get_config_of_current_test(test) utils.print_warning_if_testresult_is_bad(test, results_test[test], test_cfg.metric_threshold, test_cfg.metric) if ltestsuite: asw = 'YES' else: asw = input('If you are happy with this experiment, ' 'do you want to add it to the reference pool ?' '(yes/[No])\n') if (asw.strip().upper() == 'YES') or (asw.strip().upper() == 'Y'): add_exp_to_ref.main(new_exp, tests, p_stages=p_stages, ltestsuite=ltestsuite, p_ref_csv_files=p_ref_csv_files) else: args_for_manual_execution = \ utils.derive_arguments_for_add_exp_to_ref(new_exp, tests, p_stages, p_ref_csv_files) log.info('The experiment {} is NOT added to ' 'the reference pool \n'.format(new_exp)) log.info('If you want to add the experiment {} ' 'to the reference pool later on, type ' 'the following line when you are ready:'.format( new_exp, new_exp)) log.info('') log.info( 'python add_exp_to_ref.py {}'.format(args_for_manual_execution)) log.banner('') log.banner('Sanity test finished') log.banner('')
def plt_welchstest(df_tot, new_exp, df_result, p_stages=paths.p_stages): ''' :param df_tot: Dataframe containing containing all global annual mean (reference & new_exp) :param new_exp: Name of the new exp which is analysed :param df_result: Dataframe containing the results of the Welch's test :param p_stages : path to save the figures :return: None, but the figure is saved in p_stages ''' # simple statistics, sort by exp # to be sure the order is the same in both dataframe df_tot_mean = df_tot.groupby(['exp']).mean()\ .sort_values(['exp']).reset_index() # for std, the panda std has a bug # cf https://github.com/pandas-dev/pandas/issues/16799 df_tot_std = df_tot.groupby(['exp']).std()\ .sort_values(['exp']).reset_index() # ensure new exp to be the last line iexp = df_tot_mean.index[df_tot_mean['exp'] == new_exp] new_order = df_tot_mean.index.drop(iexp).append(iexp) df_tot_mean = df_tot_mean.reindex(new_order) df_tot_std = df_tot_std.reindex(new_order) # number col/rows per page nlin = 3 ncol = 3 nplot = nlin * ncol # needed for multipage pdf file filename_mean_std_figures = 'glob_means_{}.pdf'.format((new_exp)) p_pdf_file_var = os.path.join(p_stages,filename_mean_std_figures) pp = PdfPages(p_pdf_file_var) # loop over all variables for ivar,var in enumerate(df_result.variable): log.debug('Create plot for {}'.format(var)) # subplot preparation # ------------------------------------------------------------------------ # number of plot iplot = np.mod(ivar,nplot) # set the plotting frame if (iplot == 0): fig, plt_nbr = plt.subplots(nlin, ncol, sharex='col', figsize=(12, 12)) # subplot coordinate icol = np.mod(iplot, ncol) ilin = np.int(np.floor(iplot / ncol)) # actual plot act_plt = plt_nbr[ilin, icol] # x-axis xaxis = np.arange(df_tot_mean.shape[0]) # plotting # --------------------------------------------------------------------------- nmisval = df_tot_mean[var].isna().sum() i_newexp = len(xaxis) - nmisval - 1 # define colors colors = len(xaxis) * ['k'] colors[i_newexp:len(xaxis)] = 'k' # define thickness thickness = len(xaxis) * [1.5] thickness[i_newexp] = 3 # plot mean and std for each variable act_plt.errorbar(xaxis, df_tot_mean[var], yerr=df_tot_std[var], fmt='+k',ecolor=colors, elinewidth=thickness) # plot average reference experiments (grey band) m_ref = df_tot[df_tot.exp != new_exp][var].mean() s_ref = df_tot[df_tot.exp != new_exp][var].std() act_plt.axhline(m_ref, c='k') act_plt.fill_between([-1, max(xaxis) - 0.5], m_ref - s_ref, m_ref + s_ref, facecolor='grey',alpha=0.6) # plot color background color_graph = df_result.loc[df_result.variable == var]['col-graph']\ .values[0] act_plt.set_facecolor('{}'.format(color_graph)) # manage labels/titel/etc # ------------------------------------------------------------------------------- # label settings act_plt.xaxis.set_ticks(xaxis) act_plt.set_xticklabels(df_tot_mean['exp'],rotation=90) # title settings pvalue = float(df_result[df_result.variable == var]['p-value']) act_plt.set_title('{}, p-value = {:.2%}'.format(var,pvalue)) # Saving page & increase number var # -------------------------------------------------------------------------------- # save full page if (iplot == (nplot - 1)): pp.savefig() # save and close odf file file fig.savefig(pp, format='pdf') pp.close() log.info('Detailed plots of mean and standard deviation per variable ' 'can be found in the file {}'.format(p_pdf_file_var))
def add_line_descr_f(exp, f_exp_descr): ''' Add line for exp exp in file f_exp_descr :param exp: new expirement name :param f_exp_descr: file in which the new line has to be added return: None ''' log.info('Adding line {} in the file {}:'.format(exp, f_exp_descr)) # open file in dataframe if not os.path.isfile(f_exp_descr): # create dataframe cols_exp_descr_f = [ 'Experiment name', 'Platform', 'OS', 'Compiler (with version)', 'Optimisation level (-OX)', '-fast-transcendentals (y/n)', '-no-prec-sqrt (y/n)', '-no-prec-div (y/n)', 'welch (y/n)', 'fldcor (y/n)', 'rmse (y/n)', 'emi (y/n)', 'Date of experiment (month yyyy)' ] pd.DataFrame(columns=cols_exp_descr_f) else: df_exp_descr = pd.read_csv(f_exp_descr, sep=';') # collect information from user log.banner('Please give the following informations ' 'about your experiment') dict_line = {'Experiment name': exp} for col_name in df_exp_descr.keys(): if col_name != 'Experiment name': # ask the user for info dict_line[col_name] = input('{} : '.format(col_name)) # amend the information if needed while True: # new dataframe containing new line for exp df_exp_descr_new = df_exp_descr.append(dict_line, ignore_index=True) log.banner('Here is the content of the description ' 'file including your new experiment.') log.info(df_exp_descr_new) answ_chg = input('Is the new file right ? (y/n/abort).\n' 'If you type n, you will be able to change ' 'column values\n' 'If you type abort, the process of adding ' 'the experiment {} to the reference is stoped.\n' '(y/n/abort) : ' ''.format(exp)) if answ_chg.upper() == 'Y': # save new file df_exp_descr_new.to_csv(f_exp_descr, sep=';', index=False) # get out of the loop return False elif answ_chg.upper() == 'N': answ_col = input('Which column field you want to change ?') if answ_col in df_exp_descr.keys(): dict_line[answ_col] = input('{} : '.format(answ_col)) else: log.warning('{} not in columns!'.format(answ_col)) log.info('Columns are {}\n'.format(list(df_exp_descr.columns))) elif answ_chg.upper() == 'ABORT': exit() return ()
def main(exp, tests, p_stages=paths.p_stages, p_ref_csv_files=paths.p_ref_csv_files, ltestsuite=False, lverbose=False): # initialisation new_branch_name = 'test_add_{}'.format(exp) files_to_commit = [] # fill up file 'Exps_description.csv' with additional # information via user input f_exp_descr = os.path.join(p_ref_csv_files, 'Exps_description.csv') if not ltestsuite: add_line_descr_f(exp=exp, f_exp_descr=f_exp_descr) files_to_commit.append(f_exp_descr) for test in tests: test_cfg = get_config_of_current_test(test) csv_file = utils.clean_path( p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) # what is the filename in the reference pool filename_in_ref_dir = '{}_{}.csv'.format(test_cfg.ref_name, exp) # what is the location to store that file place_for_reference = os.path.join(p_ref_csv_files, test, filename_in_ref_dir) log.debug('Copy {} to {}'.format(csv_file, place_for_reference)) if not ltestsuite: shutil.copy(csv_file, place_for_reference) files_to_commit.append(place_for_reference) # copy pdf with bar-plots from Welch's-test if test == 'welch': pdf_file = utils.clean_path( p_stages, '{}_{}.pdf'.format(test_cfg.ref_name, exp)) # what is the name of the pdf in the reference pool filename_in_ref_dir = '{}_plots.pdf'.format(test_cfg.ref_name) # what is the location to store that file place_for_reference = os.path.join(p_ref_csv_files, test, filename_in_ref_dir) log.debug('Copy {} to {}'.format(csv_file, place_for_reference)) files_to_commit.append(place_for_reference) if not ltestsuite: shutil.copy(pdf_file, place_for_reference) # root is important to not fail during git commands os.chdir(paths.rootdir) # checkout new branch if not ltestsuite: log.info('Create and checkout new branch {}'.format(new_branch_name)) git_cmd = 'git checkout -B {}'.format(new_branch_name) utils.shell_cmd(git_cmd, py_routine='add_exp_to_ref.py') # commit all modified files prior in the function to git for file in files_to_commit: git_cmd = 'git add {}'.format(file) log.debug(git_cmd) utils.shell_cmd(git_cmd, py_routine=__name__) log.debug('Commit files {}'.format(files_to_commit)) commit_message = input('Please type your commit message :') git_cmd = 'git commit -m "{}"'.format(commit_message) utils.shell_cmd(git_cmd, py_routine=__name__) # Finish log.info( Style.GREEN( 'Files are added in the new branch: ' '{} in your local git repository.'.format(new_branch_name))) log.info('To add the file to the official repository, ' 'please perform the following steps:') log.info('1. Push the new branch into the official repo:') log.info(' git push --set-upstream origin {}'.format(new_branch_name)) log.info('2. On the Open Web interface (GitHub) , open a Pull Request.') log.banner('End add_exp_to_ref for experiment {}'.format(exp)) return ()
dest='ltestsuite', action='store_true', help='Run of testsuite') args = parser.parse_args() logger_config.init_logger(args.lverbose, __file__) log.banner('Start execute {} as main()'.format(__file__)) args.wrk_dir = utils.abs_path(args.wrk_dir) args.p_stages = utils.abs_path(args.p_stages) args.p_ref_csv_files = utils.abs_path(args.p_ref_csv_files) os.chdir((args.wrk_dir)) log.info('Current directory is {}'.format(args.wrk_dir)) log.info('Read processed data from csv for...') results_data_processing = {} for test in args.tests: log.info('{}'.format(test)) f_csv = utils.clean_path( args.p_stages, 'test_postproc_{}_{}.csv'.format(test, args.exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') log.info('...done') main(new_exp=args.exp, results_data_processing=results_data_processing, p_stages=args.p_stages, p_ref_csv_files=args.p_ref_csv_files, f_vars_to_extract=args.f_vars_to_extract,
def main(exp, actions, tests, spinup, p_raw_files, p_stages, raw_f_subfold, f_vars_to_extract, f_pattern_ref): log.banner('Start standard-postprocessing') results_data_processing = {} processed_netcdf_filename = {} skip_next_step = {} # init in case standard_postproc is skipped for test in tests: skip_next_step[test] = False for test in tests: if (actions['standard_postproc'][test]): processed_netcdf_filename[test], skip_next_step[test] = \ standard_postproc(exp, test=test, spinup=spinup, p_raw_files=p_raw_files, raw_f_subfold=raw_f_subfold, p_stages=p_stages, f_vars_to_extract=f_vars_to_extract) else: log.info('Data already processed for test {}'.format(test)) processed_netcdf_filename[test] = utils.clean_path( p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) log.banner('End standard-postprocessing') log.banner('Start conversion from NetCDF to dataframe') if 'welch' in tests: test = 'welch' if (actions['test_postproc'][test] and not skip_next_step[test]): # transforming netcdf timeseries into csv file results_data_processing[test] = timeser_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning("Skip Welch's-Test") if 'emi' in tests: test = 'emi' if (actions['test_postproc'][test] and not skip_next_step[test]): results_data_processing[test] = emis_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip emission test') if 'fldcor' in tests: test = 'fldcor' if (actions['test_postproc'][test] and not skip_next_step[test]): f_pattern_ref = download_ref_to_stages_if_required( f_pattern_ref, p_stages, f_vars_to_extract, test) results_data_processing[test] = pattern_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages, reference=f_pattern_ref) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip pattern correlation test') if 'rmse' in tests: test = 'rmse' if (actions['test_postproc'][test] and not skip_next_step[test]): test = 'rmse' f_pattern_ref = download_ref_to_stages_if_required( f_pattern_ref, p_stages, f_vars_to_extract, test) results_data_processing[test] = rmse_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages, reference=f_pattern_ref) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip Rmse test') log.banner('End conversion from NetCDF to dataframe') return (results_data_processing)
def main(new_exp, results_data_processing, tests, p_stages, p_ref_csv_files, ltestsuite, f_vars_to_extract): df_exp = {} df_ref = {} p_csv_files = {} testresult_csv = {} df_result = {} for test in tests: log.info('Prepare references for test {}'.format(test)) test_cfg = get_config_of_current_test(test) results_data_processing[test]['exp'] = new_exp # list of paths to all csv files p_csv_files[test] = glob.glob( os.path.join(p_ref_csv_files, test, '{}_*csv'.format(test_cfg.ref_name))) if len(p_csv_files[test]) == 0: log.error('No reference files found in {}'.format(p_ref_csv_files)) log.debug('{} reference(s) found for test \ {}'.format(len(p_csv_files[test]), test)) # create big dataframe containing all reference exps df_ref[test] = create_big_df(test_cfg.ref_name, list_csv_files=p_csv_files[test]) # Exclude all the non-desired variables (1) var from file, 2) exp) full_p_f_vars = os.path.join(paths.p_f_vars_proc, test, f_vars_to_extract) vars_to_analyse = list( pd.read_csv(full_p_f_vars, sep=',')['var'].values) vars_to_analyse.append('exp') try: df_ref[test] = df_ref[test][vars_to_analyse] except KeyError as e: log.warning(e) log.error('Variables defined in {} are not contained in reference \ {}'.format(utils.rel_path(f_vars_to_extract), utils.rel_path(p_ref_csv_files))) df_exp[test] = results_data_processing[test][vars_to_analyse] log.info('References for test {} prepared'.format(test)) testresult_csv[test] = os.path.join( p_stages, 'result_{}_{}.csv'.format(test, new_exp)) if test == 'welch': log.banner('') log.banner("Perform Welch's t-test for each variable") log.banner('') df_result[test] = welch_test( df_a=df_ref[test], df_b=df_exp[test], filename_student_test=testresult_csv[test]) df_result[test]['p-value [%]'] = df_result[test]['p-value'] * 100. if test == 'fldcor': log.banner('') log.banner("Perform fldcor test for each variable") log.banner('') df_result[test] = pattern_correlation(df_exp[test], test_cfg) if test == 'emi': log.banner('') log.banner("Perform emission test for each variable") log.banner('') df_result[test] = emissions(df_exp[test], df_ref[test], test_cfg) if test == 'rmse': log.banner('') log.banner("Perform rmse test for each variable") log.banner('') df_result[test] = rmse(df_exp[test], test_cfg) df_result[test] = sort_level_metric(df_result[test], test_cfg.metric_threshold, test_cfg.metric) df_result[test] = add_color_df_result(df_result[test], test_cfg.metric_threshold) print_warning_color(df_result[test], test_cfg.metric_threshold, test_cfg.metric) if ltestsuite: for test in tests: test_cfg = get_config_of_current_test(test) utils.exit_if_testresult_is_bad(test, df_result[test], test_cfg.metric_threshold, test_cfg.metric) return df_result, df_ref
log.banner('Start execute {} as main()'.format(__file__)) # make all paths from user to absolute paths args.wrk_dir = utils.abs_path(args.wrk_dir) args.p_stages = utils.abs_path(args.p_stages) args.f_pattern_ref = utils.abs_path(args.f_pattern_ref) # data processing takes a while, check that no step is done twice actions = utils.determine_actions_for_data_processing( args.exp, args.tests, args.p_stages, args.lclean) # create directories os.makedirs(args.p_stages, exist_ok=True) os.makedirs(args.wrk_dir, exist_ok=True) # go to working directory os.chdir((args.wrk_dir)) log.info('Current directory is {}'.format(args.wrk_dir)) main(exp=args.exp, actions=actions, tests=args.tests, spinup=args.spinup, p_raw_files=args.p_raw_files, raw_f_subfold=args.raw_f_subfold, p_stages=args.p_stages, f_vars_to_extract=args.f_vars_to_extract, f_pattern_ref=args.f_pattern_ref) log.banner('End execute {} as main()'.format(__file__))
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages, f_vars_to_extract): ''' Perfom standard post-processing using cdo Arguments: exp = experiment name test = name of current test to process data spinup = number of files (from begining of simulation) to ignore du to model spinup p_raw_files = path to raw model output raw_f_subfold = subfolder in p_raw_files with model output [p_raw_files]/[raw_f_subfold] p_stages = directory where processing steps are stored f_vars_to_extract = csv file containg the variables to proceed returns: netCDF filename containing the fields as defined in f_vars_to_extract ''' log.info('Postprocess data using CDO for test {}'.format(test)) # check that exp is defined if exp is None: log.error('Experiment is not defined.\n exp = {}'.format(exp)) # get variables to process: p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test) full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract) df_vars = pd.read_csv(full_p_f_vars, sep=',') # define expressions df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula'] # name of output file ofile_tot = os.path.join(p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) # initialisation files_error = [] # list files giving error files_proceed = [] # list of files where data are collected # sometimes data is stored in a folder called Raw p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold) # SPECIAL CASE, echam specific : # if the folder containing the Raw files have been deleted, # but folder 'Data' contains already global annual means if not os.path.isdir(p_raw_folder): log.warning('The folder containing the raw data ' 'has been deleted : {}'.format(p_raw_folder)) p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data') if test == 'welch': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc')) if test == 'fldcor' or test == 'rmse': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc')) if test == 'emi': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'emi_*.nc')) if len(time_series_altern_fold) < 1: log.error('Could not find files in alternative directory ' '{}'.format(time_series_altern_fold)) else: log.info('The alternative folder has been found instead: ' '{}'.format(p_altern_timeser_fold)) log.warning('This section of code is only tested for ECHAM! ' 'It is not recommended to use it for other cases') if len(time_series_altern_fold) == 1: index_ts = 0 if len(time_series_altern_fold) > 1: for (i, item) in enumerate(time_series_altern_fold): print(i, item) index_ts = int( input('Please type the index of the file' ' to use (negative means ' 'none of them) : ')) # If index positive, copy the time serie and exit if index_ts >= 0: log.info('File used : {}'.format( time_series_altern_fold[index_ts])) cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC ' '-chname,ICNC,burden_ICNC ' '-chname,SCF,SCRE -chname,LCF,LCRE ' '{} {}'.format(time_series_altern_fold[index_ts], ofile_tot)) utils.shell_cmd(cdo_cmd, py_routine=__name__) # convert netCDF to dataframe, # therefore skip next processing step if test == 'welch': timeser_proc_nc_to_df(exp, ofile_tot, p_stages, already_a_timeseries=True) skip_next_steps = True else: skip_next_steps = False log.warning('Leave ECHAM-only code-section! ' 'You are save again...') return (ofile_tot, skip_next_steps) # NORMAL CASE else: log.info('Analyse files in : {}'.format(p_raw_folder)) log.banner('Time for a coffee...') # loop over output stream for stream in df_vars['file'].unique(): # extract all lines with file f df_file = df_vars[df_vars.file == stream] # list all available files in p_raw_files/exp/raw_f_subfold #which have stream f # restart files and {}m.format(stream) e.g. echamm.nc # files are not considered final_p_raw_files = os.path.join(p_raw_folder, '*_*{}*.nc'.format(stream)) ifiles = [ fn for fn in glob.glob(final_p_raw_files) if sum([ s in os.path.basename(fn) for s in ['stream', '{}m'.format(stream)] ]) == 0 ] if len(ifiles) == 0: log.warning('No raw files found for stream {} at address : \n' '{}'.format(stream, final_p_raw_files)) # sort files in chronoligcal order # (this will be needed for doing yearmean properly) ifiles.sort() print_statistics_of_raw_files(ifiles, stream, exp) # remove spin-up files log.info('Remove first {} months of data ' 'due to model spinup'.format(spinup)) ifiles = ifiles[int(spinup):] # output file for stream f ofile_str = '{}_{}.nc'.format(exp, stream) # variables to extract form netcdf # files (this is needed for optimization) variables = variables_to_extract(vars_in_expr=df_file.formula.values) # Extract variables needed from big files log.info('Extract variables from file: {}'.format(stream)) # initialization tmp_selvar_files = [] # list to store the ifiles for ifile in ifiles: # basename of ifile ifile_bsn = os.path.basename(ifile) log.debug('File {}'.format(ifile_bsn)) tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn) cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile, tmp_selvar_file) out_status, out_mess = utils.shell_cmd(cdo_cmd, py_routine=__name__, lowarn=True) if out_status == 0: tmp_selvar_files.append(tmp_selvar_file) else: files_error.append(ifile_bsn) # Merge all the monthly files together log.info('Copy {} files'.format(stream)) tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream) if os.path.isfile(tmp_merged): os.remove(tmp_merged) cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files), tmp_merged) utils.shell_cmd(cdo_cmd, py_routine=__name__) # compute needed variables log.info('Compute variables for file : {}'.format(stream)) if os.path.isfile(ofile_str): os.remove(ofile_str) expr_str = ';'.join((df_file.expr.values)) cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format( expr_str, tmp_merged, ofile_str) utils.shell_cmd(cdo_cmd, py_routine=__name__) # keep trace of output file per stream files_proceed.append(ofile_str) # cleaning [os.remove(f) for f in tmp_selvar_files] os.remove(tmp_merged) # merge all stream files if os.path.isfile(ofile_tot): os.remove(ofile_tot) cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot) utils.shell_cmd(cdo_cmd, py_routine=__name__) [os.remove(f) for f in files_proceed] # Finish if len(files_error) != 0: log.warning('Files with a problem: {}'.format(','.join(files_error))) log.info('Postprocess data using CDO for test {} finished. \n ' 'Output here : {}'.format(test, ofile_tot)) # return name of output file return (ofile_tot, False)