def add_line_descr_f(exp, f_exp_descr): ''' Add line for exp exp in file f_exp_descr :param exp: new expirement name :param f_exp_descr: file in which the new line has to be added return: None ''' log.info('Adding line {} in the file {}:'.format(exp, f_exp_descr)) # open file in dataframe if not os.path.isfile(f_exp_descr): # create dataframe cols_exp_descr_f = [ 'Experiment name', 'Platform', 'OS', 'Compiler (with version)', 'Optimisation level (-OX)', '-fast-transcendentals (y/n)', '-no-prec-sqrt (y/n)', '-no-prec-div (y/n)', 'welch (y/n)', 'fldcor (y/n)', 'rmse (y/n)', 'emi (y/n)', 'Date of experiment (month yyyy)' ] pd.DataFrame(columns=cols_exp_descr_f) else: df_exp_descr = pd.read_csv(f_exp_descr, sep=';') # collect information from user log.banner('Please give the following informations ' 'about your experiment') dict_line = {'Experiment name': exp} for col_name in df_exp_descr.keys(): if col_name != 'Experiment name': # ask the user for info dict_line[col_name] = input('{} : '.format(col_name)) # amend the information if needed while True: # new dataframe containing new line for exp df_exp_descr_new = df_exp_descr.append(dict_line, ignore_index=True) log.banner('Here is the content of the description ' 'file including your new experiment.') log.info(df_exp_descr_new) answ_chg = input('Is the new file right ? (y/n/abort).\n' 'If you type n, you will be able to change ' 'column values\n' 'If you type abort, the process of adding ' 'the experiment {} to the reference is stoped.\n' '(y/n/abort) : ' ''.format(exp)) if answ_chg.upper() == 'Y': # save new file df_exp_descr_new.to_csv(f_exp_descr, sep=';', index=False) # get out of the loop return False elif answ_chg.upper() == 'N': answ_col = input('Which column field you want to change ?') if answ_col in df_exp_descr.keys(): dict_line[answ_col] = input('{} : '.format(answ_col)) else: log.warning('{} not in columns!'.format(answ_col)) log.info('Columns are {}\n'.format(list(df_exp_descr.columns))) elif answ_chg.upper() == 'ABORT': exit() return ()
def main(new_exp, p_raw_files, raw_f_subfold, p_stages, p_ref_csv_files, wrk_dir, f_vars_to_extract, f_pattern_ref, tests, spinup, lclean, ltestsuite, lverbose): # init logger logger_config.init_logger(lverbose, __file__) log.banner('Start sanity checker') # make all paths from user to absolute paths wrk_dir = utils.abs_path(wrk_dir) p_stages = utils.abs_path(p_stages) p_ref_csv_files = utils.abs_path(p_ref_csv_files) f_pattern_ref = utils.abs_path(f_pattern_ref) # create directories os.makedirs(p_stages, exist_ok=True) os.makedirs(wrk_dir, exist_ok=True) # go to working directory os.chdir((wrk_dir)) log.info('Working directory is {}'.format(wrk_dir)) # data processing takes a while, check that no step is done twice actions = utils.determine_actions_for_data_processing( new_exp, tests, p_stages, lclean) # create dataframe out of raw data results_data_processing = process_data.main( new_exp, actions, tests, spinup, p_raw_files=p_raw_files, p_stages=p_stages, raw_f_subfold=raw_f_subfold, f_vars_to_extract=f_vars_to_extract, f_pattern_ref=f_pattern_ref) results_test, references = perform_test.main( new_exp, results_data_processing=results_data_processing, p_stages=p_stages, tests=tests, p_ref_csv_files=p_ref_csv_files, ltestsuite=ltestsuite, f_vars_to_extract=f_vars_to_extract) if 'welch' in tests: test = 'welch' plt.plt_welchstest(references[test].append( results_data_processing[test], sort=False), new_exp, results_test[test], p_stages=p_stages) # Add experiment to the reference pool #-------------------------------------------------------------------- log.banner('') log.banner('Check results again before adding to reference pool') log.banner('') for test in tests: test_cfg = test_config.get_config_of_current_test(test) utils.print_warning_if_testresult_is_bad(test, results_test[test], test_cfg.metric_threshold, test_cfg.metric) if ltestsuite: asw = 'YES' else: asw = input('If you are happy with this experiment, ' 'do you want to add it to the reference pool ?' '(yes/[No])\n') if (asw.strip().upper() == 'YES') or (asw.strip().upper() == 'Y'): add_exp_to_ref.main(new_exp, tests, p_stages=p_stages, ltestsuite=ltestsuite, p_ref_csv_files=p_ref_csv_files) else: args_for_manual_execution = \ utils.derive_arguments_for_add_exp_to_ref(new_exp, tests, p_stages, p_ref_csv_files) log.info('The experiment {} is NOT added to ' 'the reference pool \n'.format(new_exp)) log.info('If you want to add the experiment {} ' 'to the reference pool later on, type ' 'the following line when you are ready:'.format( new_exp, new_exp)) log.info('') log.info( 'python add_exp_to_ref.py {}'.format(args_for_manual_execution)) log.banner('') log.banner('Sanity test finished') log.banner('')
def main(exp, tests, p_stages=paths.p_stages, p_ref_csv_files=paths.p_ref_csv_files, ltestsuite=False, lverbose=False): # initialisation new_branch_name = 'test_add_{}'.format(exp) files_to_commit = [] # fill up file 'Exps_description.csv' with additional # information via user input f_exp_descr = os.path.join(p_ref_csv_files, 'Exps_description.csv') if not ltestsuite: add_line_descr_f(exp=exp, f_exp_descr=f_exp_descr) files_to_commit.append(f_exp_descr) for test in tests: test_cfg = get_config_of_current_test(test) csv_file = utils.clean_path( p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) # what is the filename in the reference pool filename_in_ref_dir = '{}_{}.csv'.format(test_cfg.ref_name, exp) # what is the location to store that file place_for_reference = os.path.join(p_ref_csv_files, test, filename_in_ref_dir) log.debug('Copy {} to {}'.format(csv_file, place_for_reference)) if not ltestsuite: shutil.copy(csv_file, place_for_reference) files_to_commit.append(place_for_reference) # copy pdf with bar-plots from Welch's-test if test == 'welch': pdf_file = utils.clean_path( p_stages, '{}_{}.pdf'.format(test_cfg.ref_name, exp)) # what is the name of the pdf in the reference pool filename_in_ref_dir = '{}_plots.pdf'.format(test_cfg.ref_name) # what is the location to store that file place_for_reference = os.path.join(p_ref_csv_files, test, filename_in_ref_dir) log.debug('Copy {} to {}'.format(csv_file, place_for_reference)) files_to_commit.append(place_for_reference) if not ltestsuite: shutil.copy(pdf_file, place_for_reference) # root is important to not fail during git commands os.chdir(paths.rootdir) # checkout new branch if not ltestsuite: log.info('Create and checkout new branch {}'.format(new_branch_name)) git_cmd = 'git checkout -B {}'.format(new_branch_name) utils.shell_cmd(git_cmd, py_routine='add_exp_to_ref.py') # commit all modified files prior in the function to git for file in files_to_commit: git_cmd = 'git add {}'.format(file) log.debug(git_cmd) utils.shell_cmd(git_cmd, py_routine=__name__) log.debug('Commit files {}'.format(files_to_commit)) commit_message = input('Please type your commit message :') git_cmd = 'git commit -m "{}"'.format(commit_message) utils.shell_cmd(git_cmd, py_routine=__name__) # Finish log.info( Style.GREEN( 'Files are added in the new branch: ' '{} in your local git repository.'.format(new_branch_name))) log.info('To add the file to the official repository, ' 'please perform the following steps:') log.info('1. Push the new branch into the official repo:') log.info(' git push --set-upstream origin {}'.format(new_branch_name)) log.info('2. On the Open Web interface (GitHub) , open a Pull Request.') log.banner('End add_exp_to_ref for experiment {}'.format(exp)) return ()
'-v', dest='lverbose', action='store_true', help='Debug output') parser.add_argument('--testsuite', '-ts', dest='ltestsuite', action='store_true', help='Run of testsuite') args = parser.parse_args() # init logger logger_config.init_logger(args.lverbose, __file__) log.banner('Start execute {} as main()'.format(__file__)) # make all paths from user to absolute paths args.p_stages = utils.abs_path(args.p_stages) args.p_ref_csv_files = utils.abs_path(args.p_ref_csv_files) main(exp=args.exp, tests=args.tests, p_stages=args.p_stages, p_ref_csv_files=args.p_ref_csv_files, ltestsuite=args.ltestsuite, lverbose=args.lverbose) log.banner('End execute {} as main()'.format(__file__))
def main(exp, actions, tests, spinup, p_raw_files, p_stages, raw_f_subfold, f_vars_to_extract, f_pattern_ref): log.banner('Start standard-postprocessing') results_data_processing = {} processed_netcdf_filename = {} skip_next_step = {} # init in case standard_postproc is skipped for test in tests: skip_next_step[test] = False for test in tests: if (actions['standard_postproc'][test]): processed_netcdf_filename[test], skip_next_step[test] = \ standard_postproc(exp, test=test, spinup=spinup, p_raw_files=p_raw_files, raw_f_subfold=raw_f_subfold, p_stages=p_stages, f_vars_to_extract=f_vars_to_extract) else: log.info('Data already processed for test {}'.format(test)) processed_netcdf_filename[test] = utils.clean_path( p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) log.banner('End standard-postprocessing') log.banner('Start conversion from NetCDF to dataframe') if 'welch' in tests: test = 'welch' if (actions['test_postproc'][test] and not skip_next_step[test]): # transforming netcdf timeseries into csv file results_data_processing[test] = timeser_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning("Skip Welch's-Test") if 'emi' in tests: test = 'emi' if (actions['test_postproc'][test] and not skip_next_step[test]): results_data_processing[test] = emis_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip emission test') if 'fldcor' in tests: test = 'fldcor' if (actions['test_postproc'][test] and not skip_next_step[test]): f_pattern_ref = download_ref_to_stages_if_required( f_pattern_ref, p_stages, f_vars_to_extract, test) results_data_processing[test] = pattern_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages, reference=f_pattern_ref) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip pattern correlation test') if 'rmse' in tests: test = 'rmse' if (actions['test_postproc'][test] and not skip_next_step[test]): test = 'rmse' f_pattern_ref = download_ref_to_stages_if_required( f_pattern_ref, p_stages, f_vars_to_extract, test) results_data_processing[test] = rmse_proc_nc_to_df( exp, filename=processed_netcdf_filename[test], p_stages=p_stages, reference=f_pattern_ref) else: log.info('Processing for test {} already done'.format(test)) f_csv = os.path.join(p_stages, 'test_postproc_{}_{}.csv'.format(test, exp)) results_data_processing[test] = pd.read_csv(f_csv, sep=';') else: log.warning('Skip Rmse test') log.banner('End conversion from NetCDF to dataframe') return (results_data_processing)
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages, f_vars_to_extract): ''' Perfom standard post-processing using cdo Arguments: exp = experiment name test = name of current test to process data spinup = number of files (from begining of simulation) to ignore du to model spinup p_raw_files = path to raw model output raw_f_subfold = subfolder in p_raw_files with model output [p_raw_files]/[raw_f_subfold] p_stages = directory where processing steps are stored f_vars_to_extract = csv file containg the variables to proceed returns: netCDF filename containing the fields as defined in f_vars_to_extract ''' log.info('Postprocess data using CDO for test {}'.format(test)) # check that exp is defined if exp is None: log.error('Experiment is not defined.\n exp = {}'.format(exp)) # get variables to process: p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test) full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract) df_vars = pd.read_csv(full_p_f_vars, sep=',') # define expressions df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula'] # name of output file ofile_tot = os.path.join(p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp)) # initialisation files_error = [] # list files giving error files_proceed = [] # list of files where data are collected # sometimes data is stored in a folder called Raw p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold) # SPECIAL CASE, echam specific : # if the folder containing the Raw files have been deleted, # but folder 'Data' contains already global annual means if not os.path.isdir(p_raw_folder): log.warning('The folder containing the raw data ' 'has been deleted : {}'.format(p_raw_folder)) p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data') if test == 'welch': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc')) if test == 'fldcor' or test == 'rmse': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc')) if test == 'emi': time_series_altern_fold = glob.glob( os.path.join(p_altern_timeser_fold, 'emi_*.nc')) if len(time_series_altern_fold) < 1: log.error('Could not find files in alternative directory ' '{}'.format(time_series_altern_fold)) else: log.info('The alternative folder has been found instead: ' '{}'.format(p_altern_timeser_fold)) log.warning('This section of code is only tested for ECHAM! ' 'It is not recommended to use it for other cases') if len(time_series_altern_fold) == 1: index_ts = 0 if len(time_series_altern_fold) > 1: for (i, item) in enumerate(time_series_altern_fold): print(i, item) index_ts = int( input('Please type the index of the file' ' to use (negative means ' 'none of them) : ')) # If index positive, copy the time serie and exit if index_ts >= 0: log.info('File used : {}'.format( time_series_altern_fold[index_ts])) cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC ' '-chname,ICNC,burden_ICNC ' '-chname,SCF,SCRE -chname,LCF,LCRE ' '{} {}'.format(time_series_altern_fold[index_ts], ofile_tot)) utils.shell_cmd(cdo_cmd, py_routine=__name__) # convert netCDF to dataframe, # therefore skip next processing step if test == 'welch': timeser_proc_nc_to_df(exp, ofile_tot, p_stages, already_a_timeseries=True) skip_next_steps = True else: skip_next_steps = False log.warning('Leave ECHAM-only code-section! ' 'You are save again...') return (ofile_tot, skip_next_steps) # NORMAL CASE else: log.info('Analyse files in : {}'.format(p_raw_folder)) log.banner('Time for a coffee...') # loop over output stream for stream in df_vars['file'].unique(): # extract all lines with file f df_file = df_vars[df_vars.file == stream] # list all available files in p_raw_files/exp/raw_f_subfold #which have stream f # restart files and {}m.format(stream) e.g. echamm.nc # files are not considered final_p_raw_files = os.path.join(p_raw_folder, '*_*{}*.nc'.format(stream)) ifiles = [ fn for fn in glob.glob(final_p_raw_files) if sum([ s in os.path.basename(fn) for s in ['stream', '{}m'.format(stream)] ]) == 0 ] if len(ifiles) == 0: log.warning('No raw files found for stream {} at address : \n' '{}'.format(stream, final_p_raw_files)) # sort files in chronoligcal order # (this will be needed for doing yearmean properly) ifiles.sort() print_statistics_of_raw_files(ifiles, stream, exp) # remove spin-up files log.info('Remove first {} months of data ' 'due to model spinup'.format(spinup)) ifiles = ifiles[int(spinup):] # output file for stream f ofile_str = '{}_{}.nc'.format(exp, stream) # variables to extract form netcdf # files (this is needed for optimization) variables = variables_to_extract(vars_in_expr=df_file.formula.values) # Extract variables needed from big files log.info('Extract variables from file: {}'.format(stream)) # initialization tmp_selvar_files = [] # list to store the ifiles for ifile in ifiles: # basename of ifile ifile_bsn = os.path.basename(ifile) log.debug('File {}'.format(ifile_bsn)) tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn) cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile, tmp_selvar_file) out_status, out_mess = utils.shell_cmd(cdo_cmd, py_routine=__name__, lowarn=True) if out_status == 0: tmp_selvar_files.append(tmp_selvar_file) else: files_error.append(ifile_bsn) # Merge all the monthly files together log.info('Copy {} files'.format(stream)) tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream) if os.path.isfile(tmp_merged): os.remove(tmp_merged) cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files), tmp_merged) utils.shell_cmd(cdo_cmd, py_routine=__name__) # compute needed variables log.info('Compute variables for file : {}'.format(stream)) if os.path.isfile(ofile_str): os.remove(ofile_str) expr_str = ';'.join((df_file.expr.values)) cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format( expr_str, tmp_merged, ofile_str) utils.shell_cmd(cdo_cmd, py_routine=__name__) # keep trace of output file per stream files_proceed.append(ofile_str) # cleaning [os.remove(f) for f in tmp_selvar_files] os.remove(tmp_merged) # merge all stream files if os.path.isfile(ofile_tot): os.remove(ofile_tot) cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot) utils.shell_cmd(cdo_cmd, py_routine=__name__) [os.remove(f) for f in files_proceed] # Finish if len(files_error) != 0: log.warning('Files with a problem: {}'.format(','.join(files_error))) log.info('Postprocess data using CDO for test {} finished. \n ' 'Output here : {}'.format(test, ofile_tot)) # return name of output file return (ofile_tot, False)
def main(new_exp, results_data_processing, tests, p_stages, p_ref_csv_files, ltestsuite, f_vars_to_extract): df_exp = {} df_ref = {} p_csv_files = {} testresult_csv = {} df_result = {} for test in tests: log.info('Prepare references for test {}'.format(test)) test_cfg = get_config_of_current_test(test) results_data_processing[test]['exp'] = new_exp # list of paths to all csv files p_csv_files[test] = glob.glob( os.path.join(p_ref_csv_files, test, '{}_*csv'.format(test_cfg.ref_name))) if len(p_csv_files[test]) == 0: log.error('No reference files found in {}'.format(p_ref_csv_files)) log.debug('{} reference(s) found for test \ {}'.format(len(p_csv_files[test]), test)) # create big dataframe containing all reference exps df_ref[test] = create_big_df(test_cfg.ref_name, list_csv_files=p_csv_files[test]) # Exclude all the non-desired variables (1) var from file, 2) exp) full_p_f_vars = os.path.join(paths.p_f_vars_proc, test, f_vars_to_extract) vars_to_analyse = list( pd.read_csv(full_p_f_vars, sep=',')['var'].values) vars_to_analyse.append('exp') try: df_ref[test] = df_ref[test][vars_to_analyse] except KeyError as e: log.warning(e) log.error('Variables defined in {} are not contained in reference \ {}'.format(utils.rel_path(f_vars_to_extract), utils.rel_path(p_ref_csv_files))) df_exp[test] = results_data_processing[test][vars_to_analyse] log.info('References for test {} prepared'.format(test)) testresult_csv[test] = os.path.join( p_stages, 'result_{}_{}.csv'.format(test, new_exp)) if test == 'welch': log.banner('') log.banner("Perform Welch's t-test for each variable") log.banner('') df_result[test] = welch_test( df_a=df_ref[test], df_b=df_exp[test], filename_student_test=testresult_csv[test]) df_result[test]['p-value [%]'] = df_result[test]['p-value'] * 100. if test == 'fldcor': log.banner('') log.banner("Perform fldcor test for each variable") log.banner('') df_result[test] = pattern_correlation(df_exp[test], test_cfg) if test == 'emi': log.banner('') log.banner("Perform emission test for each variable") log.banner('') df_result[test] = emissions(df_exp[test], df_ref[test], test_cfg) if test == 'rmse': log.banner('') log.banner("Perform rmse test for each variable") log.banner('') df_result[test] = rmse(df_exp[test], test_cfg) df_result[test] = sort_level_metric(df_result[test], test_cfg.metric_threshold, test_cfg.metric) df_result[test] = add_color_df_result(df_result[test], test_cfg.metric_threshold) print_warning_color(df_result[test], test_cfg.metric_threshold, test_cfg.metric) if ltestsuite: for test in tests: test_cfg = get_config_of_current_test(test) utils.exit_if_testresult_is_bad(test, df_result[test], test_cfg.metric_threshold, test_cfg.metric) return df_result, df_ref