Example #1
0
def add_line_descr_f(exp, f_exp_descr):
    '''
    Add line for exp exp in file f_exp_descr

    :param exp: new expirement name
    :param f_exp_descr: file in which the new line has to be added

    return: None
    '''

    log.info('Adding line {} in the file {}:'.format(exp, f_exp_descr))

    # open file in dataframe
    if not os.path.isfile(f_exp_descr):
        # create dataframe
        cols_exp_descr_f = [
            'Experiment name', 'Platform', 'OS', 'Compiler (with version)',
            'Optimisation level (-OX)', '-fast-transcendentals (y/n)',
            '-no-prec-sqrt (y/n)', '-no-prec-div (y/n)', 'welch (y/n)',
            'fldcor (y/n)', 'rmse (y/n)', 'emi (y/n)',
            'Date of experiment (month yyyy)'
        ]
        pd.DataFrame(columns=cols_exp_descr_f)
    else:
        df_exp_descr = pd.read_csv(f_exp_descr, sep=';')

    # collect information from user
    log.banner('Please give the following informations '
               'about your experiment')
    dict_line = {'Experiment name': exp}
    for col_name in df_exp_descr.keys():

        if col_name != 'Experiment name':

            # ask the user for info
            dict_line[col_name] = input('{} : '.format(col_name))

    # amend the information if needed
    while True:

        # new dataframe containing new line for exp
        df_exp_descr_new = df_exp_descr.append(dict_line, ignore_index=True)

        log.banner('Here is the content of the description '
                   'file including your new experiment.')
        log.info(df_exp_descr_new)

        answ_chg = input('Is the new file right ? (y/n/abort).\n'
                         'If you type n, you will be able to change '
                         'column values\n'
                         'If you type abort, the process of adding '
                         'the experiment {} to the reference is stoped.\n'
                         '(y/n/abort) : '
                         ''.format(exp))
        if answ_chg.upper() == 'Y':
            # save new file
            df_exp_descr_new.to_csv(f_exp_descr, sep=';', index=False)

            # get out of the loop
            return False

        elif answ_chg.upper() == 'N':
            answ_col = input('Which column field you want to change ?')

            if answ_col in df_exp_descr.keys():
                dict_line[answ_col] = input('{} : '.format(answ_col))
            else:
                log.warning('{} not in columns!'.format(answ_col))
                log.info('Columns are {}\n'.format(list(df_exp_descr.columns)))

        elif answ_chg.upper() == 'ABORT':
            exit()

    return ()
Example #2
0
def main(new_exp, p_raw_files, raw_f_subfold, p_stages, p_ref_csv_files,
         wrk_dir, f_vars_to_extract, f_pattern_ref, tests, spinup, lclean,
         ltestsuite, lverbose):

    # init logger
    logger_config.init_logger(lverbose, __file__)

    log.banner('Start sanity checker')

    # make all paths from user to absolute paths
    wrk_dir = utils.abs_path(wrk_dir)
    p_stages = utils.abs_path(p_stages)
    p_ref_csv_files = utils.abs_path(p_ref_csv_files)
    f_pattern_ref = utils.abs_path(f_pattern_ref)

    # create directories
    os.makedirs(p_stages, exist_ok=True)
    os.makedirs(wrk_dir, exist_ok=True)

    # go to working directory
    os.chdir((wrk_dir))
    log.info('Working directory is {}'.format(wrk_dir))

    # data processing takes a while, check that no step is done twice
    actions = utils.determine_actions_for_data_processing(
        new_exp, tests, p_stages, lclean)

    # create dataframe out of raw data
    results_data_processing = process_data.main(
        new_exp,
        actions,
        tests,
        spinup,
        p_raw_files=p_raw_files,
        p_stages=p_stages,
        raw_f_subfold=raw_f_subfold,
        f_vars_to_extract=f_vars_to_extract,
        f_pattern_ref=f_pattern_ref)

    results_test, references = perform_test.main(
        new_exp,
        results_data_processing=results_data_processing,
        p_stages=p_stages,
        tests=tests,
        p_ref_csv_files=p_ref_csv_files,
        ltestsuite=ltestsuite,
        f_vars_to_extract=f_vars_to_extract)

    if 'welch' in tests:
        test = 'welch'
        plt.plt_welchstest(references[test].append(
            results_data_processing[test], sort=False),
                           new_exp,
                           results_test[test],
                           p_stages=p_stages)

    # Add experiment to the reference pool
    #--------------------------------------------------------------------
    log.banner('')
    log.banner('Check results again before adding to reference pool')
    log.banner('')

    for test in tests:
        test_cfg = test_config.get_config_of_current_test(test)
        utils.print_warning_if_testresult_is_bad(test, results_test[test],
                                                 test_cfg.metric_threshold,
                                                 test_cfg.metric)

    if ltestsuite:
        asw = 'YES'
    else:
        asw = input('If you are happy with this experiment, '
                    'do you want to add it to the reference pool ?'
                    '(yes/[No])\n')

    if (asw.strip().upper() == 'YES') or (asw.strip().upper() == 'Y'):
        add_exp_to_ref.main(new_exp,
                            tests,
                            p_stages=p_stages,
                            ltestsuite=ltestsuite,
                            p_ref_csv_files=p_ref_csv_files)
    else:
        args_for_manual_execution = \
            utils.derive_arguments_for_add_exp_to_ref(new_exp,
                                                      tests,
                                                      p_stages,
                                                      p_ref_csv_files)

        log.info('The experiment {} is NOT added to '
                 'the reference pool \n'.format(new_exp))
        log.info('If you want to add the experiment {} '
                 'to the reference pool later on, type '
                 'the following line when you are ready:'.format(
                     new_exp, new_exp))

        log.info('')
        log.info(
            'python add_exp_to_ref.py {}'.format(args_for_manual_execution))

    log.banner('')
    log.banner('Sanity test finished')
    log.banner('')
Example #3
0
def main(exp,
         tests,
         p_stages=paths.p_stages,
         p_ref_csv_files=paths.p_ref_csv_files,
         ltestsuite=False,
         lverbose=False):

    # initialisation
    new_branch_name = 'test_add_{}'.format(exp)
    files_to_commit = []

    # fill up file 'Exps_description.csv' with additional
    # information via user input
    f_exp_descr = os.path.join(p_ref_csv_files, 'Exps_description.csv')
    if not ltestsuite:
        add_line_descr_f(exp=exp, f_exp_descr=f_exp_descr)
    files_to_commit.append(f_exp_descr)

    for test in tests:
        test_cfg = get_config_of_current_test(test)

        csv_file = utils.clean_path(
            p_stages, 'test_postproc_{}_{}.csv'.format(test, exp))

        # what is the filename in the reference pool
        filename_in_ref_dir = '{}_{}.csv'.format(test_cfg.ref_name, exp)
        # what is the location to store that file
        place_for_reference = os.path.join(p_ref_csv_files, test,
                                           filename_in_ref_dir)

        log.debug('Copy {} to {}'.format(csv_file, place_for_reference))

        if not ltestsuite:
            shutil.copy(csv_file, place_for_reference)

        files_to_commit.append(place_for_reference)

        # copy pdf with bar-plots from Welch's-test
        if test == 'welch':

            pdf_file = utils.clean_path(
                p_stages, '{}_{}.pdf'.format(test_cfg.ref_name, exp))

            # what is the name of the pdf in the reference pool
            filename_in_ref_dir = '{}_plots.pdf'.format(test_cfg.ref_name)
            # what is the location to store that file
            place_for_reference = os.path.join(p_ref_csv_files, test,
                                               filename_in_ref_dir)

            log.debug('Copy {} to {}'.format(csv_file, place_for_reference))
            files_to_commit.append(place_for_reference)

            if not ltestsuite:
                shutil.copy(pdf_file, place_for_reference)

    # root is important to not fail during git commands
    os.chdir(paths.rootdir)

    # checkout new branch
    if not ltestsuite:
        log.info('Create and checkout new branch {}'.format(new_branch_name))
        git_cmd = 'git checkout -B {}'.format(new_branch_name)
        utils.shell_cmd(git_cmd, py_routine='add_exp_to_ref.py')

        # commit all modified files prior in the function to git
        for file in files_to_commit:
            git_cmd = 'git add {}'.format(file)
            log.debug(git_cmd)
            utils.shell_cmd(git_cmd, py_routine=__name__)

        log.debug('Commit files {}'.format(files_to_commit))
        commit_message = input('Please type your commit message :')
        git_cmd = 'git commit -m "{}"'.format(commit_message)
        utils.shell_cmd(git_cmd, py_routine=__name__)

    # Finish
    log.info(
        Style.GREEN(
            'Files are added in the new branch: '
            '{} in your local git repository.'.format(new_branch_name)))
    log.info('To add the file to the official repository, '
             'please perform the following steps:')
    log.info('1. Push the new branch into the official repo:')
    log.info('   git push --set-upstream origin {}'.format(new_branch_name))
    log.info('2. On the Open Web interface (GitHub) , open a Pull Request.')

    log.banner('End add_exp_to_ref for experiment {}'.format(exp))
    return ()
Example #4
0
                        '-v',
                        dest='lverbose',
                        action='store_true',
                        help='Debug output')

    parser.add_argument('--testsuite',
                        '-ts',
                        dest='ltestsuite',
                        action='store_true',
                        help='Run of testsuite')

    args = parser.parse_args()

    # init logger
    logger_config.init_logger(args.lverbose, __file__)

    log.banner('Start execute {} as main()'.format(__file__))

    # make all paths from user to absolute paths
    args.p_stages = utils.abs_path(args.p_stages)
    args.p_ref_csv_files = utils.abs_path(args.p_ref_csv_files)

    main(exp=args.exp,
         tests=args.tests,
         p_stages=args.p_stages,
         p_ref_csv_files=args.p_ref_csv_files,
         ltestsuite=args.ltestsuite,
         lverbose=args.lverbose)

    log.banner('End execute {} as main()'.format(__file__))
Example #5
0
def main(exp, actions, tests, spinup, p_raw_files, p_stages, raw_f_subfold,
         f_vars_to_extract, f_pattern_ref):

    log.banner('Start standard-postprocessing')

    results_data_processing = {}
    processed_netcdf_filename = {}
    skip_next_step = {}

    # init in case standard_postproc is skipped
    for test in tests:
        skip_next_step[test] = False

    for test in tests:
        if (actions['standard_postproc'][test]):
            processed_netcdf_filename[test], skip_next_step[test] = \
                standard_postproc(exp,
                                  test=test,
                                  spinup=spinup,
                                  p_raw_files=p_raw_files,
                                  raw_f_subfold=raw_f_subfold,
                                  p_stages=p_stages,
                                  f_vars_to_extract=f_vars_to_extract)
        else:
            log.info('Data already processed for test {}'.format(test))
            processed_netcdf_filename[test] = utils.clean_path(
                p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp))

    log.banner('End standard-postprocessing')

    log.banner('Start conversion from NetCDF to dataframe')

    if 'welch' in tests:

        test = 'welch'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            # transforming netcdf timeseries into csv file
            results_data_processing[test] = timeser_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning("Skip Welch's-Test")

    if 'emi' in tests:

        test = 'emi'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            results_data_processing[test] = emis_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip emission test')

    if 'fldcor' in tests:

        test = 'fldcor'

        if (actions['test_postproc'][test] and not skip_next_step[test]):

            f_pattern_ref = download_ref_to_stages_if_required(
                f_pattern_ref, p_stages, f_vars_to_extract, test)

            results_data_processing[test] = pattern_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages,
                reference=f_pattern_ref)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip pattern correlation test')

    if 'rmse' in tests:

        test = 'rmse'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            test = 'rmse'

            f_pattern_ref = download_ref_to_stages_if_required(
                f_pattern_ref, p_stages, f_vars_to_extract, test)

            results_data_processing[test] = rmse_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages,
                reference=f_pattern_ref)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip Rmse test')

    log.banner('End conversion from NetCDF to dataframe')

    return (results_data_processing)
Example #6
0
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages,
                      f_vars_to_extract):
    '''
Perfom standard post-processing using cdo 

Arguments: 
    exp            = experiment name
    test           = name of current test to process data
    spinup         = number of files (from begining of simulation) 
                     to ignore du to model spinup
    p_raw_files    = path to raw model output
    raw_f_subfold  = subfolder in p_raw_files with model output 
                     [p_raw_files]/[raw_f_subfold]
    p_stages       = directory where processing steps are stored
    f_vars_to_extract =  csv file containg the variables to proceed

returns: 
   netCDF filename containing the fields as defined in f_vars_to_extract
    '''

    log.info('Postprocess data using CDO for test {}'.format(test))

    # check that exp is defined
    if exp is None:
        log.error('Experiment is not defined.\n exp = {}'.format(exp))

    # get variables to process:
    p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test)
    full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract)
    df_vars = pd.read_csv(full_p_f_vars, sep=',')

    # define expressions
    df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula']

    # name of output file
    ofile_tot = os.path.join(p_stages,
                             'standard_postproc_{}_{}.nc'.format(test, exp))

    # initialisation
    files_error = []  # list files giving error
    files_proceed = []  # list of files where data are collected

    # sometimes data is stored in a folder called Raw
    p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold)

    # SPECIAL CASE, echam specific :
    # if the folder containing the Raw files have been deleted,
    # but folder 'Data' contains already global annual means
    if not os.path.isdir(p_raw_folder):
        log.warning('The folder containing the raw data '
                    'has been deleted : {}'.format(p_raw_folder))

        p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data')
        if test == 'welch':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc'))

        if test == 'fldcor' or test == 'rmse':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc'))
        if test == 'emi':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'emi_*.nc'))

        if len(time_series_altern_fold) < 1:
            log.error('Could not find files in alternative directory '
                      '{}'.format(time_series_altern_fold))
        else:
            log.info('The alternative folder has been found instead: '
                     '{}'.format(p_altern_timeser_fold))

            log.warning('This section of code is only tested for ECHAM! '
                        'It is not recommended to use it for other cases')

            if len(time_series_altern_fold) == 1:
                index_ts = 0
            if len(time_series_altern_fold) > 1:

                for (i, item) in enumerate(time_series_altern_fold):
                    print(i, item)
                index_ts = int(
                    input('Please type the index of the file'
                          ' to use (negative means '
                          'none of them) : '))

            # If index positive, copy the time serie and exit
            if index_ts >= 0:
                log.info('File used : {}'.format(
                    time_series_altern_fold[index_ts]))

                cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC '
                           '-chname,ICNC,burden_ICNC '
                           '-chname,SCF,SCRE -chname,LCF,LCRE '
                           '{} {}'.format(time_series_altern_fold[index_ts],
                                          ofile_tot))
                utils.shell_cmd(cdo_cmd, py_routine=__name__)

                # convert netCDF to dataframe,
                # therefore skip next processing step
                if test == 'welch':
                    timeser_proc_nc_to_df(exp,
                                          ofile_tot,
                                          p_stages,
                                          already_a_timeseries=True)
                    skip_next_steps = True
                else:
                    skip_next_steps = False

                log.warning('Leave ECHAM-only code-section! '
                            'You are save again...')
                return (ofile_tot, skip_next_steps)

    # NORMAL CASE
    else:
        log.info('Analyse files in : {}'.format(p_raw_folder))

    log.banner('Time for a coffee...')

    # loop over output stream
    for stream in df_vars['file'].unique():

        # extract all lines with file f
        df_file = df_vars[df_vars.file == stream]

        # list all available files in p_raw_files/exp/raw_f_subfold
        #which have stream f
        # restart files and {}m.format(stream) e.g. echamm.nc
        # files are not considered
        final_p_raw_files = os.path.join(p_raw_folder,
                                         '*_*{}*.nc'.format(stream))
        ifiles = [
            fn for fn in glob.glob(final_p_raw_files) if sum([
                s in os.path.basename(fn)
                for s in ['stream', '{}m'.format(stream)]
            ]) == 0
        ]
        if len(ifiles) == 0:
            log.warning('No raw files found for stream {} at address : \n'
                        '{}'.format(stream, final_p_raw_files))

        # sort files in chronoligcal order
        # (this will be needed for doing yearmean properly)
        ifiles.sort()

        print_statistics_of_raw_files(ifiles, stream, exp)

        # remove spin-up files
        log.info('Remove first {} months of data '
                 'due to model spinup'.format(spinup))
        ifiles = ifiles[int(spinup):]

        # output file for stream f
        ofile_str = '{}_{}.nc'.format(exp, stream)

        # variables to extract form netcdf
        # files (this is needed for optimization)
        variables = variables_to_extract(vars_in_expr=df_file.formula.values)

        # Extract variables needed from big files
        log.info('Extract variables from file: {}'.format(stream))

        # initialization
        tmp_selvar_files = []  # list to store the ifiles

        for ifile in ifiles:
            # basename of ifile
            ifile_bsn = os.path.basename(ifile)
            log.debug('File {}'.format(ifile_bsn))
            tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn)

            cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile,
                                                   tmp_selvar_file)
            out_status, out_mess = utils.shell_cmd(cdo_cmd,
                                                   py_routine=__name__,
                                                   lowarn=True)

            if out_status == 0:
                tmp_selvar_files.append(tmp_selvar_file)
            else:
                files_error.append(ifile_bsn)

        # Merge all the monthly files together
        log.info('Copy {} files'.format(stream))
        tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream)
        if os.path.isfile(tmp_merged):
            os.remove(tmp_merged)

        cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files),
                                           tmp_merged)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # compute needed variables
        log.info('Compute variables for file : {}'.format(stream))
        if os.path.isfile(ofile_str):
            os.remove(ofile_str)

        expr_str = ';'.join((df_file.expr.values))
        cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format(
            expr_str, tmp_merged, ofile_str)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # keep trace of output file per stream
        files_proceed.append(ofile_str)

        # cleaning
        [os.remove(f) for f in tmp_selvar_files]
        os.remove(tmp_merged)

    # merge all stream files
    if os.path.isfile(ofile_tot):
        os.remove(ofile_tot)
    cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    [os.remove(f) for f in files_proceed]

    # Finish
    if len(files_error) != 0:
        log.warning('Files with a problem: {}'.format(','.join(files_error)))

    log.info('Postprocess data using CDO for test {} finished. \n '
             'Output here : {}'.format(test, ofile_tot))

    # return name of output file
    return (ofile_tot, False)
Example #7
0
def main(new_exp, results_data_processing, tests, p_stages, p_ref_csv_files,
         ltestsuite, f_vars_to_extract):

    df_exp = {}
    df_ref = {}
    p_csv_files = {}
    testresult_csv = {}
    df_result = {}

    for test in tests:
        log.info('Prepare references for test {}'.format(test))

        test_cfg = get_config_of_current_test(test)

        results_data_processing[test]['exp'] = new_exp

        # list of paths to all csv files
        p_csv_files[test] = glob.glob(
            os.path.join(p_ref_csv_files, test,
                         '{}_*csv'.format(test_cfg.ref_name)))
        if len(p_csv_files[test]) == 0:
            log.error('No reference files found in {}'.format(p_ref_csv_files))

        log.debug('{} reference(s) found for test \
                  {}'.format(len(p_csv_files[test]), test))

        # create big dataframe containing all reference exps
        df_ref[test] = create_big_df(test_cfg.ref_name,
                                     list_csv_files=p_csv_files[test])

        # Exclude all the non-desired variables (1) var from file, 2) exp)
        full_p_f_vars = os.path.join(paths.p_f_vars_proc, test,
                                     f_vars_to_extract)
        vars_to_analyse = list(
            pd.read_csv(full_p_f_vars, sep=',')['var'].values)
        vars_to_analyse.append('exp')
        try:
            df_ref[test] = df_ref[test][vars_to_analyse]
        except KeyError as e:
            log.warning(e)
            log.error('Variables defined in {} are not contained in reference \
                {}'.format(utils.rel_path(f_vars_to_extract),
                           utils.rel_path(p_ref_csv_files)))

        df_exp[test] = results_data_processing[test][vars_to_analyse]

        log.info('References for test {} prepared'.format(test))

        testresult_csv[test] = os.path.join(
            p_stages, 'result_{}_{}.csv'.format(test, new_exp))

        if test == 'welch':
            log.banner('')
            log.banner("Perform Welch's t-test for each variable")
            log.banner('')
            df_result[test] = welch_test(
                df_a=df_ref[test],
                df_b=df_exp[test],
                filename_student_test=testresult_csv[test])
            df_result[test]['p-value [%]'] = df_result[test]['p-value'] * 100.

        if test == 'fldcor':
            log.banner('')
            log.banner("Perform fldcor test for each variable")
            log.banner('')
            df_result[test] = pattern_correlation(df_exp[test], test_cfg)

        if test == 'emi':
            log.banner('')
            log.banner("Perform emission test for each variable")
            log.banner('')
            df_result[test] = emissions(df_exp[test], df_ref[test], test_cfg)

        if test == 'rmse':
            log.banner('')
            log.banner("Perform rmse test for each variable")
            log.banner('')
            df_result[test] = rmse(df_exp[test], test_cfg)

        df_result[test] = sort_level_metric(df_result[test],
                                            test_cfg.metric_threshold,
                                            test_cfg.metric)
        df_result[test] = add_color_df_result(df_result[test],
                                              test_cfg.metric_threshold)

        print_warning_color(df_result[test], test_cfg.metric_threshold,
                            test_cfg.metric)

        if ltestsuite:
            for test in tests:
                test_cfg = get_config_of_current_test(test)
                utils.exit_if_testresult_is_bad(test, df_result[test],
                                                test_cfg.metric_threshold,
                                                test_cfg.metric)

    return df_result, df_ref