Example #1
0
def download_ref_to_stages_if_required(f_pattern_ref, p_stages,
                                       f_vars_to_extract, test):

    # no ref-file passed as argument of process_data
    if f_pattern_ref == paths.rootdir:
        log.info('Download reference file from ftp-server')

        filename_ftp_link = f_vars_to_extract.replace('.csv', '.txt').replace(
            'vars_', 'ftp_')

        path_to_ftp_link = os.path.join(paths.p_f_vars_proc, test)
        file_with_ftp_link = utils.clean_path(path_to_ftp_link,
                                              filename_ftp_link)

        output_file = os.path.join(p_stages, 'ftp_ref_pattern.nc')

        cmd = ('wget --input-file={} '
               '--output-document={}'.format(file_with_ftp_link, output_file))
        log.debug('ftp-command: {}'.format(cmd))
        utils.shell_cmd(cmd, py_routine=__name__)

        f_pattern_ref = output_file

    else:
        log.info('Using user-defined reference file for test '
                 '{}'.format(test))

    return f_pattern_ref
Example #2
0
def normalize_data(dataset):

    log.info('Normalize fields in {} with mean and '
             'standard deviation'.format(dataset))

    data = dataset.replace('.nc', '')
    std_data = '{}_std.nc'.format(data)
    std_data_enlarged = '{}_std_enlarged.nc'.format(data)
    mean_data = '{}_mean.nc'.format(data)
    mean_data_enlarged = '{}_enlarged.nc'.format(data)
    sub_data = '{}_sub.nc'.format(data)
    normalized_data = '{}_normalized.nc'.format(data)

    log.debug('Clean intermediate files for normalization')
    shell_cmd = 'rm {} {} {} {} {} {}'.format(std_data, mean_data,
                                              std_data_enlarged,
                                              mean_data_enlarged, sub_data,
                                              normalized_data)
    utils.shell_cmd(shell_cmd, py_routine=__name__, lowarn=True)

    cdo_cmd = 'cdo -L fldstd {} {}'.format(dataset, std_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L fldmean {} {}'.format(dataset, mean_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    #cdo_cmd = 'cdo -L sub {} -enlarge,{} {} {}'.format(dataset,
    cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, mean_data,
                                                mean_data_enlarged)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, std_data,
                                                std_data_enlarged)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L sub {} {} {}'.format(dataset, mean_data_enlarged,
                                           sub_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L div {} {} {}'.format(sub_data, std_data_enlarged,
                                           normalized_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    return normalized_data
Example #3
0
def rmse_proc_nc_to_df(exp, filename, reference, p_stages):
    '''
Arguments: 
    exp       = experiment name
    filename  = filename of the netCDF returned by 
                function standard_postproc
    reference = filename to the reference
    p_stages  = directory where processing steps are stored

returns:
    dataframe with processed data for pattern correlation test
    '''

    test = 'rmse'
    rmse_interim = 'test_postproc_intermediate_{}_{}.nc'.format(test, exp)

    rmse_filename = 'test_proc_{}_{}.nc'.format(test, exp)

    cdo_cmd = 'cdo -L timmean -yearmean -vertsum {} {}'.format(
        filename, rmse_interim)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    reference_normalized = normalize_data(reference)
    rmse_interim_normalized = normalize_data(rmse_interim)

    # list of variables in the timeserie netcdf file to drop
    # (not to put into the dataframe)
    vars_to_drop = []

    log.info('Compute root mean square error '
             'between {} and {} (reference)'.format(rmse_interim_normalized,
                                                    reference_normalized))

    cdo_cmd = 'cdo -L sqrt -fldmean -sqr -sub {} {} {}'.format(
        rmse_interim_normalized, reference_normalized, rmse_filename)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    # open dataset
    data = xr.open_dataset(rmse_filename)

    # Delete variables
    # useless variable time_bnds
    if ('time_bnds' in data.keys()):
        data = data.drop('time_bnds')
    # 3D vars
    if len(vars_to_drop) > 0:
        data.drop(labels=vars_to_drop)

    # transforms into dataframe
    df_data = data.to_dataframe()

    os.makedirs(p_stages, exist_ok=True)
    csv_filename = os.path.join(p_stages,
                                'test_postproc_{}_{}.csv'.format(test, exp))
    df_data.to_csv(csv_filename, index=None, header=True, sep=';')
    log.info('CSV file can be found here: {}'.format(csv_filename))

    log.info('Finished {} for file {}'.format(__name__, rmse_filename))

    return (df_data)
Example #4
0
def welch_test(df_a, df_b, filename_student_test=''):
    '''
    Perform Welch t-test for each variable fo dataframe df_b
    :param df_a: reference datframe, containing big sample
    :param df_b: datframe containing data to test
    :param filename_student_test: filename for writing result 
            of t-test result into a csv file
    :return: result of the student test in a dataframe
    '''

    row_list_df = []

    for var in df_b.keys():
        if 'exp' in var:
            continue
        log.debug("Welch's t-test for {}".format(var))
        # Welch's t-test
        t, p = stats.ttest_ind(df_a[var],
                               df_b[var],
                               equal_var=False,
                               nan_policy='omit')

        # append results for construction datframe df_result
        dict1 = {'variable': var, 't-value': t, 'p-value': p}
        row_list_df.append(dict1)

    # construction dataframe
    df_result = pd.DataFrame(row_list_df,
                             columns=['variable', 't-value', 'p-value'])

    # sort per p value
    df_result.sort_values(by=['p-value'], inplace=True)

    # if a filename is given, write the student-stest
    # result into the file named filename_student_test
    if len(filename_student_test) > 0:
        log.info('Write result to {}'.format(filename_student_test))
        df_result.to_csv(filename_student_test, sep=',')

    return (df_result)
Example #5
0
def print_statistics_of_raw_files(ifiles, stream, exp):

    datepatterns = ['%Y_%m', '%Y%m']
    years_found = []
    no_summary = False

    for file in ifiles:
        file = (os.path.basename(file))
        strip_1 = file.strip('_{}_.nc'.format(stream))
        strip_2 = strip_1.strip('{}_'.format(exp))
        strip_3 = strip_2.strip('.')
        datestring = strip_3

        failed = True
        for pattern in datepatterns:

            if failed:
                try:
                    date = datetime.datetime.strptime(datestring, pattern)
                    failed = False
                except ValueError:
                    failed = True

        if failed:
            no_summary = True

        else:
            year = date.year
            if year not in years_found:
                years_found.append(year)

    if no_summary:
        log.warning('Could not determine years '
                    'due to an unkown pattern in the filenames')
    else:
        log.info('{} files with model output '
                 'found for years:'.format(len(ifiles)))
        for year in years_found:
            log.info(year)
Example #6
0
def print_warning_color(df_result, metric_thresholds, metric):

    # dataframe containing only variables a warning has to be printed
    df_warning = df_result[df_result['level'] != 'high']

    log.info('-------------------------------------------'
             '-------------------------------------------'
             '--------------------')

    if df_warning.size > 0:

        log.warning('The following variables give problematic '
                    '{} : \n'.format(metric))

        # for each level of warning, print the dataframe
        for metric_lev in metric_thresholds:
            if metric_lev != 'high':

                # dataframe containing only this level of warning
                df_print_warn = df_warning[df_warning.level ==
                                           metric_lev.level]

                # print
                if df_print_warn.size > 0:
                    log.info('Confidence is {} for {} '.format(
                        metric_lev.level.upper(), metric))
                    log.info(metric_lev.col_txt(df_print_warn))
    else:
        log.info(
            Style.GREEN('The experiment is fine. '
                        'No {} under {} \n').format(
                            metric, metric_thresholds[1].p_thresh))

    log.info('-------------------------------------------'
             '-------------------------------------------'
             '--------------------')

    return
Example #7
0
def print_warning_if_testresult_is_bad(test, df_result, metric_thresholds,
                                       metric):

    df_warning = df_result[df_result['level'] == 'very low']

    log.info('-----------------------------------------'
             '-----------------------------------------')
    log.info(test)

    if df_warning.size > 0:

        log.info(
            Style.RED('Results are bad! \n'
                      'It is not recommended to add this '
                      'test to the reference pool'))
        # for each level of warning, print the dataframe
    else:
        log.info(Style.GREEN('Results OK'))

    log.info('-----------------------------------------'
             '-----------------------------------------')

    return
Example #8
0
def timeser_proc_nc_to_df(exp, filename, p_stages, already_a_timeseries=False):
    '''
Arguments: 
    exp      = experiment name
    filename = filename of the netCDF returned by function standard_postproc
    p_stages = directory where processing steps are stored

returns:
    dataframe with processed data for welchstest
    '''

    test = 'welch'

    if not already_a_timeseries:
        timeser_filename = 'test_postproc_{}_{}.nc'.format(test, exp)
        cdo_cmd = 'cdo -L yearmean -fldmean -vertsum {} {}'.format(
            filename, timeser_filename)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)
    else:
        log.debug('Skipping CDO-processing step')
        timeser_filename = filename

    # list of variables in the timeserie netcdf
    # file to drop (not to put into the dataframe)
    vars_to_drop = []

    log.info('Processing netCDF: {}'.format(timeser_filename))

    # open dataset
    data = xr.open_dataset(timeser_filename)

    # Delete variables
    # useless variable time_bnds
    if ('time_bnds' in data.keys()):
        data = data.drop('time_bnds')
    # 3D vars
    if len(vars_to_drop) > 0:
        data.drop(labels=vars_to_drop)

    # removed degenerated dimensions
    data = data.squeeze(drop=True)

    # transforms into dataframe
    df_data = data.to_dataframe()

    # export in a file
    os.makedirs(p_stages, exist_ok=True)
    csv_filename = os.path.join(p_stages,
                                'test_postproc_{}_{}.csv'.format(test, exp))
    df_data.to_csv(csv_filename, index=None, header=True, sep=';')
    log.info('CSV file can be found here: {}'.format(csv_filename))

    log.info('Finished {} for file {}'.format(__name__, timeser_filename))

    return (df_data)
Example #9
0
def exit_if_testresult_is_bad(test, df_result, metric_thresholds, metric):

    df_warning = df_result[df_result['level'] != 'high']
    df_warning = df_warning[df_warning['level'] != 'middle']

    log.info('-----------------------------------------'
             '-----------------------------------------')
    log.info(test)

    if df_warning.size > 0:

        log.error(Style.RED('Results are bad!'))
    else:
        log.info(Style.GREEN('Results OK'))

    log.info('-----------------------------------------'
             '-----------------------------------------')

    return
Example #10
0
def main(new_exp, p_raw_files, raw_f_subfold, p_stages, p_ref_csv_files,
         wrk_dir, f_vars_to_extract, f_pattern_ref, tests, spinup, lclean,
         ltestsuite, lverbose):

    # init logger
    logger_config.init_logger(lverbose, __file__)

    log.banner('Start sanity checker')

    # make all paths from user to absolute paths
    wrk_dir = utils.abs_path(wrk_dir)
    p_stages = utils.abs_path(p_stages)
    p_ref_csv_files = utils.abs_path(p_ref_csv_files)
    f_pattern_ref = utils.abs_path(f_pattern_ref)

    # create directories
    os.makedirs(p_stages, exist_ok=True)
    os.makedirs(wrk_dir, exist_ok=True)

    # go to working directory
    os.chdir((wrk_dir))
    log.info('Working directory is {}'.format(wrk_dir))

    # data processing takes a while, check that no step is done twice
    actions = utils.determine_actions_for_data_processing(
        new_exp, tests, p_stages, lclean)

    # create dataframe out of raw data
    results_data_processing = process_data.main(
        new_exp,
        actions,
        tests,
        spinup,
        p_raw_files=p_raw_files,
        p_stages=p_stages,
        raw_f_subfold=raw_f_subfold,
        f_vars_to_extract=f_vars_to_extract,
        f_pattern_ref=f_pattern_ref)

    results_test, references = perform_test.main(
        new_exp,
        results_data_processing=results_data_processing,
        p_stages=p_stages,
        tests=tests,
        p_ref_csv_files=p_ref_csv_files,
        ltestsuite=ltestsuite,
        f_vars_to_extract=f_vars_to_extract)

    if 'welch' in tests:
        test = 'welch'
        plt.plt_welchstest(references[test].append(
            results_data_processing[test], sort=False),
                           new_exp,
                           results_test[test],
                           p_stages=p_stages)

    # Add experiment to the reference pool
    #--------------------------------------------------------------------
    log.banner('')
    log.banner('Check results again before adding to reference pool')
    log.banner('')

    for test in tests:
        test_cfg = test_config.get_config_of_current_test(test)
        utils.print_warning_if_testresult_is_bad(test, results_test[test],
                                                 test_cfg.metric_threshold,
                                                 test_cfg.metric)

    if ltestsuite:
        asw = 'YES'
    else:
        asw = input('If you are happy with this experiment, '
                    'do you want to add it to the reference pool ?'
                    '(yes/[No])\n')

    if (asw.strip().upper() == 'YES') or (asw.strip().upper() == 'Y'):
        add_exp_to_ref.main(new_exp,
                            tests,
                            p_stages=p_stages,
                            ltestsuite=ltestsuite,
                            p_ref_csv_files=p_ref_csv_files)
    else:
        args_for_manual_execution = \
            utils.derive_arguments_for_add_exp_to_ref(new_exp,
                                                      tests,
                                                      p_stages,
                                                      p_ref_csv_files)

        log.info('The experiment {} is NOT added to '
                 'the reference pool \n'.format(new_exp))
        log.info('If you want to add the experiment {} '
                 'to the reference pool later on, type '
                 'the following line when you are ready:'.format(
                     new_exp, new_exp))

        log.info('')
        log.info(
            'python add_exp_to_ref.py {}'.format(args_for_manual_execution))

    log.banner('')
    log.banner('Sanity test finished')
    log.banner('')
Example #11
0
def plt_welchstest(df_tot, new_exp, df_result, p_stages=paths.p_stages):
    '''

    :param df_tot:  Dataframe containing containing all 
        global annual mean (reference & new_exp)
    :param new_exp:   Name of the new exp which is analysed
    :param df_result: Dataframe containing the results of the Welch's test
    :param p_stages : path to save the figures

    :return: None, but the figure is saved in p_stages
    '''

    # simple statistics, sort by exp 
    #  to be sure the order is the same in both dataframe
    df_tot_mean = df_tot.groupby(['exp']).mean()\
        .sort_values(['exp']).reset_index()

    # for std, the panda std has a bug 
    # cf https://github.com/pandas-dev/pandas/issues/16799
    df_tot_std = df_tot.groupby(['exp']).std()\
        .sort_values(['exp']).reset_index()

    # ensure new exp to be the last line
    iexp = df_tot_mean.index[df_tot_mean['exp'] == new_exp]
    new_order = df_tot_mean.index.drop(iexp).append(iexp)
    df_tot_mean = df_tot_mean.reindex(new_order)
    df_tot_std = df_tot_std.reindex(new_order)

    # number col/rows per page
    nlin = 3
    ncol = 3
    nplot = nlin * ncol

    # needed for multipage pdf file
    filename_mean_std_figures = 'glob_means_{}.pdf'.format((new_exp))
    p_pdf_file_var = os.path.join(p_stages,filename_mean_std_figures)
    pp = PdfPages(p_pdf_file_var)

    # loop over all variables
    for ivar,var in enumerate(df_result.variable):
        log.debug('Create plot for {}'.format(var))

        # subplot preparation
        # ------------------------------------------------------------------------
        # number of plot
        iplot = np.mod(ivar,nplot)

        # set the plotting frame
        if (iplot == 0):
            fig, plt_nbr = plt.subplots(nlin, 
                                        ncol, 
                                        sharex='col',
                                        figsize=(12, 12))

        # subplot coordinate
        icol = np.mod(iplot, ncol)
        ilin = np.int(np.floor(iplot / ncol))

        # actual plot
        act_plt = plt_nbr[ilin, icol]

        # x-axis
        xaxis = np.arange(df_tot_mean.shape[0])

        # plotting
        # ---------------------------------------------------------------------------
        nmisval = df_tot_mean[var].isna().sum()
        i_newexp = len(xaxis) - nmisval - 1
        # define colors
        colors = len(xaxis) * ['k']
        colors[i_newexp:len(xaxis)] = 'k'
        # define thickness
        thickness = len(xaxis) * [1.5]
        thickness[i_newexp] = 3

        # plot mean and std for each variable
        act_plt.errorbar(xaxis, df_tot_mean[var], yerr=df_tot_std[var],
                         fmt='+k',ecolor=colors, elinewidth=thickness)

        # plot average reference experiments (grey band)
        m_ref = df_tot[df_tot.exp != new_exp][var].mean()
        s_ref = df_tot[df_tot.exp != new_exp][var].std()
        act_plt.axhline(m_ref, c='k')
        act_plt.fill_between([-1, max(xaxis) - 0.5],
                             m_ref - s_ref, m_ref + s_ref,
                             facecolor='grey',alpha=0.6)

        # plot color background
        color_graph = df_result.loc[df_result.variable == var]['col-graph']\
            .values[0]
        act_plt.set_facecolor('{}'.format(color_graph))

        # manage labels/titel/etc
        # -------------------------------------------------------------------------------
        # label settings
        act_plt.xaxis.set_ticks(xaxis)
        act_plt.set_xticklabels(df_tot_mean['exp'],rotation=90)

        # title settings
        pvalue = float(df_result[df_result.variable == var]['p-value'])
        act_plt.set_title('{}, p-value = {:.2%}'.format(var,pvalue))

        # Saving page & increase number var
        # --------------------------------------------------------------------------------
        # save full page
        if (iplot == (nplot - 1)):
            pp.savefig()

    # save and close odf file file
    fig.savefig(pp, format='pdf')
    pp.close()

    log.info('Detailed plots of mean and standard deviation per variable ' 
             'can be found in the file {}'.format(p_pdf_file_var))
Example #12
0
def add_line_descr_f(exp, f_exp_descr):
    '''
    Add line for exp exp in file f_exp_descr

    :param exp: new expirement name
    :param f_exp_descr: file in which the new line has to be added

    return: None
    '''

    log.info('Adding line {} in the file {}:'.format(exp, f_exp_descr))

    # open file in dataframe
    if not os.path.isfile(f_exp_descr):
        # create dataframe
        cols_exp_descr_f = [
            'Experiment name', 'Platform', 'OS', 'Compiler (with version)',
            'Optimisation level (-OX)', '-fast-transcendentals (y/n)',
            '-no-prec-sqrt (y/n)', '-no-prec-div (y/n)', 'welch (y/n)',
            'fldcor (y/n)', 'rmse (y/n)', 'emi (y/n)',
            'Date of experiment (month yyyy)'
        ]
        pd.DataFrame(columns=cols_exp_descr_f)
    else:
        df_exp_descr = pd.read_csv(f_exp_descr, sep=';')

    # collect information from user
    log.banner('Please give the following informations '
               'about your experiment')
    dict_line = {'Experiment name': exp}
    for col_name in df_exp_descr.keys():

        if col_name != 'Experiment name':

            # ask the user for info
            dict_line[col_name] = input('{} : '.format(col_name))

    # amend the information if needed
    while True:

        # new dataframe containing new line for exp
        df_exp_descr_new = df_exp_descr.append(dict_line, ignore_index=True)

        log.banner('Here is the content of the description '
                   'file including your new experiment.')
        log.info(df_exp_descr_new)

        answ_chg = input('Is the new file right ? (y/n/abort).\n'
                         'If you type n, you will be able to change '
                         'column values\n'
                         'If you type abort, the process of adding '
                         'the experiment {} to the reference is stoped.\n'
                         '(y/n/abort) : '
                         ''.format(exp))
        if answ_chg.upper() == 'Y':
            # save new file
            df_exp_descr_new.to_csv(f_exp_descr, sep=';', index=False)

            # get out of the loop
            return False

        elif answ_chg.upper() == 'N':
            answ_col = input('Which column field you want to change ?')

            if answ_col in df_exp_descr.keys():
                dict_line[answ_col] = input('{} : '.format(answ_col))
            else:
                log.warning('{} not in columns!'.format(answ_col))
                log.info('Columns are {}\n'.format(list(df_exp_descr.columns)))

        elif answ_chg.upper() == 'ABORT':
            exit()

    return ()
Example #13
0
def main(exp,
         tests,
         p_stages=paths.p_stages,
         p_ref_csv_files=paths.p_ref_csv_files,
         ltestsuite=False,
         lverbose=False):

    # initialisation
    new_branch_name = 'test_add_{}'.format(exp)
    files_to_commit = []

    # fill up file 'Exps_description.csv' with additional
    # information via user input
    f_exp_descr = os.path.join(p_ref_csv_files, 'Exps_description.csv')
    if not ltestsuite:
        add_line_descr_f(exp=exp, f_exp_descr=f_exp_descr)
    files_to_commit.append(f_exp_descr)

    for test in tests:
        test_cfg = get_config_of_current_test(test)

        csv_file = utils.clean_path(
            p_stages, 'test_postproc_{}_{}.csv'.format(test, exp))

        # what is the filename in the reference pool
        filename_in_ref_dir = '{}_{}.csv'.format(test_cfg.ref_name, exp)
        # what is the location to store that file
        place_for_reference = os.path.join(p_ref_csv_files, test,
                                           filename_in_ref_dir)

        log.debug('Copy {} to {}'.format(csv_file, place_for_reference))

        if not ltestsuite:
            shutil.copy(csv_file, place_for_reference)

        files_to_commit.append(place_for_reference)

        # copy pdf with bar-plots from Welch's-test
        if test == 'welch':

            pdf_file = utils.clean_path(
                p_stages, '{}_{}.pdf'.format(test_cfg.ref_name, exp))

            # what is the name of the pdf in the reference pool
            filename_in_ref_dir = '{}_plots.pdf'.format(test_cfg.ref_name)
            # what is the location to store that file
            place_for_reference = os.path.join(p_ref_csv_files, test,
                                               filename_in_ref_dir)

            log.debug('Copy {} to {}'.format(csv_file, place_for_reference))
            files_to_commit.append(place_for_reference)

            if not ltestsuite:
                shutil.copy(pdf_file, place_for_reference)

    # root is important to not fail during git commands
    os.chdir(paths.rootdir)

    # checkout new branch
    if not ltestsuite:
        log.info('Create and checkout new branch {}'.format(new_branch_name))
        git_cmd = 'git checkout -B {}'.format(new_branch_name)
        utils.shell_cmd(git_cmd, py_routine='add_exp_to_ref.py')

        # commit all modified files prior in the function to git
        for file in files_to_commit:
            git_cmd = 'git add {}'.format(file)
            log.debug(git_cmd)
            utils.shell_cmd(git_cmd, py_routine=__name__)

        log.debug('Commit files {}'.format(files_to_commit))
        commit_message = input('Please type your commit message :')
        git_cmd = 'git commit -m "{}"'.format(commit_message)
        utils.shell_cmd(git_cmd, py_routine=__name__)

    # Finish
    log.info(
        Style.GREEN(
            'Files are added in the new branch: '
            '{} in your local git repository.'.format(new_branch_name)))
    log.info('To add the file to the official repository, '
             'please perform the following steps:')
    log.info('1. Push the new branch into the official repo:')
    log.info('   git push --set-upstream origin {}'.format(new_branch_name))
    log.info('2. On the Open Web interface (GitHub) , open a Pull Request.')

    log.banner('End add_exp_to_ref for experiment {}'.format(exp))
    return ()
Example #14
0
                        dest='ltestsuite',
                        action='store_true',
                        help='Run of testsuite')

    args = parser.parse_args()

    logger_config.init_logger(args.lverbose, __file__)

    log.banner('Start execute {} as main()'.format(__file__))

    args.wrk_dir = utils.abs_path(args.wrk_dir)
    args.p_stages = utils.abs_path(args.p_stages)
    args.p_ref_csv_files = utils.abs_path(args.p_ref_csv_files)

    os.chdir((args.wrk_dir))
    log.info('Current directory is {}'.format(args.wrk_dir))

    log.info('Read processed data from csv for...')
    results_data_processing = {}
    for test in args.tests:
        log.info('{}'.format(test))
        f_csv = utils.clean_path(
            args.p_stages, 'test_postproc_{}_{}.csv'.format(test, args.exp))
        results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    log.info('...done')

    main(new_exp=args.exp,
         results_data_processing=results_data_processing,
         p_stages=args.p_stages,
         p_ref_csv_files=args.p_ref_csv_files,
         f_vars_to_extract=args.f_vars_to_extract,
Example #15
0
def main(exp, actions, tests, spinup, p_raw_files, p_stages, raw_f_subfold,
         f_vars_to_extract, f_pattern_ref):

    log.banner('Start standard-postprocessing')

    results_data_processing = {}
    processed_netcdf_filename = {}
    skip_next_step = {}

    # init in case standard_postproc is skipped
    for test in tests:
        skip_next_step[test] = False

    for test in tests:
        if (actions['standard_postproc'][test]):
            processed_netcdf_filename[test], skip_next_step[test] = \
                standard_postproc(exp,
                                  test=test,
                                  spinup=spinup,
                                  p_raw_files=p_raw_files,
                                  raw_f_subfold=raw_f_subfold,
                                  p_stages=p_stages,
                                  f_vars_to_extract=f_vars_to_extract)
        else:
            log.info('Data already processed for test {}'.format(test))
            processed_netcdf_filename[test] = utils.clean_path(
                p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp))

    log.banner('End standard-postprocessing')

    log.banner('Start conversion from NetCDF to dataframe')

    if 'welch' in tests:

        test = 'welch'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            # transforming netcdf timeseries into csv file
            results_data_processing[test] = timeser_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning("Skip Welch's-Test")

    if 'emi' in tests:

        test = 'emi'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            results_data_processing[test] = emis_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip emission test')

    if 'fldcor' in tests:

        test = 'fldcor'

        if (actions['test_postproc'][test] and not skip_next_step[test]):

            f_pattern_ref = download_ref_to_stages_if_required(
                f_pattern_ref, p_stages, f_vars_to_extract, test)

            results_data_processing[test] = pattern_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages,
                reference=f_pattern_ref)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip pattern correlation test')

    if 'rmse' in tests:

        test = 'rmse'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            test = 'rmse'

            f_pattern_ref = download_ref_to_stages_if_required(
                f_pattern_ref, p_stages, f_vars_to_extract, test)

            results_data_processing[test] = rmse_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages,
                reference=f_pattern_ref)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip Rmse test')

    log.banner('End conversion from NetCDF to dataframe')

    return (results_data_processing)
Example #16
0
def main(new_exp, results_data_processing, tests, p_stages, p_ref_csv_files,
         ltestsuite, f_vars_to_extract):

    df_exp = {}
    df_ref = {}
    p_csv_files = {}
    testresult_csv = {}
    df_result = {}

    for test in tests:
        log.info('Prepare references for test {}'.format(test))

        test_cfg = get_config_of_current_test(test)

        results_data_processing[test]['exp'] = new_exp

        # list of paths to all csv files
        p_csv_files[test] = glob.glob(
            os.path.join(p_ref_csv_files, test,
                         '{}_*csv'.format(test_cfg.ref_name)))
        if len(p_csv_files[test]) == 0:
            log.error('No reference files found in {}'.format(p_ref_csv_files))

        log.debug('{} reference(s) found for test \
                  {}'.format(len(p_csv_files[test]), test))

        # create big dataframe containing all reference exps
        df_ref[test] = create_big_df(test_cfg.ref_name,
                                     list_csv_files=p_csv_files[test])

        # Exclude all the non-desired variables (1) var from file, 2) exp)
        full_p_f_vars = os.path.join(paths.p_f_vars_proc, test,
                                     f_vars_to_extract)
        vars_to_analyse = list(
            pd.read_csv(full_p_f_vars, sep=',')['var'].values)
        vars_to_analyse.append('exp')
        try:
            df_ref[test] = df_ref[test][vars_to_analyse]
        except KeyError as e:
            log.warning(e)
            log.error('Variables defined in {} are not contained in reference \
                {}'.format(utils.rel_path(f_vars_to_extract),
                           utils.rel_path(p_ref_csv_files)))

        df_exp[test] = results_data_processing[test][vars_to_analyse]

        log.info('References for test {} prepared'.format(test))

        testresult_csv[test] = os.path.join(
            p_stages, 'result_{}_{}.csv'.format(test, new_exp))

        if test == 'welch':
            log.banner('')
            log.banner("Perform Welch's t-test for each variable")
            log.banner('')
            df_result[test] = welch_test(
                df_a=df_ref[test],
                df_b=df_exp[test],
                filename_student_test=testresult_csv[test])
            df_result[test]['p-value [%]'] = df_result[test]['p-value'] * 100.

        if test == 'fldcor':
            log.banner('')
            log.banner("Perform fldcor test for each variable")
            log.banner('')
            df_result[test] = pattern_correlation(df_exp[test], test_cfg)

        if test == 'emi':
            log.banner('')
            log.banner("Perform emission test for each variable")
            log.banner('')
            df_result[test] = emissions(df_exp[test], df_ref[test], test_cfg)

        if test == 'rmse':
            log.banner('')
            log.banner("Perform rmse test for each variable")
            log.banner('')
            df_result[test] = rmse(df_exp[test], test_cfg)

        df_result[test] = sort_level_metric(df_result[test],
                                            test_cfg.metric_threshold,
                                            test_cfg.metric)
        df_result[test] = add_color_df_result(df_result[test],
                                              test_cfg.metric_threshold)

        print_warning_color(df_result[test], test_cfg.metric_threshold,
                            test_cfg.metric)

        if ltestsuite:
            for test in tests:
                test_cfg = get_config_of_current_test(test)
                utils.exit_if_testresult_is_bad(test, df_result[test],
                                                test_cfg.metric_threshold,
                                                test_cfg.metric)

    return df_result, df_ref
Example #17
0
    log.banner('Start execute {} as main()'.format(__file__))

    # make all paths from user to absolute paths
    args.wrk_dir = utils.abs_path(args.wrk_dir)
    args.p_stages = utils.abs_path(args.p_stages)
    args.f_pattern_ref = utils.abs_path(args.f_pattern_ref)

    # data processing takes a while, check that no step is done twice
    actions = utils.determine_actions_for_data_processing(
        args.exp, args.tests, args.p_stages, args.lclean)

    # create directories
    os.makedirs(args.p_stages, exist_ok=True)
    os.makedirs(args.wrk_dir, exist_ok=True)

    # go to working directory
    os.chdir((args.wrk_dir))
    log.info('Current directory is {}'.format(args.wrk_dir))

    main(exp=args.exp,
         actions=actions,
         tests=args.tests,
         spinup=args.spinup,
         p_raw_files=args.p_raw_files,
         raw_f_subfold=args.raw_f_subfold,
         p_stages=args.p_stages,
         f_vars_to_extract=args.f_vars_to_extract,
         f_pattern_ref=args.f_pattern_ref)

    log.banner('End execute {} as main()'.format(__file__))
Example #18
0
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages,
                      f_vars_to_extract):
    '''
Perfom standard post-processing using cdo 

Arguments: 
    exp            = experiment name
    test           = name of current test to process data
    spinup         = number of files (from begining of simulation) 
                     to ignore du to model spinup
    p_raw_files    = path to raw model output
    raw_f_subfold  = subfolder in p_raw_files with model output 
                     [p_raw_files]/[raw_f_subfold]
    p_stages       = directory where processing steps are stored
    f_vars_to_extract =  csv file containg the variables to proceed

returns: 
   netCDF filename containing the fields as defined in f_vars_to_extract
    '''

    log.info('Postprocess data using CDO for test {}'.format(test))

    # check that exp is defined
    if exp is None:
        log.error('Experiment is not defined.\n exp = {}'.format(exp))

    # get variables to process:
    p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test)
    full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract)
    df_vars = pd.read_csv(full_p_f_vars, sep=',')

    # define expressions
    df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula']

    # name of output file
    ofile_tot = os.path.join(p_stages,
                             'standard_postproc_{}_{}.nc'.format(test, exp))

    # initialisation
    files_error = []  # list files giving error
    files_proceed = []  # list of files where data are collected

    # sometimes data is stored in a folder called Raw
    p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold)

    # SPECIAL CASE, echam specific :
    # if the folder containing the Raw files have been deleted,
    # but folder 'Data' contains already global annual means
    if not os.path.isdir(p_raw_folder):
        log.warning('The folder containing the raw data '
                    'has been deleted : {}'.format(p_raw_folder))

        p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data')
        if test == 'welch':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc'))

        if test == 'fldcor' or test == 'rmse':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc'))
        if test == 'emi':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'emi_*.nc'))

        if len(time_series_altern_fold) < 1:
            log.error('Could not find files in alternative directory '
                      '{}'.format(time_series_altern_fold))
        else:
            log.info('The alternative folder has been found instead: '
                     '{}'.format(p_altern_timeser_fold))

            log.warning('This section of code is only tested for ECHAM! '
                        'It is not recommended to use it for other cases')

            if len(time_series_altern_fold) == 1:
                index_ts = 0
            if len(time_series_altern_fold) > 1:

                for (i, item) in enumerate(time_series_altern_fold):
                    print(i, item)
                index_ts = int(
                    input('Please type the index of the file'
                          ' to use (negative means '
                          'none of them) : '))

            # If index positive, copy the time serie and exit
            if index_ts >= 0:
                log.info('File used : {}'.format(
                    time_series_altern_fold[index_ts]))

                cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC '
                           '-chname,ICNC,burden_ICNC '
                           '-chname,SCF,SCRE -chname,LCF,LCRE '
                           '{} {}'.format(time_series_altern_fold[index_ts],
                                          ofile_tot))
                utils.shell_cmd(cdo_cmd, py_routine=__name__)

                # convert netCDF to dataframe,
                # therefore skip next processing step
                if test == 'welch':
                    timeser_proc_nc_to_df(exp,
                                          ofile_tot,
                                          p_stages,
                                          already_a_timeseries=True)
                    skip_next_steps = True
                else:
                    skip_next_steps = False

                log.warning('Leave ECHAM-only code-section! '
                            'You are save again...')
                return (ofile_tot, skip_next_steps)

    # NORMAL CASE
    else:
        log.info('Analyse files in : {}'.format(p_raw_folder))

    log.banner('Time for a coffee...')

    # loop over output stream
    for stream in df_vars['file'].unique():

        # extract all lines with file f
        df_file = df_vars[df_vars.file == stream]

        # list all available files in p_raw_files/exp/raw_f_subfold
        #which have stream f
        # restart files and {}m.format(stream) e.g. echamm.nc
        # files are not considered
        final_p_raw_files = os.path.join(p_raw_folder,
                                         '*_*{}*.nc'.format(stream))
        ifiles = [
            fn for fn in glob.glob(final_p_raw_files) if sum([
                s in os.path.basename(fn)
                for s in ['stream', '{}m'.format(stream)]
            ]) == 0
        ]
        if len(ifiles) == 0:
            log.warning('No raw files found for stream {} at address : \n'
                        '{}'.format(stream, final_p_raw_files))

        # sort files in chronoligcal order
        # (this will be needed for doing yearmean properly)
        ifiles.sort()

        print_statistics_of_raw_files(ifiles, stream, exp)

        # remove spin-up files
        log.info('Remove first {} months of data '
                 'due to model spinup'.format(spinup))
        ifiles = ifiles[int(spinup):]

        # output file for stream f
        ofile_str = '{}_{}.nc'.format(exp, stream)

        # variables to extract form netcdf
        # files (this is needed for optimization)
        variables = variables_to_extract(vars_in_expr=df_file.formula.values)

        # Extract variables needed from big files
        log.info('Extract variables from file: {}'.format(stream))

        # initialization
        tmp_selvar_files = []  # list to store the ifiles

        for ifile in ifiles:
            # basename of ifile
            ifile_bsn = os.path.basename(ifile)
            log.debug('File {}'.format(ifile_bsn))
            tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn)

            cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile,
                                                   tmp_selvar_file)
            out_status, out_mess = utils.shell_cmd(cdo_cmd,
                                                   py_routine=__name__,
                                                   lowarn=True)

            if out_status == 0:
                tmp_selvar_files.append(tmp_selvar_file)
            else:
                files_error.append(ifile_bsn)

        # Merge all the monthly files together
        log.info('Copy {} files'.format(stream))
        tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream)
        if os.path.isfile(tmp_merged):
            os.remove(tmp_merged)

        cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files),
                                           tmp_merged)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # compute needed variables
        log.info('Compute variables for file : {}'.format(stream))
        if os.path.isfile(ofile_str):
            os.remove(ofile_str)

        expr_str = ';'.join((df_file.expr.values))
        cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format(
            expr_str, tmp_merged, ofile_str)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # keep trace of output file per stream
        files_proceed.append(ofile_str)

        # cleaning
        [os.remove(f) for f in tmp_selvar_files]
        os.remove(tmp_merged)

    # merge all stream files
    if os.path.isfile(ofile_tot):
        os.remove(ofile_tot)
    cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    [os.remove(f) for f in files_proceed]

    # Finish
    if len(files_error) != 0:
        log.warning('Files with a problem: {}'.format(','.join(files_error)))

    log.info('Postprocess data using CDO for test {} finished. \n '
             'Output here : {}'.format(test, ofile_tot))

    # return name of output file
    return (ofile_tot, False)