コード例 #1
0
def create_big_df(ref_names, list_csv_files, filename_csv=''):
    '''
    :param list_csv_files: list of csv files for the big dataframe
    :return: big dataframe containing the whole data
    '''

    # initialise big empty dataframe
    df_tot = pd.DataFrame()

    # create big dataframe
    for fexp in list_csv_files:

        exp = os.path.basename(fexp).rstrip('.csv').replace('glob_means_', '')

        # read the csv file
        if os.path.isfile(fexp):
            df_exp = pd.read_csv(fexp, sep=';')
            df_exp['exp'] = exp

            # append dataframe of exp to the total dataframe
            df_tot = df_tot.append(df_exp, sort=False)

        else:
            log.warning('csv file is not a file : {}'.format(fexp))

    if len(filename_csv) > 0:
        df_tot.to_csv(filename_csv, sep=';')

    return df_tot
コード例 #2
0
ファイル: utils.py プロジェクト: C2SM/clim-sanity-checker
def determine_actions_for_data_processing(exp, tests, p_stages, lforce):

    actions = {'standard_postproc': {}, 'test_postproc': {}}

    if lforce:
        log.warning('Redo all processing steps')

    # see if standard-postprocessing is needed
    for test in tests:

        standard_proc_nc = os.path.join(
            p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp))
        if (not os.path.isfile(standard_proc_nc) or lforce):
            action_needed = True
        else:
            action_needed = False

        actions['standard_postproc'][test] = action_needed

        test_specific_csv = os.path.join(
            p_stages, 'test_postproc_{}_{}.csv'.format(test, exp))

        if (not os.path.isfile(test_specific_csv) or lforce
                or actions['standard_postproc'][test]):

            action_needed = True
        else:
            action_needed = False

        actions['test_postproc'][test] = action_needed

    log.debug('actions: {}'.format(actions))

    return (actions)
コード例 #3
0
ファイル: utils.py プロジェクト: C2SM/clim-sanity-checker
def shell_cmd(cmd, py_routine, lowarn=False):
    """ 
    Send shell command through subprocess.Popen and returns a string 
    containing the cmd output

    lowarn = True -> only a warning is written, no exit (To use with caution!)
    """

    # send cmd to be executed
    p = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         universal_newlines=True)

    # gets the output of the cmd
    out, err = p.communicate()

    # initailisation output status
    out_status = 0
    # check if cmd was executed properly
    if p.returncode != 0:
        log.debug("{} (shell_cmd): ERROR in the command: \n {}".format(
            py_routine, cmd))
        if lowarn:
            log.warning("Shell command failed, but explicitly "
                        "keep program alive: \n {}".format(err))
            out_status = 1
        else:
            log.error("Error returned: {}".format(err))

    return (out_status, str(out))
コード例 #4
0
    def __init__(self, lev, metric_threshold, color_var):

        # defining color text
        dict_col = {
            'Red': Style.RED,
            'DarkRed': Style.RED_HIGHL,
            'Orange': Style.ORANGE,
            'Green': Style.GREEN
        }

        try:
            self.col_txt = dict_col[color_var]
        except KeyError:
            log.warning('No text color associated with {} - '
                        'setting to BLACK'.format(color_var))
            self.col_txt = Style.BLACK

        # other properties
        self.level = lev
        self.p_thresh = metric_threshold
        self.col_graph = color_var
コード例 #5
0
def print_statistics_of_raw_files(ifiles, stream, exp):

    datepatterns = ['%Y_%m', '%Y%m']
    years_found = []
    no_summary = False

    for file in ifiles:
        file = (os.path.basename(file))
        strip_1 = file.strip('_{}_.nc'.format(stream))
        strip_2 = strip_1.strip('{}_'.format(exp))
        strip_3 = strip_2.strip('.')
        datestring = strip_3

        failed = True
        for pattern in datepatterns:

            if failed:
                try:
                    date = datetime.datetime.strptime(datestring, pattern)
                    failed = False
                except ValueError:
                    failed = True

        if failed:
            no_summary = True

        else:
            year = date.year
            if year not in years_found:
                years_found.append(year)

    if no_summary:
        log.warning('Could not determine years '
                    'due to an unkown pattern in the filenames')
    else:
        log.info('{} files with model output '
                 'found for years:'.format(len(ifiles)))
        for year in years_found:
            log.info(year)
コード例 #6
0
def print_warning_color(df_result, metric_thresholds, metric):

    # dataframe containing only variables a warning has to be printed
    df_warning = df_result[df_result['level'] != 'high']

    log.info('-------------------------------------------'
             '-------------------------------------------'
             '--------------------')

    if df_warning.size > 0:

        log.warning('The following variables give problematic '
                    '{} : \n'.format(metric))

        # for each level of warning, print the dataframe
        for metric_lev in metric_thresholds:
            if metric_lev != 'high':

                # dataframe containing only this level of warning
                df_print_warn = df_warning[df_warning.level ==
                                           metric_lev.level]

                # print
                if df_print_warn.size > 0:
                    log.info('Confidence is {} for {} '.format(
                        metric_lev.level.upper(), metric))
                    log.info(metric_lev.col_txt(df_print_warn))
    else:
        log.info(
            Style.GREEN('The experiment is fine. '
                        'No {} under {} \n').format(
                            metric, metric_thresholds[1].p_thresh))

    log.info('-------------------------------------------'
             '-------------------------------------------'
             '--------------------')

    return
コード例 #7
0
def add_line_descr_f(exp, f_exp_descr):
    '''
    Add line for exp exp in file f_exp_descr

    :param exp: new expirement name
    :param f_exp_descr: file in which the new line has to be added

    return: None
    '''

    log.info('Adding line {} in the file {}:'.format(exp, f_exp_descr))

    # open file in dataframe
    if not os.path.isfile(f_exp_descr):
        # create dataframe
        cols_exp_descr_f = [
            'Experiment name', 'Platform', 'OS', 'Compiler (with version)',
            'Optimisation level (-OX)', '-fast-transcendentals (y/n)',
            '-no-prec-sqrt (y/n)', '-no-prec-div (y/n)', 'welch (y/n)',
            'fldcor (y/n)', 'rmse (y/n)', 'emi (y/n)',
            'Date of experiment (month yyyy)'
        ]
        pd.DataFrame(columns=cols_exp_descr_f)
    else:
        df_exp_descr = pd.read_csv(f_exp_descr, sep=';')

    # collect information from user
    log.banner('Please give the following informations '
               'about your experiment')
    dict_line = {'Experiment name': exp}
    for col_name in df_exp_descr.keys():

        if col_name != 'Experiment name':

            # ask the user for info
            dict_line[col_name] = input('{} : '.format(col_name))

    # amend the information if needed
    while True:

        # new dataframe containing new line for exp
        df_exp_descr_new = df_exp_descr.append(dict_line, ignore_index=True)

        log.banner('Here is the content of the description '
                   'file including your new experiment.')
        log.info(df_exp_descr_new)

        answ_chg = input('Is the new file right ? (y/n/abort).\n'
                         'If you type n, you will be able to change '
                         'column values\n'
                         'If you type abort, the process of adding '
                         'the experiment {} to the reference is stoped.\n'
                         '(y/n/abort) : '
                         ''.format(exp))
        if answ_chg.upper() == 'Y':
            # save new file
            df_exp_descr_new.to_csv(f_exp_descr, sep=';', index=False)

            # get out of the loop
            return False

        elif answ_chg.upper() == 'N':
            answ_col = input('Which column field you want to change ?')

            if answ_col in df_exp_descr.keys():
                dict_line[answ_col] = input('{} : '.format(answ_col))
            else:
                log.warning('{} not in columns!'.format(answ_col))
                log.info('Columns are {}\n'.format(list(df_exp_descr.columns)))

        elif answ_chg.upper() == 'ABORT':
            exit()

    return ()
コード例 #8
0
def main(exp, actions, tests, spinup, p_raw_files, p_stages, raw_f_subfold,
         f_vars_to_extract, f_pattern_ref):

    log.banner('Start standard-postprocessing')

    results_data_processing = {}
    processed_netcdf_filename = {}
    skip_next_step = {}

    # init in case standard_postproc is skipped
    for test in tests:
        skip_next_step[test] = False

    for test in tests:
        if (actions['standard_postproc'][test]):
            processed_netcdf_filename[test], skip_next_step[test] = \
                standard_postproc(exp,
                                  test=test,
                                  spinup=spinup,
                                  p_raw_files=p_raw_files,
                                  raw_f_subfold=raw_f_subfold,
                                  p_stages=p_stages,
                                  f_vars_to_extract=f_vars_to_extract)
        else:
            log.info('Data already processed for test {}'.format(test))
            processed_netcdf_filename[test] = utils.clean_path(
                p_stages, 'standard_postproc_{}_{}.nc'.format(test, exp))

    log.banner('End standard-postprocessing')

    log.banner('Start conversion from NetCDF to dataframe')

    if 'welch' in tests:

        test = 'welch'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            # transforming netcdf timeseries into csv file
            results_data_processing[test] = timeser_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning("Skip Welch's-Test")

    if 'emi' in tests:

        test = 'emi'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            results_data_processing[test] = emis_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip emission test')

    if 'fldcor' in tests:

        test = 'fldcor'

        if (actions['test_postproc'][test] and not skip_next_step[test]):

            f_pattern_ref = download_ref_to_stages_if_required(
                f_pattern_ref, p_stages, f_vars_to_extract, test)

            results_data_processing[test] = pattern_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages,
                reference=f_pattern_ref)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip pattern correlation test')

    if 'rmse' in tests:

        test = 'rmse'

        if (actions['test_postproc'][test] and not skip_next_step[test]):
            test = 'rmse'

            f_pattern_ref = download_ref_to_stages_if_required(
                f_pattern_ref, p_stages, f_vars_to_extract, test)

            results_data_processing[test] = rmse_proc_nc_to_df(
                exp,
                filename=processed_netcdf_filename[test],
                p_stages=p_stages,
                reference=f_pattern_ref)
        else:
            log.info('Processing for test {} already done'.format(test))
            f_csv = os.path.join(p_stages,
                                 'test_postproc_{}_{}.csv'.format(test, exp))
            results_data_processing[test] = pd.read_csv(f_csv, sep=';')
    else:
        log.warning('Skip Rmse test')

    log.banner('End conversion from NetCDF to dataframe')

    return (results_data_processing)
コード例 #9
0
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages,
                      f_vars_to_extract):
    '''
Perfom standard post-processing using cdo 

Arguments: 
    exp            = experiment name
    test           = name of current test to process data
    spinup         = number of files (from begining of simulation) 
                     to ignore du to model spinup
    p_raw_files    = path to raw model output
    raw_f_subfold  = subfolder in p_raw_files with model output 
                     [p_raw_files]/[raw_f_subfold]
    p_stages       = directory where processing steps are stored
    f_vars_to_extract =  csv file containg the variables to proceed

returns: 
   netCDF filename containing the fields as defined in f_vars_to_extract
    '''

    log.info('Postprocess data using CDO for test {}'.format(test))

    # check that exp is defined
    if exp is None:
        log.error('Experiment is not defined.\n exp = {}'.format(exp))

    # get variables to process:
    p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test)
    full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract)
    df_vars = pd.read_csv(full_p_f_vars, sep=',')

    # define expressions
    df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula']

    # name of output file
    ofile_tot = os.path.join(p_stages,
                             'standard_postproc_{}_{}.nc'.format(test, exp))

    # initialisation
    files_error = []  # list files giving error
    files_proceed = []  # list of files where data are collected

    # sometimes data is stored in a folder called Raw
    p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold)

    # SPECIAL CASE, echam specific :
    # if the folder containing the Raw files have been deleted,
    # but folder 'Data' contains already global annual means
    if not os.path.isdir(p_raw_folder):
        log.warning('The folder containing the raw data '
                    'has been deleted : {}'.format(p_raw_folder))

        p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data')
        if test == 'welch':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc'))

        if test == 'fldcor' or test == 'rmse':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc'))
        if test == 'emi':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'emi_*.nc'))

        if len(time_series_altern_fold) < 1:
            log.error('Could not find files in alternative directory '
                      '{}'.format(time_series_altern_fold))
        else:
            log.info('The alternative folder has been found instead: '
                     '{}'.format(p_altern_timeser_fold))

            log.warning('This section of code is only tested for ECHAM! '
                        'It is not recommended to use it for other cases')

            if len(time_series_altern_fold) == 1:
                index_ts = 0
            if len(time_series_altern_fold) > 1:

                for (i, item) in enumerate(time_series_altern_fold):
                    print(i, item)
                index_ts = int(
                    input('Please type the index of the file'
                          ' to use (negative means '
                          'none of them) : '))

            # If index positive, copy the time serie and exit
            if index_ts >= 0:
                log.info('File used : {}'.format(
                    time_series_altern_fold[index_ts]))

                cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC '
                           '-chname,ICNC,burden_ICNC '
                           '-chname,SCF,SCRE -chname,LCF,LCRE '
                           '{} {}'.format(time_series_altern_fold[index_ts],
                                          ofile_tot))
                utils.shell_cmd(cdo_cmd, py_routine=__name__)

                # convert netCDF to dataframe,
                # therefore skip next processing step
                if test == 'welch':
                    timeser_proc_nc_to_df(exp,
                                          ofile_tot,
                                          p_stages,
                                          already_a_timeseries=True)
                    skip_next_steps = True
                else:
                    skip_next_steps = False

                log.warning('Leave ECHAM-only code-section! '
                            'You are save again...')
                return (ofile_tot, skip_next_steps)

    # NORMAL CASE
    else:
        log.info('Analyse files in : {}'.format(p_raw_folder))

    log.banner('Time for a coffee...')

    # loop over output stream
    for stream in df_vars['file'].unique():

        # extract all lines with file f
        df_file = df_vars[df_vars.file == stream]

        # list all available files in p_raw_files/exp/raw_f_subfold
        #which have stream f
        # restart files and {}m.format(stream) e.g. echamm.nc
        # files are not considered
        final_p_raw_files = os.path.join(p_raw_folder,
                                         '*_*{}*.nc'.format(stream))
        ifiles = [
            fn for fn in glob.glob(final_p_raw_files) if sum([
                s in os.path.basename(fn)
                for s in ['stream', '{}m'.format(stream)]
            ]) == 0
        ]
        if len(ifiles) == 0:
            log.warning('No raw files found for stream {} at address : \n'
                        '{}'.format(stream, final_p_raw_files))

        # sort files in chronoligcal order
        # (this will be needed for doing yearmean properly)
        ifiles.sort()

        print_statistics_of_raw_files(ifiles, stream, exp)

        # remove spin-up files
        log.info('Remove first {} months of data '
                 'due to model spinup'.format(spinup))
        ifiles = ifiles[int(spinup):]

        # output file for stream f
        ofile_str = '{}_{}.nc'.format(exp, stream)

        # variables to extract form netcdf
        # files (this is needed for optimization)
        variables = variables_to_extract(vars_in_expr=df_file.formula.values)

        # Extract variables needed from big files
        log.info('Extract variables from file: {}'.format(stream))

        # initialization
        tmp_selvar_files = []  # list to store the ifiles

        for ifile in ifiles:
            # basename of ifile
            ifile_bsn = os.path.basename(ifile)
            log.debug('File {}'.format(ifile_bsn))
            tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn)

            cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile,
                                                   tmp_selvar_file)
            out_status, out_mess = utils.shell_cmd(cdo_cmd,
                                                   py_routine=__name__,
                                                   lowarn=True)

            if out_status == 0:
                tmp_selvar_files.append(tmp_selvar_file)
            else:
                files_error.append(ifile_bsn)

        # Merge all the monthly files together
        log.info('Copy {} files'.format(stream))
        tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream)
        if os.path.isfile(tmp_merged):
            os.remove(tmp_merged)

        cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files),
                                           tmp_merged)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # compute needed variables
        log.info('Compute variables for file : {}'.format(stream))
        if os.path.isfile(ofile_str):
            os.remove(ofile_str)

        expr_str = ';'.join((df_file.expr.values))
        cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format(
            expr_str, tmp_merged, ofile_str)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # keep trace of output file per stream
        files_proceed.append(ofile_str)

        # cleaning
        [os.remove(f) for f in tmp_selvar_files]
        os.remove(tmp_merged)

    # merge all stream files
    if os.path.isfile(ofile_tot):
        os.remove(ofile_tot)
    cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    [os.remove(f) for f in files_proceed]

    # Finish
    if len(files_error) != 0:
        log.warning('Files with a problem: {}'.format(','.join(files_error)))

    log.info('Postprocess data using CDO for test {} finished. \n '
             'Output here : {}'.format(test, ofile_tot))

    # return name of output file
    return (ofile_tot, False)
コード例 #10
0
def main(new_exp, results_data_processing, tests, p_stages, p_ref_csv_files,
         ltestsuite, f_vars_to_extract):

    df_exp = {}
    df_ref = {}
    p_csv_files = {}
    testresult_csv = {}
    df_result = {}

    for test in tests:
        log.info('Prepare references for test {}'.format(test))

        test_cfg = get_config_of_current_test(test)

        results_data_processing[test]['exp'] = new_exp

        # list of paths to all csv files
        p_csv_files[test] = glob.glob(
            os.path.join(p_ref_csv_files, test,
                         '{}_*csv'.format(test_cfg.ref_name)))
        if len(p_csv_files[test]) == 0:
            log.error('No reference files found in {}'.format(p_ref_csv_files))

        log.debug('{} reference(s) found for test \
                  {}'.format(len(p_csv_files[test]), test))

        # create big dataframe containing all reference exps
        df_ref[test] = create_big_df(test_cfg.ref_name,
                                     list_csv_files=p_csv_files[test])

        # Exclude all the non-desired variables (1) var from file, 2) exp)
        full_p_f_vars = os.path.join(paths.p_f_vars_proc, test,
                                     f_vars_to_extract)
        vars_to_analyse = list(
            pd.read_csv(full_p_f_vars, sep=',')['var'].values)
        vars_to_analyse.append('exp')
        try:
            df_ref[test] = df_ref[test][vars_to_analyse]
        except KeyError as e:
            log.warning(e)
            log.error('Variables defined in {} are not contained in reference \
                {}'.format(utils.rel_path(f_vars_to_extract),
                           utils.rel_path(p_ref_csv_files)))

        df_exp[test] = results_data_processing[test][vars_to_analyse]

        log.info('References for test {} prepared'.format(test))

        testresult_csv[test] = os.path.join(
            p_stages, 'result_{}_{}.csv'.format(test, new_exp))

        if test == 'welch':
            log.banner('')
            log.banner("Perform Welch's t-test for each variable")
            log.banner('')
            df_result[test] = welch_test(
                df_a=df_ref[test],
                df_b=df_exp[test],
                filename_student_test=testresult_csv[test])
            df_result[test]['p-value [%]'] = df_result[test]['p-value'] * 100.

        if test == 'fldcor':
            log.banner('')
            log.banner("Perform fldcor test for each variable")
            log.banner('')
            df_result[test] = pattern_correlation(df_exp[test], test_cfg)

        if test == 'emi':
            log.banner('')
            log.banner("Perform emission test for each variable")
            log.banner('')
            df_result[test] = emissions(df_exp[test], df_ref[test], test_cfg)

        if test == 'rmse':
            log.banner('')
            log.banner("Perform rmse test for each variable")
            log.banner('')
            df_result[test] = rmse(df_exp[test], test_cfg)

        df_result[test] = sort_level_metric(df_result[test],
                                            test_cfg.metric_threshold,
                                            test_cfg.metric)
        df_result[test] = add_color_df_result(df_result[test],
                                              test_cfg.metric_threshold)

        print_warning_color(df_result[test], test_cfg.metric_threshold,
                            test_cfg.metric)

        if ltestsuite:
            for test in tests:
                test_cfg = get_config_of_current_test(test)
                utils.exit_if_testresult_is_bad(test, df_result[test],
                                                test_cfg.metric_threshold,
                                                test_cfg.metric)

    return df_result, df_ref