Beispiel #1
0
def rmse_proc_nc_to_df(exp, filename, reference, p_stages):
    '''
Arguments: 
    exp       = experiment name
    filename  = filename of the netCDF returned by 
                function standard_postproc
    reference = filename to the reference
    p_stages  = directory where processing steps are stored

returns:
    dataframe with processed data for pattern correlation test
    '''

    test = 'rmse'
    rmse_interim = 'test_postproc_intermediate_{}_{}.nc'.format(test, exp)

    rmse_filename = 'test_proc_{}_{}.nc'.format(test, exp)

    cdo_cmd = 'cdo -L timmean -yearmean -vertsum {} {}'.format(
        filename, rmse_interim)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    reference_normalized = normalize_data(reference)
    rmse_interim_normalized = normalize_data(rmse_interim)

    # list of variables in the timeserie netcdf file to drop
    # (not to put into the dataframe)
    vars_to_drop = []

    log.info('Compute root mean square error '
             'between {} and {} (reference)'.format(rmse_interim_normalized,
                                                    reference_normalized))

    cdo_cmd = 'cdo -L sqrt -fldmean -sqr -sub {} {} {}'.format(
        rmse_interim_normalized, reference_normalized, rmse_filename)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    # open dataset
    data = xr.open_dataset(rmse_filename)

    # Delete variables
    # useless variable time_bnds
    if ('time_bnds' in data.keys()):
        data = data.drop('time_bnds')
    # 3D vars
    if len(vars_to_drop) > 0:
        data.drop(labels=vars_to_drop)

    # transforms into dataframe
    df_data = data.to_dataframe()

    os.makedirs(p_stages, exist_ok=True)
    csv_filename = os.path.join(p_stages,
                                'test_postproc_{}_{}.csv'.format(test, exp))
    df_data.to_csv(csv_filename, index=None, header=True, sep=';')
    log.info('CSV file can be found here: {}'.format(csv_filename))

    log.info('Finished {} for file {}'.format(__name__, rmse_filename))

    return (df_data)
Beispiel #2
0
def download_ref_to_stages_if_required(f_pattern_ref, p_stages,
                                       f_vars_to_extract, test):

    # no ref-file passed as argument of process_data
    if f_pattern_ref == paths.rootdir:
        log.info('Download reference file from ftp-server')

        filename_ftp_link = f_vars_to_extract.replace('.csv', '.txt').replace(
            'vars_', 'ftp_')

        path_to_ftp_link = os.path.join(paths.p_f_vars_proc, test)
        file_with_ftp_link = utils.clean_path(path_to_ftp_link,
                                              filename_ftp_link)

        output_file = os.path.join(p_stages, 'ftp_ref_pattern.nc')

        cmd = ('wget --input-file={} '
               '--output-document={}'.format(file_with_ftp_link, output_file))
        log.debug('ftp-command: {}'.format(cmd))
        utils.shell_cmd(cmd, py_routine=__name__)

        f_pattern_ref = output_file

    else:
        log.info('Using user-defined reference file for test '
                 '{}'.format(test))

    return f_pattern_ref
Beispiel #3
0
def timeser_proc_nc_to_df(exp, filename, p_stages, already_a_timeseries=False):
    '''
Arguments: 
    exp      = experiment name
    filename = filename of the netCDF returned by function standard_postproc
    p_stages = directory where processing steps are stored

returns:
    dataframe with processed data for welchstest
    '''

    test = 'welch'

    if not already_a_timeseries:
        timeser_filename = 'test_postproc_{}_{}.nc'.format(test, exp)
        cdo_cmd = 'cdo -L yearmean -fldmean -vertsum {} {}'.format(
            filename, timeser_filename)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)
    else:
        log.debug('Skipping CDO-processing step')
        timeser_filename = filename

    # list of variables in the timeserie netcdf
    # file to drop (not to put into the dataframe)
    vars_to_drop = []

    log.info('Processing netCDF: {}'.format(timeser_filename))

    # open dataset
    data = xr.open_dataset(timeser_filename)

    # Delete variables
    # useless variable time_bnds
    if ('time_bnds' in data.keys()):
        data = data.drop('time_bnds')
    # 3D vars
    if len(vars_to_drop) > 0:
        data.drop(labels=vars_to_drop)

    # removed degenerated dimensions
    data = data.squeeze(drop=True)

    # transforms into dataframe
    df_data = data.to_dataframe()

    # export in a file
    os.makedirs(p_stages, exist_ok=True)
    csv_filename = os.path.join(p_stages,
                                'test_postproc_{}_{}.csv'.format(test, exp))
    df_data.to_csv(csv_filename, index=None, header=True, sep=';')
    log.info('CSV file can be found here: {}'.format(csv_filename))

    log.info('Finished {} for file {}'.format(__name__, timeser_filename))

    return (df_data)
Beispiel #4
0
def main(exp,
         tests,
         p_stages=paths.p_stages,
         p_ref_csv_files=paths.p_ref_csv_files,
         ltestsuite=False,
         lverbose=False):

    # initialisation
    new_branch_name = 'test_add_{}'.format(exp)
    files_to_commit = []

    # fill up file 'Exps_description.csv' with additional
    # information via user input
    f_exp_descr = os.path.join(p_ref_csv_files, 'Exps_description.csv')
    if not ltestsuite:
        add_line_descr_f(exp=exp, f_exp_descr=f_exp_descr)
    files_to_commit.append(f_exp_descr)

    for test in tests:
        test_cfg = get_config_of_current_test(test)

        csv_file = utils.clean_path(
            p_stages, 'test_postproc_{}_{}.csv'.format(test, exp))

        # what is the filename in the reference pool
        filename_in_ref_dir = '{}_{}.csv'.format(test_cfg.ref_name, exp)
        # what is the location to store that file
        place_for_reference = os.path.join(p_ref_csv_files, test,
                                           filename_in_ref_dir)

        log.debug('Copy {} to {}'.format(csv_file, place_for_reference))

        if not ltestsuite:
            shutil.copy(csv_file, place_for_reference)

        files_to_commit.append(place_for_reference)

        # copy pdf with bar-plots from Welch's-test
        if test == 'welch':

            pdf_file = utils.clean_path(
                p_stages, '{}_{}.pdf'.format(test_cfg.ref_name, exp))

            # what is the name of the pdf in the reference pool
            filename_in_ref_dir = '{}_plots.pdf'.format(test_cfg.ref_name)
            # what is the location to store that file
            place_for_reference = os.path.join(p_ref_csv_files, test,
                                               filename_in_ref_dir)

            log.debug('Copy {} to {}'.format(csv_file, place_for_reference))
            files_to_commit.append(place_for_reference)

            if not ltestsuite:
                shutil.copy(pdf_file, place_for_reference)

    # root is important to not fail during git commands
    os.chdir(paths.rootdir)

    # checkout new branch
    if not ltestsuite:
        log.info('Create and checkout new branch {}'.format(new_branch_name))
        git_cmd = 'git checkout -B {}'.format(new_branch_name)
        utils.shell_cmd(git_cmd, py_routine='add_exp_to_ref.py')

        # commit all modified files prior in the function to git
        for file in files_to_commit:
            git_cmd = 'git add {}'.format(file)
            log.debug(git_cmd)
            utils.shell_cmd(git_cmd, py_routine=__name__)

        log.debug('Commit files {}'.format(files_to_commit))
        commit_message = input('Please type your commit message :')
        git_cmd = 'git commit -m "{}"'.format(commit_message)
        utils.shell_cmd(git_cmd, py_routine=__name__)

    # Finish
    log.info(
        Style.GREEN(
            'Files are added in the new branch: '
            '{} in your local git repository.'.format(new_branch_name)))
    log.info('To add the file to the official repository, '
             'please perform the following steps:')
    log.info('1. Push the new branch into the official repo:')
    log.info('   git push --set-upstream origin {}'.format(new_branch_name))
    log.info('2. On the Open Web interface (GitHub) , open a Pull Request.')

    log.banner('End add_exp_to_ref for experiment {}'.format(exp))
    return ()
Beispiel #5
0
def normalize_data(dataset):

    log.info('Normalize fields in {} with mean and '
             'standard deviation'.format(dataset))

    data = dataset.replace('.nc', '')
    std_data = '{}_std.nc'.format(data)
    std_data_enlarged = '{}_std_enlarged.nc'.format(data)
    mean_data = '{}_mean.nc'.format(data)
    mean_data_enlarged = '{}_enlarged.nc'.format(data)
    sub_data = '{}_sub.nc'.format(data)
    normalized_data = '{}_normalized.nc'.format(data)

    log.debug('Clean intermediate files for normalization')
    shell_cmd = 'rm {} {} {} {} {} {}'.format(std_data, mean_data,
                                              std_data_enlarged,
                                              mean_data_enlarged, sub_data,
                                              normalized_data)
    utils.shell_cmd(shell_cmd, py_routine=__name__, lowarn=True)

    cdo_cmd = 'cdo -L fldstd {} {}'.format(dataset, std_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L fldmean {} {}'.format(dataset, mean_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    #cdo_cmd = 'cdo -L sub {} -enlarge,{} {} {}'.format(dataset,
    cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, mean_data,
                                                mean_data_enlarged)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L -enlarge,{} {} {}'.format(dataset, std_data,
                                                std_data_enlarged)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L sub {} {} {}'.format(dataset, mean_data_enlarged,
                                           sub_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    cdo_cmd = 'cdo -L div {} {} {}'.format(sub_data, std_data_enlarged,
                                           normalized_data)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    return normalized_data
Beispiel #6
0
def standard_postproc(exp, test, spinup, p_raw_files, raw_f_subfold, p_stages,
                      f_vars_to_extract):
    '''
Perfom standard post-processing using cdo 

Arguments: 
    exp            = experiment name
    test           = name of current test to process data
    spinup         = number of files (from begining of simulation) 
                     to ignore du to model spinup
    p_raw_files    = path to raw model output
    raw_f_subfold  = subfolder in p_raw_files with model output 
                     [p_raw_files]/[raw_f_subfold]
    p_stages       = directory where processing steps are stored
    f_vars_to_extract =  csv file containg the variables to proceed

returns: 
   netCDF filename containing the fields as defined in f_vars_to_extract
    '''

    log.info('Postprocess data using CDO for test {}'.format(test))

    # check that exp is defined
    if exp is None:
        log.error('Experiment is not defined.\n exp = {}'.format(exp))

    # get variables to process:
    p_test_vars_proc = os.path.join(paths.p_f_vars_proc, test)
    full_p_f_vars = utils.clean_path(p_test_vars_proc, f_vars_to_extract)
    df_vars = pd.read_csv(full_p_f_vars, sep=',')

    # define expressions
    df_vars['expr'] = df_vars['var'] + '=' + df_vars['formula']

    # name of output file
    ofile_tot = os.path.join(p_stages,
                             'standard_postproc_{}_{}.nc'.format(test, exp))

    # initialisation
    files_error = []  # list files giving error
    files_proceed = []  # list of files where data are collected

    # sometimes data is stored in a folder called Raw
    p_raw_folder = os.path.join(p_raw_files, exp, raw_f_subfold)

    # SPECIAL CASE, echam specific :
    # if the folder containing the Raw files have been deleted,
    # but folder 'Data' contains already global annual means
    if not os.path.isdir(p_raw_folder):
        log.warning('The folder containing the raw data '
                    'has been deleted : {}'.format(p_raw_folder))

        p_altern_timeser_fold = os.path.join(p_raw_files, exp, 'Data')
        if test == 'welch':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'timeser_daint_*.nc'))

        if test == 'fldcor' or test == 'rmse':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'multi_annual_means_*.nc'))
        if test == 'emi':
            time_series_altern_fold = glob.glob(
                os.path.join(p_altern_timeser_fold, 'emi_*.nc'))

        if len(time_series_altern_fold) < 1:
            log.error('Could not find files in alternative directory '
                      '{}'.format(time_series_altern_fold))
        else:
            log.info('The alternative folder has been found instead: '
                     '{}'.format(p_altern_timeser_fold))

            log.warning('This section of code is only tested for ECHAM! '
                        'It is not recommended to use it for other cases')

            if len(time_series_altern_fold) == 1:
                index_ts = 0
            if len(time_series_altern_fold) > 1:

                for (i, item) in enumerate(time_series_altern_fold):
                    print(i, item)
                index_ts = int(
                    input('Please type the index of the file'
                          ' to use (negative means '
                          'none of them) : '))

            # If index positive, copy the time serie and exit
            if index_ts >= 0:
                log.info('File used : {}'.format(
                    time_series_altern_fold[index_ts]))

                cdo_cmd = ('cdo -L -chname,CDNC,burden_CDNC '
                           '-chname,ICNC,burden_ICNC '
                           '-chname,SCF,SCRE -chname,LCF,LCRE '
                           '{} {}'.format(time_series_altern_fold[index_ts],
                                          ofile_tot))
                utils.shell_cmd(cdo_cmd, py_routine=__name__)

                # convert netCDF to dataframe,
                # therefore skip next processing step
                if test == 'welch':
                    timeser_proc_nc_to_df(exp,
                                          ofile_tot,
                                          p_stages,
                                          already_a_timeseries=True)
                    skip_next_steps = True
                else:
                    skip_next_steps = False

                log.warning('Leave ECHAM-only code-section! '
                            'You are save again...')
                return (ofile_tot, skip_next_steps)

    # NORMAL CASE
    else:
        log.info('Analyse files in : {}'.format(p_raw_folder))

    log.banner('Time for a coffee...')

    # loop over output stream
    for stream in df_vars['file'].unique():

        # extract all lines with file f
        df_file = df_vars[df_vars.file == stream]

        # list all available files in p_raw_files/exp/raw_f_subfold
        #which have stream f
        # restart files and {}m.format(stream) e.g. echamm.nc
        # files are not considered
        final_p_raw_files = os.path.join(p_raw_folder,
                                         '*_*{}*.nc'.format(stream))
        ifiles = [
            fn for fn in glob.glob(final_p_raw_files) if sum([
                s in os.path.basename(fn)
                for s in ['stream', '{}m'.format(stream)]
            ]) == 0
        ]
        if len(ifiles) == 0:
            log.warning('No raw files found for stream {} at address : \n'
                        '{}'.format(stream, final_p_raw_files))

        # sort files in chronoligcal order
        # (this will be needed for doing yearmean properly)
        ifiles.sort()

        print_statistics_of_raw_files(ifiles, stream, exp)

        # remove spin-up files
        log.info('Remove first {} months of data '
                 'due to model spinup'.format(spinup))
        ifiles = ifiles[int(spinup):]

        # output file for stream f
        ofile_str = '{}_{}.nc'.format(exp, stream)

        # variables to extract form netcdf
        # files (this is needed for optimization)
        variables = variables_to_extract(vars_in_expr=df_file.formula.values)

        # Extract variables needed from big files
        log.info('Extract variables from file: {}'.format(stream))

        # initialization
        tmp_selvar_files = []  # list to store the ifiles

        for ifile in ifiles:
            # basename of ifile
            ifile_bsn = os.path.basename(ifile)
            log.debug('File {}'.format(ifile_bsn))
            tmp_selvar_file = 'tmp_extract_{}'.format(ifile_bsn)

            cdo_cmd = 'cdo selvar,{} {} {}'.format(','.join(variables), ifile,
                                                   tmp_selvar_file)
            out_status, out_mess = utils.shell_cmd(cdo_cmd,
                                                   py_routine=__name__,
                                                   lowarn=True)

            if out_status == 0:
                tmp_selvar_files.append(tmp_selvar_file)
            else:
                files_error.append(ifile_bsn)

        # Merge all the monthly files together
        log.info('Copy {} files'.format(stream))
        tmp_merged = 'tmp_{}_{}.nc'.format(exp, stream)
        if os.path.isfile(tmp_merged):
            os.remove(tmp_merged)

        cdo_cmd = 'cdo -copy {} {}'.format(' '.join(tmp_selvar_files),
                                           tmp_merged)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # compute needed variables
        log.info('Compute variables for file : {}'.format(stream))
        if os.path.isfile(ofile_str):
            os.remove(ofile_str)

        expr_str = ';'.join((df_file.expr.values))
        cdo_cmd = 'cdo -L -setctomiss,-9e+33 -expr,"{}" {} {}'.format(
            expr_str, tmp_merged, ofile_str)
        utils.shell_cmd(cdo_cmd, py_routine=__name__)

        # keep trace of output file per stream
        files_proceed.append(ofile_str)

        # cleaning
        [os.remove(f) for f in tmp_selvar_files]
        os.remove(tmp_merged)

    # merge all stream files
    if os.path.isfile(ofile_tot):
        os.remove(ofile_tot)
    cdo_cmd = 'cdo merge {} {}'.format(' '.join(files_proceed), ofile_tot)
    utils.shell_cmd(cdo_cmd, py_routine=__name__)

    [os.remove(f) for f in files_proceed]

    # Finish
    if len(files_error) != 0:
        log.warning('Files with a problem: {}'.format(','.join(files_error)))

    log.info('Postprocess data using CDO for test {} finished. \n '
             'Output here : {}'.format(test, ofile_tot))

    # return name of output file
    return (ofile_tot, False)