コード例 #1
0
    def test_esk608(self):
        """Test Esk-608: Execute Spark histogram filling macro"""

        # check if required Python and Java libraries are made available to worker nodes
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )
        self.assertRegexpMatches(
            sc.getConf().get('spark.jars.packages', ''),
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4',
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail'
        )
        if re.search('spark://', sc.getConf().get('spark.master', '')):
            py_mods = utils.get_file_path('py_mods')
            self.assertRegexpMatches(
                sc.getConf().get('spark.submit.pyFiles', ''), py_mods,
                'Eskapade modules missing from spark.submit.pyFiles, needed in Spark cluster mode'
            )
            self.assertRegexpMatches(
                sc.getConf().get('spark.files', ''), py_mods,
                'Eskapade modules missing from spark.files, needed in Spark cluster mode'
            )

        # run Eskapade
        self.run_eskapade('esk608_spark_histogrammar.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)
        settings = ProcessManager().service(ConfigObject)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(sorted(ds['spark_df'].columns),
                             sorted(['date', 'loc', 'x', 'y']),
                             'unexpected columns in data frame')

        # data-generation checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        col_names = ['date', 'x', 'y', 'loc', 'x:y']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names))

        # data-summary checks
        f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y']
        file_names = ['report.tex'
                      ] + ['hist_{}.pdf'.format(col) for col in f_bases]
        for fname in file_names:
            path = persistence.io_path('results_data', settings.io_conf(),
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
コード例 #2
0
    def initialize(self):
        """Initialize SparkDataToCsv"""

        # check input arguments
        self.check_arg_types(allow_none=True, read_key=str, output_path=str, compression_codec=str)
        self.check_arg_types(mode=str, sep=str, num_files=int)
        self.check_arg_types(recurse=True, allow_none=True)
        self.check_arg_vals('read_key', 'sep')
        self.check_arg_vals('output_path', 'compression_codec', allow_none=True)
        self.check_arg_opts(mode=('overwrite', 'ignore', 'error'))
        if self.num_files < 1:
            raise RuntimeError('requested number of files is less than 1 ({:d})'.format(self.num_files))

        # set other attributes
        self.do_execution = True

        # set default output path
        if not self.output_path:
            settings = ProcessManager().service(ConfigObject)
            self.output_path = 'file:' + persistence.io_path('results_data', settings.io_conf(), '{}_output'.format(self.name))

        # parse header argument
        try:
            self.header = tuple(self.header)
        except TypeError:
            self.header = bool(self.header)
        if isinstance(self.header, tuple) and not self.header:
            raise RuntimeError('empty header sequence specified')
    
        # check output directory, if local
        if self.output_path.startswith('file:'):
            local_output_path = os.path.abspath(self.output_path.replace('file:',''))
            if os.path.exists(self.output_path):
                # output data already exist
                if self.mode == 'ignore':
                    # do not execute link
                    self.log().debug('Output data already exist; not executing link')
                    self.do_execution = False
                    return StatusCode.Success
                elif self.mode == 'error':
                    # raise exception
                    raise RuntimeError('output data already exist')

                # remove output directory
                if not os.path.isdir(local_output_path):
                    raise RuntimeError('output path "{}" is not a directory'.format(local_output_path))
                shutil.rmtree(local_output_path)
            elif not os.path.exists(os.path.dirname(local_output_path)):
                # create path up to the last component
                self.log().debug('Creating output path "%s"', local_output_path)
                os.makedirs(os.path.dirname(local_output_path))

        return StatusCode.Success
コード例 #3
0
    def test_esk305(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk305_correlation_summary.py'
        settings['batchMode'] = True

        status = execution.run_eskapade(settings)
        self.assertTrue(status.isSuccess())

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # input data checks
        all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5']

        self.assertIn('input_data', ds)
        self.assertIsInstance(ds['input_data'], pd.DataFrame)
        self.assertListEqual(list(ds['input_data'].columns), all_col_names)

        self.assertIn('correlations', ds)
        self.assertIsInstance(ds['correlations'], list)
        corr_list = ds['correlations']
        self.assertEqual(4, len(corr_list))

        # correlation matrix checks
        col_names = ['x1', 'x2', 'x3', 'x4', 'x5']

        for corr in corr_list:
            self.assertIsInstance(corr, pd.DataFrame)
            #self.assertListEqual(list(corr.columns), col_names)
            self.assertListEqual(list(corr.index), col_names)

        # heatmap pdf checks
        io_conf = settings.io_conf()
        results_path = persistence.io_path('results_data', io_conf, 'report')

        correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio']
        for corr in correlations:
            path = '{0:s}/correlations_input_data_{1:s}.pdf'.format(
                results_path, corr)
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
コード例 #4
0
log.debug('Now parsing configuration file esk209_read_big_data_itr')

#########################################################################################
# --- minimal analysis information
settings = ProcessManager().service(ConfigObject)
settings['analysisName'] = 'esk209_read_big_data_itr'
settings['version'] = 0

#########################################################################################

# when chunking through an input file, pick up only N lines in each iteration.
chunksize = 5

#########################################################################################
# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

proc_mgr = ProcessManager()

# --- example 1: readdata loops over the input files, but no file chunking.

if settings.get('do_example1', True):
    ch = proc_mgr.add_chain('MyChain1')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)
コード例 #5
0
#########################################################################################
# --- minimal analysis information

settings = ProcessManager().service(ConfigObject)
settings['analysisName'] = 'esk304_df_boxplot'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

msg = r"""

The plots and latex files produced by link df_summary can be found in dir:
%s
""" % persistence.io_path('results_data', settings.io_conf(), 'report')
log.info(msg)

COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str), var_b=dict(mean=3., std=1.),
                var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'], dtype=str))

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create process manager
proc_mgr = ProcessManager()