def initialize(self): """Initialize CorrelationSummary""" # check input arguments self.check_arg_types(read_key=str, store_key=str, results_path=str, methods=list, pages_key=str) self.check_arg_vals('read_key') # get I/O configuration io_conf = ProcessManager().service(ConfigObject).io_conf() # read report templates with open( persistence.io_path('templates', io_conf, 'df_summary_report.tex')) as templ_file: self.report_template = templ_file.read() with open( persistence.io_path( 'templates', io_conf, 'df_summary_report_page.tex')) as templ_file: self.page_template = templ_file.read() # get path to results directory if not self.results_path: self.results_path = persistence.io_path('results_data', io_conf, 'report') # check if output directory exists if os.path.exists(self.results_path): # check if path is a directory if not os.path.isdir(self.results_path): self.log().critical('output path "%s" is not a directory', self.results_path) raise AssertionError('output path is not a directory') else: # create directory self.log().debug('Making output directory "%s"', self.results_path) os.makedirs(self.results_path) # check methods for method in self.methods: if method not in ALL_CORRS: logstring = '"{}" is not a valid correlation method, please use one of {}' logstring = logstring.format( method, ', '.join(['"' + m + '"' for m in ALL_CORRS])) raise AssertionError(logstring) # initialize attributes self.pages = [] return StatusCode.Success
def test_esk409(self): """Test Esk-409: Unredeemed vouchers.""" # run Eskapade macro = resources.tutorial('esk409_unredeemed_vouchers.py') self.eskapade_run(macro) ds = process_manager.service(DataStore) # check generated data self.assertIn('voucher_redeems', ds) self.assertIn('voucher_ages', ds) self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet) self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet) self.assertLess(ds['voucher_redeems'].numEntries(), 6000) self.assertGreater(ds['voucher_redeems'].numEntries(), 0) self.assertEqual(ds['voucher_ages'].numEntries(), 10000) # check fit result fit_link = process_manager.get('Fitting').get('Fit') self.assertEqual(fit_link.fit_result.status(), 0) n_ev_pull = (fit_link.results['n_ev'][0] - 6000.) / fit_link.results['n_ev'][1] self.assertGreater(n_ev_pull, -3.) self.assertLess(n_ev_pull, 3.) # check plot output plot_path = persistence.io_path('results_data', 'voucher_redeem.pdf') self.assertTrue(os.path.exists(plot_path)) statinfo = os.stat(plot_path) self.assertGreater(statinfo.st_size, 0)
def initialize(self): """Initialize the link.""" # perform basic checks of configured attributes # a key and path OR dictionary need to have been set. if self.path and self.key: self.dictionary = {self.key: self.path} elif not self.dictionary: raise Exception('Path and key OR dictionary not properly set.') # correct the output paths, if need be paths = list(self.dictionary.values()) assert '' not in paths, 'One or more of the paths in dict is empty.' assert all([isinstance(p, str) for p in paths ]), 'One or more of the paths in dict is not string.' # update paths if needed for k, p in self.dictionary.items(): if not p.__contains__('/'): self.dictionary[k] = persistence.io_path('results_data', p) self.logger.debug( 'Output filename for key <{key}> has been reset to {new_key}.', key=k, new_key=self.dictionary[k]) self.logger.info('kwargs passed on to pandas writer are: {kwargs}.', kwargs=self.kwargs) return StatusCode.Success
def test_esk608(self): """Test Esk-608: Execute Spark histogram filling macro""" # check if required Python and Java libraries are made available to worker nodes sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) self.assertRegexpMatches( sc.getConf().get('spark.jars.packages', ''), 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4', 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail' ) if re.search('spark://', sc.getConf().get('spark.master', '')): py_mods = utils.get_file_path('py_mods') self.assertRegexpMatches( sc.getConf().get('spark.submit.pyFiles', ''), py_mods, 'Eskapade modules missing from spark.submit.pyFiles, needed in Spark cluster mode' ) self.assertRegexpMatches( sc.getConf().get('spark.files', ''), py_mods, 'Eskapade modules missing from spark.files, needed in Spark cluster mode' ) # run Eskapade self.run_eskapade('esk608_spark_histogrammar.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) settings = ProcessManager().service(ConfigObject) # check data frame self.assertIn('spark_df', ds, 'no object with key "spark_df" in data store') self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame, '"spark_df" is not a Spark data frame') self.assertEqual(ds['spark_df'].count(), 12, 'unexpected number of rows in data frame') self.assertListEqual(sorted(ds['spark_df'].columns), sorted(['date', 'loc', 'x', 'y']), 'unexpected columns in data frame') # data-generation checks self.assertIn('hist', ds) self.assertIsInstance(ds['hist'], dict) col_names = ['date', 'x', 'y', 'loc', 'x:y'] self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names)) # data-summary checks f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y'] file_names = ['report.tex' ] + ['hist_{}.pdf'.format(col) for col in f_bases] for fname in file_names: path = persistence.io_path('results_data', settings.io_conf(), 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def run_eskapade(self, macro, return_status=definitions.StatusCode.Success): """Run Eskapade""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['macro'] = persistence.io_path('macros', settings.io_conf(), macro) status = execution.run_eskapade(settings) self.assertTrue(status == return_status)
def initialize(self): """Initialize SparkDataToCsv""" # check input arguments self.check_arg_types(allow_none=True, read_key=str, output_path=str, compression_codec=str) self.check_arg_types(mode=str, sep=str, num_files=int) self.check_arg_types(recurse=True, allow_none=True) self.check_arg_vals('read_key', 'sep') self.check_arg_vals('output_path', 'compression_codec', allow_none=True) self.check_arg_opts(mode=('overwrite', 'ignore', 'error')) if self.num_files < 1: raise RuntimeError('requested number of files is less than 1 ({:d})'.format(self.num_files)) # set other attributes self.do_execution = True # set default output path if not self.output_path: settings = ProcessManager().service(ConfigObject) self.output_path = 'file:' + persistence.io_path('results_data', settings.io_conf(), '{}_output'.format(self.name)) # parse header argument try: self.header = tuple(self.header) except TypeError: self.header = bool(self.header) if isinstance(self.header, tuple) and not self.header: raise RuntimeError('empty header sequence specified') # check output directory, if local if self.output_path.startswith('file:'): local_output_path = os.path.abspath(self.output_path.replace('file:','')) if os.path.exists(self.output_path): # output data already exist if self.mode == 'ignore': # do not execute link self.log().debug('Output data already exist; not executing link') self.do_execution = False return StatusCode.Success elif self.mode == 'error': # raise exception raise RuntimeError('output data already exist') # remove output directory if not os.path.isdir(local_output_path): raise RuntimeError('output path "{}" is not a directory'.format(local_output_path)) shutil.rmtree(local_output_path) elif not os.path.exists(os.path.dirname(local_output_path)): # create path up to the last component self.log().debug('Creating output path "%s"', local_output_path) os.makedirs(os.path.dirname(local_output_path)) return StatusCode.Success
def _create_spark_conf(self, eskapade_settings=None, config_path=None, spark_settings=None): """Create and set Spark configuration Read the Spark configuration file and store the settings as a SparkConf object. The path of the configuration file is given by the config_path argument or, if this argument is not specified, it is obtained from the Eskapade settings object (key "sparkCfgFile"). If neither of these inputs are provided, an empty configuration object is created. With the spark_settings argument, settings from the configuration file can be overwritten. Also additional settings can be specified with this argument. :param str config_path: path of configuration file :param eskapade.ConfigObject es_settings_obj: Eskapade configuration (key "sparkCfgFile" for config path) :param iterable spark_settings: iterable of custom settings key-value pairs to be set """ # set path of config file cfg_path = str(config_path) if config_path else str( eskapade_settings.get( 'sparkCfgFile')) if eskapade_settings else None if cfg_path and eskapade_settings and not os.path.isabs(cfg_path): cfg_path = persistence.io_path('config_spark', eskapade_settings.io_conf(), cfg_path) if cfg_path and cfg_path != self.config_path: self.log().debug( 'Setting configuration file path to "{}"'.format(cfg_path)) self.config_path = cfg_path self.reset_config() # create Spark config spark_conf = pyspark.conf.SparkConf() # set settings from config file if self.config_path: cfg = self.get_config() if not CONF_PREFIX in cfg: raise RuntimeError( 'No section "{}" found in config file'.format(CONF_PREFIX)) spark_conf.setAll(cfg.items(CONF_PREFIX)) # set custom settings if spark_settings: spark_conf.setAll(spark_settings) return spark_conf
def test_esk411(self): """Test Esk-411: Predictive maintenance Weibull fit""" # run Eskapade self.run_eskapade('esk411_weibull_predictive_maintenance.py') ds = ProcessManager().service(DataStore) ws = ProcessManager().service(RooFitManager).ws # roofit objects check in datastore self.assertIn('fit_result', ds) self.assertIsInstance(ds['fit_result'], ROOT.RooFitResult) # roofit objects check in workspace self.assertIn('binnedData', ds) self.assertIsInstance(ds['binnedData'], ROOT.RooDataHist) mdata = ds['binnedData'] self.assertTrue(mdata) self.assertEqual(300, mdata.numEntries()) mpdf = ws.pdf('sum3pdf') self.assertTrue(mpdf) # successful fit result fit_result = ds['fit_result'] self.assertEqual(0, fit_result.status()) self.assertEqual(3, fit_result.covQual()) n1 = ws.var('N1') self.assertTrue(n1) self.assertGreater(n1.getVal(), 2.e5) n2 = ws.var('N2') self.assertTrue(n2) self.assertGreater(n2.getVal(), 4.e5) n3 = ws.var('N3') self.assertTrue(n3) self.assertGreater(n3.getVal(), 5.e4) # data-summary checks io_conf = ProcessManager().service(ConfigObject).io_conf() file_names = [ 'weibull_fit_report.tex', 'correlation_matrix_fit_result.pdf', 'floating_pars_fit_result.tex', 'fit_of_time_difference_medium_range.pdf' ] for fname in file_names: path = persistence.io_path('results_data', io_conf, 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertGreater(statinfo.st_size, 0)
def initialize(self): """Initialize WsUtils""" # check input arguments self.check_arg_types(pages_key=str) if isinstance(self.copy_into_ws, str): self.copy_into_ws = [self.copy_into_ws] assert isinstance(self.copy_into_ws, list), 'copy_into_ws needs to be a string or list of strings.' if isinstance(self.copy_into_ds, str): self.copy_into_ds = [self.copy_into_ds] assert isinstance(self.copy_into_ds, list), 'copy_into_ds needs to be a string or list of strings.' # get I/O configuration io_conf = ProcessManager().service(ConfigObject).io_conf() # read report templates with open(core.persistence.io_path('templates', io_conf, 'df_summary_report.tex')) as templ_file: self.report_template = templ_file.read() with open(core.persistence.io_path('templates', io_conf, 'df_summary_report_page.tex')) as templ_file: self.page_template = templ_file.read() with open(persistence.io_path('templates', io_conf, 'df_summary_table_page.tex')) as templ_file: self.table_template = templ_file.read() # get path to results directory if not self.results_path: # get I/O configuration io_conf = ProcessManager().service(ConfigObject).io_conf() self.results_path = core.persistence.io_path('results_data', io_conf, 'report') # check if output directory exists if os.path.exists(self.results_path): # check if path is a directory if not os.path.isdir(self.results_path): self.log().critical('output path "%s" is not a directory', self.results_path) raise AssertionError('output path is not a directory') else: # create directory self.log().debug('Making output directory %s', self.results_path) os.makedirs(self.results_path) # make sure Eskapade RooFit library is loaded for fitting (for plotting correlation matrix) if self._fit: roofit_utils.load_libesroofit() return StatusCode.Success
def test_esk305(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk305_correlation_summary.py' settings['batchMode'] = True status = execution.run_eskapade(settings) self.assertTrue(status.isSuccess()) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # input data checks all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5'] self.assertIn('input_data', ds) self.assertIsInstance(ds['input_data'], pd.DataFrame) self.assertListEqual(list(ds['input_data'].columns), all_col_names) self.assertIn('correlations', ds) self.assertIsInstance(ds['correlations'], list) corr_list = ds['correlations'] self.assertEqual(4, len(corr_list)) # correlation matrix checks col_names = ['x1', 'x2', 'x3', 'x4', 'x5'] for corr in corr_list: self.assertIsInstance(corr, pd.DataFrame) #self.assertListEqual(list(corr.columns), col_names) self.assertListEqual(list(corr.index), col_names) # heatmap pdf checks io_conf = settings.io_conf() results_path = persistence.io_path('results_data', io_conf, 'report') correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio'] for corr in correlations: path = '{0:s}/correlations_input_data_{1:s}.pdf'.format( results_path, corr) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def test_esk401(self): """Test Esk-401: ROOT hist fill, plot, convert""" # run Eskapade self.run_eskapade('esk401_roothist_fill_plot_convert.py') ds = ProcessManager().service(DataStore) # histogram checks self.assertIn('hist', ds) self.assertIsInstance(ds['hist'], dict) columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x1:x2', 'x2:x3', 'x4:x5'] self.assertListEqual(sorted(ds['hist'].keys()), sorted(columns)) for col in columns: self.assertIsInstance(ds['hist'][col], ROOT.TH1) # data-generation checks self.assertIn('n_correlated_data', ds) self.assertEqual(500, ds['n_correlated_data']) self.assertIn('n_rdh_x1', ds) self.assertEqual(40, ds['n_rdh_x1']) self.assertIn('n_rds_x2_vs_x3', ds) self.assertEqual(23, ds['n_rds_x2_vs_x3']) # roofit objects check self.assertIn('hpdf', ds) self.assertIsInstance(ds['hpdf'], ROOT.RooHistPdf) self.assertIn('rdh_x1', ds) self.assertIsInstance(ds['rdh_x1'], ROOT.RooDataHist) self.assertIn('rds_x2_vs_x3', ds) self.assertIsInstance(ds['rds_x2_vs_x3'], ROOT.RooDataSet) self.assertIn('vars_x2_vs_x3', ds) self.assertIsInstance(ds['vars_x2_vs_x3'], ROOT.RooArgSet) # data-summary checks io_conf = ProcessManager().service(ConfigObject).io_conf() file_names = ['report.tex'] + [ 'hist_{}.pdf'.format(col.replace(':', '_vs_')) for col in columns ] for fname in file_names: path = persistence.io_path('results_data', io_conf, 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def initialize(self): """ Initialize WriteFromDf """ # perform basic checks of configured attributes # a key and path need to have been set. if self.key == '' and self.path == '' and self.dictionary is None: raise Exception('Key, path and dictionary are not set.') if len(self.key) == 0 and len(self.dictionary) == 0: raise Exception('Key or dict has not been set.') if len(self.path) == 0 and len(self.dictionary) == 0: raise Exception('Output filename or dict has not been set. Exit.') else: assert self.path != '' and isinstance(self.path, str), 'path not given.' if self.path and self.key: self.dictionary = {self.key: self.path} elif self.dictionary: pass else: raise Exception('Path and key OR dictionary not properly set.') # correct the output paths, if need be if self.dictionary: paths = list(self.dictionary.values()) assert '' not in paths, 'One or more of the paths in dict is empty.' assert False not in [isinstance(p, str) for p in paths] # update paths if needed for k in self.dictionary.keys(): p = self.dictionary[k] if not p.__contains__('/'): io_conf = ProcessManager().service(ConfigObject).io_conf() self.dictionary[k] = persistence.io_path( 'results_data', io_conf, p) self.log().debug( 'Output filename for key <%s> has been reset to: %s' % (k, self.dictionary[k])) self.log().info('kwargs passed on to pandas writer are: %s' % self.kwargs) return StatusCode.Success
def test_esk305(self): settings = process_manager.service(ConfigObject) settings['batchMode'] = True self.eskapade_run(resources.tutorial('esk305_correlation_summary.py')) ds = process_manager.service(DataStore) # input data checks all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5'] self.assertIn('input_data', ds) self.assertIsInstance(ds['input_data'], pd.DataFrame) self.assertListEqual(list(ds['input_data'].columns), all_col_names) self.assertIn('correlations', ds) self.assertIsInstance(ds['correlations'], list) corr_list = ds['correlations'] self.assertEqual(4, len(corr_list)) # correlation matrix checks col_names = ['x1', 'x2', 'x3', 'x4', 'x5'] for corr in corr_list: self.assertIsInstance(corr, pd.DataFrame) # self.assertListEqual(list(corr.columns), col_names) self.assertListEqual(list(corr.index), col_names) # heatmap pdf checks results_path = persistence.io_path('results_data', 'report') correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio'] for corr in correlations: path = '{0:s}/correlations_input_data_{1:s}.pdf'.format( results_path, corr) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def initialize(self): """Initialize CorrelationSummary""" # get I/O configuration io_conf = ProcessManager().service(ConfigObject).io_conf() # get path to results directory if not self.results_path: self.results_path = persistence.io_path('results_data', io_conf, 'report') # check if output directory exists if os.path.exists(self.results_path): # check if path is a directory if not os.path.isdir(self.results_path): self.log().critical('output path "%s" is not a directory', self.results_path) raise AssertionError('output path is not a directory') else: # create directory self.log().debug('Making output directory "%s"', self.results_path) os.makedirs(self.results_path) # check method if self.method not in ALL_CORRS: logstring = '"{}" is not a valid correlation method, please use one of {}; using "pearson"' logstring = logstring.format( self.method, ', '.join(['"' + m + '"' for m in ALL_CORRS])) self.log().error(logstring) self.method = 'pearson' # check input arguments self.check_arg_types(read_key=str, method=str) self.check_arg_vals('read_key') return StatusCode.Success
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk411_weibull_predictive_maintenance' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The plots and latex report produced by link WsUtils can be found in dir: {path} """ logger.info(msg, path=persistence.io_path('results_data', 'report')) settings['generate'] = True # settings['read_data'] = not settings['generate'] settings['model'] = True settings['process'] = True settings['fit_plot'] = True settings['summary'] = True fitpdf = 'sum3pdf' n_percentile_bins = 300 ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['model']:
def test_esk106_script(self, mock_argv): """Test Eskapade run with esk106 macro from script""" proc_mgr = ProcessManager() # get file paths settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk106_cmdline_options' settings_ = settings.copy() script_path = eskapade.utils.get_file_path('run_eskapade') macro_path = persistence.io_path('macros', settings.io_conf(), 'esk106_cmdline_options.py') # import run-script module orig_mod_path = sys.path.copy() sys.path.append(os.path.dirname(script_path)) script_mod = os.path.splitext(os.path.basename(script_path))[0] run_eskapade = importlib.import_module(script_mod) # mock command-line arguments args = [] mock_argv.__getitem__ = lambda s, k: args.__getitem__(k) # base settings args_ = [script_path, macro_path, '-LDEBUG', '--batch-mode'] settings_['macro'] = macro_path settings_['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings_['batchMode'] = True def do_run(name, args, args_, settings_, add_args, add_settings, chains): # set arguments args.clear() args += args_ + add_args settings = settings_.copy() settings.update(add_settings) # run Eskapade proc_mgr.reset() run_eskapade.main() settings_run = proc_mgr.service(ConfigObject) # check results self.assertListEqual( [c.name for c in proc_mgr.chains], chains, 'unexpected chain names in "{}" test'.format(name)) self.assertDictEqual( settings_run, settings, 'unexpected settings in "{}" test'.format(name)) # run both chains do_run( 'both chains', args, args_, settings_, ['--store-all', '-cdo_chain0=True', '-cdo_chain1=True'], dict(storeResultsEachChain=True, do_chain0=True, do_chain1=True), ['Chain0', 'Chain1']) # run only last chain by skipping the first do_run('skip first', args, args_, settings_, ['-bChain1', '-cdo_chain0=True', '-cdo_chain1=True'], dict(beginWithChain='Chain1', do_chain0=True, do_chain1=True), ['Chain0', 'Chain1']) # run only last chain by not defining the first do_run('no first', args, args_, settings_, ['-cdo_chain0=False', '-cdo_chain1=True'], dict(do_chain0=False, do_chain1=True), ['Chain1']) # restore module search path sys.path.clear() sys.path += orig_mod_path
log.debug('Now parsing configuration file esk305_correlation_summary') ######################################################################################### # --- minimal analysis information proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk305_correlation_summary' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['input_path'] = persistence.io_path('data', settings.io_conf(), 'correlated_data.sv.gz') settings['reader'] = 'csv' settings['separator'] = ' ' settings['correlations'] = [ 'pearson', 'kendall', 'spearman', 'correlation_ratio' ] ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains proc_mgr.add_chain('Data') proc_mgr.add_chain('Summary') # load data reader = analysis.ReadToDf(name='reader',
######################################################################################### # --- minimal analysis information proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'Tutorial_5' ######################################################################################### # --- setup Spark proc_mgr.service(SparkManager).get_or_create_session() ######################################################################################### # --- analysis values, settings, helper functions, configuration flags. DATA_FILE_PATH = persistence.io_path('data', settings.io_conf(), 'LAozone.data') VAR_LABELS = dict(doy='Day of year', date='Date', vis='Visibility', vis_km='Visibility') VAR_UNITS = dict(vis='mi', vis_km='km') def comp_date(day): """Get date/time from day of year""" import pandas as pd return pd.Timestamp('1976-01-01') + pd.Timedelta('{:d}D'.format(day - 1)) def mi_to_km(dist):
else: logger.error('unsupported stream_type specified: {type}.', type=stream_type) ########################################################################## # --- now set up the chains and links based on configuration flags spark_streaming = Chain('SparkStreaming') # the word count example wordcount_link = spark_analysis.SparkStreamingWordCount( name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts') spark_streaming.add(wordcount_link) # store output writer_link = spark_analysis.SparkStreamingWriter( name='SparkStreamingWriter', read_key=wordcount_link.store_key, output_path='file:' + persistence.io_path('results_data', '/dstream/wordcount'), suffix='txt', repartition=1) spark_streaming.add(writer_link) # start/stop of Spark Streaming control_link = spark_analysis.SparkStreamingController(name='SparkStreamingController', timeout=10) spark_streaming.add(control_link) ########################################################################## logger.debug('Done parsing configuration file esk610_spark_streaming.')
def initialize(self): """Initialize UncorrelationHypothesisTester""" # check input arguments self.check_arg_types(read_key=str, significance_key=str, sk_significance_map=str, sk_residuals_map=str, sk_residuals_overview=str, default_number_of_bins=int, nsims_per_significance=int, prefix=str, z_threshold=float, pages_key=str, clientpages_key=str, hist_dict_key=str) self.check_arg_types(recurse=True, allow_none=True, columns=str) self.check_arg_types(recurse=True, allow_none=True, x_columns=str) self.check_arg_types(recurse=True, allow_none=True, y_columns=str) self.check_arg_types(recurse=True, allow_none=True, ignore_categories=str) self.check_arg_types(recurse=True, allow_none=True, var_ignore_categories=str) self.check_arg_vals('read_key') self.check_arg_vals('significance_key') if self.map_to_original and not isinstance(self.map_to_original, str) \ and not isinstance(self.map_to_original, dict): raise TypeError('map_to_original needs to be a dict or string (to fetch a dict from the datastore)') # get I/O configuration io_conf = ProcessManager().service(ConfigObject).io_conf() # read report templates with open(persistence.io_path('templates', io_conf, 'df_summary_report.tex')) as templ_file: self.report_template = templ_file.read() with open(persistence.io_path('templates', io_conf, 'df_summary_report_page.tex')) as templ_file: self.page_template = templ_file.read() with open(persistence.io_path('templates', io_conf, 'df_summary_table_page.tex')) as templ_file: self.table_template = templ_file.read() # get path to results directory if not self.results_path: self.results_path = persistence.io_path('results_data', io_conf, 'report') if self.results_path and not self.results_path.endswith('/'): self.results_path = self.results_path + '/' # check if output directory exists if os.path.exists(self.results_path): # check if path is a directory if not os.path.isdir(self.results_path): self.log().critical('output path "%s" is not a directory', self.results_path) raise AssertionError('output path is not a directory') else: # create directory self.log().debug('Making output directory "%s"', self.results_path) os.makedirs(self.results_path) # prefix for file storage if self.prefix and not self.prefix.endswith('_'): self.prefix = self.prefix + '_' # check provided columns if len(self.columns): assert len(self.x_columns) == 0 and len(self.y_columns) == 0, \ 'Set either columns OR x_columns and y_columns.' if len(self.x_columns): assert len(self.columns) == 0 and len(self.y_columns) > 0, \ 'Set either columns OR x_columns and y_columns.' self._all_columns = [] # check that var_ignore_categories are set correctly. for col, ic in self.var_ignore_categories.items(): if isinstance(ic, str): self.var_ignore_categories[col] = [ic] elif not isinstance(ic, list): raise TypeError('var_ignore_categories key "%s" needs to be a string or list of strings' % col) # load roofit classes roofit_utils.load_libesroofit() return StatusCode.Success
log.debug('Now parsing configuration file esk209_read_big_data_itr') ######################################################################################### # --- minimal analysis information settings = ProcessManager().service(ConfigObject) settings['analysisName'] = 'esk209_read_big_data_itr' settings['version'] = 0 ######################################################################################### # when chunking through an input file, pick up only N lines in each iteration. chunksize = 5 ######################################################################################### # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags proc_mgr = ProcessManager() # --- example 1: readdata loops over the input files, but no file chunking. if settings.get('do_example1', True): ch = proc_mgr.add_chain('MyChain1') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview)
def _process_results_path(self): """Process results_path argument.""" if not self.results_path: self.results_path = persistence.io_path('results_data', 'report') persistence.create_dir(self.results_path)
settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk604_spark_execute_query' settings['version'] = 0 ########################################################################## # Start Spark session spark = proc_mgr.service(SparkManager).create_session( eskapade_settings=settings) ########################################################################## # CSV and dataframe settings # NB: local file may not be accessible to worker node in cluster mode file_paths = [ 'file:' + persistence.io_path('data', settings.io_conf(), 'dummy1.csv'), 'file:' + persistence.io_path('data', settings.io_conf(), 'dummy2.csv') ] # define store_key for all data files to be read in STORE_KEYS = ['spark_df1', 'spark_df2'] ########################################################################## # Now set up the chains and links based on configuration flags proc_mgr.add_chain('Read') # create read link for each data file for index, key in enumerate(STORE_KEYS): read_link = spark_analysis.SparkDfReader(name='Reader' + str(index + 1), store_key=key,
proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk411_weibull_predictive_maintenance' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The plots and latex report produced by link WsUtils can be found in dir: %s """ % (persistence.io_path('results_data', settings.io_conf(), 'report')) log.info(msg) settings['generate'] = True #settings['read_data'] = not settings['generate'] settings['model'] = True settings['process'] = True settings['fit_plot'] = True settings['summary'] = True fitpdf = 'sum3pdf' n_percentile_bins = 300 ######################################################################################### # --- now set up the chains and links based on configuration flags