def test_esk409(self): """Test Esk-409: Unredeemed vouchers""" # run Eskapade self.run_eskapade('esk409_unredeemed_vouchers.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check generated data self.assertIn('voucher_redeems', ds) self.assertIn('voucher_ages', ds) self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet) self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet) self.assertLess(ds['voucher_redeems'].numEntries(), 6000) self.assertGreater(ds['voucher_redeems'].numEntries(), 0) self.assertEqual(ds['voucher_ages'].numEntries(), 10000) # check fit result fit_link = proc_mgr.get_chain('Fitting').get_link('Fit') self.assertEqual(fit_link.fit_result.status(), 0) n_ev_pull = (fit_link.results['n_ev'][0] - 6000.) / fit_link.results['n_ev'][1] self.assertGreater(n_ev_pull, -3.) self.assertLess(n_ev_pull, 3.) # check plot output plot_path = persistence.io_path( 'results_data', proc_mgr.service(ConfigObject).io_conf(), 'voucher_redeem.pdf') self.assertTrue(os.path.exists(plot_path)) statinfo = os.stat(plot_path) self.assertGreater(statinfo.st_size, 0)
def execute(self): """Execute TruncExpGen""" # get process manager and services proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) rfm = proc_mgr.service(RooFitManager) # get PDF from RooFitManager model = rfm.model(self.model_name) # check if dataset with upper bounds exists in data store if self.max_var_data_key not in ds: self.log().warning( 'No range upper-bound data in data store; generating %d dummy bounds', NUM_DUMMY_EVENTS) ds[self.max_var_data_key] = gen_max_var_data(model) # get max-var data max_var_data = ds.get(self.max_var_data_key) if not isinstance(max_var_data, ROOT.RooAbsData): raise TypeError('data with key "{}" are not RooFit data'.format( self.read_key)) # select max-var data mv_sel_data = sel_max_var_data(model, max_var_data, self.event_frac) # generate data proto_arg = RooFit.ProtoData(mv_sel_data, False, False) data = model.pdf.generate(model.var_set, proto_arg, *self._gen_cmd_args.values()) ds[self.store_key] = data return StatusCode.Success
def test_execute_all_status_return(self, mock_execute, mock_import, mock_persist): from eskapade import StatusCode, ProcessManager pm = ProcessManager() pm.service( ConfigObject)['analysisName'] = 'test_execute_all_status_return' c1 = Chain('1') c2 = Chain('2') c3 = Chain('fail') c4 = Chain('4') pm.chains = [c1, c2, c3, c4] status = pm.execute_all() self.assertEqual(status, StatusCode.Failure) executed_chains = [arg[0] for arg in mock_execute.call_args_list] self.assertNotIn(c4, executed_chains) pm.reset() pm.service( ConfigObject)['analysisName'] = 'test_execute_all_status_return' mock_execute.reset_mock() c1 = Chain('1') c2 = Chain('2') c3 = Chain('skip') c4 = Chain('4') pm.chains = [c1, c2, c3, c4] status = pm.execute_all() self.assertEqual(status, StatusCode.Success) executed_chains = [arg[0][0] for arg in mock_execute.call_args_list] self.assertIn(c4, executed_chains)
def execute(self): """Execute ConvertRooDataSet2RooDataHist""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) ws = proc_mgr.service(RooFitManager).ws # basic checks on contensts of the data frame if self.from_ws: rds = ws.data(self.read_key) if rds is None: raise RuntimeError('no data with key "{}" in workspace'.format(self.read_key)) else: if self.read_key not in ds: raise KeyError('key "{}" not found in datastore'.format(self.read_key)) rds = ds[self.read_key] if not isinstance(rds, ROOT.RooDataSet): raise TypeError('retrieved object "{0:s}" not of type RooDataSet (got "{1:s}")'.format(self.read_key, str(type(rds)))) if rds.numEntries() == 0: raise AssertionError('RooDataSet "{}" is empty'.format(self.read_key)) # check presence of all columns for col in self.columns: if not ws.var(col): raise RuntimeError('variable "{}" not found in workspace'.format(col)) # create a temporary observables set of the columns temp_obs = uuid.uuid4().hex obs = ','.join(self.columns) failure = ws.defineSet(temp_obs, obs) if not failure: theobs = ws.set(temp_obs) else: raise RuntimeError('unable to retrieve (/create) observables with name "{}"'.format(obs)) # do conversion from RooDataSet to RooDataHist self.log().debug('Converting roodataset "%s" into roodatahist "%s"', self.read_key, self.store_key) rdh = data_conversion.rds_to_rdh(rds, rf_varset=theobs, binning_name=self.binning_name) # remove original rds? if self.rm_original: if self.from_ws: # FIXME can datasets be deleted from an rws? dont know how pass else: del ds[self.read_key] # put object into the datastore ds[self.store_key] = rdh n_rdh = rdh.numEntries() ds['n_' + self.store_key] = n_rdh self.log().debug('Stored roodatahist "%s" with number of bins: %d', self.store_key, n_rdh) # cleanup of temporary observables set ws.removeSet(temp_obs) return StatusCode.Success
def test_esk603(self): """Test Esk-603: Write Spark data to CSV""" # check if running in local mode sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) # run Eskapade self.run_eskapade('esk603_write_spark_data_to_csv.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # read output data results_data_path = persistence.io_dir( 'results_data', proc_mgr.service(ConfigObject).io_conf()) names = [] headers = [] contents = [] csv_dirs = glob.glob('{}/*'.format(results_data_path)) self.assertEqual(len(csv_dirs), 3, 'expected to find three CSV output directories') for csv_dir in csv_dirs: names.append(os.path.basename(csv_dir)) csv_files = glob.glob('{}/part*'.format(csv_dir)) self.assertEqual( len(csv_files), 1, 'expected to find only one CSV file in "{}"'.format(names[-1])) with open(csv_files[0]) as csv: contents.append([l.strip().split(',') for l in csv]) self.assertEqual( len(contents[-1]), 101, 'unexpected number of lines in "{}" CSV'.format(names[-1])) headers.append(contents[-1][0]) contents[-1] = sorted(contents[-1][1:]) # check output data self.assertListEqual(headers[0], ['index', 'foo', 'bar'], 'unexpected CSV header for "{}"'.format(names[0])) self.assertListEqual( contents[0], sorted([str(it), 'foo{:d}'.format(it), str((it + 1) / 2.)] for it in range(100)), 'unexpected CSV content for "{}"'.format(names[0])) for name, head, cont in zip(names[1:], headers[1:], contents[1:]): self.assertListEqual( head, headers[0], 'CSV header of "{0:s}" differs from header of "{1:s}"'.format( name, names[0])) self.assertListEqual( cont, contents[0], 'CSV content of "{0:s}" differs from content of "{1:s}"'. format(name, names[0]))
def execute(self): """Execute LINKTEMPLATE""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) # --- your algorithm code goes here self.log().debug('Now executing link: %s', self.name) return StatusCode.Success
def setUp(self): """Setup test environment""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'DataConversionTest' # ensure local testing spark_settings = [('spark.app.name', settings['analysisName']), ('spark.master', 'local[*]'), ('spark.driver.host', 'localhost')] proc_mgr.service(SparkManager).create_session( eskapade_settings=settings, spark_settings=spark_settings)
def execute(self): """Execute SparkStreamingWriter""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) data = ds[self.read_key] if self.repartition: data = data.repartition(self.repartition) data.saveAsTextFiles(self.output_path, suffix=self.suffix) return StatusCode.Success
def execute(self): """Execute IPythonEmbed""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) self.log().info( "Starting interactive session ... press Ctrl+d to exit.\n") # this function calls the interactive ipython session # in this session ds, settings, and proc_mgr are available embed() return StatusCode.Success
def execute(self): """Execute SparkDfReader""" # create data-frame reader proc_mgr = ProcessManager() spark = proc_mgr.service(SparkManager).get_session() data = spark.read # call data-frame reader methods data = apply_transform_funcs(data, self._read_methods) # store data in data store proc_mgr.service(DataStore)[self.store_key] = data return StatusCode.Success
def test_esk602(self): """Test Esk-602: Read CSV files into a Spark data frame""" # check if running in local mode sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) # run Eskapade self.run_eskapade('esk602_read_csv_to_spark_df.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check data frame self.assertIn('spark_df', ds, 'no object with key "spark_df" in data store') self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame, '"spark_df" is not a Spark data frame') self.assertEqual(ds['spark_df'].rdd.getNumPartitions(), 5, 'unexpected number of partitions in data frame') self.assertEqual(ds['spark_df'].count(), 12, 'unexpected number of rows in data frame') self.assertListEqual(ds['spark_df'].columns, ['date', 'loc', 'x', 'y'], 'unexpected columns in data frame') self.assertSetEqual( set((r['date'], r['loc']) for r in ds['spark_df'].collect()), set([(20090101, 'a'), (20090102, 'b'), (20090103, 'c'), (20090104, 'd'), (20090104, 'e'), (20090106, 'a'), (20090107, 'b'), (20090107, 'c'), (20090107, 'd'), (20090108, 'e'), (20090109, 'e'), (20090109, 'f')]), 'unexpected values in date/loc columns')
def execute(self): """Execute SparkDfWriter""" # get process manager and data store proc_mgr = ProcessManager() ds = ProcessManager().service(DataStore) # check if data frame exists in data store if self.read_key not in ds: err_msg = 'no input data found in data store with key "{}"'.format( self.read_key) if not self.fail_missing_data: self.log().error(err_msg.capitalize()) return StatusCode.Success raise KeyError(err_msg) # fetch data from data store data = ds[self.read_key] if not isinstance(data, pyspark.sql.DataFrame): spark = proc_mgr.service(SparkManager).get_session() self.log().debug( 'Converting data of type "%s" to a Spark data frame', type(data)) data = data_conversion.create_spark_df(spark, data, schema=self.schema) # create data-frame writer with requested number of partitions/output files df_writer = data.repartition(self.num_files).write # call data-frame writer methods df_writer = apply_transform_funcs(df_writer, self._write_methods) return StatusCode.Success
def test_esk605(self): """Test Esk-605: Create Spark data frame""" # run Eskapade self.run_eskapade('esk605_create_spark_df.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check created data frames cols = (StructField('index', LongType()), StructField('foo', StringType()), StructField('bar', DoubleType())) rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(20, 100)] for key in ('rows_df', 'rdd_df', 'df_df', 'pd_df'): self.assertIn(key, ds, 'no object with key {} in data store'.format(key)) df = ds[key] self.assertIsInstance( df, pyspark.sql.DataFrame, 'object with key {0:s} is not a data frame (type {1:s})'. format(key, str(type(df)))) self.assertTupleEqual( tuple(df.schema), cols, 'unexpected data-frame schema for {}'.format(key)) self.assertListEqual( sorted(tuple(r) for r in df.collect()), rows, 'unexpected data-frame content for {}'.format(key)) self.assertTrue(df.is_cached, 'data frame {} is not cached'.format(key)) self.assertLessEqual( df.rdd.getNumPartitions(), 2, 'unexpected number of data-frame partitions for {}'.format( key))
def test_esk604(self): """Test Esk-604: Execute Spark-SQL query""" # check if running in local mode sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) # run Eskapade self.run_eskapade('esk604_spark_execute_query.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check data frame self.assertIn('spark_df_sql', ds, 'no object with key "spark_df_sql" in data store') self.assertIsInstance(ds['spark_df_sql'], pyspark.sql.DataFrame, '"spark_df_sql" is not a Spark data frame') self.assertEqual(ds['spark_df_sql'].count(), 4, 'unexpected number of rows in filtered data frame') self.assertListEqual(ds['spark_df_sql'].columns, ['loc', 'sumx', 'sumy'], 'unexpected columns in data frame') self.assertEqual( ds['spark_df_sql'].schema, proc_mgr.get_chain('ApplySQL').get_link('SparkSQL').schema, 'schema of data frame does not correspond to schema stored in link' ) self.assertSetEqual( set(tuple(r) for r in ds['spark_df_sql'].collect()), set([('e', 10, 15), ('d', 2, 11), ('b', 6, 16), ('a', 2, 18)]), 'unexpected values in loc/sumx/sumy columns')
def setUp(self): """Set up test""" TutorialMacrosTest.setUp(self) proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['macrosDir'] = '{0:s}/{1:s}'.format( utils.get_env_var('es_root'), 'tutorials') settings['analysisName'] = 'SparkAnalysisTutorialMacrosTest' # ensure local testing spark_settings = [('spark.app.name', settings['analysisName']), ('spark.master', 'local[*]'), ('spark.driver.host', 'localhost')] proc_mgr.service(SparkManager).create_session( eskapade_settings=settings, spark_settings=spark_settings)
def test_spark_setup(self): """Test if Spark setup is working properly""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'spark_setup' sm = proc_mgr.service(SparkManager) spark = sm.create_session(eskapade_settings=settings) df = spark.createDataFrame([(0, 'foo'), (1, 'bar')], ['id', 'value']) self.assertSetEqual(set(tuple(r) for r in df.collect()), set([(0, 'foo'), (1, 'bar')]), 'unexpected values in columns') sm.finish()
def test_esk607(self): """Test Esk-607: Add column to Spark dataframe""" # check if running in local mode sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) # run Eskapade self.run_eskapade('esk607_spark_with_column.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check data frame self.assertIn('new_spark_df', ds, 'no object with key "new_spark_df" in data store') self.assertIsInstance(ds['new_spark_df'], pyspark.sql.DataFrame, '"new_spark_df" is not a Spark data frame') self.assertEqual(ds['new_spark_df'].count(), 5, 'unexpected number of rows in filtered data frame') self.assertListEqual( ds['new_spark_df'].columns, ['dummy', 'date', 'loc', 'x', 'y', 'pow_xy1', 'pow_xy2'], 'unexpected columns in data frame') self.assertSetEqual( set(tuple(r) for r in ds['new_spark_df'].collect()), set([('bla', 20090103, 'c', 5, 7, 78125.0, 78125.0), ('bal', 20090102, 'b', 3, 8, 6561.0, 6561.0), ('flo', 20090104, 'e', 3, 5, 243.0, 243.0), ('bar', 20090101, 'a', 1, 9, 1.0, 1.0), ('foo', 20090104, 'd', 1, 6, 1.0, 1.0)]), 'unexpected values in columns')
def test_esk609(self): """Test Esk-609: Map data-frame groups""" # run Eskapade self.run_eskapade('esk609_map_df_groups.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check input data for key in ('map_rdd', 'flat_map_rdd'): self.assertIn(key, ds, 'no data found with key "{}"'.format(key)) self.assertIsInstance( ds[key], pyspark.RDD, 'object "{0:s}" is not an RDD (type "{1:s}")'.format( key, str(type(ds[key])))) # sums of "bar" variable bar_sums = [(0, 27.5), (1, 77.5), (2, 127.5), (3, 177.5), (4, 227.5), (5, 277.5), (6, 327.5), (7, 377.5), (8, 427.5), (9, 477.5)] flmap_rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2., bar_sums[it // 10][1]) for it in range(100)] # check mapped data frames self.assertListEqual(sorted(ds['map_rdd'].collect()), bar_sums, 'unexpected values in "map_rdd"') self.assertListEqual(sorted(ds['flat_map_rdd'].collect()), flmap_rows, 'unexpected values in "flat_map_rdd"')
def checkCollectionSet(self): """ Check existence of collection in either mongo or datastore, and check that they are not empty. Collections need to be both present and not empty. - For mongo collections a dedicated filter can be applied before doing the count. - For pandas dataframes the additional option 'skip_chain_when_key_not_in_ds' exists. Meaning, skip the chain as well if the dataframe is not present in the datastore. """ proc_mgr = ProcessManager() # check if collection names are present in datastore ds = proc_mgr.service(DataStore) for k in self.collectionSet: if k not in list(ds.keys()): if self.skip_chain_when_key_not_in_ds: self.log().warning( 'Key <%s> not in DataStore. Sending skip chain signal.' % k) return StatusCode.SkipChain else: raise Exception('Key <%s> not in DataStore.' % k) df = ds[k] if len(df.index) == 0: self.log().warning( 'pandas.DataFrame with datastore key <%s> is empty. Sending skip chain signal.' % k) return StatusCode.SkipChain return StatusCode.Success
def main(): """Run Eskapade Top-level control function for an Eskapade run started from the command line. Arguments specified by the user are parsed and converted to settings in the configuration object. Optionally, an interactive IPython session is started when the run is finished. """ # create parser for command-line arguments parser = create_arg_parser() user_args = parser.parse_args() # create config object for settings if not user_args.unpickle_config: # create new config settings = ConfigObject() else: # read previously persisted settings if pickled file is specified conf_path = user_args.config_files.pop(0) settings = ConfigObject.import_from_file(conf_path) del user_args.unpickle_config # set configuration macros settings.add_macros(user_args.config_files) # set user options settings.set_user_opts(user_args) # run Eskapade core.execution.run_eskapade(settings) # start interpreter if requested (--interactive on command line) if settings.get('interactive'): # create process manager, config object, and data store proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) # set Pandas display options pd.set_option('display.width', 120) pd.set_option('display.max_columns', 50) # start interactive session log = logging.getLogger(__name__) log.info("Continuing interactive session ... press Ctrl+d to exit.\n") IPython.embed()
def execute(self): """Execute SparkStreamingWordCount""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) lines = ds[self.read_key] counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1))\ .reduceByKey(lambda a, b: a + b) counts.pprint() if self.store_key is not None: ds[self.store_key] = counts return StatusCode.Success
def process_and_store(self): """Store (and possibly process) histogram objects""" proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) if self.store_key is not None: ds[self.store_key] = self._hists
def test_esk608(self): """Test Esk-608: Execute Spark histogram filling macro""" # check if required Python and Java libraries are made available to worker nodes sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) self.assertRegexpMatches( sc.getConf().get('spark.jars.packages', ''), 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4', 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail' ) if re.search('spark://', sc.getConf().get('spark.master', '')): py_mods = utils.get_file_path('py_mods') self.assertRegexpMatches( sc.getConf().get('spark.submit.pyFiles', ''), py_mods, 'Eskapade modules missing from spark.submit.pyFiles, needed in Spark cluster mode' ) self.assertRegexpMatches( sc.getConf().get('spark.files', ''), py_mods, 'Eskapade modules missing from spark.files, needed in Spark cluster mode' ) # run Eskapade self.run_eskapade('esk608_spark_histogrammar.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) settings = ProcessManager().service(ConfigObject) # check data frame self.assertIn('spark_df', ds, 'no object with key "spark_df" in data store') self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame, '"spark_df" is not a Spark data frame') self.assertEqual(ds['spark_df'].count(), 12, 'unexpected number of rows in data frame') self.assertListEqual(sorted(ds['spark_df'].columns), sorted(['date', 'loc', 'x', 'y']), 'unexpected columns in data frame') # data-generation checks self.assertIn('hist', ds) self.assertIsInstance(ds['hist'], dict) col_names = ['date', 'x', 'y', 'loc', 'x:y'] self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names)) # data-summary checks f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y'] file_names = ['report.tex' ] + ['hist_{}.pdf'.format(col) for col in f_bases] for fname in file_names: path = persistence.io_path('results_data', settings.io_conf(), 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def process_and_store(self): """Make, clean, and store ValueCount objects""" # nothing to do? if self.store_key_hists is None and self.store_key_counts is None: return proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # 1. construct value counts for col in self.columns: name = ':'.join(col) vc = ValueCounts(col, col, self._counts[name]) # remove all items from Counters where the key is not of correct datatype. # e.g. in Counter dict of ints, remove any non-ints that may arise # from dq issues. if self.drop_inconsistent_key_types: vc = self.drop_inconsistent_keys(col, vc) self._valcnts[name] = vc if self.store_key_counts is not None: ds[self.store_key_counts] = self._valcnts # 2. construct hists from value counts if self.store_key_hists is None: return for col in self.columns: if len(col) != 1: continue name = ':'.join(col) dt = np.dtype(self.var_dtype[name]).type() is_number = isinstance(dt, np.number) is_timestamp = isinstance(dt, np.datetime64) # bin_specs is used for converting index back to original var in # histogram class. bin_specs = {} if is_number: bin_specs = self.bin_specs.get(name, self._unit_bin_specs) elif is_timestamp: bin_specs = self.bin_specs.get(name, self._unit_timestamp_specs) h = Histogram(self._valcnts[name], variable=name, datatype=self.var_dtype[name], bin_specs=bin_specs) self._hists[name] = h # and store ds[self.store_key_hists] = self._hists # cleanup if self.store_key_counts is None: del self._valcnts if self.store_key_hists is None: del self._hists
def setUp(self): """Set up test""" execution.reset_eskapade() proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = self.__class__.__name__ settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['batchMode'] = True
def execute(self): """Execute PrintWs""" proc_mgr = ProcessManager() ws = proc_mgr.service(RooFitManager).ws ws.Print('v') return StatusCode.Success
def execute(self): """Execute SparkConfigurator""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) sm = proc_mgr.service(SparkManager) # stop running spark session, if any sm.finish() # start a new session spark = proc_mgr.service(SparkManager).create_session( eskapade_settings=settings, spark_settings=self.spark_settings) spark.sparkContext.setLogLevel(self.log_level) # check config self.log().info('New Spark session started with config: {}'.format(str(spark.sparkContext.getConf().getAll()))) return StatusCode.Success
def do_storage(self): """Storage of the created RooDataHist object""" proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # 1. create pdf of dataset as well? if self.create_hist_pdf: hpdf_name = self.create_hist_pdf hist_pdf = ROOT.RooHistPdf(hpdf_name, hpdf_name, self._varset, self._rdh) # 2. remove original df? if self.rm_original: del ds[self.read_key] # 3a. put objects from the datastore into the workspace if self.into_ws: ws = proc_mgr.service(RooFitManager).ws try: ws.put(self._rdh, ROOT.RooFit.Rename(self.store_key)) ws.defineSet(self.store_key_vars, self._varset) ws.defineSet(self.store_key_cats, self._catset) if self.create_hist_pdf: ws.put(hist_pdf, RooFit.RecycleConflictNodes()) except: raise RuntimeError( 'could not import object "%s" into rooworkspace' % self.read_key) # 3b. put objects into datastore else: ds[self.store_key] = self._rdh ds[self.store_key_vars] = self._varset ds[self.store_key_cats] = self._catset if self.create_hist_pdf: ds[hpdf_name] = hist_pdf n_rdh = int(self._rdh.sumEntries()) ds['n_' + self.store_key] = n_rdh self.log().debug('Stored roodatahist "%s" with sum of weights: %d', self.store_key, n_rdh) ds[self.sk_map_to_original] = self._mto
def test_udf_functionality(self): """Test if Spark setup is working properly for user-defined functions""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'spark_setup' sm = proc_mgr.service(SparkManager) spark = sm.create_session(includeEskapadeModules=True, eskapade_settings=settings) df = spark.createDataFrame([(0, 'foo'), (1, 'bar')], ['id', 'value']) udf_to_str = udf(dq_helper.to_str, StringType()) df = df.withColumn('output', udf_to_str(df['value'])) self.assertSetEqual(set(tuple(r) for r in df.collect()), set([(0, 'foo', 'foo'), (1, 'bar', 'bar')]), 'unexpected values in columns') sm.finish()
def run_eskapade(self, macro, return_status=definitions.StatusCode.Success): """Run Eskapade""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['macro'] = persistence.io_path('macros', settings.io_conf(), macro) status = execution.run_eskapade(settings) self.assertTrue(status == return_status)