def test_esk404(self): """Test Esk-404: Workspace create PDF, simulate, fit, plot""" # run Eskapade self.run_eskapade('esk404_workspace_createpdf_simulate_fit_plot.py') ds = ProcessManager().service(DataStore) ws = ProcessManager().service(RooFitManager).ws # data-generation checks self.assertIn('n_df_simdata', ds) self.assertEqual(1000, ds['n_df_simdata']) # roofit objects check in datastore self.assertIn('fit_result', ds) self.assertIsInstance(ds['fit_result'], ROOT.RooFitResult) # successful fit result fit_result = ds['fit_result'] self.assertEqual(0, fit_result.status()) self.assertEqual(3, fit_result.covQual()) self.assertIn('simdata', ds) self.assertIsInstance(ds['simdata'], ROOT.RooDataSet) self.assertIn('simdata_plot', ds) self.assertIsInstance(ds['simdata_plot'], ROOT.RooPlot) # roofit objects check in workspace self.assertIn('model', ws) self.assertIn('bkg', ws) self.assertIn('sig', ws)
def test_esk609(self): """Test Esk-609: Map data-frame groups""" # run Eskapade self.run_eskapade('esk609_map_df_groups.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check input data for key in ('map_rdd', 'flat_map_rdd'): self.assertIn(key, ds, 'no data found with key "{}"'.format(key)) self.assertIsInstance( ds[key], pyspark.RDD, 'object "{0:s}" is not an RDD (type "{1:s}")'.format( key, str(type(ds[key])))) # sums of "bar" variable bar_sums = [(0, 27.5), (1, 77.5), (2, 127.5), (3, 177.5), (4, 227.5), (5, 277.5), (6, 327.5), (7, 377.5), (8, 427.5), (9, 477.5)] flmap_rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2., bar_sums[it // 10][1]) for it in range(100)] # check mapped data frames self.assertListEqual(sorted(ds['map_rdd'].collect()), bar_sums, 'unexpected values in "map_rdd"') self.assertListEqual(sorted(ds['flat_map_rdd'].collect()), flmap_rows, 'unexpected values in "flat_map_rdd"')
def test_esk409(self): """Test Esk-409: Unredeemed vouchers""" # run Eskapade self.run_eskapade('esk409_unredeemed_vouchers.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check generated data self.assertIn('voucher_redeems', ds) self.assertIn('voucher_ages', ds) self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet) self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet) self.assertLess(ds['voucher_redeems'].numEntries(), 6000) self.assertGreater(ds['voucher_redeems'].numEntries(), 0) self.assertEqual(ds['voucher_ages'].numEntries(), 10000) # check fit result fit_link = proc_mgr.get_chain('Fitting').get_link('Fit') self.assertEqual(fit_link.fit_result.status(), 0) n_ev_pull = (fit_link.results['n_ev'][0] - 6000.) / fit_link.results['n_ev'][1] self.assertGreater(n_ev_pull, -3.) self.assertLess(n_ev_pull, 3.) # check plot output plot_path = persistence.io_path( 'results_data', proc_mgr.service(ConfigObject).io_conf(), 'voucher_redeem.pdf') self.assertTrue(os.path.exists(plot_path)) statinfo = os.stat(plot_path) self.assertGreater(statinfo.st_size, 0)
def execute(self): """Execute SparkDfWriter""" # get process manager and data store proc_mgr = ProcessManager() ds = ProcessManager().service(DataStore) # check if data frame exists in data store if self.read_key not in ds: err_msg = 'no input data found in data store with key "{}"'.format( self.read_key) if not self.fail_missing_data: self.log().error(err_msg.capitalize()) return StatusCode.Success raise KeyError(err_msg) # fetch data from data store data = ds[self.read_key] if not isinstance(data, pyspark.sql.DataFrame): spark = proc_mgr.service(SparkManager).get_session() self.log().debug( 'Converting data of type "%s" to a Spark data frame', type(data)) data = data_conversion.create_spark_df(spark, data, schema=self.schema) # create data-frame writer with requested number of partitions/output files df_writer = data.repartition(self.num_files).write # call data-frame writer methods df_writer = apply_transform_funcs(df_writer, self._write_methods) return StatusCode.Success
def checkCollectionSet(self): """ Check existence of collection in either mongo or datastore, and check that they are not empty. Collections need to be both present and not empty. - For mongo collections a dedicated filter can be applied before doing the count. - For pandas dataframes the additional option 'skip_chain_when_key_not_in_ds' exists. Meaning, skip the chain as well if the dataframe is not present in the datastore. """ proc_mgr = ProcessManager() # check if collection names are present in datastore ds = proc_mgr.service(DataStore) for k in self.collectionSet: if k not in list(ds.keys()): if self.skip_chain_when_key_not_in_ds: self.log().warning( 'Key <%s> not in DataStore. Sending skip chain signal.' % k) return StatusCode.SkipChain else: raise Exception('Key <%s> not in DataStore.' % k) df = ds[k] if len(df.index) == 0: self.log().warning( 'pandas.DataFrame with datastore key <%s> is empty. Sending skip chain signal.' % k) return StatusCode.SkipChain return StatusCode.Success
def test_esk208(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk208_record_factorizer.py' status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) self.assertTrue(status.isSuccess()) self.assertTrue('test1' in ds) self.assertTrue('test1_fact' in ds) self.assertTrue('test1_refact' in ds) self.assertTrue('to_original' in ds) df1 = ds['test1'] df2 = ds['test1_refact'] self.assertEqual(len(df1.index), 12) self.assertEqual(len(df2.index), 12) self.assertTrue('dummy' in df1.columns) self.assertTrue('loc' in df1.columns) self.assertTrue('dummy' in df2.columns) self.assertTrue('loc' in df2.columns) self.assertListEqual(df1['dummy'].values.tolist(), df2['dummy'].values.tolist()) self.assertListEqual(df1['loc'].values.tolist(), df2['loc'].values.tolist())
def test_esk302(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk302_histogram_filler_plotter.py' settings['batchMode'] = True status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) columns = [ 'date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude', 'longitude' ] # data-generation checks self.assertTrue(status.isSuccess()) self.assertIn('n_sum_rc', ds) self.assertEqual(1300, ds['n_sum_rc']) self.assertIn('hist', ds) self.assertIsInstance(ds['hist'], dict) self.assertListEqual(sorted(ds['hist'].keys()), sorted(columns)) # data-summary checks file_names = ['report.tex' ] + ['hist_{}.pdf'.format(col) for col in columns] for fname in file_names: path = '{0:s}/{1:s}/data/v0/report/{2:s}'.format( settings['resultsDir'], settings['analysisName'], fname) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def test_esk306(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk306_concatenate_reports.py' settings['batchMode'] = True status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # report checks self.assertTrue(status.isSuccess()) self.assertIn('report_pages', ds) self.assertIsInstance(ds['report_pages'], list) self.assertEqual(19, len(ds['report_pages'])) # data-summary checks file_names = ['report.tex'] for fname in file_names: path = '{0:s}/{1:s}/data/v0/report/{2:s}'.format( settings['resultsDir'], settings['analysisName'], fname) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def test_esk408(self): """Test Esk-408: Classification error propagation after fit""" # run Eskapade self.run_eskapade( 'esk408_classification_error_propagation_after_fit.py') ds = ProcessManager().service(DataStore) ws = ProcessManager().service(RooFitManager).ws # data-generation checks self.assertIn('n_df_pvalues', ds) self.assertEqual(500, ds['n_df_pvalues']) self.assertIn('df_pvalues', ds) self.assertIsInstance(ds['df_pvalues'], pd.DataFrame) df = ds['df_pvalues'] self.assertTrue('high_risk_pvalue' in df.columns) self.assertTrue('high_risk_perror' in df.columns) # roofit objects check in workspace fit_result = ws.obj('fit_result') self.assertFalse(not fit_result) self.assertIsInstance(fit_result, ROOT.RooFitResult) # test for successful fit result self.assertEqual(0, fit_result.status()) self.assertEqual(3, fit_result.covQual()) frac = ws.var('frac') self.assertFalse(not frac) self.assertTrue(frac.getVal() > 0) self.assertTrue(frac.getError() > 0)
def initialize(self): """Inititialize the TruncExpGen execution""" # check input arguments self.check_arg_types(store_key=str, max_var_data_key=str, model_name=str, event_frac=float) self.check_arg_vals('store_key', 'max_var_data_key', 'model_name', 'event_frac') # check if model exists rfm = ProcessManager().service(RooFitManager) model = rfm.model(self.model_name) if not model: self.log().warning( 'Model "{}" does not exist; creating with default values'. format(self.model_name)) model = rfm.model(self.model_name, model_cls=TruncExponential) # check if model PDF has been built if not model.is_built: model.build_model() # process command arguments for generate function self._gen_cmd_args = create_roofit_opts(create_linked_list=False, **self.kwargs) return StatusCode.Success
def execute(self): """Execute TruncExpGen""" # get process manager and services proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) rfm = proc_mgr.service(RooFitManager) # get PDF from RooFitManager model = rfm.model(self.model_name) # check if dataset with upper bounds exists in data store if self.max_var_data_key not in ds: self.log().warning( 'No range upper-bound data in data store; generating %d dummy bounds', NUM_DUMMY_EVENTS) ds[self.max_var_data_key] = gen_max_var_data(model) # get max-var data max_var_data = ds.get(self.max_var_data_key) if not isinstance(max_var_data, ROOT.RooAbsData): raise TypeError('data with key "{}" are not RooFit data'.format( self.read_key)) # select max-var data mv_sel_data = sel_max_var_data(model, max_var_data, self.event_frac) # generate data proto_arg = RooFit.ProtoData(mv_sel_data, False, False) data = model.pdf.generate(model.var_set, proto_arg, *self._gen_cmd_args.values()) ds[self.store_key] = data return StatusCode.Success
def execute(self): """Execute DfSummary Creates a report page for each variable in data frame. * create statistics object for column * create overview table of column variable * plot histogram of column variable * store plot :returns: execution status code :rtype: StatusCode """ ds = ProcessManager().service(DataStore) # fetch and check input data frame data = ds.get(self.read_key, None) if data is None: self.log().critical( 'No input data "%s" found in data store for %s', self.read_key, str(self)) raise RuntimeError('no input data found for {}'.format(str(self))) else: self.assert_data_type(data) # create report page for histogram if self.pages_key: self.pages = ds.get(self.pages_key, []) if not isinstance(self.pages, list): raise TypeError( 'pages key "{}" does not refer to a list'.format( self.pages_key)) # determine all possible columns, used for comparison below all_columns = self.get_all_columns(data) if not self.columns: self.columns = all_columns for name in self.columns[:]: # check if column is in data frame if name not in all_columns: self.log().warning('Key "%s" not in input data; skipping', name) self.columns.remove(self.columns.index(name)) continue self.log().debug('Processing "%s"', name) sample = self.get_sample(data, name) self.process_sample(name, sample) # add nan histogram to summary if present if self.nan_counts: nan_hist = self.nan_counts, self.columns self.process_nan_histogram(nan_hist, self.get_length(data)) # storage if self.pages_key: ds[self.pages_key] = self.pages return StatusCode.Success
def test_execute(self): from eskapade import ProcessManager, DataStore from eskapade.analysis import ApplyFuncToDf # --- setup a dummy data frame df = pd.DataFrame({ 'a': ['aap', 'noot', 'mies'], 'b': [0, 1, 2], 'c': [0, 1, 1], 'd': [1, 'a', None] }) # --- setup datastore ds = ProcessManager().service(DataStore) ds['test_input'] = df # --- setup the link link = ApplyFuncToDf() link.add_columns = {'foo': 'bar'} link.read_key = 'test_input' link.store_key = 'test_output' link.execute() # --- the actual detests # stored at all? self.assertIn('test_output', list(ds.keys()), 'DataFrame not stored') # added a column? self.assertIn('foo', ds['test_output'].columns, 'Column not added to DataFrame')
def test_esk605(self): """Test Esk-605: Create Spark data frame""" # run Eskapade self.run_eskapade('esk605_create_spark_df.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # check created data frames cols = (StructField('index', LongType()), StructField('foo', StringType()), StructField('bar', DoubleType())) rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(20, 100)] for key in ('rows_df', 'rdd_df', 'df_df', 'pd_df'): self.assertIn(key, ds, 'no object with key {} in data store'.format(key)) df = ds[key] self.assertIsInstance( df, pyspark.sql.DataFrame, 'object with key {0:s} is not a data frame (type {1:s})'. format(key, str(type(df)))) self.assertTupleEqual( tuple(df.schema), cols, 'unexpected data-frame schema for {}'.format(key)) self.assertListEqual( sorted(tuple(r) for r in df.collect()), rows, 'unexpected data-frame content for {}'.format(key)) self.assertTrue(df.is_cached, 'data frame {} is not cached'.format(key)) self.assertLessEqual( df.rdd.getNumPartitions(), 2, 'unexpected number of data-frame partitions for {}'.format( key))
def execute(self): """ Execute WriteFromDf Pick up the dataframe and write to disk. """ ds = ProcessManager().service(DataStore) # check that all dataframes are present assert all( k in list(ds.keys()) for k in list(self.dictionary.keys())), 'key(s) not in DataStore.' # check that all ds items are dataframes assert all(isinstance(ds[k],pd.DataFrame) for k in list(self.dictionary.keys())), \ 'key(s) is not a pandas DataFrame.' # collect writer and store the dataframes for k in list(self.dictionary.keys()): df = ds[k] path = self.dictionary[k] if self.add_counter_to_name: ps = os.path.splitext(path) path = ps[0] + '_' + str(self._counter) + ps[1] writer = pandasWriter(path, self.writer) folder = os.path.dirname(path) self.log().debug('Checking for directory: %s', folder) if not os.path.exists(folder): self.log().fatal('Path given is invalid.') self.log().debug('Writing file: %s' % (path)) writer(df, path, **self.kwargs) self._counter += 1 return StatusCode.Success
def test_esk304(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings[ 'macro'] = settings['esRoot'] + '/tutorials/esk304_df_boxplot.py' settings['batchMode'] = True status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # data-generation checks self.assertTrue(status.isSuccess()) self.assertIn('data', ds) self.assertIsInstance(ds['data'], pd.DataFrame) self.assertEqual(10000, len(ds['data'])) self.assertListEqual(sorted(ds['data'].columns), ['var_a', 'var_b', 'var_c']) # data-summary checks file_names = [ 'report_boxplots.tex', 'boxplot_var_a.pdf', 'boxplot_var_c.pdf' ] for fname in file_names: path = '{0:s}/{1:s}/data/v0/report/{2:s}'.format( settings['resultsDir'], settings['analysisName'], fname) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def execute(self): """Execute ConvertRooDataSet2RooDataHist""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) ws = proc_mgr.service(RooFitManager).ws # basic checks on contensts of the data frame if self.from_ws: rds = ws.data(self.read_key) if rds is None: raise RuntimeError('no data with key "{}" in workspace'.format(self.read_key)) else: if self.read_key not in ds: raise KeyError('key "{}" not found in datastore'.format(self.read_key)) rds = ds[self.read_key] if not isinstance(rds, ROOT.RooDataSet): raise TypeError('retrieved object "{0:s}" not of type RooDataSet (got "{1:s}")'.format(self.read_key, str(type(rds)))) if rds.numEntries() == 0: raise AssertionError('RooDataSet "{}" is empty'.format(self.read_key)) # check presence of all columns for col in self.columns: if not ws.var(col): raise RuntimeError('variable "{}" not found in workspace'.format(col)) # create a temporary observables set of the columns temp_obs = uuid.uuid4().hex obs = ','.join(self.columns) failure = ws.defineSet(temp_obs, obs) if not failure: theobs = ws.set(temp_obs) else: raise RuntimeError('unable to retrieve (/create) observables with name "{}"'.format(obs)) # do conversion from RooDataSet to RooDataHist self.log().debug('Converting roodataset "%s" into roodatahist "%s"', self.read_key, self.store_key) rdh = data_conversion.rds_to_rdh(rds, rf_varset=theobs, binning_name=self.binning_name) # remove original rds? if self.rm_original: if self.from_ws: # FIXME can datasets be deleted from an rws? dont know how pass else: del ds[self.read_key] # put object into the datastore ds[self.store_key] = rdh n_rdh = rdh.numEntries() ds['n_' + self.store_key] = n_rdh self.log().debug('Stored roodatahist "%s" with number of bins: %d', self.store_key, n_rdh) # cleanup of temporary observables set ws.removeSet(temp_obs) return StatusCode.Success
def process_and_store(self): """Store (and possibly process) histogram objects""" proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) if self.store_key is not None: ds[self.store_key] = self._hists
def process_and_store(self): """Make, clean, and store ValueCount objects""" # nothing to do? if self.store_key_hists is None and self.store_key_counts is None: return proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # 1. construct value counts for col in self.columns: name = ':'.join(col) vc = ValueCounts(col, col, self._counts[name]) # remove all items from Counters where the key is not of correct datatype. # e.g. in Counter dict of ints, remove any non-ints that may arise # from dq issues. if self.drop_inconsistent_key_types: vc = self.drop_inconsistent_keys(col, vc) self._valcnts[name] = vc if self.store_key_counts is not None: ds[self.store_key_counts] = self._valcnts # 2. construct hists from value counts if self.store_key_hists is None: return for col in self.columns: if len(col) != 1: continue name = ':'.join(col) dt = np.dtype(self.var_dtype[name]).type() is_number = isinstance(dt, np.number) is_timestamp = isinstance(dt, np.datetime64) # bin_specs is used for converting index back to original var in # histogram class. bin_specs = {} if is_number: bin_specs = self.bin_specs.get(name, self._unit_bin_specs) elif is_timestamp: bin_specs = self.bin_specs.get(name, self._unit_timestamp_specs) h = Histogram(self._valcnts[name], variable=name, datatype=self.var_dtype[name], bin_specs=bin_specs) self._hists[name] = h # and store ds[self.store_key_hists] = self._hists # cleanup if self.store_key_counts is None: del self._valcnts if self.store_key_hists is None: del self._hists
def execute(self): """Execute PrintWs""" proc_mgr = ProcessManager() ws = proc_mgr.service(RooFitManager).ws ws.Print('v') return StatusCode.Success
def test_reset(self, mock_remove_services, mock_remove_chains): from eskapade import ProcessManager pm = ProcessManager() pm.custom_attribute = 'test' pm.reset() mock_remove_services.assert_called() mock_remove_chains.assert_called() self.assertFalse(hasattr(pm, 'custom_attribute'), 'custom_attribute was not removed')
def setUp(self): """Set up test""" execution.reset_eskapade() proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = self.__class__.__name__ settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['batchMode'] = True
def execute(self): """Execute HelloWorld""" settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) for i in range(self.repeat): self.log().info('Hello {0}'.format(self.hello)) return StatusCode.Success
def run_eskapade(self, macro, return_status=definitions.StatusCode.Success): """Run Eskapade""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['macro'] = persistence.io_path('macros', settings.io_conf(), macro) status = execution.run_eskapade(settings) self.assertTrue(status == return_status)
def test_singleton(self): pm1 = ProcessManager() pm1.custom_attribute = 'test_attr' pm2 = ProcessManager() self.assertIs(pm1, pm2, 'process manager is not a singleton') self.assertTrue( hasattr(pm2, 'custom_attribute'), 'process-manager attributes are reset upon re-creation') self.assertEqual( pm2.custom_attribute, 'test_attr', 'process-manager attributes are changed upon re-creation')
def execute(self): """ Execute AssignRandomClass """ ds = ProcessManager().service(DataStore) # basic checks on contensts of the data frame assert self.readKey in list( ds.keys()), 'Key %s not in DataStore.' % self.readKey df = ds[self.readKey] if not isinstance(df, DataFrame): raise Exception('Retrieved object not of type pandas DataFrame.') ndf = len(df.index) assert ndf > 0, 'dataframe %s is empty.' % self.readKey if self.column in df.columns: raise Exception( 'Column name <%s> already used: <%s>. Will not overwrite.' % (self.column, str(df.columns))) # fix final number of events assigned per random class # ... each class gets at least one event if self.nevents is not None: if len(self.nevents) == self.nclasses - 1: self.nevents.append(ndf - sum(n for n in self.nevents)) if self.nevents is None: self.nevents = [int(ndf * f) for f in self.fractions] pass for i in range(self.nclasses): nsum = sum(n for n in self.nevents[:i + 1]) ndiff = 0 if (nsum - ndf < 0) else (nsum - ndf) self.nevents[i] -= ndiff if self.nevents[i] < 0: self.nevents[i] = 0 pass for i, n in enumerate(self.nevents): assert n >= 0, 'Random class <%d> assigned nevents <%d> needs to be greater than zero. %s' % \ (i, n, str(self.nevents)) self.log().info('Random class <%d> assigned n events <%d>.' % (i, n)) # random reshuffling of dataframe indices settings = ProcessManager().service(ConfigObject) RNG = RandomState(settings['seed']) permute = RNG.permutation(df.index) # apply the random reshuffling, and assign records to the n classes df[self.column] = 0 for i in range(self.nclasses): ib = sum(n for n in self.nevents[:i]) ie = sum(n for n in self.nevents[:i + 1]) df.ix[permute[ib:ie], self.column] = i pass return StatusCode.Success
def execute(self): """Execute LINKTEMPLATE""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) # --- your algorithm code goes here self.log().debug('Now executing link: %s', self.name) return StatusCode.Success
def initialize(self): """Initialize SparkDataToCsv""" # check input arguments self.check_arg_types(allow_none=True, read_key=str, output_path=str, compression_codec=str) self.check_arg_types(mode=str, sep=str, num_files=int) self.check_arg_types(recurse=True, allow_none=True) self.check_arg_vals('read_key', 'sep') self.check_arg_vals('output_path', 'compression_codec', allow_none=True) self.check_arg_opts(mode=('overwrite', 'ignore', 'error')) if self.num_files < 1: raise RuntimeError('requested number of files is less than 1 ({:d})'.format(self.num_files)) # set other attributes self.do_execution = True # set default output path if not self.output_path: settings = ProcessManager().service(ConfigObject) self.output_path = 'file:' + persistence.io_path('results_data', settings.io_conf(), '{}_output'.format(self.name)) # parse header argument try: self.header = tuple(self.header) except TypeError: self.header = bool(self.header) if isinstance(self.header, tuple) and not self.header: raise RuntimeError('empty header sequence specified') # check output directory, if local if self.output_path.startswith('file:'): local_output_path = os.path.abspath(self.output_path.replace('file:','')) if os.path.exists(self.output_path): # output data already exist if self.mode == 'ignore': # do not execute link self.log().debug('Output data already exist; not executing link') self.do_execution = False return StatusCode.Success elif self.mode == 'error': # raise exception raise RuntimeError('output data already exist') # remove output directory if not os.path.isdir(local_output_path): raise RuntimeError('output path "{}" is not a directory'.format(local_output_path)) shutil.rmtree(local_output_path) elif not os.path.exists(os.path.dirname(local_output_path)): # create path up to the last component self.log().debug('Creating output path "%s"', local_output_path) os.makedirs(os.path.dirname(local_output_path)) return StatusCode.Success
def setUp(self): """Setup test environment""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'DataConversionTest' # ensure local testing spark_settings = [('spark.app.name', settings['analysisName']), ('spark.master', 'local[*]'), ('spark.driver.host', 'localhost')] proc_mgr.service(SparkManager).create_session( eskapade_settings=settings, spark_settings=spark_settings)
def test_configuring_spark(self): """Test configuration of Spark session Test setting configuration variables in SparkManager before creating a SparkSession. Configuration with environment variables is not tested here, because the unit-test framework and the command line behave differently. Configuration with the SparkConfigurator link is tested in the SparkAnalysisTutorialMacrosTest (tutorial esk601). """ sm = ProcessManager().service(SparkManager) # create SparkSession spark_settings = [('spark.app.name', 'my_spark_session'), ('spark.master', 'local[42]'), ('spark.driver.host', '127.0.0.1')] spark = sm.create_session(spark_settings=spark_settings) sc = spark.sparkContext self.assertEqual(sc.getConf().get('spark.app.name'), 'my_spark_session', 'app name not set correctly') self.assertEqual(sc.getConf().get('spark.master'), 'local[42]', 'master not set correctly') self.assertEqual(sc.getConf().get('spark.driver.host'), '127.0.0.1', 'driver host not set correctly') sm.finish() # create new session with different settings - new settings should be picked up spark_settings = [('spark.app.name', 'second_spark_session'), ('spark.master', 'local[*]'), ('spark.driver.host', 'localhost')] spark = sm.create_session(spark_settings=spark_settings) sc = spark.sparkContext self.assertEqual(sc.getConf().get('spark.app.name'), 'second_spark_session', 'app name not set correctly') self.assertEqual(sc.getConf().get('spark.master'), 'local[*]', 'master not set correctly') self.assertEqual(sc.getConf().get('spark.driver.host'), 'localhost', 'driver host not set correctly') # specify new settings for already running session - nothing should change spark_settings = [('spark.app.name', 'third_spark_session'), ('spark.master', 'local[-1]'), ('spark.driver.host', 'foobar')] spark = sm.create_session(spark_settings=spark_settings) sc = spark.sparkContext self.assertEqual(sc.getConf().get('spark.app.name'), 'second_spark_session', 'app name not set correctly') self.assertEqual(sc.getConf().get('spark.master'), 'local[*]', 'master not set correctly') self.assertEqual(sc.getConf().get('spark.driver.host'), 'localhost', 'driver host not set correctly') sm.finish()