def test_esk404(self):
        """Test Esk-404: Workspace create PDF, simulate, fit, plot"""

        # run Eskapade
        self.run_eskapade('esk404_workspace_createpdf_simulate_fit_plot.py')
        ds = ProcessManager().service(DataStore)
        ws = ProcessManager().service(RooFitManager).ws

        # data-generation checks
        self.assertIn('n_df_simdata', ds)
        self.assertEqual(1000, ds['n_df_simdata'])

        # roofit objects check in datastore
        self.assertIn('fit_result', ds)
        self.assertIsInstance(ds['fit_result'], ROOT.RooFitResult)

        # successful fit result
        fit_result = ds['fit_result']
        self.assertEqual(0, fit_result.status())
        self.assertEqual(3, fit_result.covQual())

        self.assertIn('simdata', ds)
        self.assertIsInstance(ds['simdata'], ROOT.RooDataSet)
        self.assertIn('simdata_plot', ds)
        self.assertIsInstance(ds['simdata_plot'], ROOT.RooPlot)

        # roofit objects check in workspace
        self.assertIn('model', ws)
        self.assertIn('bkg', ws)
        self.assertIn('sig', ws)
Example #2
0
    def test_esk609(self):
        """Test Esk-609: Map data-frame groups"""

        # run Eskapade
        self.run_eskapade('esk609_map_df_groups.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check input data
        for key in ('map_rdd', 'flat_map_rdd'):
            self.assertIn(key, ds, 'no data found with key "{}"'.format(key))
            self.assertIsInstance(
                ds[key], pyspark.RDD,
                'object "{0:s}" is not an RDD (type "{1:s}")'.format(
                    key, str(type(ds[key]))))

        # sums of "bar" variable
        bar_sums = [(0, 27.5), (1, 77.5), (2, 127.5), (3, 177.5), (4, 227.5),
                    (5, 277.5), (6, 327.5), (7, 377.5), (8, 427.5), (9, 477.5)]
        flmap_rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.,
                       bar_sums[it // 10][1]) for it in range(100)]

        # check mapped data frames
        self.assertListEqual(sorted(ds['map_rdd'].collect()), bar_sums,
                             'unexpected values in "map_rdd"')
        self.assertListEqual(sorted(ds['flat_map_rdd'].collect()), flmap_rows,
                             'unexpected values in "flat_map_rdd"')
    def test_esk409(self):
        """Test Esk-409: Unredeemed vouchers"""

        # run Eskapade
        self.run_eskapade('esk409_unredeemed_vouchers.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check generated data
        self.assertIn('voucher_redeems', ds)
        self.assertIn('voucher_ages', ds)
        self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet)
        self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet)
        self.assertLess(ds['voucher_redeems'].numEntries(), 6000)
        self.assertGreater(ds['voucher_redeems'].numEntries(), 0)
        self.assertEqual(ds['voucher_ages'].numEntries(), 10000)

        # check fit result
        fit_link = proc_mgr.get_chain('Fitting').get_link('Fit')
        self.assertEqual(fit_link.fit_result.status(), 0)
        n_ev_pull = (fit_link.results['n_ev'][0] -
                     6000.) / fit_link.results['n_ev'][1]
        self.assertGreater(n_ev_pull, -3.)
        self.assertLess(n_ev_pull, 3.)

        # check plot output
        plot_path = persistence.io_path(
            'results_data',
            proc_mgr.service(ConfigObject).io_conf(), 'voucher_redeem.pdf')
        self.assertTrue(os.path.exists(plot_path))
        statinfo = os.stat(plot_path)
        self.assertGreater(statinfo.st_size, 0)
Example #4
0
    def execute(self):
        """Execute SparkDfWriter"""

        # get process manager and data store
        proc_mgr = ProcessManager()
        ds = ProcessManager().service(DataStore)

        # check if data frame exists in data store
        if self.read_key not in ds:
            err_msg = 'no input data found in data store with key "{}"'.format(
                self.read_key)
            if not self.fail_missing_data:
                self.log().error(err_msg.capitalize())
                return StatusCode.Success
            raise KeyError(err_msg)

        # fetch data from data store
        data = ds[self.read_key]
        if not isinstance(data, pyspark.sql.DataFrame):
            spark = proc_mgr.service(SparkManager).get_session()
            self.log().debug(
                'Converting data of type "%s" to a Spark data frame',
                type(data))
            data = data_conversion.create_spark_df(spark,
                                                   data,
                                                   schema=self.schema)

        # create data-frame writer with requested number of partitions/output files
        df_writer = data.repartition(self.num_files).write

        # call data-frame writer methods
        df_writer = apply_transform_funcs(df_writer, self._write_methods)

        return StatusCode.Success
Example #5
0
    def checkCollectionSet(self):
        """ 
        Check existence of collection in either mongo or datastore, and check that they are not empty.
    
        Collections need to be both present and not empty.

        - For mongo collections a dedicated filter can be applied before doing the count. 
        - For pandas dataframes the additional option 'skip_chain_when_key_not_in_ds' exists. Meaning, skip the chain as well if the dataframe is not present in the datastore.
        """

        proc_mgr = ProcessManager()

        # check if collection names are present in datastore
        ds = proc_mgr.service(DataStore)
        for k in self.collectionSet:
            if k not in list(ds.keys()):
                if self.skip_chain_when_key_not_in_ds:
                    self.log().warning(
                        'Key <%s> not in DataStore. Sending skip chain signal.'
                        % k)
                    return StatusCode.SkipChain
                else:
                    raise Exception('Key <%s> not in DataStore.' % k)
            df = ds[k]
            if len(df.index) == 0:
                self.log().warning(
                    'pandas.DataFrame with datastore key <%s> is empty. Sending skip chain signal.'
                    % k)
                return StatusCode.SkipChain

        return StatusCode.Success
Example #6
0
    def test_esk208(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk208_record_factorizer.py'

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        self.assertTrue(status.isSuccess())
        self.assertTrue('test1' in ds)
        self.assertTrue('test1_fact' in ds)
        self.assertTrue('test1_refact' in ds)
        self.assertTrue('to_original' in ds)
        df1 = ds['test1']
        df2 = ds['test1_refact']
        self.assertEqual(len(df1.index), 12)
        self.assertEqual(len(df2.index), 12)
        self.assertTrue('dummy' in df1.columns)
        self.assertTrue('loc' in df1.columns)
        self.assertTrue('dummy' in df2.columns)
        self.assertTrue('loc' in df2.columns)
        self.assertListEqual(df1['dummy'].values.tolist(),
                             df2['dummy'].values.tolist())
        self.assertListEqual(df1['loc'].values.tolist(),
                             df2['loc'].values.tolist())
Example #7
0
    def test_esk302(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk302_histogram_filler_plotter.py'
        settings['batchMode'] = True

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)
        columns = [
            'date', 'isActive', 'age', 'eyeColor', 'gender', 'company',
            'latitude', 'longitude'
        ]

        # data-generation checks
        self.assertTrue(status.isSuccess())
        self.assertIn('n_sum_rc', ds)
        self.assertEqual(1300, ds['n_sum_rc'])
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(columns))

        # data-summary checks
        file_names = ['report.tex'
                      ] + ['hist_{}.pdf'.format(col) for col in columns]
        for fname in file_names:
            path = '{0:s}/{1:s}/data/v0/report/{2:s}'.format(
                settings['resultsDir'], settings['analysisName'], fname)
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
    def test_esk306(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk306_concatenate_reports.py'
        settings['batchMode'] = True

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # report checks
        self.assertTrue(status.isSuccess())
        self.assertIn('report_pages', ds)
        self.assertIsInstance(ds['report_pages'], list)
        self.assertEqual(19, len(ds['report_pages']))

        # data-summary checks
        file_names = ['report.tex']
        for fname in file_names:
            path = '{0:s}/{1:s}/data/v0/report/{2:s}'.format(
                settings['resultsDir'], settings['analysisName'], fname)
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
    def test_esk408(self):
        """Test Esk-408: Classification error propagation after fit"""

        # run Eskapade
        self.run_eskapade(
            'esk408_classification_error_propagation_after_fit.py')
        ds = ProcessManager().service(DataStore)
        ws = ProcessManager().service(RooFitManager).ws

        # data-generation checks
        self.assertIn('n_df_pvalues', ds)
        self.assertEqual(500, ds['n_df_pvalues'])
        self.assertIn('df_pvalues', ds)
        self.assertIsInstance(ds['df_pvalues'], pd.DataFrame)
        df = ds['df_pvalues']
        self.assertTrue('high_risk_pvalue' in df.columns)
        self.assertTrue('high_risk_perror' in df.columns)

        # roofit objects check in workspace
        fit_result = ws.obj('fit_result')
        self.assertFalse(not fit_result)
        self.assertIsInstance(fit_result, ROOT.RooFitResult)
        # test for successful fit result
        self.assertEqual(0, fit_result.status())
        self.assertEqual(3, fit_result.covQual())

        frac = ws.var('frac')
        self.assertFalse(not frac)
        self.assertTrue(frac.getVal() > 0)
        self.assertTrue(frac.getError() > 0)
Example #10
0
    def initialize(self):
        """Inititialize the TruncExpGen execution"""

        # check input arguments
        self.check_arg_types(store_key=str,
                             max_var_data_key=str,
                             model_name=str,
                             event_frac=float)
        self.check_arg_vals('store_key', 'max_var_data_key', 'model_name',
                            'event_frac')

        # check if model exists
        rfm = ProcessManager().service(RooFitManager)
        model = rfm.model(self.model_name)
        if not model:
            self.log().warning(
                'Model "{}" does not exist; creating with default values'.
                format(self.model_name))
            model = rfm.model(self.model_name, model_cls=TruncExponential)

        # check if model PDF has been built
        if not model.is_built:
            model.build_model()

        # process command arguments for generate function
        self._gen_cmd_args = create_roofit_opts(create_linked_list=False,
                                                **self.kwargs)

        return StatusCode.Success
Example #11
0
    def execute(self):
        """Execute TruncExpGen"""

        # get process manager and services
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)
        rfm = proc_mgr.service(RooFitManager)

        # get PDF from RooFitManager
        model = rfm.model(self.model_name)

        # check if dataset with upper bounds exists in data store
        if self.max_var_data_key not in ds:
            self.log().warning(
                'No range upper-bound data in data store; generating %d dummy bounds',
                NUM_DUMMY_EVENTS)
            ds[self.max_var_data_key] = gen_max_var_data(model)

        # get max-var data
        max_var_data = ds.get(self.max_var_data_key)
        if not isinstance(max_var_data, ROOT.RooAbsData):
            raise TypeError('data with key "{}" are not RooFit data'.format(
                self.read_key))

        # select max-var data
        mv_sel_data = sel_max_var_data(model, max_var_data, self.event_frac)

        # generate data
        proto_arg = RooFit.ProtoData(mv_sel_data, False, False)
        data = model.pdf.generate(model.var_set, proto_arg,
                                  *self._gen_cmd_args.values())
        ds[self.store_key] = data

        return StatusCode.Success
Example #12
0
    def execute(self):
        """Execute DfSummary

        Creates a report page for each variable in data frame.

        * create statistics object for column
        * create overview table of column variable
        * plot histogram of column variable
        * store plot

        :returns: execution status code
        :rtype: StatusCode
        """

        ds = ProcessManager().service(DataStore)

        # fetch and check input data frame
        data = ds.get(self.read_key, None)
        if data is None:
            self.log().critical(
                'No input data "%s" found in data store for %s', self.read_key,
                str(self))
            raise RuntimeError('no input data found for {}'.format(str(self)))
        else:
            self.assert_data_type(data)

        # create report page for histogram
        if self.pages_key:
            self.pages = ds.get(self.pages_key, [])
            if not isinstance(self.pages, list):
                raise TypeError(
                    'pages key "{}" does not refer to a list'.format(
                        self.pages_key))

        # determine all possible columns, used for comparison below
        all_columns = self.get_all_columns(data)
        if not self.columns:
            self.columns = all_columns

        for name in self.columns[:]:
            # check if column is in data frame
            if name not in all_columns:
                self.log().warning('Key "%s" not in input data; skipping',
                                   name)
                self.columns.remove(self.columns.index(name))
                continue
            self.log().debug('Processing "%s"', name)
            sample = self.get_sample(data, name)
            self.process_sample(name, sample)

        # add nan histogram to summary if present
        if self.nan_counts:
            nan_hist = self.nan_counts, self.columns
            self.process_nan_histogram(nan_hist, self.get_length(data))

        # storage
        if self.pages_key:
            ds[self.pages_key] = self.pages

        return StatusCode.Success
Example #13
0
    def test_execute(self):
        from eskapade import ProcessManager, DataStore
        from eskapade.analysis import ApplyFuncToDf

        # --- setup a dummy data frame
        df = pd.DataFrame({
            'a': ['aap', 'noot', 'mies'],
            'b': [0, 1, 2],
            'c': [0, 1, 1],
            'd': [1, 'a', None]
        })

        # --- setup datastore
        ds = ProcessManager().service(DataStore)
        ds['test_input'] = df

        # --- setup the link
        link = ApplyFuncToDf()
        link.add_columns = {'foo': 'bar'}
        link.read_key = 'test_input'
        link.store_key = 'test_output'
        link.execute()

        # --- the actual detests

        # stored at all?
        self.assertIn('test_output', list(ds.keys()), 'DataFrame not stored')

        # added a column?
        self.assertIn('foo', ds['test_output'].columns,
                      'Column not added to DataFrame')
Example #14
0
    def test_esk605(self):
        """Test Esk-605: Create Spark data frame"""

        # run Eskapade
        self.run_eskapade('esk605_create_spark_df.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check created data frames
        cols = (StructField('index',
                            LongType()), StructField('foo', StringType()),
                StructField('bar', DoubleType()))
        rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.)
                for it in range(20, 100)]
        for key in ('rows_df', 'rdd_df', 'df_df', 'pd_df'):
            self.assertIn(key, ds,
                          'no object with key {} in data store'.format(key))
            df = ds[key]
            self.assertIsInstance(
                df, pyspark.sql.DataFrame,
                'object with key {0:s} is not a data frame (type {1:s})'.
                format(key, str(type(df))))
            self.assertTupleEqual(
                tuple(df.schema), cols,
                'unexpected data-frame schema for {}'.format(key))
            self.assertListEqual(
                sorted(tuple(r) for r in df.collect()), rows,
                'unexpected data-frame content for {}'.format(key))
            self.assertTrue(df.is_cached,
                            'data frame {} is not cached'.format(key))
            self.assertLessEqual(
                df.rdd.getNumPartitions(), 2,
                'unexpected number of data-frame partitions for {}'.format(
                    key))
Example #15
0
    def execute(self):
        """ Execute WriteFromDf

        Pick up the dataframe and write to disk.
        """

        ds = ProcessManager().service(DataStore)

        # check that all dataframes are present
        assert all(
            k in list(ds.keys())
            for k in list(self.dictionary.keys())), 'key(s) not in DataStore.'

        # check that all ds items are dataframes
        assert all(isinstance(ds[k],pd.DataFrame) for k in list(self.dictionary.keys())), \
            'key(s) is not a pandas DataFrame.'

        # collect writer and store the dataframes
        for k in list(self.dictionary.keys()):
            df = ds[k]
            path = self.dictionary[k]
            if self.add_counter_to_name:
                ps = os.path.splitext(path)
                path = ps[0] + '_' + str(self._counter) + ps[1]
            writer = pandasWriter(path, self.writer)
            folder = os.path.dirname(path)
            self.log().debug('Checking for directory: %s', folder)
            if not os.path.exists(folder):
                self.log().fatal('Path given is invalid.')
            self.log().debug('Writing file: %s' % (path))
            writer(df, path, **self.kwargs)

        self._counter += 1
        return StatusCode.Success
Example #16
0
    def test_esk304(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings[
            'macro'] = settings['esRoot'] + '/tutorials/esk304_df_boxplot.py'
        settings['batchMode'] = True

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # data-generation checks
        self.assertTrue(status.isSuccess())
        self.assertIn('data', ds)
        self.assertIsInstance(ds['data'], pd.DataFrame)
        self.assertEqual(10000, len(ds['data']))
        self.assertListEqual(sorted(ds['data'].columns),
                             ['var_a', 'var_b', 'var_c'])

        # data-summary checks
        file_names = [
            'report_boxplots.tex', 'boxplot_var_a.pdf', 'boxplot_var_c.pdf'
        ]
        for fname in file_names:
            path = '{0:s}/{1:s}/data/v0/report/{2:s}'.format(
                settings['resultsDir'], settings['analysisName'], fname)
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
Example #17
0
    def execute(self):
        """Execute ConvertRooDataSet2RooDataHist"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)
        ws = proc_mgr.service(RooFitManager).ws

        # basic checks on contensts of the data frame
        if self.from_ws:
            rds = ws.data(self.read_key)
            if rds is None:
                raise RuntimeError('no data with key "{}" in workspace'.format(self.read_key))
        else:
            if self.read_key not in ds:
                raise KeyError('key "{}" not found in datastore'.format(self.read_key))
            rds = ds[self.read_key]
        if not isinstance(rds, ROOT.RooDataSet):
            raise TypeError('retrieved object "{0:s}" not of type RooDataSet (got "{1:s}")'.format(self.read_key,
                                                                                                   str(type(rds))))
        if rds.numEntries() == 0:
            raise AssertionError('RooDataSet "{}" is empty'.format(self.read_key))

        # check presence of all columns
        for col in self.columns:
            if not ws.var(col):
                raise RuntimeError('variable "{}" not found in workspace'.format(col))

        # create a temporary observables set of the columns
        temp_obs = uuid.uuid4().hex
        obs = ','.join(self.columns)
        failure = ws.defineSet(temp_obs, obs)
        if not failure:
            theobs = ws.set(temp_obs)
        else:
            raise RuntimeError('unable to retrieve (/create) observables with name "{}"'.format(obs))

        # do conversion from RooDataSet to RooDataHist
        self.log().debug('Converting roodataset "%s" into roodatahist "%s"', self.read_key, self.store_key)
        rdh = data_conversion.rds_to_rdh(rds, rf_varset=theobs, binning_name=self.binning_name)

        # remove original rds?
        if self.rm_original:
            if self.from_ws:
                # FIXME can datasets be deleted from an rws? dont know how
                pass
            else:
                del ds[self.read_key]

        # put object into the datastore
        ds[self.store_key] = rdh
        n_rdh = rdh.numEntries()
        ds['n_' + self.store_key] = n_rdh
        self.log().debug('Stored roodatahist "%s" with number of bins: %d', self.store_key, n_rdh)

        # cleanup of temporary observables set
        ws.removeSet(temp_obs)

        return StatusCode.Success
Example #18
0
    def process_and_store(self):
        """Store (and possibly process) histogram objects"""

        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        if self.store_key is not None:
            ds[self.store_key] = self._hists
Example #19
0
    def process_and_store(self):
        """Make, clean, and store ValueCount objects"""

        # nothing to do?
        if self.store_key_hists is None and self.store_key_counts is None:
            return

        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # 1. construct value counts
        for col in self.columns:
            name = ':'.join(col)
            vc = ValueCounts(col, col, self._counts[name])
            # remove all items from Counters where the key is not of correct datatype.
            # e.g. in Counter dict of ints, remove any non-ints that may arise
            # from dq issues.
            if self.drop_inconsistent_key_types:
                vc = self.drop_inconsistent_keys(col, vc)
            self._valcnts[name] = vc

        if self.store_key_counts is not None:
            ds[self.store_key_counts] = self._valcnts

        # 2. construct hists from value counts
        if self.store_key_hists is None:
            return

        for col in self.columns:
            if len(col) != 1:
                continue
            name = ':'.join(col)
            dt = np.dtype(self.var_dtype[name]).type()
            is_number = isinstance(dt, np.number)
            is_timestamp = isinstance(dt, np.datetime64)

            # bin_specs is used for converting index back to original var in
            # histogram class.
            bin_specs = {}
            if is_number:
                bin_specs = self.bin_specs.get(name, self._unit_bin_specs)
            elif is_timestamp:
                bin_specs = self.bin_specs.get(name,
                                               self._unit_timestamp_specs)
            h = Histogram(self._valcnts[name],
                          variable=name,
                          datatype=self.var_dtype[name],
                          bin_specs=bin_specs)
            self._hists[name] = h
        # and store
        ds[self.store_key_hists] = self._hists

        # cleanup
        if self.store_key_counts is None:
            del self._valcnts
        if self.store_key_hists is None:
            del self._hists
Example #20
0
    def execute(self):
        """Execute PrintWs"""

        proc_mgr = ProcessManager()
        ws = proc_mgr.service(RooFitManager).ws

        ws.Print('v')

        return StatusCode.Success
Example #21
0
 def test_reset(self, mock_remove_services, mock_remove_chains):
     from eskapade import ProcessManager
     pm = ProcessManager()
     pm.custom_attribute = 'test'
     pm.reset()
     mock_remove_services.assert_called()
     mock_remove_chains.assert_called()
     self.assertFalse(hasattr(pm, 'custom_attribute'),
                      'custom_attribute was not removed')
Example #22
0
    def setUp(self):
        """Set up test"""

        execution.reset_eskapade()
        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['analysisName'] = self.__class__.__name__
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['batchMode'] = True
Example #23
0
    def execute(self):
        """Execute HelloWorld"""

        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        for i in range(self.repeat):
            self.log().info('Hello {0}'.format(self.hello))

        return StatusCode.Success
Example #24
0
    def run_eskapade(self,
                     macro,
                     return_status=definitions.StatusCode.Success):
        """Run Eskapade"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['macro'] = persistence.io_path('macros', settings.io_conf(),
                                                macro)
        status = execution.run_eskapade(settings)
        self.assertTrue(status == return_status)
Example #25
0
 def test_singleton(self):
     pm1 = ProcessManager()
     pm1.custom_attribute = 'test_attr'
     pm2 = ProcessManager()
     self.assertIs(pm1, pm2, 'process manager is not a singleton')
     self.assertTrue(
         hasattr(pm2, 'custom_attribute'),
         'process-manager attributes are reset upon re-creation')
     self.assertEqual(
         pm2.custom_attribute, 'test_attr',
         'process-manager attributes are changed upon re-creation')
Example #26
0
    def execute(self):
        """ Execute AssignRandomClass """

        ds = ProcessManager().service(DataStore)

        # basic checks on contensts of the data frame
        assert self.readKey in list(
            ds.keys()), 'Key %s not in DataStore.' % self.readKey
        df = ds[self.readKey]
        if not isinstance(df, DataFrame):
            raise Exception('Retrieved object not of type pandas DataFrame.')
        ndf = len(df.index)
        assert ndf > 0, 'dataframe %s is empty.' % self.readKey
        if self.column in df.columns:
            raise Exception(
                'Column name <%s> already used: <%s>. Will not overwrite.' %
                (self.column, str(df.columns)))

        # fix final number of events assigned per random class
        # ... each class gets at least one event
        if self.nevents is not None:
            if len(self.nevents) == self.nclasses - 1:
                self.nevents.append(ndf - sum(n for n in self.nevents))
        if self.nevents is None:
            self.nevents = [int(ndf * f) for f in self.fractions]
            pass
        for i in range(self.nclasses):
            nsum = sum(n for n in self.nevents[:i + 1])
            ndiff = 0 if (nsum - ndf < 0) else (nsum - ndf)
            self.nevents[i] -= ndiff
            if self.nevents[i] < 0:
                self.nevents[i] = 0
            pass
        for i, n in enumerate(self.nevents):
            assert n >= 0, 'Random class <%d> assigned nevents <%d> needs to be greater than zero. %s' % \
                                                                                        (i, n, str(self.nevents))
            self.log().info('Random class <%d> assigned n events <%d>.' %
                            (i, n))

        # random reshuffling of dataframe indices
        settings = ProcessManager().service(ConfigObject)
        RNG = RandomState(settings['seed'])
        permute = RNG.permutation(df.index)

        # apply the random reshuffling, and assign records to the n classes
        df[self.column] = 0
        for i in range(self.nclasses):
            ib = sum(n for n in self.nevents[:i])
            ie = sum(n for n in self.nevents[:i + 1])
            df.ix[permute[ib:ie], self.column] = i
            pass

        return StatusCode.Success
Example #27
0
    def execute(self):
        """Execute LINKTEMPLATE"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)

        # --- your algorithm code goes here

        self.log().debug('Now executing link: %s', self.name)

        return StatusCode.Success
Example #28
0
    def initialize(self):
        """Initialize SparkDataToCsv"""

        # check input arguments
        self.check_arg_types(allow_none=True, read_key=str, output_path=str, compression_codec=str)
        self.check_arg_types(mode=str, sep=str, num_files=int)
        self.check_arg_types(recurse=True, allow_none=True)
        self.check_arg_vals('read_key', 'sep')
        self.check_arg_vals('output_path', 'compression_codec', allow_none=True)
        self.check_arg_opts(mode=('overwrite', 'ignore', 'error'))
        if self.num_files < 1:
            raise RuntimeError('requested number of files is less than 1 ({:d})'.format(self.num_files))

        # set other attributes
        self.do_execution = True

        # set default output path
        if not self.output_path:
            settings = ProcessManager().service(ConfigObject)
            self.output_path = 'file:' + persistence.io_path('results_data', settings.io_conf(), '{}_output'.format(self.name))

        # parse header argument
        try:
            self.header = tuple(self.header)
        except TypeError:
            self.header = bool(self.header)
        if isinstance(self.header, tuple) and not self.header:
            raise RuntimeError('empty header sequence specified')
    
        # check output directory, if local
        if self.output_path.startswith('file:'):
            local_output_path = os.path.abspath(self.output_path.replace('file:',''))
            if os.path.exists(self.output_path):
                # output data already exist
                if self.mode == 'ignore':
                    # do not execute link
                    self.log().debug('Output data already exist; not executing link')
                    self.do_execution = False
                    return StatusCode.Success
                elif self.mode == 'error':
                    # raise exception
                    raise RuntimeError('output data already exist')

                # remove output directory
                if not os.path.isdir(local_output_path):
                    raise RuntimeError('output path "{}" is not a directory'.format(local_output_path))
                shutil.rmtree(local_output_path)
            elif not os.path.exists(os.path.dirname(local_output_path)):
                # create path up to the last component
                self.log().debug('Creating output path "%s"', local_output_path)
                os.makedirs(os.path.dirname(local_output_path))

        return StatusCode.Success
Example #29
0
    def setUp(self):
        """Setup test environment"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['analysisName'] = 'DataConversionTest'
        # ensure local testing
        spark_settings = [('spark.app.name', settings['analysisName']),
                          ('spark.master', 'local[*]'),
                          ('spark.driver.host', 'localhost')]
        proc_mgr.service(SparkManager).create_session(
            eskapade_settings=settings, spark_settings=spark_settings)
Example #30
0
    def test_configuring_spark(self):
        """Test configuration of Spark session

        Test setting configuration variables in SparkManager before creating a
        SparkSession.  Configuration with environment variables is not tested
        here, because the unit-test framework and the command line behave
        differently.  Configuration with the SparkConfigurator link is tested in
        the SparkAnalysisTutorialMacrosTest (tutorial esk601).
        """

        sm = ProcessManager().service(SparkManager)

        # create SparkSession
        spark_settings = [('spark.app.name', 'my_spark_session'),
                          ('spark.master', 'local[42]'),
                          ('spark.driver.host', '127.0.0.1')]
        spark = sm.create_session(spark_settings=spark_settings)
        sc = spark.sparkContext

        self.assertEqual(sc.getConf().get('spark.app.name'),
                         'my_spark_session', 'app name not set correctly')
        self.assertEqual(sc.getConf().get('spark.master'), 'local[42]',
                         'master not set correctly')
        self.assertEqual(sc.getConf().get('spark.driver.host'), '127.0.0.1',
                         'driver host not set correctly')

        sm.finish()

        # create new session with different settings - new settings should be picked up
        spark_settings = [('spark.app.name', 'second_spark_session'),
                          ('spark.master', 'local[*]'),
                          ('spark.driver.host', 'localhost')]
        spark = sm.create_session(spark_settings=spark_settings)
        sc = spark.sparkContext

        self.assertEqual(sc.getConf().get('spark.app.name'),
                         'second_spark_session', 'app name not set correctly')
        self.assertEqual(sc.getConf().get('spark.master'), 'local[*]',
                         'master not set correctly')
        self.assertEqual(sc.getConf().get('spark.driver.host'), 'localhost',
                         'driver host not set correctly')

        # specify new settings for already running session - nothing should change
        spark_settings = [('spark.app.name', 'third_spark_session'),
                          ('spark.master', 'local[-1]'),
                          ('spark.driver.host', 'foobar')]
        spark = sm.create_session(spark_settings=spark_settings)
        sc = spark.sparkContext

        self.assertEqual(sc.getConf().get('spark.app.name'),
                         'second_spark_session', 'app name not set correctly')
        self.assertEqual(sc.getConf().get('spark.master'), 'local[*]',
                         'master not set correctly')
        self.assertEqual(sc.getConf().get('spark.driver.host'), 'localhost',
                         'driver host not set correctly')

        sm.finish()