コード例 #1
0
    def test_esk409(self):
        """Test Esk-409: Unredeemed vouchers"""

        # run Eskapade
        self.run_eskapade('esk409_unredeemed_vouchers.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check generated data
        self.assertIn('voucher_redeems', ds)
        self.assertIn('voucher_ages', ds)
        self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet)
        self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet)
        self.assertLess(ds['voucher_redeems'].numEntries(), 6000)
        self.assertGreater(ds['voucher_redeems'].numEntries(), 0)
        self.assertEqual(ds['voucher_ages'].numEntries(), 10000)

        # check fit result
        fit_link = proc_mgr.get_chain('Fitting').get_link('Fit')
        self.assertEqual(fit_link.fit_result.status(), 0)
        n_ev_pull = (fit_link.results['n_ev'][0] -
                     6000.) / fit_link.results['n_ev'][1]
        self.assertGreater(n_ev_pull, -3.)
        self.assertLess(n_ev_pull, 3.)

        # check plot output
        plot_path = persistence.io_path(
            'results_data',
            proc_mgr.service(ConfigObject).io_conf(), 'voucher_redeem.pdf')
        self.assertTrue(os.path.exists(plot_path))
        statinfo = os.stat(plot_path)
        self.assertGreater(statinfo.st_size, 0)
コード例 #2
0
    def execute(self):
        """Execute TruncExpGen"""

        # get process manager and services
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)
        rfm = proc_mgr.service(RooFitManager)

        # get PDF from RooFitManager
        model = rfm.model(self.model_name)

        # check if dataset with upper bounds exists in data store
        if self.max_var_data_key not in ds:
            self.log().warning(
                'No range upper-bound data in data store; generating %d dummy bounds',
                NUM_DUMMY_EVENTS)
            ds[self.max_var_data_key] = gen_max_var_data(model)

        # get max-var data
        max_var_data = ds.get(self.max_var_data_key)
        if not isinstance(max_var_data, ROOT.RooAbsData):
            raise TypeError('data with key "{}" are not RooFit data'.format(
                self.read_key))

        # select max-var data
        mv_sel_data = sel_max_var_data(model, max_var_data, self.event_frac)

        # generate data
        proto_arg = RooFit.ProtoData(mv_sel_data, False, False)
        data = model.pdf.generate(model.var_set, proto_arg,
                                  *self._gen_cmd_args.values())
        ds[self.store_key] = data

        return StatusCode.Success
コード例 #3
0
    def test_execute_all_status_return(self, mock_execute, mock_import,
                                       mock_persist):
        from eskapade import StatusCode, ProcessManager

        pm = ProcessManager()
        pm.service(
            ConfigObject)['analysisName'] = 'test_execute_all_status_return'
        c1 = Chain('1')
        c2 = Chain('2')
        c3 = Chain('fail')
        c4 = Chain('4')
        pm.chains = [c1, c2, c3, c4]
        status = pm.execute_all()
        self.assertEqual(status, StatusCode.Failure)
        executed_chains = [arg[0] for arg in mock_execute.call_args_list]
        self.assertNotIn(c4, executed_chains)

        pm.reset()
        pm.service(
            ConfigObject)['analysisName'] = 'test_execute_all_status_return'
        mock_execute.reset_mock()
        c1 = Chain('1')
        c2 = Chain('2')
        c3 = Chain('skip')
        c4 = Chain('4')
        pm.chains = [c1, c2, c3, c4]
        status = pm.execute_all()
        self.assertEqual(status, StatusCode.Success)
        executed_chains = [arg[0][0] for arg in mock_execute.call_args_list]
        self.assertIn(c4, executed_chains)
コード例 #4
0
    def execute(self):
        """Execute ConvertRooDataSet2RooDataHist"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)
        ws = proc_mgr.service(RooFitManager).ws

        # basic checks on contensts of the data frame
        if self.from_ws:
            rds = ws.data(self.read_key)
            if rds is None:
                raise RuntimeError('no data with key "{}" in workspace'.format(self.read_key))
        else:
            if self.read_key not in ds:
                raise KeyError('key "{}" not found in datastore'.format(self.read_key))
            rds = ds[self.read_key]
        if not isinstance(rds, ROOT.RooDataSet):
            raise TypeError('retrieved object "{0:s}" not of type RooDataSet (got "{1:s}")'.format(self.read_key,
                                                                                                   str(type(rds))))
        if rds.numEntries() == 0:
            raise AssertionError('RooDataSet "{}" is empty'.format(self.read_key))

        # check presence of all columns
        for col in self.columns:
            if not ws.var(col):
                raise RuntimeError('variable "{}" not found in workspace'.format(col))

        # create a temporary observables set of the columns
        temp_obs = uuid.uuid4().hex
        obs = ','.join(self.columns)
        failure = ws.defineSet(temp_obs, obs)
        if not failure:
            theobs = ws.set(temp_obs)
        else:
            raise RuntimeError('unable to retrieve (/create) observables with name "{}"'.format(obs))

        # do conversion from RooDataSet to RooDataHist
        self.log().debug('Converting roodataset "%s" into roodatahist "%s"', self.read_key, self.store_key)
        rdh = data_conversion.rds_to_rdh(rds, rf_varset=theobs, binning_name=self.binning_name)

        # remove original rds?
        if self.rm_original:
            if self.from_ws:
                # FIXME can datasets be deleted from an rws? dont know how
                pass
            else:
                del ds[self.read_key]

        # put object into the datastore
        ds[self.store_key] = rdh
        n_rdh = rdh.numEntries()
        ds['n_' + self.store_key] = n_rdh
        self.log().debug('Stored roodatahist "%s" with number of bins: %d', self.store_key, n_rdh)

        # cleanup of temporary observables set
        ws.removeSet(temp_obs)

        return StatusCode.Success
コード例 #5
0
    def test_esk603(self):
        """Test Esk-603: Write Spark data to CSV"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk603_write_spark_data_to_csv.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # read output data
        results_data_path = persistence.io_dir(
            'results_data',
            proc_mgr.service(ConfigObject).io_conf())
        names = []
        headers = []
        contents = []
        csv_dirs = glob.glob('{}/*'.format(results_data_path))
        self.assertEqual(len(csv_dirs), 3,
                         'expected to find three CSV output directories')
        for csv_dir in csv_dirs:
            names.append(os.path.basename(csv_dir))
            csv_files = glob.glob('{}/part*'.format(csv_dir))
            self.assertEqual(
                len(csv_files), 1,
                'expected to find only one CSV file in "{}"'.format(names[-1]))
            with open(csv_files[0]) as csv:
                contents.append([l.strip().split(',') for l in csv])
                self.assertEqual(
                    len(contents[-1]), 101,
                    'unexpected number of lines in "{}" CSV'.format(names[-1]))
                headers.append(contents[-1][0])
                contents[-1] = sorted(contents[-1][1:])

        # check output data
        self.assertListEqual(headers[0], ['index', 'foo', 'bar'],
                             'unexpected CSV header for "{}"'.format(names[0]))
        self.assertListEqual(
            contents[0],
            sorted([str(it), 'foo{:d}'.format(it),
                    str((it + 1) / 2.)] for it in range(100)),
            'unexpected CSV content for "{}"'.format(names[0]))
        for name, head, cont in zip(names[1:], headers[1:], contents[1:]):
            self.assertListEqual(
                head, headers[0],
                'CSV header of "{0:s}" differs from header of "{1:s}"'.format(
                    name, names[0]))
            self.assertListEqual(
                cont, contents[0],
                'CSV content of "{0:s}" differs from content of "{1:s}"'.
                format(name, names[0]))
コード例 #6
0
ファイル: link_template.py プロジェクト: jpavel/Eskapade
    def execute(self):
        """Execute LINKTEMPLATE"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)

        # --- your algorithm code goes here

        self.log().debug('Now executing link: %s', self.name)

        return StatusCode.Success
コード例 #7
0
    def setUp(self):
        """Setup test environment"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['analysisName'] = 'DataConversionTest'
        # ensure local testing
        spark_settings = [('spark.app.name', settings['analysisName']),
                          ('spark.master', 'local[*]'),
                          ('spark.driver.host', 'localhost')]
        proc_mgr.service(SparkManager).create_session(
            eskapade_settings=settings, spark_settings=spark_settings)
コード例 #8
0
    def execute(self):
        """Execute SparkStreamingWriter"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)

        data = ds[self.read_key]

        if self.repartition:
            data = data.repartition(self.repartition)
        data.saveAsTextFiles(self.output_path, suffix=self.suffix)

        return StatusCode.Success
コード例 #9
0
    def execute(self):
        """Execute IPythonEmbed"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)

        self.log().info(
            "Starting interactive session ... press Ctrl+d to exit.\n")
        # this function calls the interactive ipython session
        # in this session ds, settings, and proc_mgr are available
        embed()

        return StatusCode.Success
コード例 #10
0
ファイル: spark_df_reader.py プロジェクト: Patechoc/Eskapade
    def execute(self):
        """Execute SparkDfReader"""

        # create data-frame reader
        proc_mgr = ProcessManager()
        spark = proc_mgr.service(SparkManager).get_session()
        data = spark.read

        # call data-frame reader methods
        data = apply_transform_funcs(data, self._read_methods)

        # store data in data store
        proc_mgr.service(DataStore)[self.store_key] = data

        return StatusCode.Success
コード例 #11
0
    def test_esk602(self):
        """Test Esk-602: Read CSV files into a Spark data frame"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk602_read_csv_to_spark_df.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].rdd.getNumPartitions(), 5,
                         'unexpected number of partitions in data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(ds['spark_df'].columns, ['date', 'loc', 'x', 'y'],
                             'unexpected columns in data frame')
        self.assertSetEqual(
            set((r['date'], r['loc']) for r in ds['spark_df'].collect()),
            set([(20090101, 'a'), (20090102, 'b'), (20090103, 'c'),
                 (20090104, 'd'), (20090104, 'e'), (20090106, 'a'),
                 (20090107, 'b'), (20090107, 'c'), (20090107, 'd'),
                 (20090108, 'e'), (20090109, 'e'), (20090109, 'f')]),
            'unexpected values in date/loc columns')
コード例 #12
0
ファイル: spark_df_writer.py プロジェクト: Patechoc/Eskapade
    def execute(self):
        """Execute SparkDfWriter"""

        # get process manager and data store
        proc_mgr = ProcessManager()
        ds = ProcessManager().service(DataStore)

        # check if data frame exists in data store
        if self.read_key not in ds:
            err_msg = 'no input data found in data store with key "{}"'.format(
                self.read_key)
            if not self.fail_missing_data:
                self.log().error(err_msg.capitalize())
                return StatusCode.Success
            raise KeyError(err_msg)

        # fetch data from data store
        data = ds[self.read_key]
        if not isinstance(data, pyspark.sql.DataFrame):
            spark = proc_mgr.service(SparkManager).get_session()
            self.log().debug(
                'Converting data of type "%s" to a Spark data frame',
                type(data))
            data = data_conversion.create_spark_df(spark,
                                                   data,
                                                   schema=self.schema)

        # create data-frame writer with requested number of partitions/output files
        df_writer = data.repartition(self.num_files).write

        # call data-frame writer methods
        df_writer = apply_transform_funcs(df_writer, self._write_methods)

        return StatusCode.Success
コード例 #13
0
    def test_esk605(self):
        """Test Esk-605: Create Spark data frame"""

        # run Eskapade
        self.run_eskapade('esk605_create_spark_df.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check created data frames
        cols = (StructField('index',
                            LongType()), StructField('foo', StringType()),
                StructField('bar', DoubleType()))
        rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.)
                for it in range(20, 100)]
        for key in ('rows_df', 'rdd_df', 'df_df', 'pd_df'):
            self.assertIn(key, ds,
                          'no object with key {} in data store'.format(key))
            df = ds[key]
            self.assertIsInstance(
                df, pyspark.sql.DataFrame,
                'object with key {0:s} is not a data frame (type {1:s})'.
                format(key, str(type(df))))
            self.assertTupleEqual(
                tuple(df.schema), cols,
                'unexpected data-frame schema for {}'.format(key))
            self.assertListEqual(
                sorted(tuple(r) for r in df.collect()), rows,
                'unexpected data-frame content for {}'.format(key))
            self.assertTrue(df.is_cached,
                            'data frame {} is not cached'.format(key))
            self.assertLessEqual(
                df.rdd.getNumPartitions(), 2,
                'unexpected number of data-frame partitions for {}'.format(
                    key))
コード例 #14
0
    def test_esk604(self):
        """Test Esk-604: Execute Spark-SQL query"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk604_spark_execute_query.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check data frame
        self.assertIn('spark_df_sql', ds,
                      'no object with key "spark_df_sql" in data store')
        self.assertIsInstance(ds['spark_df_sql'], pyspark.sql.DataFrame,
                              '"spark_df_sql" is not a Spark data frame')
        self.assertEqual(ds['spark_df_sql'].count(), 4,
                         'unexpected number of rows in filtered data frame')
        self.assertListEqual(ds['spark_df_sql'].columns,
                             ['loc', 'sumx', 'sumy'],
                             'unexpected columns in data frame')
        self.assertEqual(
            ds['spark_df_sql'].schema,
            proc_mgr.get_chain('ApplySQL').get_link('SparkSQL').schema,
            'schema of data frame does not correspond to schema stored in link'
        )
        self.assertSetEqual(
            set(tuple(r) for r in ds['spark_df_sql'].collect()),
            set([('e', 10, 15), ('d', 2, 11), ('b', 6, 16), ('a', 2, 18)]),
            'unexpected values in loc/sumx/sumy columns')
コード例 #15
0
    def setUp(self):
        """Set up test"""

        TutorialMacrosTest.setUp(self)
        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['macrosDir'] = '{0:s}/{1:s}'.format(
            utils.get_env_var('es_root'), 'tutorials')
        settings['analysisName'] = 'SparkAnalysisTutorialMacrosTest'

        # ensure local testing
        spark_settings = [('spark.app.name', settings['analysisName']),
                          ('spark.master', 'local[*]'),
                          ('spark.driver.host', 'localhost')]
        proc_mgr.service(SparkManager).create_session(
            eskapade_settings=settings, spark_settings=spark_settings)
コード例 #16
0
    def test_spark_setup(self):
        """Test if Spark setup is working properly"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['analysisName'] = 'spark_setup'

        sm = proc_mgr.service(SparkManager)
        spark = sm.create_session(eskapade_settings=settings)

        df = spark.createDataFrame([(0, 'foo'), (1, 'bar')], ['id', 'value'])

        self.assertSetEqual(set(tuple(r) for r in df.collect()),
                            set([(0, 'foo'), (1, 'bar')]),
                            'unexpected values in columns')
        sm.finish()
コード例 #17
0
    def test_esk607(self):
        """Test Esk-607: Add column to Spark dataframe"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk607_spark_with_column.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check data frame
        self.assertIn('new_spark_df', ds,
                      'no object with key "new_spark_df" in data store')
        self.assertIsInstance(ds['new_spark_df'], pyspark.sql.DataFrame,
                              '"new_spark_df" is not a Spark data frame')
        self.assertEqual(ds['new_spark_df'].count(), 5,
                         'unexpected number of rows in filtered data frame')
        self.assertListEqual(
            ds['new_spark_df'].columns,
            ['dummy', 'date', 'loc', 'x', 'y', 'pow_xy1', 'pow_xy2'],
            'unexpected columns in data frame')
        self.assertSetEqual(
            set(tuple(r) for r in ds['new_spark_df'].collect()),
            set([('bla', 20090103, 'c', 5, 7, 78125.0, 78125.0),
                 ('bal', 20090102, 'b', 3, 8, 6561.0, 6561.0),
                 ('flo', 20090104, 'e', 3, 5, 243.0, 243.0),
                 ('bar', 20090101, 'a', 1, 9, 1.0, 1.0),
                 ('foo', 20090104, 'd', 1, 6, 1.0, 1.0)]),
            'unexpected values in columns')
コード例 #18
0
    def test_esk609(self):
        """Test Esk-609: Map data-frame groups"""

        # run Eskapade
        self.run_eskapade('esk609_map_df_groups.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check input data
        for key in ('map_rdd', 'flat_map_rdd'):
            self.assertIn(key, ds, 'no data found with key "{}"'.format(key))
            self.assertIsInstance(
                ds[key], pyspark.RDD,
                'object "{0:s}" is not an RDD (type "{1:s}")'.format(
                    key, str(type(ds[key]))))

        # sums of "bar" variable
        bar_sums = [(0, 27.5), (1, 77.5), (2, 127.5), (3, 177.5), (4, 227.5),
                    (5, 277.5), (6, 327.5), (7, 377.5), (8, 427.5), (9, 477.5)]
        flmap_rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.,
                       bar_sums[it // 10][1]) for it in range(100)]

        # check mapped data frames
        self.assertListEqual(sorted(ds['map_rdd'].collect()), bar_sums,
                             'unexpected values in "map_rdd"')
        self.assertListEqual(sorted(ds['flat_map_rdd'].collect()), flmap_rows,
                             'unexpected values in "flat_map_rdd"')
コード例 #19
0
    def checkCollectionSet(self):
        """ 
        Check existence of collection in either mongo or datastore, and check that they are not empty.
    
        Collections need to be both present and not empty.

        - For mongo collections a dedicated filter can be applied before doing the count. 
        - For pandas dataframes the additional option 'skip_chain_when_key_not_in_ds' exists. Meaning, skip the chain as well if the dataframe is not present in the datastore.
        """

        proc_mgr = ProcessManager()

        # check if collection names are present in datastore
        ds = proc_mgr.service(DataStore)
        for k in self.collectionSet:
            if k not in list(ds.keys()):
                if self.skip_chain_when_key_not_in_ds:
                    self.log().warning(
                        'Key <%s> not in DataStore. Sending skip chain signal.'
                        % k)
                    return StatusCode.SkipChain
                else:
                    raise Exception('Key <%s> not in DataStore.' % k)
            df = ds[k]
            if len(df.index) == 0:
                self.log().warning(
                    'pandas.DataFrame with datastore key <%s> is empty. Sending skip chain signal.'
                    % k)
                return StatusCode.SkipChain

        return StatusCode.Success
コード例 #20
0
ファイル: run_eskapade.py プロジェクト: jpavel/Eskapade
def main():
    """Run Eskapade

    Top-level control function for an Eskapade run started from the
    command line.  Arguments specified by the user are parsed and
    converted to settings in the configuration object.  Optionally, an
    interactive IPython session is started when the run is finished.
    """

    # create parser for command-line arguments
    parser = create_arg_parser()
    user_args = parser.parse_args()

    # create config object for settings
    if not user_args.unpickle_config:
        # create new config
        settings = ConfigObject()
    else:
        # read previously persisted settings if pickled file is specified
        conf_path = user_args.config_files.pop(0)
        settings = ConfigObject.import_from_file(conf_path)
    del user_args.unpickle_config

    # set configuration macros
    settings.add_macros(user_args.config_files)

    # set user options
    settings.set_user_opts(user_args)

    # run Eskapade
    core.execution.run_eskapade(settings)

    # start interpreter if requested (--interactive on command line)
    if settings.get('interactive'):
        # create process manager, config object, and data store
        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)

        # set Pandas display options
        pd.set_option('display.width', 120)
        pd.set_option('display.max_columns', 50)

        # start interactive session
        log = logging.getLogger(__name__)
        log.info("Continuing interactive session ... press Ctrl+d to exit.\n")
        IPython.embed()
コード例 #21
0
    def execute(self):
        """Execute SparkStreamingWordCount"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)

        lines = ds[self.read_key]
        counts = lines.flatMap(lambda line: line.split(" "))\
            .map(lambda word: (word, 1))\
            .reduceByKey(lambda a, b: a + b)
        counts.pprint()

        if self.store_key is not None:
            ds[self.store_key] = counts

        return StatusCode.Success
コード例 #22
0
    def process_and_store(self):
        """Store (and possibly process) histogram objects"""

        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        if self.store_key is not None:
            ds[self.store_key] = self._hists
コード例 #23
0
    def test_esk608(self):
        """Test Esk-608: Execute Spark histogram filling macro"""

        # check if required Python and Java libraries are made available to worker nodes
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )
        self.assertRegexpMatches(
            sc.getConf().get('spark.jars.packages', ''),
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4',
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail'
        )
        if re.search('spark://', sc.getConf().get('spark.master', '')):
            py_mods = utils.get_file_path('py_mods')
            self.assertRegexpMatches(
                sc.getConf().get('spark.submit.pyFiles', ''), py_mods,
                'Eskapade modules missing from spark.submit.pyFiles, needed in Spark cluster mode'
            )
            self.assertRegexpMatches(
                sc.getConf().get('spark.files', ''), py_mods,
                'Eskapade modules missing from spark.files, needed in Spark cluster mode'
            )

        # run Eskapade
        self.run_eskapade('esk608_spark_histogrammar.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)
        settings = ProcessManager().service(ConfigObject)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(sorted(ds['spark_df'].columns),
                             sorted(['date', 'loc', 'x', 'y']),
                             'unexpected columns in data frame')

        # data-generation checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        col_names = ['date', 'x', 'y', 'loc', 'x:y']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names))

        # data-summary checks
        f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y']
        file_names = ['report.tex'
                      ] + ['hist_{}.pdf'.format(col) for col in f_bases]
        for fname in file_names:
            path = persistence.io_path('results_data', settings.io_conf(),
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
コード例 #24
0
    def process_and_store(self):
        """Make, clean, and store ValueCount objects"""

        # nothing to do?
        if self.store_key_hists is None and self.store_key_counts is None:
            return

        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # 1. construct value counts
        for col in self.columns:
            name = ':'.join(col)
            vc = ValueCounts(col, col, self._counts[name])
            # remove all items from Counters where the key is not of correct datatype.
            # e.g. in Counter dict of ints, remove any non-ints that may arise
            # from dq issues.
            if self.drop_inconsistent_key_types:
                vc = self.drop_inconsistent_keys(col, vc)
            self._valcnts[name] = vc

        if self.store_key_counts is not None:
            ds[self.store_key_counts] = self._valcnts

        # 2. construct hists from value counts
        if self.store_key_hists is None:
            return

        for col in self.columns:
            if len(col) != 1:
                continue
            name = ':'.join(col)
            dt = np.dtype(self.var_dtype[name]).type()
            is_number = isinstance(dt, np.number)
            is_timestamp = isinstance(dt, np.datetime64)

            # bin_specs is used for converting index back to original var in
            # histogram class.
            bin_specs = {}
            if is_number:
                bin_specs = self.bin_specs.get(name, self._unit_bin_specs)
            elif is_timestamp:
                bin_specs = self.bin_specs.get(name,
                                               self._unit_timestamp_specs)
            h = Histogram(self._valcnts[name],
                          variable=name,
                          datatype=self.var_dtype[name],
                          bin_specs=bin_specs)
            self._hists[name] = h
        # and store
        ds[self.store_key_hists] = self._hists

        # cleanup
        if self.store_key_counts is None:
            del self._valcnts
        if self.store_key_hists is None:
            del self._hists
コード例 #25
0
ファイル: test_bases.py プロジェクト: Patechoc/Eskapade
    def setUp(self):
        """Set up test"""

        execution.reset_eskapade()
        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['analysisName'] = self.__class__.__name__
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['batchMode'] = True
コード例 #26
0
    def execute(self):
        """Execute PrintWs"""

        proc_mgr = ProcessManager()
        ws = proc_mgr.service(RooFitManager).ws

        ws.Print('v')

        return StatusCode.Success
コード例 #27
0
    def execute(self):
        """Execute SparkConfigurator"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        sm = proc_mgr.service(SparkManager)

        # stop running spark session, if any
        sm.finish()

        # start a new session
        spark = proc_mgr.service(SparkManager).create_session(
            eskapade_settings=settings, spark_settings=self.spark_settings)
        spark.sparkContext.setLogLevel(self.log_level)

        # check config
        self.log().info('New Spark session started with config: {}'.format(str(spark.sparkContext.getConf().getAll())))

        return StatusCode.Success
コード例 #28
0
    def do_storage(self):
        """Storage of the created RooDataHist object"""

        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # 1. create pdf of dataset as well?
        if self.create_hist_pdf:
            hpdf_name = self.create_hist_pdf
            hist_pdf = ROOT.RooHistPdf(hpdf_name, hpdf_name, self._varset,
                                       self._rdh)

        # 2. remove original df?
        if self.rm_original:
            del ds[self.read_key]

        # 3a. put objects from the datastore into the workspace
        if self.into_ws:
            ws = proc_mgr.service(RooFitManager).ws
            try:
                ws.put(self._rdh, ROOT.RooFit.Rename(self.store_key))
                ws.defineSet(self.store_key_vars, self._varset)
                ws.defineSet(self.store_key_cats, self._catset)
                if self.create_hist_pdf:
                    ws.put(hist_pdf, RooFit.RecycleConflictNodes())
            except:
                raise RuntimeError(
                    'could not import object "%s" into rooworkspace' %
                    self.read_key)
        # 3b. put objects into datastore
        else:
            ds[self.store_key] = self._rdh
            ds[self.store_key_vars] = self._varset
            ds[self.store_key_cats] = self._catset
            if self.create_hist_pdf:
                ds[hpdf_name] = hist_pdf

        n_rdh = int(self._rdh.sumEntries())
        ds['n_' + self.store_key] = n_rdh
        self.log().debug('Stored roodatahist "%s" with sum of weights: %d',
                         self.store_key, n_rdh)
        ds[self.sk_map_to_original] = self._mto
コード例 #29
0
    def test_udf_functionality(self):
        """Test if Spark setup is working properly for user-defined functions"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['analysisName'] = 'spark_setup'

        sm = proc_mgr.service(SparkManager)
        spark = sm.create_session(includeEskapadeModules=True,
                                  eskapade_settings=settings)

        df = spark.createDataFrame([(0, 'foo'), (1, 'bar')], ['id', 'value'])

        udf_to_str = udf(dq_helper.to_str, StringType())
        df = df.withColumn('output', udf_to_str(df['value']))

        self.assertSetEqual(set(tuple(r) for r in df.collect()),
                            set([(0, 'foo', 'foo'), (1, 'bar', 'bar')]),
                            'unexpected values in columns')
        sm.finish()
コード例 #30
0
ファイル: test_bases.py プロジェクト: Patechoc/Eskapade
    def run_eskapade(self,
                     macro,
                     return_status=definitions.StatusCode.Success):
        """Run Eskapade"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['macro'] = persistence.io_path('macros', settings.io_conf(),
                                                macro)
        status = execution.run_eskapade(settings)
        self.assertTrue(status == return_status)