def test_esk603(self): """Test Esk-603: Write Spark data to CSV""" # check if running in local mode sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) # run Eskapade self.run_eskapade('esk603_write_spark_data_to_csv.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # read output data results_data_path = persistence.io_dir( 'results_data', proc_mgr.service(ConfigObject).io_conf()) names = [] headers = [] contents = [] csv_dirs = glob.glob('{}/*'.format(results_data_path)) self.assertEqual(len(csv_dirs), 3, 'expected to find three CSV output directories') for csv_dir in csv_dirs: names.append(os.path.basename(csv_dir)) csv_files = glob.glob('{}/part*'.format(csv_dir)) self.assertEqual( len(csv_files), 1, 'expected to find only one CSV file in "{}"'.format(names[-1])) with open(csv_files[0]) as csv: contents.append([l.strip().split(',') for l in csv]) self.assertEqual( len(contents[-1]), 101, 'unexpected number of lines in "{}" CSV'.format(names[-1])) headers.append(contents[-1][0]) contents[-1] = sorted(contents[-1][1:]) # check output data self.assertListEqual(headers[0], ['index', 'foo', 'bar'], 'unexpected CSV header for "{}"'.format(names[0])) self.assertListEqual( contents[0], sorted([str(it), 'foo{:d}'.format(it), str((it + 1) / 2.)] for it in range(100)), 'unexpected CSV content for "{}"'.format(names[0])) for name, head, cont in zip(names[1:], headers[1:], contents[1:]): self.assertListEqual( head, headers[0], 'CSV header of "{0:s}" differs from header of "{1:s}"'.format( name, names[0])) self.assertListEqual( cont, contents[0], 'CSV content of "{0:s}" differs from content of "{1:s}"'. format(name, names[0]))
def tearDown(self): """Tear down test""" # remove persisted results for this test path = persistence.io_dir('ana_results') if os.path.exists(path): shutil.rmtree(path) # reset run process execution.reset_eskapade()
def tearDown(self): """Tear down test""" # remove persisted results for this test path = persistence.io_dir( 'ana_results', ProcessManager().service(ConfigObject).io_conf()) if os.path.exists(path): shutil.rmtree(path) # reset run process execution.reset_eskapade()
def persist_services(self, io_conf, chain=None): """Persist process services in files. :param dict io_conf: I/O config as returned by ConfigObject.io_conf :param str chain: name of chain for which data is persisted """ # parse I/O config io_conf = ConfigObject.IoConfig(**io_conf) # parse specified chain if chain: # prepend underscore for output directory chain = '_{}'.format(chain) else: # use default directory if chain not specified chain = 'default' # get chain path and set link of latest data base_path = persistence.io_dir('proc_service_data', io_conf) chain_path = '{0:s}/{1:s}'.format(base_path, chain) persistence.create_dir(chain_path) self.logger.debug('Persisting process services in "{path}".', path=chain_path) try: # remove old symlink os.remove('{}/latest'.format(base_path)) except OSError: pass try: # create new symlink os.symlink(chain, '{}/latest'.format(base_path)) except OSError as exc: self.logger.fatal( 'Unable to create symlink to latest version of services: <{path}/latest>.', path=base_path) raise exc # remove old data service_paths = glob.glob('{}/*.pkl'.format(chain_path)) try: for path in service_paths: os.remove(path) except Exception as exc: self.logger.fatal( 'Unable to remove previously persisted process services.') raise exc # persist services for cls in self.get_services(): self.service(cls).persist_in_file('{0:s}/{1!s}.pkl'.format( chain_path, cls))
def execute_macro(self, filename, copyfile=True): """Execute an input python configuration file. A copy of the configuration file is stored for bookkeeping purposes. :param str filename: the path of the python configuration file :param bool copyfile: back up the macro for bookkeeping purposes :raise Exception: if input configuration file cannot be found """ if not os.path.isfile(filename): raise Exception( 'ERROR. Configuration macro \'{}\' not found.'.format( filename)) exec(compile(open(filename).read(), filename, 'exec')) # make copy of macro for bookkeeping purposes settings = self.service(ConfigObject) if not settings.get('doNotStoreResults') and copyfile: import shutil shutil.copy(filename, persistence.io_dir('results_config'))
def initialize(self): """Inititialize the TruncExpFit execution""" # check input arguments self.check_arg_types(read_key=str, max_var_data_key=str, model_name=str, results_path=str) self.check_arg_vals('read_key', 'model_name') # create process-manager and service instances proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) rfm = proc_mgr.service(RooFitManager) # check if model exists model = rfm.model(self.model_name) if not model: self.log().warning( 'Model "{}" does not exist; creating with default values'. format(self.model_name)) model = rfm.model(self.model_name, model_cls=TruncExponential) # check if model PDF has been built if not model.is_built: model.build_model() # process command arguments for fit function self._fit_cmd_args = create_roofit_opts( 'fit', ConditionalObservables=model.max_var_set, **self.kwargs) # get path to results directory if not self.results_path: self.results_path = persistence.io_dir('results_data', settings.io_conf()) persistence.create_dir(self.results_path) return StatusCode.Success
def test_esk610(self): """Test Esk-610: Spark Streaming word count""" # this test relies on linux shell scripts to create file stream if (sys.platform != 'linux') and (sys.platform != 'darwin'): print('skipping test_esk610 for non-unix {} platform'.format( sys.platform)) return # check if running in local mode sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) # create test dir tmpdir = '/tmp/eskapade_stream_test' os.mkdir(tmpdir) # create a file stream tmpfile = ''.join( random.choice(string.ascii_lowercase) for x in range(8)) cmd = 'for i in $(seq -f \"%05g\" 0 1000); \ do echo \'Hello world\' > "{}"/"{}"_$i.dummy; \ sleep 1; done'.format(tmpdir, tmpfile) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # run eskapade self.run_eskapade('esk610_spark_streaming_wordcount.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # end file stream p.kill() # check if file stream was properly executed stdout, stderr = p.communicate() self.assertEqual(stdout, b'', 'unexpected stdout output {}'.format(stdout)) self.assertEqual(stderr, b'', 'unexpected stderr output {}'.format(stderr)) # check if stream was setup correctly (that's all we can do - the data itself is gone) self.assertIsInstance(ds['dstream'], pyspark.streaming.DStream) # read and check output data results_data_path = persistence.io_dir( 'results_data', proc_mgr.service(ConfigObject).io_conf()) names = [] contents = [] csv_dirs = glob.glob( '{}/dstream/wordcount-*.txt'.format(results_data_path)) self.assertGreater(len(csv_dirs), 0, 'expected to find CSV output directories') for csv_dir in csv_dirs: names.append(os.path.basename(csv_dir)) csv_files = glob.glob('{}/part*'.format(csv_dir)) #self.assertEqual(len(csv_files), 1, 'expected to find exactly one CSV file in "{}"'.format(names[-1])) if len(csv_files) > 0: with open(csv_files[0]) as csv: record = [l for l in csv] if record != []: # empty records are allowed (because of timing differences) self.assertRegexpMatches( record[0], 'Hello', 'Expected \'Hello\' as in \'Hello world\'') self.assertRegexpMatches( record[1], 'world', 'Expected \'world\' as in \'Hello world\'') contents.append(record[:]) self.assertGreater( len(contents), 0, 'expected ~ten items (each second a streaming RDD) - depending on timing' ) # clean up files shutil.rmtree(tmpdir)
def _process_results_path(self): """Process results_path argument.""" if not self.results_path: self.results_path = persistence.io_dir('results_data') persistence.create_dir(self.results_path)
settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk603_read_csv_to_spark_df' settings['version'] = 0 ########################################################################## # --- start Spark session spark = proc_mgr.service(SparkManager).create_session(eskapade_settings=settings) ########################################################################## # --- CSV and data settings output_dir = 'file:' + persistence.io_dir('results_data', settings.io_conf()) num_files = 1 separator = ',' write_header = True columns = ['index', 'foo', 'bar'] rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)] ########################################################################## # --- Spark data ds = proc_mgr.service(DataStore) ds['rdd'] = spark.sparkContext.parallelize(rows) ds['df'] = spark.createDataFrame(ds['rdd'], schema=columns)
def import_services(self, io_conf, chain=None, force=None, no_force=None): """Import process services from files. :param dict io_conf: I/O config as returned by ConfigObject.io_conf :param str chain: name of chain for which data was persisted :param force: force import if service already registered :type force: bool or list :param list no_force: do not force import of services in this list """ no_force = no_force or [] # parse I/O config io_conf = ConfigObject.IoConfig(**io_conf) # get services for which import may be forced force_set = set() if force: try: # check if an iterable of forced services was provided force_set.update(force) except TypeError: # force all services if "force" was provided, but is not iterable force_set.update(self.get_services()) force_set -= set(no_force) # parse specified chain if chain: # prepend underscore for output directory chain = '_{}'.format(chain) else: # use data from latest chain if not specified chain = 'latest' # get list of persisted files base_path = persistence.io_dir('proc_service_data', io_conf) service_paths = glob.glob('{0:s}/{1:s}/*.pkl'.format(base_path, chain)) self.logger.debug( 'Importing process services from "{path}/{chain}" (found {n:d} files).', path=base_path, chain=chain, n=len(service_paths)) # read and register services for path in service_paths: try: # try to import service module cls_spec = os.path.splitext( os.path.basename(path))[0].split('.') mod = importlib.import_module('.'.join(cls_spec[:-1])) cls = getattr(mod, cls_spec[-1]) except Exception as exc: # unable to import module self.logger.error( 'Unable to import process-service module for path "{path}".', path=path) raise exc # check if service is already registered if cls in self.get_services(): if cls in force_set: # remove old service instance if import is forced self.remove_service(cls) else: # skip import if not forced self.logger.debug( 'Service "{cls!s}" already registered; skipping import of "{path}"', cls=cls, path=path) continue # read service instance from file inst = cls.import_from_file(path) if inst: self.service(inst)
########################################################################## # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk603_read_csv_to_spark_df' settings['version'] = 0 ########################################################################## # --- start Spark session spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings) ########################################################################## # --- CSV and data settings output_dir = 'file:' + persistence.io_dir('results_data') num_files = 1 separator = ',' write_header = True columns = ['index', 'foo', 'bar'] rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)] ########################################################################## # --- Spark data ds = process_manager.service(DataStore) ds['rdd'] = spark.sparkContext.parallelize(rows) ds['df'] = spark.createDataFrame(ds['rdd'], schema=columns) ########################################################################## # --- now set up the chains and links based on configuration flags