Example #1
0
    def test_esk603(self):
        """Test Esk-603: Write Spark data to CSV"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk603_write_spark_data_to_csv.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # read output data
        results_data_path = persistence.io_dir(
            'results_data',
            proc_mgr.service(ConfigObject).io_conf())
        names = []
        headers = []
        contents = []
        csv_dirs = glob.glob('{}/*'.format(results_data_path))
        self.assertEqual(len(csv_dirs), 3,
                         'expected to find three CSV output directories')
        for csv_dir in csv_dirs:
            names.append(os.path.basename(csv_dir))
            csv_files = glob.glob('{}/part*'.format(csv_dir))
            self.assertEqual(
                len(csv_files), 1,
                'expected to find only one CSV file in "{}"'.format(names[-1]))
            with open(csv_files[0]) as csv:
                contents.append([l.strip().split(',') for l in csv])
                self.assertEqual(
                    len(contents[-1]), 101,
                    'unexpected number of lines in "{}" CSV'.format(names[-1]))
                headers.append(contents[-1][0])
                contents[-1] = sorted(contents[-1][1:])

        # check output data
        self.assertListEqual(headers[0], ['index', 'foo', 'bar'],
                             'unexpected CSV header for "{}"'.format(names[0]))
        self.assertListEqual(
            contents[0],
            sorted([str(it), 'foo{:d}'.format(it),
                    str((it + 1) / 2.)] for it in range(100)),
            'unexpected CSV content for "{}"'.format(names[0]))
        for name, head, cont in zip(names[1:], headers[1:], contents[1:]):
            self.assertListEqual(
                head, headers[0],
                'CSV header of "{0:s}" differs from header of "{1:s}"'.format(
                    name, names[0]))
            self.assertListEqual(
                cont, contents[0],
                'CSV content of "{0:s}" differs from content of "{1:s}"'.
                format(name, names[0]))
Example #2
0
    def tearDown(self):
        """Tear down test"""

        # remove persisted results for this test
        path = persistence.io_dir('ana_results')
        if os.path.exists(path):
            shutil.rmtree(path)

        # reset run process
        execution.reset_eskapade()
Example #3
0
    def tearDown(self):
        """Tear down test"""

        # remove persisted results for this test
        path = persistence.io_dir(
            'ana_results',
            ProcessManager().service(ConfigObject).io_conf())
        if os.path.exists(path):
            shutil.rmtree(path)

        # reset run process
        execution.reset_eskapade()
Example #4
0
    def persist_services(self, io_conf, chain=None):
        """Persist process services in files.

        :param dict io_conf: I/O config as returned by ConfigObject.io_conf
        :param str chain: name of chain for which data is persisted
        """
        # parse I/O config
        io_conf = ConfigObject.IoConfig(**io_conf)

        # parse specified chain
        if chain:
            # prepend underscore for output directory
            chain = '_{}'.format(chain)
        else:
            # use default directory if chain not specified
            chain = 'default'

        # get chain path and set link of latest data
        base_path = persistence.io_dir('proc_service_data', io_conf)
        chain_path = '{0:s}/{1:s}'.format(base_path, chain)
        persistence.create_dir(chain_path)
        self.logger.debug('Persisting process services in "{path}".',
                          path=chain_path)
        try:
            # remove old symlink
            os.remove('{}/latest'.format(base_path))
        except OSError:
            pass
        try:
            # create new symlink
            os.symlink(chain, '{}/latest'.format(base_path))
        except OSError as exc:
            self.logger.fatal(
                'Unable to create symlink to latest version of services: <{path}/latest>.',
                path=base_path)
            raise exc

        # remove old data
        service_paths = glob.glob('{}/*.pkl'.format(chain_path))
        try:
            for path in service_paths:
                os.remove(path)
        except Exception as exc:
            self.logger.fatal(
                'Unable to remove previously persisted process services.')
            raise exc

        # persist services
        for cls in self.get_services():
            self.service(cls).persist_in_file('{0:s}/{1!s}.pkl'.format(
                chain_path, cls))
Example #5
0
    def execute_macro(self, filename, copyfile=True):
        """Execute an input python configuration file.

        A copy of the configuration file is stored for bookkeeping purposes.

        :param str filename: the path of the python configuration file
        :param bool copyfile: back up the macro for bookkeeping purposes
        :raise Exception: if input configuration file cannot be found
        """
        if not os.path.isfile(filename):
            raise Exception(
                'ERROR. Configuration macro \'{}\' not found.'.format(
                    filename))
        exec(compile(open(filename).read(), filename, 'exec'))

        # make copy of macro for bookkeeping purposes
        settings = self.service(ConfigObject)
        if not settings.get('doNotStoreResults') and copyfile:
            import shutil
            shutil.copy(filename, persistence.io_dir('results_config'))
Example #6
0
    def initialize(self):
        """Inititialize the TruncExpFit execution"""

        # check input arguments
        self.check_arg_types(read_key=str,
                             max_var_data_key=str,
                             model_name=str,
                             results_path=str)
        self.check_arg_vals('read_key', 'model_name')

        # create process-manager and service instances
        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        rfm = proc_mgr.service(RooFitManager)

        # check if model exists
        model = rfm.model(self.model_name)
        if not model:
            self.log().warning(
                'Model "{}" does not exist; creating with default values'.
                format(self.model_name))
            model = rfm.model(self.model_name, model_cls=TruncExponential)

        # check if model PDF has been built
        if not model.is_built:
            model.build_model()

        # process command arguments for fit function
        self._fit_cmd_args = create_roofit_opts(
            'fit', ConditionalObservables=model.max_var_set, **self.kwargs)

        # get path to results directory
        if not self.results_path:
            self.results_path = persistence.io_dir('results_data',
                                                   settings.io_conf())
        persistence.create_dir(self.results_path)

        return StatusCode.Success
Example #7
0
    def test_esk610(self):
        """Test Esk-610: Spark Streaming word count"""

        # this test relies on linux shell scripts to create file stream
        if (sys.platform != 'linux') and (sys.platform != 'darwin'):
            print('skipping test_esk610 for non-unix {} platform'.format(
                sys.platform))
            return

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # create test dir
        tmpdir = '/tmp/eskapade_stream_test'
        os.mkdir(tmpdir)

        # create a file stream
        tmpfile = ''.join(
            random.choice(string.ascii_lowercase) for x in range(8))
        cmd = 'for i in $(seq -f \"%05g\" 0 1000); \
                do echo \'Hello world\' > "{}"/"{}"_$i.dummy; \
                        sleep 1; done'.format(tmpdir, tmpfile)
        p = subprocess.Popen(cmd,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)

        # run eskapade
        self.run_eskapade('esk610_spark_streaming_wordcount.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # end file stream
        p.kill()

        # check if file stream was properly executed
        stdout, stderr = p.communicate()
        self.assertEqual(stdout, b'',
                         'unexpected stdout output {}'.format(stdout))
        self.assertEqual(stderr, b'',
                         'unexpected stderr output {}'.format(stderr))

        # check if stream was setup correctly (that's all we can do - the data itself is gone)
        self.assertIsInstance(ds['dstream'], pyspark.streaming.DStream)

        # read and check output data
        results_data_path = persistence.io_dir(
            'results_data',
            proc_mgr.service(ConfigObject).io_conf())
        names = []
        contents = []
        csv_dirs = glob.glob(
            '{}/dstream/wordcount-*.txt'.format(results_data_path))
        self.assertGreater(len(csv_dirs), 0,
                           'expected to find CSV output directories')
        for csv_dir in csv_dirs:
            names.append(os.path.basename(csv_dir))
            csv_files = glob.glob('{}/part*'.format(csv_dir))
            #self.assertEqual(len(csv_files), 1, 'expected to find exactly one CSV file in "{}"'.format(names[-1]))
            if len(csv_files) > 0:
                with open(csv_files[0]) as csv:
                    record = [l for l in csv]
                    if record != []:  # empty records are allowed (because of timing differences)
                        self.assertRegexpMatches(
                            record[0], 'Hello',
                            'Expected \'Hello\' as in \'Hello world\'')
                        self.assertRegexpMatches(
                            record[1], 'world',
                            'Expected \'world\' as in \'Hello world\'')
                    contents.append(record[:])
        self.assertGreater(
            len(contents), 0,
            'expected ~ten items (each second a streaming RDD) - depending on timing'
        )

        # clean up files
        shutil.rmtree(tmpdir)
Example #8
0
 def _process_results_path(self):
     """Process results_path argument."""
     if not self.results_path:
         self.results_path = persistence.io_dir('results_data')
     persistence.create_dir(self.results_path)
settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk603_read_csv_to_spark_df'
settings['version'] = 0


##########################################################################
# --- start Spark session

spark = proc_mgr.service(SparkManager).create_session(eskapade_settings=settings)


##########################################################################
# --- CSV and data settings

output_dir = 'file:' + persistence.io_dir('results_data', settings.io_conf())
num_files = 1
separator = ','
write_header = True
columns = ['index', 'foo', 'bar']
rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)]


##########################################################################
# --- Spark data

ds = proc_mgr.service(DataStore)
ds['rdd'] = spark.sparkContext.parallelize(rows)
ds['df'] = spark.createDataFrame(ds['rdd'], schema=columns)

Example #10
0
    def import_services(self, io_conf, chain=None, force=None, no_force=None):
        """Import process services from files.

        :param dict io_conf: I/O config as returned by ConfigObject.io_conf
        :param str chain: name of chain for which data was persisted
        :param force: force import if service already registered
        :type force: bool or list
        :param list no_force: do not force import of services in this list
        """
        no_force = no_force or []

        # parse I/O config
        io_conf = ConfigObject.IoConfig(**io_conf)

        # get services for which import may be forced
        force_set = set()
        if force:
            try:
                # check if an iterable of forced services was provided
                force_set.update(force)
            except TypeError:
                # force all services if "force" was provided, but is not iterable
                force_set.update(self.get_services())
        force_set -= set(no_force)

        # parse specified chain
        if chain:
            # prepend underscore for output directory
            chain = '_{}'.format(chain)
        else:
            # use data from latest chain if not specified
            chain = 'latest'

        # get list of persisted files
        base_path = persistence.io_dir('proc_service_data', io_conf)
        service_paths = glob.glob('{0:s}/{1:s}/*.pkl'.format(base_path, chain))
        self.logger.debug(
            'Importing process services from "{path}/{chain}" (found {n:d} files).',
            path=base_path,
            chain=chain,
            n=len(service_paths))

        # read and register services
        for path in service_paths:
            try:
                # try to import service module
                cls_spec = os.path.splitext(
                    os.path.basename(path))[0].split('.')
                mod = importlib.import_module('.'.join(cls_spec[:-1]))
                cls = getattr(mod, cls_spec[-1])
            except Exception as exc:
                # unable to import module
                self.logger.error(
                    'Unable to import process-service module for path "{path}".',
                    path=path)
                raise exc

            # check if service is already registered
            if cls in self.get_services():
                if cls in force_set:
                    # remove old service instance if import is forced
                    self.remove_service(cls)
                else:
                    # skip import if not forced
                    self.logger.debug(
                        'Service "{cls!s}" already registered; skipping import of "{path}"',
                        cls=cls,
                        path=path)
                    continue

            # read service instance from file
            inst = cls.import_from_file(path)
            if inst:
                self.service(inst)
Example #11
0
##########################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk603_read_csv_to_spark_df'
settings['version'] = 0

##########################################################################
# --- start Spark session

spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings)

##########################################################################
# --- CSV and data settings

output_dir = 'file:' + persistence.io_dir('results_data')
num_files = 1
separator = ','
write_header = True
columns = ['index', 'foo', 'bar']
rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)]

##########################################################################
# --- Spark data

ds = process_manager.service(DataStore)
ds['rdd'] = spark.sparkContext.parallelize(rows)
ds['df'] = spark.createDataFrame(ds['rdd'], schema=columns)

##########################################################################
# --- now set up the chains and links based on configuration flags