def test_execute(self): from eskapade import ProcessManager, DataStore from eskapade.core_ops.links import DsToDs ds = ProcessManager().service(DataStore) ds['test'] = 1 ds_to_ds = DsToDs() ds_to_ds.readKey = 'test' ds_to_ds.storeKey = 'moved_test' ds_to_ds.execute() self.assertIn('moved_test', list(ds.keys()), 'new key not in datastore') self.assertNotIn('test', list(ds.keys()), 'old key still in datastore') self.assertEqual(ds['moved_test'], 1, 'key-value pair not consistent')
def test_execute(self): from eskapade import ProcessManager, DataStore from eskapade.analysis import ApplyFuncToDf # --- setup a dummy data frame df = pd.DataFrame({ 'a': ['aap', 'noot', 'mies'], 'b': [0, 1, 2], 'c': [0, 1, 1], 'd': [1, 'a', None] }) # --- setup datastore ds = ProcessManager().service(DataStore) ds['test_input'] = df # --- setup the link link = ApplyFuncToDf() link.add_columns = {'foo': 'bar'} link.read_key = 'test_input' link.store_key = 'test_output' link.execute() # --- the actual detests # stored at all? self.assertIn('test_output', list(ds.keys()), 'DataFrame not stored') # added a column? self.assertIn('foo', ds['test_output'].columns, 'Column not added to DataFrame')
def execute(self): """ Execute WriteFromDf Pick up the dataframe and write to disk. """ ds = ProcessManager().service(DataStore) # check that all dataframes are present assert all( k in list(ds.keys()) for k in list(self.dictionary.keys())), 'key(s) not in DataStore.' # check that all ds items are dataframes assert all(isinstance(ds[k],pd.DataFrame) for k in list(self.dictionary.keys())), \ 'key(s) is not a pandas DataFrame.' # collect writer and store the dataframes for k in list(self.dictionary.keys()): df = ds[k] path = self.dictionary[k] if self.add_counter_to_name: ps = os.path.splitext(path) path = ps[0] + '_' + str(self._counter) + ps[1] writer = pandasWriter(path, self.writer) folder = os.path.dirname(path) self.log().debug('Checking for directory: %s', folder) if not os.path.exists(folder): self.log().fatal('Path given is invalid.') self.log().debug('Writing file: %s' % (path)) writer(df, path, **self.kwargs) self._counter += 1 return StatusCode.Success
def execute(self): """Execute SparkExecuteQuery""" self.log().debug('Applying following SQL-query to object(s) in DataStore: {0:s}'.format(self.query)) ds = ProcessManager().service(DataStore) # register all objects in DataStore as SQL temporary views for ds_key in ds.keys(): spark_df = ds[ds_key] spark_df.createOrReplaceTempView(ds_key) # get existing SparkSession spark = ProcessManager().service(spark_analysis.SparkManager).get_session() # apply SQL-query to temporary view(s) result = spark.sql(self.query) # store dataframe schema self.schema = result.schema # convert to different data format if required if self.output_format == 'rdd': # convert to RDD of tuples result = result.rdd.map(tuple) elif self.output_format == 'pd': # convert to Pandas dataframe result = result.toPandas() ds[self.store_key] = result return StatusCode.Success
def execute(self): """Execute DsObjectDeleter""" settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # used in code testing only if settings.get('TESTING'): self.log().warning( 'Running in TESTING mode. NOT clearing datastore for testing purposes.' ) return StatusCode.Success # delete specific items for key in self.deletionKeys: if key in ds: self.log().debug('Now deleting datastore object with key "%s"', key) del ds[key] # delete specific class types for cls in self.deletionClasses: for key in ds: if isinstance(ds[key], cls): self.log().debug( 'Now deleting datastore object with key "%s"', key) del ds[key] # delete all but specific items if len(self.keepOnly): keys = list(ds.keys()) for key in keys: if key not in self.keepOnly: self.log().debug( 'Now deleting datastore object with key "%s"', key) del ds[key] # delete all items in datastore if self.clearAll: keys = list(ds.keys()) for key in keys: self.log().debug('Now deleting datastore object with key "%s"', key) del ds[key] return StatusCode.Success
def test_execute(self): from eskapade import ProcessManager, DataStore from eskapade.core_ops.links import AssertInDs ds = ProcessManager().service(DataStore) ds['test1'] = pd.DataFrame([1], columns=['data']) ds['test2'] = pd.DataFrame([2], columns=['data']) ds['test3'] = pd.DataFrame([3], columns=['data']) aids = AssertInDs() aids.keySet = ['test1', 'test2', 'test3'] aids.initialize() aids.execute() aids.finalize() # There is no output to test against. self.assertIn('test1', list(ds.keys()), 'dataframe not in datastore') self.assertIn('test2', list(ds.keys()), 'dataframe not in datastore') self.assertIn('test3', list(ds.keys()), 'dataframe not in datastore')
def execute(self): """ Execute AssignRandomClass """ ds = ProcessManager().service(DataStore) # basic checks on contensts of the data frame assert self.readKey in list( ds.keys()), 'Key %s not in DataStore.' % self.readKey df = ds[self.readKey] if not isinstance(df, DataFrame): raise Exception('Retrieved object not of type pandas DataFrame.') ndf = len(df.index) assert ndf > 0, 'dataframe %s is empty.' % self.readKey if self.column in df.columns: raise Exception( 'Column name <%s> already used: <%s>. Will not overwrite.' % (self.column, str(df.columns))) # fix final number of events assigned per random class # ... each class gets at least one event if self.nevents is not None: if len(self.nevents) == self.nclasses - 1: self.nevents.append(ndf - sum(n for n in self.nevents)) if self.nevents is None: self.nevents = [int(ndf * f) for f in self.fractions] pass for i in range(self.nclasses): nsum = sum(n for n in self.nevents[:i + 1]) ndiff = 0 if (nsum - ndf < 0) else (nsum - ndf) self.nevents[i] -= ndiff if self.nevents[i] < 0: self.nevents[i] = 0 pass for i, n in enumerate(self.nevents): assert n >= 0, 'Random class <%d> assigned nevents <%d> needs to be greater than zero. %s' % \ (i, n, str(self.nevents)) self.log().info('Random class <%d> assigned n events <%d>.' % (i, n)) # random reshuffling of dataframe indices settings = ProcessManager().service(ConfigObject) RNG = RandomState(settings['seed']) permute = RNG.permutation(df.index) # apply the random reshuffling, and assign records to the n classes df[self.column] = 0 for i in range(self.nclasses): ib = sum(n for n in self.nevents[:i]) ie = sum(n for n in self.nevents[:i + 1]) df.ix[permute[ib:ie], self.column] = i pass return StatusCode.Success
def execute(self): """Create a report of the data frame variables * create statistics object for column * create overview table of column variable * plot histogram of column variable * store plot :returns: execution status code :rtype: StatusCode """ # fetch and check input data frame hist_dict = ProcessManager().service(DataStore).get( self.read_key, None) if not isinstance(hist_dict, dict): self.log().critical( 'no histograms "%s" found in data store for %s', self.read_key, str(self)) raise RuntimeError('no input data found for %s' % str(self)) if self.hist_keys is None: self.hist_keys = hist_dict.keys() # create report page for histogram self.pages = [] for name in self.hist_keys: # histogram name self.log().info('processing histogram "%s"', name) # check if histogram is in dict if name not in hist_dict: self.log().warning('histogram "%s" not in dictionary "%s"', name, self.read_key) continue hist = hist_dict[name] # process each histogram (plot and make summary table) if hist.n_dim == 1: self.process_1d_histogram(name, hist) elif hist.n_dim == 2: self.process_2d_histogram(name, hist) # write out accumulated histogram statistics into report file with open('{}/report.tex'.format(self.results_path), 'w') as report_file: report_file.write( self.report_template.replace('INPUT_PAGES', ''.join(self.pages))) return StatusCode.Success
def test_esk104(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk104_basic_datastore_operations.py' status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) self.assertTrue(status.isSuccess()) self.assertEqual(1, len(ds.keys())) self.assertEqual(1, ds['a'])
def test_esk107(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings[ 'macro'] = settings['esRoot'] + '/tutorials/esk107_chain_looper.py' status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # chain is repeated 10 times, with nothing put in datastore self.assertTrue(status.isSuccess()) self.assertEqual(0, len(ds.keys())) self.assertEqual(10, pm.chains[0].links[1].maxcount)
def test_esk110(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk110_code_profiling.py' status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) self.assertTrue(status.isSuccess()) self.assertEqual(0, len(pm.chains)) self.assertEqual(0, len(ds.keys())) self.assertTrue('doCodeProfiling' in settings) self.assertEqual('cumulative', settings['doCodeProfiling'])
def test_esk105bc(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk105_B_store_each_chain.py' status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # results of all three chains have been persisted self.assertTrue(status.isSuccess()) path = '{0:s}/{1:s}/proc_service_data/v0/_chain{{:d}}/{2:s}.pkl'.format( settings['resultsDir'], settings['analysisName'], str(DataStore)) for path_it in range(1, 4): self.assertTrue(os.path.exists(path.format(path_it))) execution.reset_eskapade() settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk105_C_begin_at_chain3.py' status = execution.run_eskapade(settings) ds = ProcessManager().service(DataStore) # object from all three chains are present self.assertTrue(status.isSuccess()) self.assertTrue('f' in ds) self.assertTrue('g' in ds) self.assertTrue('h' in ds) self.assertEqual(3, len(ds.keys())) self.assertEqual(7, ds['f']['n_favorite']) self.assertEqual(1, ds['g']['a']) self.assertEqual(7, ds['h'][1])
def execute(self): """Execute link""" ds = ProcessManager().service(DataStore) assert self.read_key in list(ds.keys()), 'key <%s> not in DataStore.' % self.read_key df = ds[self.read_key] for arr in self.apply_funcs: # get func input keys = list(arr.keys()) assert 'func' in keys, 'function input is insufficient.' func = arr['func'] self.log().debug('Applying function %s' % str(func)) args = () kwargs = {} if 'kwargs' in keys: kwargs = arr['kwargs'] if 'args' in keys: args = arr['args'] # apply func if 'groupby' in keys: groupby = arr['groupby'] if 'groupbyColout' in keys: kwargs['groupbyColout'] = arr['groupbyColout'] df = self.groupbyapply(df, groupby, func, *args, **kwargs) elif 'storekey' in keys: if 'entire' in keys: result = func(df, *args, **kwargs) elif 'colin' in keys: colin = arr['colin'] assert colin in df.columns result = df[colin].apply(func, args=args, **kwargs) else: result = df.apply(func, args=args, **kwargs) ds[arr['storekey']] = result else: assert 'colout' in keys, 'function input is insufficient' colout = arr['colout'] if 'entire' in keys: df[colout] = func(df, *args, **kwargs) elif 'colin' in keys: colin = arr['colin'] if isinstance(colin, list): for c in colin: assert c in df.columns else: assert colin in df.columns df[colout] = df[colin].apply(func, args=args, **kwargs) else: df[colout] = df.apply(func, args=args, **kwargs) # add columns if self.add_columns is not None: for k, v in self.add_columns.items(): df[k] = v if self.store_key is None: ds[self.read_key] = df else: ds[self.store_key] = df return StatusCode.Success
def execute(self): """ Execute ApplySelectionToDf Applies queries or column selection to a pandas DataFrame. Input dataframe is not overwritten, unless told to do so in kwargs. 1. Apply queries, in order of provided query list. 2. Select columns (if provided). """ ds = ProcessManager().service(DataStore) assert self.readKey in list( ds.keys()), 'Key %s not in DataStore.' % self.readKey assert isinstance( ds[self.readKey], pd.DataFrame ), 'Object with key %s is not a pandas DataFrame.' % self.readKey # 1. apply queries to input dataframe. # input dataframe is not overwritten, unless told to do so in kwargs. do_continue = True if len(self.querySet): # apply first query query = self.querySet[0] try: df = ds[self.readKey].query(query, **self.kwargs) except: if not self.continueIfFailure: raise ValueError( 'Failed to apply query <%s> to dataframe <%s>.' % (query, self.readKey)) else: orig_df_cols = (ds[self.readKey]).columns df = pd.DataFrame(columns=orig_df_cols) do_continue = False # apply rest of the queries if any if do_continue: for query in self.querySet[1:]: try: df = df.query(query, **self.kwargs) except: if not self.continueIfFailure: raise ValueError( 'Failed to apply query <%s> to dataframe <%s>.' % (query, self.readKey)) else: orig_df_cols = (ds[self.readKey]).columns df = pd.DataFrame(columns=orig_df_cols) break # 2. apply column selection to input dataframe. # input dataframe is not overwritten. if len(self.selectColumns): if not 'df' in vars(): df = (ds[self.readKey]).copy(deep=False) try: df = df[self.selectColumns] except: if not self.continueIfFailure: raise ValueError( 'Failed to select columns <%s> of dataframe <%s>.' % (str(self.selectColumns), self.readKey)) else: df = pd.DataFrame(columns=self.selectColumns) assert 'df' in vars(), 'No dataframe available for storage?' ds[self.storeKey] = df ds['n_' + self.storeKey] = len(df.index) self.log().info('Stored dataframe with key <%s> and length <%d>.' % (self.storeKey, len(df.index))) return StatusCode.Success