def execute(self): """ Execute MongoMoveCollection """ process_manager.service(MongoConnection).set_config_info(process_manager.service(ConfigObject)) self.mdb = process_manager.service(MongoConnection).database # check if collection names are in database colls = self.mdb.collection_names() if self.source_collection not in colls: raise Exception("%s is not a collection in the mongo database" % self.source_collection) if self.filter is not None: if isinstance(self.filter, dict): pass elif isinstance(self.filter, str): ds = process_manager.service(DataStore) assert self.filter in ds, 'Filter key <%s> not found in datastore.' % self.filter self.filter = ds[self.filter] assert isinstance(self.filter, dict), 'Filter with key <%s> is not a dict.' % self.filter else: raise Exception('Given filter of incorrect type.') if self.filter is not None: data = self.mdb[self.source_collection].find(self.filter) else: data = self.mdb[self.source_collection].find() df = pd.DataFrame(list(data)) if len(df) == 0: self.logger.info('Source collection <{collection}> has zero length. Nothing to move.', collection=self.source_collection) return StatusCode.Success if self.columnsToAdd is not None: for k, v in self.columnsToAdd.items(): df[k] = v docs = list(df.T.to_dict().values()) s = [] for coll in self.target_collections: try: self.mdb[coll].insert_many(docs) s.append(coll) appliedstr = 'Copied' if self.copy else 'Moved' self.logger.info('{action} collection <{collection}> with length <{length}> to <{target}>.', action=appliedstr, collection=self.source_collection, length=len(docs), target=coll) except: for c in s: self.mdb[c].delete_many(({'_id': {"$in": list(df._id)}})) raise Exception('Error in move: insertion in target collection %s failed' % coll) if not self.copy: try: self.mdb[self.source_collection].delete_many({'_id': {"$in": list(df._id)}}) except: for c in s: self.mdb[c].delete_many(({'_id': {"$in": list(df._id)}})) raise Exception('Error in move: deletion from source collection %s failed' % self.source_collection) return StatusCode.Success
def test_esk108reduce(self): settings = process_manager.service(ConfigObject) settings['TESTING'] = True self.eskapade_run(resources.tutorial('esk108_reduce.py')) ds = process_manager.service(DataStore) self.assertEqual(20, ds['n_products'])
def test_esk110(self): self.eskapade_run(resources.tutorial('esk110_code_profiling.py')) settings = process_manager.service(ConfigObject) ds = process_manager.service(DataStore) self.assertEqual(0, len(process_manager)) self.assertEqual(0, len(ds)) self.assertTrue('doCodeProfiling' in settings) self.assertEqual('cumulative', settings['doCodeProfiling'])
def eskapade_run(): """Run Eskapade. Top-level entry point for an Eskapade run started from the command line. Arguments specified by the user are parsed and converted to settings in the configuration object. Optionally, an interactive Python session is started when the run is finished. """ from escore import process_manager, ConfigObject, DataStore from escore.core import execution from escore.core.run_utils import create_arg_parser # create parser for command-line arguments parser = create_arg_parser() user_args = parser.parse_args() # create config object for settings if not user_args.unpickle_config: # create new config settings = ConfigObject() else: # read previously persisted settings if pickled file is specified conf_path = user_args.config_files.pop(0) settings = ConfigObject.import_from_file(conf_path) del user_args.unpickle_config # set configuration macros settings.add_macros(user_args.config_files) # set user options settings.set_user_opts(user_args) try: # run Eskapade execution.eskapade_run(settings) except Exception as exc: logger.error('{exc}', exc=exc) raise # start interpreter if requested (--interactive on command line) if settings.get('interactive'): # set Pandas display options import pandas as pd pd.set_option('display.width', 120) pd.set_option('display.max_columns', 50) # make datastore and config available in interactive session ds = process_manager.service(DataStore) settings = process_manager.service(ConfigObject) # start interactive session from code import InteractiveConsole cons = InteractiveConsole(locals()) cons.interact( "\nContinuing interactive session ... press Ctrl+d to exit.\n")
def fork_and_store(self): """Fork and then store input collection Need to reopen mongo connection after fork """ self.logger.debug("Process id before forking: {}".format(os.getpid())) child_pid_list = [] # submit a new process try: pid = os.fork() except OSError: raise OSError("Could not create a child process.") if pid == 0: self.logger.debug("In child process with PID {}".format( os.getpid())) # Need to open separate mongo connection after each fork settings = process_manager.service(ConfigObject) mongo_connection = MongoConnection() mongo_connection.set_config_info(settings) mdb = mongo_connection.database ds = process_manager.service(DataStore) docs = ds[self.read_key] # store docs etl_utils.dostorage(mdb, docs, self.store_collections, self.clearFirst, self.logger, self.read_key) # close connection mongo_connection.close() # safe jupyter exit when forking os._exit(os.EX_OK) else: self.logger.debug( "Back in parent process after forking child {}".format(pid)) child_pid_list.append(pid) # can wait for fork to finish, or just go. if self.wait_after_fork: # check that fork is finished while child_pid_list: self.logger.debug("Waiting for child process to finish.") finished = os.waitpid(0, 0) if finished[0] in child_pid_list: self.logger.debug( "Finished child process {} with status {}".format( finished[0], finished[1])) child_pid_list.remove(finished[0]) self.logger.debug('Finished fork.')
def initialize(self): """ Initialize SkipChainIfCollectionEmpty """ mongo_conn = process_manager.service(MongoConnection) mongo_conn.set_config_info(process_manager.service(ConfigObject)) self.mdb = mongo_conn.database if self.checkAtInitialize: return self.checkCollectionSet() return StatusCode.Success
def execute(self): """ Execute MongoCheckCollection """ process_manager.service(MongoConnection).set_config_info(process_manager.service(ConfigObject)) self.mdb = process_manager.service(MongoConnection).database # check if collection names are in database all_colls = self.mdb.collection_names() for c in self.collectionSet: if c not in all_colls: raise Exception("%s is not a collection in the mongo database" % c) return StatusCode.Success
def initialize(self): """ Initialize MongoDeleteManyFromDF """ process_manager.service(MongoConnection).set_config_info( process_manager.service(ConfigObject)) self.mdb = process_manager.service(MongoConnection).database colls = self.mdb.collection_names() if self.read_key not in colls: raise Exception("%s is not a collection in the mongo database" % self.read_key) return StatusCode.Success
def initialize(self): """ Initialize MongoDFToCollection """ process_manager.service(MongoConnection).set_config_info( process_manager.service(ConfigObject)) self.mdb = process_manager.service(MongoConnection).database assert isinstance(self.read_key, str) and len(self.read_key) > 0, 'read key not set.' if len(self.store_collections) == 0: self.store_collections.append(self.read_key) return StatusCode.Success
def test_esk106(self): settings = process_manager.service(ConfigObject) # fake a setting from the cmd-line. picked up in the macro settings['do_chain0'] = False self.eskapade_run(resources.tutorial('esk106_cmdline_options.py')) settings = process_manager.service(ConfigObject) self.assertEqual(1, len(process_manager)) self.assertEqual('Chain1', list(process_manager)[0].name) self.assertEqual(False, settings.get('do_chain0', True)) self.assertEqual(True, settings.get('do_chain1', True)) self.assertEqual('Universe', list(list(process_manager)[0])[0].hello)
def initialize(self): """Initialize the link. :returns: status code of initialization :rtype: StatusCode """ settings = process_manager.service(ConfigObject) process_manager.service(MongoConnection).set_config_info(settings) self.mdb = process_manager.service(MongoConnection).database self.check_arg_types(collection=str, store_key=str) self.check_arg_vals('collection', 'store_key') return StatusCode.Success
def execute(self): """ Execute MongoDeleteManyFromDF """ ds = process_manager.service(DataStore) df = ds[self.read_key] import bson if '_id' not in df.columns: raise Exception('No _id column in dataframe %s' % self.read_key) elif df._id.dtype != bson.objectid.ObjectId: raise Exception( '_id column not of the correct type. Type should be bson.objectId.ObjectId' ) try: self.mdb[self.read_key].delete_many({'_id': {"$in": list(df._id)}}) except: self.mdb[self.read_key].delete_many(({ '_id': { "$in": list(df._id) } })) raise Exception('Deletion in collection tobeprocessed failed') return StatusCode.Success
def execute(self): """Execute the link. :returns: status code of execution :rtype: StatusCode """ settings = process_manager.service(ConfigObject) ds = process_manager.service(DataStore) doc = ds.get(self.read_key, assert_type=(dict, list), assert_len=True) self.logger.debug('Started doing storage') etl_utils.dostorage(self.mdb, doc, self.store_collections, self.clear_first, self.logger, self.read_key) return StatusCode.Success
def initialize(self): """Initialize the link. :returns: status code of initialization :rtype: StatusCode """ if not self.keys: self.logger.warning('No functions to apply') # perform basic checks on input keys for idx in range(len(self.keys)): if isinstance(self.keys[idx], str): arr = self.keys[idx] arr = dict(key_fs=arr, key_ds=arr, func=unit_func) self.keys[idx] = arr if not isinstance(self.keys[idx], dict): raise AssertionError( 'keys attribute is not a list of dict/str.') arr = self.keys[idx] keys = list(arr.keys()) if 'key_ds' not in keys: raise AssertionError('key input is insufficient.') # will count number of times execute has been called. fs = process_manager.service(ForkStore) fs['n_' + self.name + '_executed'] = 0 return StatusCode.Success
def test_esk101(self): self.eskapade_run(resources.tutorial('esk101_helloworld.py')) settings = process_manager.service(ConfigObject) self.assertTrue(settings['do_hello']) self.assertEqual(2, settings['n_repeat'])
def execute(self): """Execute the link. :returns: status code of execution :rtype: StatusCode """ ds = process_manager.service(DataStore) if self.read_key: # fetch and check input if isinstance(self.read_key, str): obj = ds.get(self.read_key, self.default, self.assert_type, self.assert_len, self.assert_in) elif isinstance(self.read_key, list): obj = [ ds.get(key, self.default, self.assert_type, self.assert_len, self.assert_in) for key in self.read_key ] # apply function trans_obj = self.func(obj, *self.args, **self.kwargs) else: # possibly function does not require object as input trans_obj = self.func(*self.args, **self.kwargs) if self.store_key: ds[self.store_key] = trans_obj return StatusCode.Success
def test_esk105a(self): self.eskapade_run(resources.tutorial('esk105_A_dont_store_results.py')) settings = process_manager.service(ConfigObject) path = settings['resultsDir'] + '/' + settings['analysisName'] self.assertFalse(os.path.exists(path))
def mongo_reset_collections(): import argparse from escore import process_manager, ConfigObject from escore.core import persistence from mongodbtools import resources from mongodbtools import MongoConnection parser = argparse.ArgumentParser( 'eskapade_mongo_reset_collections', description='Clean MongoDB collections.', epilog= 'Please note, only the collections \'proxy\' and \'roles\' will be kept.' ) parser.add_argument('--config', '-c', nargs='?', help='Path to custom MongoDB configuration file.') args = parser.parse_args() path = resources.config('mongo.cfg') if args.config and os.path.exists(args.config): path = args.config logger.info('Using MongoDB configuration from {path:s}'.format(path=path)) settings = process_manager.service(ConfigObject) settings['analysisName'] = 'mongo_reset_collections' settings['version'] = 0 settings['mongodb'] = resources.config('mongo.cfg') # Assert if the results are written to mongo process_manager.service(MongoConnection).set_config_info(settings) mdb = process_manager.service(MongoConnection).database # --- these collections you should keep (authorization data/proxy service) col_keep = ['proxy', 'roles'] col_names = mdb.collection_names(False) for name in col_names: if mdb[name].name not in col_keep: mdb[name].remove({}) mdb.drop_collection(name) logger.info('Cleared all MongoDB collections (except {cols})'.format( cols=col_keep))
def configure_mongo(self, lock: bool = False) -> None: """Configure mongo used during exectute This is the final part of initialization, and needs to be redone in case of forked processing. Hence this function is split off into a separate function. The function can be locked once the configuration is final. :param bool lock: if True, lock this part of the configuration """ if self.config_lock: return self.config_lock = lock settings = process_manager.service(ConfigObject) if settings.get('fork', False): # during fork # Need to open separate mongo connection after each fork self.mongo_connection = MongoConnection() self.mongo_connection.set_config_info(settings) self.mdb = self.mongo_connection.database # set length of cursor of this fork fidx = settings['fork_index'] self.skip = self.n_chunks_in_fork * fidx * self.chunk_size self.limit = self.n_chunks_in_fork * self.chunk_size else: # default (no fork) process_manager.service(MongoConnection).set_config_info(settings) self.mdb = process_manager.service(MongoConnection).database # check if collection names are in database colls = self.mdb.collection_names() if self.collection not in colls: self.logger.warning( 'Source collection <%s> does not exist in mongo db.' % self.collection) raise NameError try: kwargs = dict() kwargs['filter'] = self.query kwargs['projection'] = self.use_cols if self.skip: kwargs['skip'] = self.skip if self.limit: kwargs['limit'] = self.limit cursor = self.mdb[self.collection].find(**kwargs) self._reader = MongoCursorReader(cursor, self.chunk_size) except: self.logger.critical( 'Could not get cursor to source collection <%s> from mongo db.' % self.collection) raise BufferError
def test_esk104(self): self.eskapade_run( resources.tutorial('esk104_basic_datastore_operations.py')) ds = process_manager.service(DataStore) self.assertEqual(1, len(ds)) self.assertEqual(1, ds['a'])
def test_esk107(self): self.eskapade_run(resources.tutorial('esk107_chain_looper.py')) ds = process_manager.service(DataStore) # chain is repeated 10 times, with nothing put in datastore self.assertEqual(0, len(ds)) self.assertEqual(10, list(list(process_manager)[0])[1].maxcount)
def finalize(self): """Finalize the link. :returns: status code of finalization :rtype: StatusCode """ ds = process_manager.service(DataStore) fs = process_manager.service(ForkStore) # check if nothing to do if fs.get('n_' + self.name + '_executed', 0) == 0: return StatusCode.Success # check number of times forkdatacollector has run if fs.get('n_fork', 0) > 0 and ( fs['n_' + self.name + '_executed'] % fs['n_fork'] > 0): self.logger.warning( 'Did not execute multiple of n_fork {0} times: {1}. Data may be missing.' .format(fs['n_fork'], fs['n_' + self.name + '_executed'])) # putting (transformed) objects from forkstore back into datastore for arr in self.keys: keys = list(arr.keys()) key_ds = arr['key_ds'] key_fs = key_ds if 'key_fs' not in keys else arr['key_fs'] if key_fs not in fs: raise AssertionError('key {} not in forkstore.'.format(key_fs)) # retrieve function to apply func = unit_func if 'func' not in keys else arr['func'] args = () if 'args' not in keys else arr['args'] kwargs = {} if 'kwargs' not in keys else arr['kwargs'] # apply transformation self.logger.debug('Applying function {function!s}.', function=func) obj = fs[key_fs] try: trans_obj = func(obj, *args, **kwargs) except: raise Exception( 'Failed to apply function {function!s} to object with {key}.', function=func, key=key_ds) # put transformed ojbect back in datastore ds[key_ds] = trans_obj fs.Print() return StatusCode.Success
def initialize(self): """Initialize the link. :returns: status code of initialization :rtype: StatusCode """ self.check_arg_types(read_key=str, store_collections=list) self.check_arg_types(recurse=True, allow_none=True, store_collections=str) self.check_arg_vals('read_key') settings = process_manager.service(ConfigObject) process_manager.service(MongoConnection).set_config_info(settings) self.mdb = process_manager.service(MongoConnection).database return StatusCode.Success
def setUp(self): """Set up test""" execution.reset_eskapade() settings = process_manager.service(ConfigObject) settings['analysisName'] = self.__class__.__name__ settings['logLevel'] = LogLevel.DEBUG settings['batchMode'] = True
def execute(self): """Execute the link.""" ds = process_manager.service(DataStore) for key in self.keySet: assert key in ds, 'Key {} not in DataStore.'.format(key) return StatusCode.Success
def test_esk102(self): self.eskapade_run(resources.tutorial('esk102_multiple_chains.py')) settings = process_manager.service(ConfigObject) self.assertTrue(settings['do_chain0']) self.assertTrue(settings['do_chain1']) self.assertTrue(settings['do_chain2']) self.assertEqual(3, len(process_manager))
def test_esk106_script(self, mock_argv): """Test Eskapade run with esk106 macro from script""" # get file paths settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk106_cmdline_options' settings_ = settings.copy() macro_path = resources.tutorial('esk106_cmdline_options.py') # mock command-line arguments args = [] mock_argv.__getitem__ = lambda s, k: args.__getitem__(k) # base settings args_ = [macro_path, '-LDEBUG', '--batch-mode'] settings_['macro'] = macro_path settings_['logLevel'] = LogLevel.DEBUG settings_['batchMode'] = True def do_run(name, args, args_, settings_, add_args, add_settings, chains): # set arguments args.clear() args += args_ + add_args settings = settings_.copy() settings.update(add_settings) # run Eskapade process_manager.reset() entry_points.eskapade_run() settings_run = process_manager.service(ConfigObject) # check results self.assertListEqual( [c.name for c in process_manager.chains], chains, 'unexpected chain names in "{}" test'.format(name)) self.assertDictEqual( settings_run, settings, 'unexpected settings in "{}" test'.format(name)) # run both chains do_run( 'both chains', args, args_, settings_, ['--store-all', '-cdo_chain0=True', '-cdo_chain1=True'], dict(storeResultsEachChain=True, do_chain0=True, do_chain1=True), ['Chain0', 'Chain1']) # run only last chain by skipping the first do_run('skip first', args, args_, settings_, ['-bChain1', '-cdo_chain0=True', '-cdo_chain1=True'], dict(beginWithChain='Chain1', do_chain0=True, do_chain1=True), ['Chain0', 'Chain1']) # run only last chain by not defining the first do_run('no first', args, args_, settings_, ['-cdo_chain0=False', '-cdo_chain1=True'], dict(do_chain0=False, do_chain1=True), ['Chain1'])
def test_esk109(self): settings = process_manager.service(ConfigObject) # this flag turns off python embed link settings['TESTING'] = True self.eskapade_run(resources.tutorial('esk109_debugging_tips.py'), StatusCode.Failure) self.assertTrue(isinstance(list(list(process_manager)[0])[3], Break))
def import_and_update_datastore(self): """Import and update the datastore """ # loading external datastore ext_store = DataStore.import_from_file(self.path) if not isinstance(ext_store, DataStore): self.logger.fatal('Object in file "{path}" not of type DataStore.', path=self.path) raise AssertionError('Input object not of type DataStore.') if self.update: # update existing datastore ds = process_manager.service(DataStore) ds.update(ext_store) else: # default # replace existing datastore process_manager.remove_service(DataStore) process_manager.service(ext_store)
def test_esk103(self): self.eskapade_run(resources.tutorial('esk103_printdatastore.py')) ds = process_manager.service(DataStore) self.assertEqual('world', ds['hello']) self.assertEqual(1, ds['d']['a']) self.assertEqual(2, ds['d']['b']) self.assertEqual(3, ds['d']['c'])