def initialize_algo_from_model(algo_options, searchinfo, namespace): """Init algo from model if possible, and catch discrepancies. Args: algo_options (dict): algo options searchinfo (dict): information required for search namespace (string): namespace, 'user' or 'app' Returns: algo (object/None): loaded algo or None algo_options (dict): algo option """ algo = None if 'model_name' in algo_options: try: model_algo_name, algo, model_options = models.base.load_model( algo_options['model_name'], searchinfo, namespace=namespace) except models.models_util.ModelNotFoundException: algo = None except Exception as e: cexc.log_traceback() raise RuntimeError('Failed to load model "%s". Exception: %s.' % ( algo_options['model_name'], str(e))) if algo is not None: FitPartialProcessor.catch_model_discrepancies(algo_options, model_options, model_algo_name) # Pre 2.2 models do not save algo_name in their model options # So we must re add them here to be compatible with 2.2+ versions model_options['algo_name'] = algo_options['algo_name'] algo_options = model_options return algo, algo_options
def copy_model(source_searchinfo, source_model_name, target_searchinfo, target_model_name): """ copy the source_model_name from given namespace (source_searchinfo) to target namespace (target_searchinfo) with a new name. Args: source_searchinfo: used to get the namespace of the source model source_model_name: the name of the source model target_searchinfo: used to get the namespace of the target model target_model_name: the name of the target model Returns: (dict) the reply of the last lookup file POST request """ # copy the model to staging directory staging_model_filepath = copy_model_to_staging(source_model_name, source_searchinfo) target_file_name = model_name_to_filename(target_model_name) # send the model to target space with overwritten if os.access(staging_model_filepath, os.R_OK): reply = move_model_file_from_staging( target_file_name, target_searchinfo, namespace='user', model_filepath=staging_model_filepath) else: cexc.log_traceback() raise Exception( 'The temp model file %s is missing or permission denied' % staging_model_filepath) return reply
def _handle_clone_reply(replies): """ merge the 'messages' part of all replies into the last reply. Args: replies (list) : the replies from all splunk REST requests, with ['content']['messages'] modified by _clone_experiment_model_callback() Returns: (dict) a modified version of mltk clone reply, trimming all attributes in `content` except 'messages'. """ messages = [] merged_reply = None # set None to throw exception if replies is empty try: for reply in replies: messages.append(json.loads(reply['content'])['messages'][0]) merged_reply = reply if not reply['success']: break merged_reply['content'] = json.dumps({'messages': messages}) except Exception as e: cexc.log_traceback() raise Exception( "Invalid JSON response from REST API, Please check mlspl.log for more details." ) return merged_reply
def load_model(model_name, searchinfo, namespace): """Try to load the model, error otherwise. Args: model_name (str): model name searchinfo (dict): information required for search namespace (string): namespace, 'user' or 'app' Returns: algo_name (str): algo name algo (model object): algo object model_options (dict): model options """ try: algo_name, algo, model_options = models.base.load_model( model_name, searchinfo, namespace=namespace) except (OSError, IOError) as e: if e.errno == errno.ENOENT: raise RuntimeError('model "%s" does not exist.' % model_name) raise RuntimeError('Failed to load model "%s": %s.' % (model_name, str(e))) except Exception as e: cexc.log_traceback() raise RuntimeError('Failed to load model "%s": %s.' % (model_name, str(e))) return algo_name, algo, model_options
def get_model_from_btool_result(btool_dict, model_name, user, app, roles, namespace): try: if namespace == 'user' and user in btool_dict and model_name in btool_dict[ user]: result = btool_dict[user][model_name] user_match_str = os.path.join('users', user, app, 'lookups', model_name_to_filename(model_name)) # Here only models in the user namespace is checked, because there is a issue/bug with btool # if username is also a role name in Splunk (e.g. username=power and there is the "power" role), # btool might return objects that the user have no permission on but role does. if result.endswith(user_match_str): return result app_match_str = os.path.join('apps', app, 'lookups', model_name_to_filename(model_name)) merged_result = None for role in roles: try: path = btool_dict[role].pop(model_name) if path.endswith(app_match_str): return path else: # If "app:" is not used, check global namespace if namespace != 'app' and (merged_result is None or merged_result < path): merged_result = path except KeyError: continue # Do Nothing, go to next item except Exception: cexc.log_traceback() raise Exception("Please check mlspl.log for more details.") return merged_result
def fit(df, algo, algo_options): """Perform the literal fitting process. This method updates the algo by fitting with input data. Some of the algorithms additionally make predictions within their fit method, thus the predictions are returned in dataframe type. Some other algorithms do not make prediction in their fit method, thus None is returned. Args: df (dataframe): dataframe to fit the algo algo (object): initialized/loaded algo object algo_options (dict): algo options Returns: algo (object): updated algo object df (dataframe): - if algo.fit makes prediction, return prediction - if algo.fit does not make prediction, return input df has_applied (bool): flag to indicate whether df represents original df or prediction df """ try: prediction_df = algo.fit(df, options=algo_options) except Exception as e: cexc.log_traceback() raise RuntimeError('Error while fitting "%s" model: %s' % (algo_options['algo_name'], str(e))) has_applied = isinstance(prediction_df, pd.DataFrame) if has_applied: df = prediction_df return algo, df, has_applied
def initialize_processor(processor_name, process_options, searchinfo): """Import and initialize a processor. Processors are stored in ./bin/processors/ The processors all inherit from the BaseProcessor class. Args: processor_name (str): processor name process_options (dict): process options searchinfo (dict): information required for search Returns: processor (object): initialized processor """ try: processor_module = importlib.import_module( 'processors.{}'.format(processor_name)) processor_class = getattr(processor_module, processor_name) except AttributeError as e: logger.debug('Failed to import ML-SPL processor "%s"' % processor_name) raise RuntimeError('Failed to import ML-SPL processor.') try: processor = processor_class(process_options, searchinfo) except Exception as e: cexc.log_traceback() logger.debug('Error while initializing processor "%s": %s' % (processor_name, str(e))) raise RuntimeError(str(e)) return processor
def _fit(self, X): for variable in self.feature_variables: df_util.assert_field_present(X, variable) df_util.drop_unused_fields(X, self.feature_variables) df_util.assert_any_fields(X) df_util.assert_any_rows(X) if X[self.time_series].dtype == object: raise ValueError( '%s contains non-numeric data. ARIMA only accepts numeric data.' % self.time_series) X[self.time_series] = X[self.time_series].astype(float) try: self.estimator = _ARIMA( X[self.time_series].values, order=self.out_params['model_params']['order'], missing=self.out_params['model_params']['missing']).fit( disp=False) except ValueError as e: if 'stationary' in e.message: raise ValueError( "The computed initial AR coefficients are not " "stationary. You should induce stationarity by choosing a different model order." ) elif 'invertible' in e.message: raise ValueError( "The computed initial MA coefficients are not invertible. " "You should induce invertibility by choosing a different model order." ) else: cexc.log_traceback() raise ValueError(e) except MissingDataError: raise RuntimeError('Empty or null values are not supported in %s. ' 'If using timechart, try using a larger span.' % self.time_series) except Exception as e: cexc.log_traceback() raise RuntimeError(e) # Saving the _time but not as a part of the ARIMA structure but as new attribute for ARIMA. if '_time' in self.feature_variables: freq = self._find_freq(X['_time'].values, self.freq_threshold) self.estimator.datetime_information = dict( ver=0, _time=X['_time'].values, freq=freq, # in seconds (unix epoch) first_timestamp=X['_time'].values[0], last_timestamp=X['_time'].values[-1], length=len(X)) else: self.estimator.datetime_information = dict(ver=0, _time=None, freq=None, first_time=None, last_time=None, length=len(X))
def get_file_path_from_content(content): try: file_path = content['entry'][0]['content']['eai:data'] except Exception as e: cexc.log_traceback() raise Exception("Invalid JSON response from REST API, Please check mlspl.log for more details.") return file_path
def initialize_algo(algo_options, searchinfo): algo_name = algo_options['algo_name'] try: algo_class = initialize_algo_class(algo_name, searchinfo) return algo_class(algo_options) except Exception as e: cexc.log_traceback() raise RuntimeError('Error while initializing algorithm "%s": %s' % ( algo_name, str(e)))
def callback(model_name): draft_model_name = get_experiment_draft_model_name(model_name) try: return copy_model(searchinfo, draft_model_name, searchinfo, model_name) except ModelNotFoundException as e: cexc.log_traceback() logger.error(e) raise SplunkRestProxyException( "%s: %s" % (str(e), draft_model_name), logging.ERROR, httplib.NOT_FOUND)
def _delete_models(request, url_parts): if len(url_parts) == 1: try: searchinfo = searchinfo_from_request(request) rest_proxy = rest_proxy_from_searchinfo(searchinfo) model_list = get_model_list_by_experiment( rest_proxy, namespace='user', experiment_id=url_parts[0]) for model_name in model_list: url = rest_url_util.make_get_lookup_url( rest_proxy, namespace='user', lookup_file=model_name) reply = rest_proxy.make_rest_call('DELETE', url) except Exception as e: cexc.log_traceback() pass
def setup_model(cls, process_options, searchinfo): """Load temp model, try to load real model, update options. Remove the tmp_dir in the process. Args: process_options (dict): process_options searchinfo (dict): information required for search Returns: algo_name (str): algorithm name algo (object): algorithm object process_options (dict): updated process options """ tmp_dir = process_options.pop('tmp_dir') searchinfo = search_util.add_distributed_search_info(process_options, searchinfo) namespace = process_options.pop('namespace', None) try: algo_name, _, model_options = models.base.load_model( process_options['model_name'], searchinfo, namespace=namespace, model_dir=tmp_dir, skip_model_obj=True, tmp=True ) algo = None logger.debug('Using tmp model to set required_fields.') except: # Try to load real model. try: algo_name, algo, model_options = models.base.load_model( process_options['model_name'], searchinfo, namespace=namespace) except (OSError, IOError) as e: if e.errno == errno.ENOENT: raise RuntimeError('model "%s" does not exist.' % process_options['model_name']) raise RuntimeError('Failed to load model "%s": %s.' % ( process_options['model_name'], str(e))) except Exception as e: cexc.log_traceback() raise RuntimeError('Failed to load model "%s": %s.' % ( process_options['model_name'], str(e))) model_options.update(process_options) # process options override loaded model options process_options = model_options return algo_name, algo, process_options, namespace
def save_temp_model(algo_options, tmp_dir): """Save temp model for follow-up apply. Args: algo_options (dict): algo options tmp_dir (str): temp directory to save model to """ if 'model_name' in algo_options: try: models.base.save_model(algo_options['model_name'], None, algo_options['algo_name'], algo_options, model_dir=tmp_dir, tmp=True) except Exception as e: cexc.log_traceback() raise RuntimeError( 'Error while saving temporary model "%s": %s' % (algo_options['model_name'], e))
def save_model(self): """Attempt to save the model, delete the temporary model.""" if 'model_name' in self.algo_options: try: models.base.save_model(self.algo_options['model_name'], self.algo, self.algo_options['algo_name'], self.algo_options, max_size=self.resource_limits['max_model_size_mb'], searchinfo=self.searchinfo, namespace=self.namespace) except Exception as e: cexc.log_traceback() raise RuntimeError('Error while saving model "%s": %s' % (self.algo_options['model_name'], e)) try: models.base.delete_model(self.algo_options['model_name'], model_dir=self.tmp_dir, tmp=True) except Exception as e: cexc.log_traceback() logger.warn('Exception while deleting tmp model "%s": %s', self.algo_options['model_name'], e)
def load_scoring_function(module_name, func_name): """Load the scoring algorithm from correct module. Args: module_name (str): name of the module to load (eg. sklearn.metrics) func_name (str): name of the scoring function to load Returns: scoring (function): scoring function loaded from module """ try: scoring_module = importlib.import_module(module_name) scoring = getattr(scoring_module, func_name) except (ImportError, AttributeError): cexc.log_traceback() err_msg = 'Scoring method {} is not available'.format(func_name) raise RuntimeError(err_msg) return scoring
def score(df, score_method, scoring_options): """Perform the literal predict from the estimator. Args: df (dataframe): input data score_method (object): initialized score_method object scoring_options (dict): scoring options Returns: score_df (dataframe): output dataframe """ try: score_df = score_method.score(df, scoring_options) except Exception as e: cexc.log_traceback() err_msg = 'Error while scoring "{}": {}' raise RuntimeError( err_msg.format(scoring_options['scoring_name'], str(e))) return score_df
def fit(algo, df, options): """Perform the partial fit. Args: algo (object): algo object df (dataframe): dataframe to fit on options (dict): process options Returns: algo (object): updated algorithm """ try: algo.partial_fit(df, options=options) except MLSPLNotImplementedError: raise RuntimeError('Algorithm "%s" does not support partial fit' % options['algo_name']) except Exception as e: cexc.log_traceback() raise RuntimeError('Error while fitting "%s" model: %s' % (options['algo_name'], str(e))) return algo
def setup_score_method(self, scoring_options, searchinfo): """ Load scoring class and module name. Args: scoring_options (dict): scoring options searchinfo (dict): information required for search Returns: score_method (object): scoring class from sklearn scoring_module_name (str): scoring module name from scorings.conf """ scoring_name = scoring_options['scoring_name'] try: scoring_class, scoring_module_name = self.load_class_and_module_name( scoring_name, searchinfo) return scoring_class(scoring_options), scoring_module_name except Exception as e: cexc.log_traceback() err_msg = 'Error while initializing scoring method "{}": {}' raise RuntimeError(err_msg.format(scoring_name, str(e)))
def save_experiment(experiment, update, searchinfo, experiment_dir=experiment_staging_dir, namespace='user'): try: os.makedirs(experiment_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(experiment_dir): pass else: cexc.log_traceback() raise Exception("Error creating experiment: %s, %s" % (experiment["id"], e)) experiment_name_to_open = '_' + str(uuid.uuid1()).replace('-', '_') # raises if invalid experiment_type_long = experiment['type'].lower() experiment_type_short = EXPERIMENT_TYPES_MAP[experiment_type_long] file_path = file_name_to_path( experiment_name_to_filename(experiment_name_to_open, experiment_type_short), experiment_dir) logger.debug('Saving experiment: %s' % file_path) with open(file_path, mode='w') as f: experiment_writer = csv.writer(f) # TODO: Version attribute experiment_writer.writerow(['experiment']) experiment_writer.writerow([json.dumps(experiment)]) experiment_filename = experiment_name_to_filename(experiment["id"], experiment_type_short) # File is closed at this point, but f.name is still accessible. reply = move_experiment_file_from_staging(experiment_filename, update, searchinfo, namespace, f.name) # decorate the new lookup file with experiment-specific info reply['entry'][0] = get_experiment_from_lookup(reply['entry'][0]) return reply
def apply(df, algo, process_options): """Perform the literal predict from the estimator. Args: df (dataframe): input data algo (object): initialized algo object process_options (dict): process options Returns: prediction_df (dataframe): output dataframe """ try: prediction_df = algo.apply(df, options=process_options) gc.collect() except Exception as e: cexc.log_traceback() cexc.messages.warn('Error while applying model "%s": %s' % (process_options['model_name'], str(e))) raise RuntimeError(e) return prediction_df
def get_output(self): """Override get_output from BaseProcessor. Check if prediction was already made, otherwise make prediction. Returns: (dataframe): output dataframe """ if not self.has_applied: try: self.df = self.algo.apply(self.df, options=self.algo_options) except Exception as e: cexc.log_traceback() logger.debug('Error during apply phase of fit command. Check apply method of algorithm.') raise RuntimeError('Error while fitting "%s" model: %s' % (self.algo_options['algo_name'], str(e))) if self.df is None: messages.warn('Apply method did not return any results.') self.df = pd.DataFrame() return self.df
def parse_reply_for_rest(reply): """ simplified version of lookups_parse_reply - instead of throwing custom Exceptions for non success case, it only throws one exception which is a wrapper of splunk reply. Args: reply: Returns: """ try: if not reply['success']: raise SplunkRestException(reply) return json.loads(reply['content']) except SplunkRestException as e: cexc.log_traceback() raise SplunkRestException(reply) except Exception as e: cexc.log_traceback() raise Exception( "Invalid JSON response from REST API, Please check mlspl.log for more details." )
def _add_model_name_to_reply(raw_reply, model_name): """ a util function for customize the reply from Splunk lookup-table-file REST endpoint. 1. if it's a success REST reply, insert type='INFO' and add custom attribute `mltk_model_name` to the `messages` parts. 2. if it's not a success REST reply, only add the custom attribute. Args: raw_reply (dict) : a dict of raw reply from Splunk lookup-table-file request model_name: the model name which needs to be inserted. Returns: (dict) modified reply. """ reply = copy.deepcopy(raw_reply) try: content = json.loads(raw_reply['content']) messages = content['messages'] if len(messages) > 0: messages[0][MODEL_NAME_ATTR] = model_name else: message_success = { 'type': "INFO", 'text': '', MODEL_NAME_ATTR: model_name, } messages.append(message_success) reply['content'] = json.dumps(content) except Exception as e: cexc.log_traceback() raise Exception( "Invalid JSON response from REST API, Please check mlspl.log for more details." ) return reply
def _handle_all_experiment_models(reply, callback_handler): """ pass the callback_handler to each model for all search stages of an experiment, exit if handler returns failure. Args: reply (dict) : the reply object of an experiment GET request. callback_handler (func) : a callback handler for each model, it should return the reply of a REST request. Returns: (list): a list of replies from each handlers """ try: content = json.loads(reply['content']) entries = content['entry'] # a cache that stores the reply from the callback of each model reply_list = [] for entry in entries: ss_json = entry['content']['searchStages'] search_stages = json.loads(ss_json) for search_stage in search_stages: model_name = search_stage.get('modelName') if model_name is not None: reply = callback_handler(model_name) reply_list.append(reply) # if any of the reply is not successful, stop the process and return the current reply list if not reply.get('success'): return reply_list return reply_list except Exception: cexc.log_traceback() raise Exception( "Invalid JSON response from REST API, Please check mlspl.log for more details." )
def delete_model(process_options, searchinfo, namespace): """Actually delete the model. Args: process_options (dict): process options searchinfo (dict): information required for search namespace (string): namespace, 'user' or 'app' """ try: deletemodels.delete_model(process_options['model_name'], searchinfo, namespace=namespace) except (OSError, IOError) as e: if e.errno == errno.ENOENT: raise RuntimeError('model "%s" does not exist.' % process_options['model_name']) raise RuntimeError('Failed to delete model "%s": %s.' % (process_options['model_name'], str(e))) except Exception as e: cexc.log_traceback() raise RuntimeError('Failed to delete model "%s": %s.' % (process_options['model_name'], str(e))) messages.info('Deleted model "%s"' % process_options['model_name'])
def setup_model(cls, process_options, searchinfo): """Load temp model, try to load real model, update options. Remove the tmp_dir in the process. Args: process_options (dict): process_options searchinfo (dict): information required for search Returns: algo_name (str): algorithm name algo (object): algorithm object process_options (dict): updated process options namespace (str): namespace of the model """ tmp_dir = process_options.pop('tmp_dir') searchinfo = search_util.add_distributed_search_info( process_options, searchinfo) namespace = process_options.pop('namespace', None) mlspl_conf = process_options.pop('mlspl_conf') # For MLA-1989 we cannot properly load a model in parsetmp search if is_parsetmp(searchinfo): process_options['mlspl_limits'] = {} process_options['feature_variables'] = ['*'] return None, None, process_options, None try: algo_name, _, model_options = models.base.load_model( process_options['model_name'], searchinfo, namespace=namespace, model_dir=tmp_dir, skip_model_obj=True, tmp=True) algo = None logger.debug('Using tmp model to set required_fields.') except: # Try to load real model. try: algo_name, algo, model_options = models.base.load_model( process_options['model_name'], searchinfo, namespace=namespace) except (OSError, IOError) as e: if e.errno == errno.ENOENT: raise RuntimeError('model "%s" does not exist.' % process_options['model_name']) raise RuntimeError('Failed to load model "%s": %s.' % (process_options['model_name'], str(e))) except Exception as e: cexc.log_traceback() raise RuntimeError('Failed to load model "%s": %s.' % (process_options['model_name'], str(e))) model_options.update( process_options) # process options override loaded model options process_options = model_options process_options['mlspl_limits'] = mlspl_conf.get_stanza(algo_name) return algo_name, algo, process_options, namespace
def add_distributed_search_info(process_options, searchinfo): """ Add additional information required for distributed search to searchinfo given. Args: process_options (dict): the process options to pass to the processor searchinfo (dict): information required for search Returns: searchinfo (dict): the original input searchinfo dict updated with information for distributed search """ # For MLA-1989, in parsetmp search, we do not add anything if is_parsetmp(searchinfo): return searchinfo # In the case we need this before process_options exists if process_options is None: process_options = searchinfo try: dispatch_dir = process_options.get('dispatch_dir') info = info_csv_to_dict(os.path.join(dispatch_dir, 'info.csv')) dispatch_base_folder = os.path.dirname(dispatch_dir) def get_root_from_info(dispatch_dir): """Recursively get _root_sid from info.csv until we find args.txt. If _root_sid is present without a value, it should be '' (empty string) if it is not present, we will default to None, which are both falsy Args: dispatch_dir (str): the dispatch directory path or the previous _root_sid value Returns dispatch_dir (str): the dispatch directory path where we can find args.txt """ if not dispatch_dir.startswith(dispatch_base_folder): dispatch_dir = os.sep.join( [dispatch_base_folder, dispatch_dir]) try: if 'args.txt' in os.listdir(dispatch_dir): return dispatch_dir some_info = info_csv_to_dict( os.path.join(dispatch_dir, 'info.csv')) if some_info.get('_root_sid'): return get_root_from_info(some_info['_root_sid']) except IOError as e: pass return dispatch_dir dispatch_dir = get_root_from_info(dispatch_dir) searchinfo['bundle_path'] = get_bundle_path(info) searchinfo['is_remote'] = is_remote_search(info) if searchinfo['is_remote']: searchinfo['roles'] = args_util.parse_roles( os.path.join(dispatch_dir, 'args.txt')) except Exception as e: logger.debug(e) cexc.log_traceback() raise RuntimeError('Failed to load model "%s": ' % (process_options['model_name'])) return searchinfo