def apply(self, df, options): # Make a copy of data, to not alter original dataframe logger = get_logger('IsolationForest Logger') X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1. y_hat = self.estimator.predict(X.values)*-1 # Printing the accuracy for prediction of outliers accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2))) logger.debug(accuracy) y_hat = y_hat.astype('str') #Assign output_name default_name = 'isOutlier' new_name = options.get('output_name', None) output_name = self.rename_output(default_names=default_name, new_names=new_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def make_rest_call(session_key, method, url, postargs): import os import subprocess import cexc logger = cexc.get_logger(__name__) payload = { 'session_key': session_key, 'url': url, 'method': method, 'postargs': postargs } try: python_path = os.path.join(os.environ['SPLUNK_HOME'], 'bin', 'python') p = subprocess.Popen( [python_path, os.path.abspath(__file__)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdoutdata, stderrdata) = p.communicate(json.dumps(payload)) p.wait() for errline in stderrdata.splitlines(): logger.debug('> %s', errline) if p.returncode != 0: raise RuntimeError( "rest_bouncer subprocess exited with non-zero error code '%d'" % p.returncode) reply = json.loads(stdoutdata) except Exception as e: logger.warn('rest_bouncer failure: %s: %s', type(e).__name__, str(e)) return False return reply
#!/usr/bin/env python # Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved. import errno import gc import os import pandas as pd import cexc import conf import models.base from BaseProcessor import BaseProcessor from util import search_util logger = cexc.get_logger(__name__) messages = cexc.get_messages_logger() class ApplyProcessor(BaseProcessor): """The apply processor receives and returns pandas DataFrames.""" def __init__(self, process_options, searchinfo): """Initialize options for the processor. Args: process_options (dict): process options searchinfo (dict): information required for search """ self.searchinfo = searchinfo self.algo_name, self.algo, self.process_options, self.namespace = self.setup_model(process_options, self.searchinfo)
#!/usr/bin/env python # Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved. from exec_anaconda import exec_anaconda_or_die exec_anaconda_or_die() import cexc from cexc import BaseChunkHandler from util.param_util import parse_args from util.command_util import GeneratingCommand, is_getinfo_chunk logger = cexc.get_logger('kvstorelookup') messages = cexc.get_messages_logger() class KVStoreLookupCommand(GeneratingCommand): """KVStoreLookupCommand uses the ChunkedController & KVStoreLookupProcessor to read a KVStore collection""" @staticmethod def handle_arguments(getinfo): """Check for invalid arguments and get controller_options. Args: getinfo (dict): getinfo metadata Returns: controller_options (dict): controller options """ options = parse_args(getinfo['searchinfo']['args']) params = options.get('params', {})
#!/usr/bin/env python # Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved. from exec_anaconda import exec_anaconda_or_die exec_anaconda_or_die() import os import cexc from cexc import BaseChunkHandler from util import command_util from util.param_util import parse_args from chunked_controller import ChunkedController logger = cexc.get_logger('score') messages = cexc.get_messages_logger() class ScoreCommand(cexc.BaseChunkHandler): """ScoreCommand uses ChunkedController & processor(s) to score field(s). """ @staticmethod def handle_arguments(getinfo): """Take the getinfo metadata and return controller_options. Args: getinfo (dict): getinfo metadata from first chunk Returns: controller_options (dict): options to be passed to controller """ if len(getinfo['searchinfo']['raw_args']) == 0: raise RuntimeError('First argument must be a scoring method')
#!/usr/bin/env python # Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved. from exec_anaconda import exec_anaconda_or_die exec_anaconda_or_die() import os import conf from cStringIO import StringIO from util.param_util import is_truthy, parse_args, convert_params from util import command_util import cexc from chunked_controller import ChunkedController from cexc import BaseChunkHandler logger = cexc.get_logger('fit') messages = cexc.get_messages_logger() class FitCommand(cexc.BaseChunkHandler): """FitCommand uses ChunkedController & one of two processors to fit models. The FitCommand can use either the FitBatchProcessor or the FitPartialProcessor, which is chosen based on the presence of the partial_fit parameter. """ @staticmethod def handle_arguments(getinfo): """Take the getinfo metadata and return controller_options. Args: getinfo (dict): getinfo metadata from first chunk
#!/usr/bin/env python # Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved. import cexc from cexc.cexc_anaconda import exec_anaconda_or_die exec_anaconda_or_die() from util import command_util, param_util from util.command_util import GeneratingCommand logger = cexc.get_logger('delete') messages = cexc.get_messages_logger() class DeleteModelCommand(GeneratingCommand): """DeleteModelCommand uses the ChunkedController & DeleteModelProcessor to delete models.""" @staticmethod def handle_arguments(getinfo): """Check for invalid argument usage and return controller options. Args: getinfo(dict): getinfo metadata Returns: controller_options (dict): controller options """ if len(getinfo['searchinfo']['args']) != 1: raise RuntimeError('Usage: deletemodel <modelname>') controller_options = {} controller_options['namespace'], controller_options[
#!/usr/bin/env python # Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved. import cexc from cexc import BaseChunkHandler from cexc.cexc_anaconda import exec_anaconda_or_die exec_anaconda_or_die() from util import param_util, command_util from util.command_util import GeneratingCommand logger = cexc.get_logger('summary') messages = cexc.get_messages_logger() class SummaryCommand(GeneratingCommand): """Summary command gets model summaries from ML-SPL models.""" @staticmethod def handle_arguments(getinfo): """Catch invalid argument and return controller options. Args: getinfo (dict): getinfo metadata Return: controller_options (dict): controller options """ if len(getinfo['searchinfo']['args']) == 0: raise RuntimeError('First argument must be a saved model')
#!/usr/bin/env python # Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved. from exec_anaconda import exec_anaconda_or_die exec_anaconda_or_die() import cexc from cexc import BaseChunkHandler from util import command_util from util.command_util import GeneratingCommand logger = cexc.get_logger('list') messages = cexc.get_messages_logger() class ListModelsCommand(GeneratingCommand): """ListModelsCommand uses the ChunkedController & ListModelsProcessor to list saved models.""" @staticmethod def handle_arguments(getinfo): """Check for invalid arguments and get controller_options. Args: getinfo (dict): getinfo metadata Returns: controller_options (dict): controller options """ if len(getinfo['searchinfo']['args']) > 0: raise RuntimeError(
#!/usr/bin/env python # Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved. import cexc from cexc import BaseChunkHandler from cexc.cexc_anaconda import exec_anaconda_or_die exec_anaconda_or_die() import conf from util.param_util import parse_args, is_truthy, parse_namespace_model_name from util import command_util from chunked_controller import ChunkedController logger = cexc.get_logger('apply') messages = cexc.get_messages_logger() class ApplyCommand(BaseChunkHandler): """ApplyCommand uses the ChunkedController & ApplyProcessor to make predictions.""" @staticmethod def handle_arguments(getinfo): """Take the getinfo metadata and return controller_options. Args: getinfo (dict): getinfo metadata Returns: controller_options (dict): options to be sent to controller """ if len(getinfo['searchinfo']['args']) == 0:
def fit(self, df, options): # df contains all the search results, including hidden fields # but the requested requested are saved as self.feature_variables logger = get_logger('MyCustomLogging') X = df.copy() # it is always best practice to prepare your data. # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only # want the features that are valid field names. #Make sure to turn off get_dummies X, _, self.columns = df_util.prepare_features( X=X, variables=self.feature_variables, get_dummies=False, mlspl_limits=options.get('mlspl_limits'), ) # test if user field is in the list logger.debug("The user field is %s", self.user_field) try: my_list_index = (X[self.user_field].values) except: raise RuntimeError( 'You must specify user field that exists. You sent %s', self.user_field) X = X.drop([self.user_field], axis=1) my_list_header = (X.columns.values) #ratings as a matrix , clean that data up! X = X.replace([np.inf, -np.inf], "nan").replace("nan", "0") matrix = X.values # force type for Numpy Math matrix = matrix.astype(np.float64) # should consider erroring out when you have super sparse user data # TODO add other methods via parameter user_sim = pairwise_distances(matrix, metric='cosine') item_sim = pairwise_distances(matrix.T, metric='cosine') #item prediction item_sim = matrix.dot(item_sim) / np.array( [np.abs(item_sim).sum(axis=1)]) #user sim mean_user_rating = matrix.mean(axis=1) matrix_diff = (matrix - mean_user_rating[:, np.newaxis]) user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot( matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T # add back into the matrix the header row if self.rating_type == "item": output_df = pd.DataFrame(item_sim, columns=my_list_header, index=my_list_index) if self.rating_type == "user": output_df = pd.DataFrame(user_sim, columns=my_list_header, index=my_list_index) output_df[self.user_field] = pd.Series(my_list_index).values return output_df