Exemple #1
0
 def params_checker(self):
     
     '''
     Function to check the parameters
     and returns the corresponding error message when mismatch encountered
     It also checks for probability threshold between 0 and 1
     '''
     error_codes1 = error_codes()
     kwargs = self.kwargs
     algo_params_type = self.ideal_args_type
     for key in kwargs:
         try:
             if(key=='pthres'):
                 if(type(kwargs[key])==int or type(kwargs['pthres'])==float):
                     if(kwargs[key]<1 and kwargs['pthres']>0):
                         continue
                     else:
                         error_codes1['param']['data']['argument']='pthres'
                         error_codes1['param']['data']['value']=kwargs['pthres']
                         error_codes1['param']['message']='probability must be between 0 and 1 and it must be of type int or float'
                         return error_codes1['param']
                     
         except:
             pass
         
         if(kwargs[key]!=None and type(kwargs[key])!=(algo_params_type[key])):
             error_codes1['param']['data']['argument']=key
             error_codes1['param']['data']['value']=kwargs[key]
             error_codes1['param']['message']='should be of type {}'.format((algo_params_type[key]))
             return error_codes1['param']
         
         else:
             continue
Exemple #2
0
    def read(self):

        try:
            response_data = pd.read_csv(self.filepath)

        except Exception as e:
            error_codes1 = error_codes()
            error_codes1['param']['message'] = '{},{}'.format(
                str(e), str(self.filepath))
            return error_codes1['param']

        print("Getting the dataset from the reader....\n")
        entire_data = self.parse_dict_to_dataframe(response_data)

        return entire_data
Exemple #3
0
def save_model(
        model,
        metric_names,
        assetno,
        filename='som_trained_model',
        target_dir="../../Anomaly_Detection_Models/Machine_Learning_Models"):
    '''
    Function to save the model in the given relative path using pickle
    Arguments:
    Required Params:
        model : MODEL's class object which contains all model related info like weights and architecture
        metric_names :  list of metric names to form the filename for saving the model
        filename : Default -> 'som_trained_model' (It's main part of the filename)
        target_dir: Give relative path to the target directory
    '''

    error_codes1 = error_codes()
    try:
        time_now = ts_to_unix(pd.to_datetime(dt.datetime.now()))
        metric_names = [
            ''.join(e for e in metric if e.isalnum())
            for metric in metric_names
        ]

        # Creating the filename with metricnames and assetno and current time
        filename = filename + '_{}_{}_{}'.format('_'.join(metric_names),
                                                 str(assetno), str(time_now))

        filepath = os.path.join(target_dir, filename)

        if (len(filepath) > 150):
            filepath = filepath[:150]

        filehandler = open(filepath, 'wb')
        pickle.dump(model, filehandler)
        print("\nSaved model : {} in {},\nLast Checkpointed at: {}\n".format(
            filename, target_dir, time_now))
        return filepath

    except Exception as e:
        traceback.print_exc()
        print("Error occured while saving model\n")
        error_codes1['unknown']['message'] = e
        return error_codes1['unknown']
def make_ack_json(anomaly_detectors):
    '''
    Function to make acknowledgement output json.
    Arguments : List of anomaly detector objects which has all the info such as anomaly indexes per metric per asset
    Returns   : dictionary of acknowledgement json
    Logic     : The function makes o/p json for two cases i.e univariate and multivariate separately.
                If its univariate , each anomaly detector object has only anomaly info of only one metric in an asset
                so to make json o/p we combine all the anomaly detector objects per asset and write them together under an
                asset. We split the total list of anomaly detectors into groups by assetno, and then loop over them.
                Whereas for multivariate ,each anomaly detector consists info about all metrics per asset, so we just loop 
                over the list and make o/p json
    Note      : The function also added new feature called anom_counts under each asset json , to indicate the no of anomalies
                detected for each metric in an asset, so this can be utilised to check for no anomaly case
    '''

    bad_response = {
        "code": "204",
        "status": "No Content",
        "message": "Input Data is Empty"
    }
    no_anom_response = {
        "code": "200",
        "status": "OK",
        "message": "No Anomalies detected"
    }

    ack_json = lambda: {"header": '', "body": []}
    anom_per_asset = lambda: {
        "asset": "<asset_serial_number>",
        "anomalies": []
    }
    anom_per_metric = lambda: {"name": "<TagName>", "datapoints": []}
    Datapoint_keys = [
        'from_timestamp', 'to_timestamp', 'anomaly_timestamp', 'anomaly_code'
    ]

    ack_json1 = ack_json()
    overall_zero_anoms = 0
    zero_anomalies = 0
    total_anom_detectors = 0
    error_codes1 = error_codes()
    if (anomaly_detectors[0].algo_type == 'univariate'):

        no_assets = pd.unique([
            anomaly_detector.assetno for anomaly_detector in anomaly_detectors
        ]).size
        anomaly_detectors_per_asset = np.split(np.array(anomaly_detectors),
                                               no_assets)

        for i in range(no_assets):

            anom_per_asset1 = anom_per_asset()

            no_zero_anoms = 0

            for anomaly_detector in anomaly_detectors_per_asset[i]:

                data = anomaly_detector.data
                anom_indexes = anomaly_detector.anom_indexes

                if (len(data[anomaly_detector.metric_name]) != 0):
                    total_anom_detectors += 1
                    if (len(anom_indexes) != 0):
                        anom_per_asset1['asset'] = anomaly_detector.assetno
                        anom_per_metric1 = anom_per_metric()
                        anom_per_metric1['name'] = anomaly_detector.metric_name
                        anom_timestamps = (data.index[anom_indexes].values)
                        anom_timestamps = [
                            np.asscalar(t) for t in anom_timestamps
                        ]

                        anom_per_metric1['datapoints'] = [
                            dict(
                                list(
                                    zip(Datapoint_keys, [
                                        t, t, [t], anomaly_detector.algo_code
                                    ]))) for t in anom_timestamps
                        ]

                        anom_per_asset1['anomalies'].append(anom_per_metric1)
                        ack_json1['header'] = error_codes1['success']
                    else:
                        overall_zero_anoms += 1
                        no_zero_anoms += 1
                        zero_anomalies += 1
                else:
                    ack_json1['header'] = bad_response
                    ack_json1['body'] = []
                    return ack_json1

            if (no_zero_anoms != len(anomaly_detectors_per_asset[i])):
                ack_json1['body'].append(anom_per_asset1)

    else:

        for anomaly_detector in anomaly_detectors:

            data = anomaly_detector.data
            anom_indexes = anomaly_detector.anom_indexes
            if (len(data) != 0):
                total_anom_detectors += 1
                if (len(anom_indexes) == 0):
                    overall_zero_anoms += 1

            if (len(data) != 0):
                total_anom_detectors += 1
                if (len(anom_indexes) == 0):
                    zero_anomalies += 1
                else:
                    ack_json1['header'] = error_codes1['success']
                    anom_per_asset1 = anom_per_asset()
                    anom_per_asset1['asset'] = anomaly_detector.assetno

                    metric_names = anomaly_detector.metric_name

                    for metric_name in metric_names:

                        anom_per_metric1 = anom_per_metric()
                        anom_per_metric1['name'] = metric_name
                        anom_timestamps = (data.index[anom_indexes].values)
                        anom_timestamps = [
                            np.asscalar(t) for t in anom_timestamps
                        ]

                        anom_per_metric1['datapoints'] = [
                            dict(
                                list(
                                    zip(Datapoint_keys, [
                                        t, t, [t], anomaly_detector.algo_code
                                    ]))) for t in anom_timestamps
                        ]
                        anom_per_asset1['anomalies'].append(anom_per_metric1)

                    ack_json1['body'].append(anom_per_asset1)
            else:
                ack_json1['header'] = bad_response
                ack_json1['body'] = []
                return ack_json1

    if (overall_zero_anoms == total_anom_detectors):
        ack_json1['header'] = no_anom_response
        ack_json1['body'] = []

    return ack_json1
Exemple #5
0
def train(filepath,
          network_shape=None,
          input_feature_size=None,
          time_constant=None,
          minNumPerBmu=2,
          no_of_neighbours=10,
          init_radius=None,
          init_learning_rate=0.01,
          N=100,
          diff_order=1,
          is_train=True,
          epochs=4,
          batch_size=4,
          to_plot=True,
          test_frac=0.5):
    '''
        Wrapper function which should be called inorder to run the anomaly detection, it has four parts :
        *reader           - Class Data_reader defined in data_handler.py which takes in json_string  and parses json 
                            and gives list of dataframes
        *preprocessor     - preprocessors are defined in preprocessors.py, which takes in data and gives out processed 
                            data
        *anomaly detector - Class som_knn_detector defined in som_knn_detector.py, which takes in
                            data and algorithm parameters as argument and trains and saves the model and returns
                            the model file path where it saved
        *make_acknowledgement_json - Its function to Make acknowlegement json imported from make_ackg_json.py
        
        
        Arguments :
        Required Parameter:
            filepath: The Json object in the format of the input json given from reader api
            
        Optional Parameter 
                mode : mode has 3 options -> 'detect only','detect and log' , 'log only'
                    Default: 'detect only'
                network_shape : (Type : Tuple (x,y) ) where x is no of rows of the grid of neuron layer and y is the no of columns of grid. So Total no of neurons in a single layer is (x*y)
                    Default: (8,8)
                input_feature_size: Positive Integer representing the no of features in the input data for which anomaly to be detected 
                    Default: Will be no of metric's given as the input , For ex: For two metrics given the feature size will be taken as 2 since this is a multivariate algorithm
                    Customised input : Give no of features wanted to be extracted per metric (yet to do)
                    Note: (Do not give unrelated metrics together in input data , since all metrics are analyzed together i.e Multivariate)

                time_constant: positive float, Exponential decay factor to decrease the neighborhood radius around BMU
                    Default: n_iterations/(log(init_radius)) , It's calculated in the program
                
                minNumPerBmu: positive integer , It is a minimum no of BMU hits for a neuron. Used to minimise the effect of noise in the data
                    Default : 3
                no_of_neighbors: positive integer , It is no of neighbors for KNN algorithm.
                    Default: 3
                initial_radius : positive float, initial radius to find the group of neurons around each BMU
                    Default: 0.4
                initial_learning_rate : positive float , It is learning rate for the algo
                    Default  : 0.01
                diff_order : positive integer, It is order of differencing to be done on the raw data 
                    Default : 0 
                    Note : use 1 or more for mean shift dataset
                epochs: positive integer , no of epochs to train
                    Default: 4
                batch_size : positive integer, no of samples in data to be processed simultaneously
                    Default: 4
                test_frac: positive float, Ratio of test : train data
                    Default : 0.2
                to_plot : Boolean .Give True to see the plots of change-points detected and False if there is no need for plotting
                    Default : True
                
                
        '''

    #algorithm arguments

    model_input_args = {
        'som_shape': network_shape,
        'input_feature_size': None,
        'time_constant': None,
        'minNumPerBmu': minNumPerBmu,
        'no_of_neighbors': no_of_neighbours,
        'initial_radius': init_radius,
        'initial_learning_rate': init_learning_rate,
        'n_iterations': None,
        'N': N,
        'diff_order': diff_order
    }

    #Training arguments
    training_args = {
        'is_train': True,
        'epochs': epochs,
        'batch_size': batch_size,
        'to_plot': to_plot,
        'test_frac': test_frac
    }

    #merging all algo arguments for params checking
    algo_kwargs = {**model_input_args, **training_args}
    '''
            #instantiating the error_codes to avoid overwritting
            #error_codes is a python file imported as error_codes which has error_codes dictionary mapping 
            #for different kinds errors and reset function to reset them.
        '''

    error_codes1 = error_codes()

    try:

        # type_checker is python file which has Type_checker class which checks given parameter types
        checker = type_checker.Type_checker(
            kwargs=algo_kwargs, ideal_args_type=ideal_train_kwargs_type)
        # res is None when no error raised, otherwise it stores the appropriate error message
        res = checker.params_checker()
        if (res != None):
            return json.dumps(res)

        # instanstiating the reader class with reader arguments
        data_reader = Data_reader(filepath=filepath)
        #getting list of dataframes per asset if not empty
        #otherwise gives string 'Empty Dataframe'
        entire_data = data_reader.read()

        writer_data = []
        anomaly_detectors = []

        if ((len(entire_data) != 0 and entire_data is not None
             and type(entire_data) != dict)):
            '''
                looping over the data per assets and inside that looping over metrics per asset
                * Instantiates anomaly detector class with algo args and metric index to detect on
                * Stores the anomaly indexes and anomaly detector object to bulk write to db at once
                '''

            # Output Format for training the model
            #models: is an list of filepaths where model saved
            out_json = {'header': '', 'models': []}

            for i, data_per_asset in enumerate(entire_data):
                if (len(data_per_asset) != 0):
                    assetno = pd.unique(data_per_asset['assetno'])[0]
                    data_per_asset[
                        data_per_asset.columns[1:]] = normalise_standardise(
                            data_per_asset[data_per_asset.columns[1:]])

                    print("Data of Asset no: {} \n {}\n".format(
                        assetno, data_per_asset.head()))
                    cols = list(data_per_asset.columns[1:])

                    anomaly_detector = som_detector.Som_Detector(
                        data=data_per_asset,
                        model_input_args=model_input_args,
                        training_args=training_args,
                        eval_args=None)

                    model_path = (anomaly_detector.detect_anomalies())

                    model = {anomaly_detector.assetno: model_path}
                    '''
                        TODO : Add code for saving the model into database here 
                        '''

                    out_json['models'].append(model)
            out_json['header'] = error_codes1['success']

            return json.dumps(out_json)
        elif (type(entire_data) == dict):
            return json.dumps(entire_data)
        else:
            '''
                Data empty error
                '''
            return json.dumps(error_codes1['data_missing'])
    except Exception as e:
        '''
            unknown exceptions are caught here and traceback used to know the source of the error
            '''
        traceback.print_exc()
        error_codes1['unknown']['message'] = str(e)
        return json.dumps(error_codes1['unknown'])
Exemple #6
0
def evaluate(filepath, model_path, to_plot=True, anom_thres=3.0):
    '''
        Wrapper function which should be called inorder to run the anomaly detection, it has four parts :
        *reader           - Class Data_reader defined in data_handler.py which takes in filepath in string format
                            and parses a big dataframe into list of dataframes per asset
        *preprocessor     - preprocessors are defined in preprocessors.py, which takes in data and gives out processed 
                            data
        *anomaly detector - Class som_knn_detector defined in som_knn_detector.py, which takes in
                            data and algorithm parameters as argument and evaluates the data using the choosen model
                            and returns anomaly_indexes.      
        * make_acknowledgement_json - Its function to Make acknowlegement json imported from make_ackg_json.py
        
        
        Arguments :
        
        Required Parameter:
            filepath: The Json object in the format of the input json given from reader api to evaluate the model
            model_path : Saved model file path in (string) format
        Optional Parameters: 
            mode - mode has 3 options 'detect only','detect and log','log only'
                Default: 'detect only'
            anom_thres : (Type : Positive float ) Anomaly threshold, used on anomaly scores estimated using K nearest neighbours on BMU of input test sample
                Default: 3.0
            to_plot : Boolean .Give True to see the plots of change-points detected and False if there is no need for plotting
                Default : True

        '''

    eval_args = {
        'model_path': model_path,
        'to_plot': to_plot,
        'anom_thres': anom_thres
    }
    '''
            #instantiating the error_codes to avoid overwritting
            #error_codes is a python file imported as error_codes which has error_codes dictionary mapping 
            #for different kinds errors and reset function to reset them.
        '''
    error_codes1 = error_codes()

    try:

        # type_checker is python file which has Type_checker class which checks given parameter types
        checker = type_checker.Type_checker(
            kwargs=eval_args, ideal_args_type=ideal_eval_kwargs_type)
        # res is None when no error raised, otherwise it stores the appropriate error message
        res = checker.params_checker()
        if (res != None):
            return json.dumps(res)

        # instanstiating the reader class with reader arguments
        data_reader = Data_reader(filepath=filepath)
        #getting list of dataframes per asset if not empty
        #otherwise gives string 'Empty Dataframe'
        entire_data = data_reader.read()

        writer_data = []
        anomaly_detectors = []

        if ((len(entire_data) != 0 and entire_data != None
             and type(entire_data) != dict)):
            '''
                looping over the data per assets and inside that looping over metrics per asset
                * Instantiates anomaly detector class with algo args and metric index to detect on
                * Stores the anomaly indexes and anomaly detector object to bulk write to db at once
                '''

            for i, data_per_asset in enumerate(entire_data):
                if (len(data_per_asset) != 0):
                    assetno = pd.unique(data_per_asset['assetno'])[0]
                    data_per_asset[
                        data_per_asset.columns[1:]] = normalise_standardise(
                            data_per_asset[data_per_asset.columns[1:]])

                    print("Data of Asset no: {} \n {}\n".format(
                        assetno, data_per_asset.head()))

                    anomaly_detector = som_detector.Som_Detector(
                        data=data_per_asset,
                        model_input_args=model_input_args,
                        training_args=None,
                        eval_args=eval_args)

                    anom_indexes = anomaly_detector.detect_anomalies()
                    anomaly_detectors.append(anomaly_detector)

            ack_json = make_ackg_json.make_ack_json(anomaly_detectors)

            return json.dumps(ack_json)

        else:
            '''
                Data empty error
                '''
            return json.dumps(error_codes1['data_missing'])
    except Exception as e:
        '''
            unknown exceptions are caught here and traceback used to know the source of the error
            '''
        traceback.print_exc()
        error_codes1['unknown']['message'] = str(e)
        return json.dumps(error_codes1['unknown'])