Esempio n. 1
0
 def dask_clusterMethod(self, cluster_method,
                        mname,
                        data
                        ):
     try:
         logger.info('[{}] : [INFO] Loading Clustering method {}'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method)))
         # delattr(cluster_method, 'behaviour')
         # del cluster_method.__dict__['behaviour']
         for k, v in cluster_method.get_params().items():
             logger.info('[{}] : [INFO] Method parameter {} set to {}'.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
         try:
             with joblib.parallel_backend('dask'):
                 logger.info('[{}] : [INFO] Using Dask backend for user defined method'.format(
                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                 clf = cluster_method.fit(data)
         except Exception as inst:
             logger.error('[{}] : [ERROR] Failed to fit user defined method with dask backedn with {} and {}'.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
             logger.warning('[{}] : [WARN] using default process based backedn for user defined method'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
             clf = cluster_method.fit(data)
     except Exception as inst:
         logger.error('[{}] : [ERROR] Failed to fit {} with {} and {}'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method),
             type(inst), inst.args))
         sys.exit(1)
     predictions = clf.predict(data)
     logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions))
     fname = str(clf).split('(')[0]
     self.__serializemodel(clf, fname, mname)
     return clf
 def filterLowVariance(self, df):
     logger.info('[{}] : [INFO] Checking low variance columns ...'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
     uniques = df.apply(lambda x: x.nunique())
     rm_columns = []
     for uindex, uvalue in uniques.iteritems():
         if uvalue == 1:
             rm_columns.append(uindex)
     logger.info('[{}] : [INFO] Found {} low variance columns removing ...'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(rm_columns)))
     logger.debug('[{}] : [INFO] Found {} low variance columns: {}'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(rm_columns), rm_columns))
     df.drop(rm_columns, inplace=True, axis=1)
Esempio n. 3
0
    def dict2csv(self, response, query, filename, df=False):
        '''
        :param response: elasticsearch response
        :param query: elasticserch query
        :param filename: name of file
        :param df: if set to true method returns dataframe and doesn't save to file.
        :return: 0 if saved to file and dataframe if not
        '''
        requiredMetrics = []
        logger.info('[%s] : [INFO] Started response to csv conversion',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        # print "This is the query _------------_-> %s" %query
        # print "This is the response _------------_-> %s" %response
        for key, value in response['aggregations'].items():
            for k, v in value.items():
                for r in v:
                    dictMetrics = {}
                    # print "This is the dictionary ---------> %s " % str(r)
                    for rKey, rValue in r.items():
                        if rKey == 'doc_count' or rKey == 'key_as_string':
                            pass
                        elif rKey == 'key':
                            logger.debug('[%s] : [DEBUG] Request has keys %s and  values %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue)
                            # print "%s -> %s"% (rKey, rValue)
                            dictMetrics['key'] = rValue
                        elif list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance.raw' \
                                or list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance':
                            logger.debug('[%s] : [DEBUG] Detected Memory type aggregation', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                            # print "This is  rValue ________________> %s" % str(rValue)
                            # print "Keys of rValue ________________> %s" % str(rValue.keys())
                            try:
                                for val in rValue['buckets']:
                                        dictMetrics[val['key']] = val['1']['value']
                            except Exception as inst:
                                logger.error('[%s] : [ERROR] Failed to find key with %s and %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue['value'])
                                sys.exit(1)
                        else:
                            # print "Values -> %s" % rValue
                            # print "rKey -> %s" % rKey
                            # print "This is the rValue ___________> %s " % str(rValue)
                            logger.debug('[%s] : [DEBUG] Request has keys %s and flattened values %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue['value'])
                            dictMetrics[rKey] = rValue['value']
                    requiredMetrics.append(dictMetrics)
        # print "Required Metrics -> %s" % requiredMetrics
        csvOut = os.path.join(self.dataDir, filename)
        cheaders = []
        if list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == "type_instance.raw" or \
                        list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance':
            logger.debug('[%s] : [DEBUG] Detected Memory type query', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            try:
                cheaders = list(requiredMetrics[0].keys())
            except IndexError:
                logger.error('[%s] : [ERROR] Empty response detected from DMon, stoping detection, check DMon.', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                print("Empty response detected from DMon, stoping detection, check DMon")
                sys.exit(1)
        else:
            kvImp = {}

            for qKey, qValue in query['aggs'].items():
                logger.info('[%s] : [INFO] Value aggs from query %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), qValue['aggs'])
                for v, t in qValue['aggs'].items():
                    kvImp[v] = t['avg']['field']
                    cheaders.append(v)

            cheaders.append('key')
            for key, value in kvImp.items():
                cheaders[cheaders.index(key)] = value
            for e in requiredMetrics:
                for krep, vrep in kvImp.items():
                    e[vrep] = e.pop(krep)
            logger.info('[%s] : [INFO] Dict translator %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(kvImp))
        logger.info('[%s] : [INFO] Headers detected %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cheaders))
        if not df:
            try:
                with open(csvOut, 'wb') as csvfile:
                    w = csv.DictWriter(csvfile, cheaders)
                    w.writeheader()
                    for metrics in requiredMetrics:
                        if set(cheaders) != set(metrics.keys()):
                            logger.error('[%s] : [ERROR] Headers different from required metrics: headers -> %s, metrics ->%s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cheaders),
                                         str(list(metrics.keys())))
                            diff = list(set(metrics.keys()) - set(cheaders))
                            print("Headers different from required metrics with %s " % diff)
                            print("Check qInterval setting for all metrics. Try increasing it!")
                            sys.exit(1)
                        w.writerow(metrics)
                csvfile.close()
            except EnvironmentError:
                logger.error('[%s] : [ERROR] File %s could not be created', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), csvOut)
                sys.exit(1)
            logger.info('[%s] : [INFO] Finished csv %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), filename)
            return 0
        else:
            df = pd.DataFrame(requiredMetrics)
            # df.set_index('key', inplace=True)
            logger.info('[%s] : [INFO] Created dataframe',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            return df
Esempio n. 4
0
    def detect(self, method,
               model,
               data):
        '''
        :param method: -> method name
        :param model: -> trained clusterer
        :param data: -> dataframe with data
        :return: -> dictionary that contains the list of anomalous timestamps
        '''
        smodel = self.__loadClusterModel(method, model)
        anomalieslist = []
        if not smodel:
            dpredict = 0
        else:
            if data.shape[0]:
                if isinstance(smodel, IsolationForest):
                    logger.info('[{}] : [INFO] Loading predictive model IsolationForest ').format(
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                    for k, v in smodel.get_params().items():
                        logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
                    # print("Contamination -> %s" % smodel.contamination)
                    # print("Max_Features -> %s" % smodel.max_features)
                    # print("Max_Samples -> %s" % smodel.max_samples_)
                    # print("Threashold -> %s " % smodel.threshold_)
                    try:
                        dpredict = smodel.predict(data)
                        logger.debug('[{}] : [DEBUG] IsolationForest prediction array: {}').format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(dpredict))
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
                        dpredict = 0

                elif isinstance(smodel, DBSCAN):
                    logger.info('[{}] : [INFO] Loading predictive model DBSCAN ').format(
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                    for k, v in smodel.get_params().items():
                        logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
                    # print("Leaf_zise -> %s" % smodel.leaf_size)
                    # print("Algorithm -> %s" % smodel.algorithm)
                    # print("EPS -> %s" % smodel.eps)
                    # print("Min_Samples -> %s" % smodel.min_samples)
                    # print("N_jobs -> %s" % smodel.n_jobs)
                    try:
                        dpredict = smodel.fit_predict(data)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s',
                                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                                     inst.args)
                        dpredict = 0
            else:
                dpredict = 0
                logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]),
                             str(data.shape[1]))
                print("Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]),
                             str(data.shape[1])))
            print("dpredict type is %s" % (type(dpredict)))
        if type(dpredict) is not int:
            anomalyarray = np.argwhere(dpredict == -1)
            for an in anomalyarray:
                anomalies = {}
                anomalies['utc'] = int(data.iloc[an[0]].name)
                anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name))
                anomalieslist.append(anomalies)
        anomaliesDict = {}
        anomaliesDict['anomalies'] = anomalieslist
        logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict))
        return anomaliesDict
Esempio n. 5
0
    def __decision_boundary(
        self,
        model,
        data,
        method,
        mname,
        anomaly_label=-1,
    ):
        """
        :param model: model to be refitted with 2 features (PCA)
        :param data: dataset after PCA
        :param method: method used for plotting decision boundary
        :param mname: name of the model to be displayed
        :param anomaly_label: label for anomaly instances (differs from method to method)
        """
        logger.info(
            '[{}] : [INFO] Computing PCA with 2 components for decision boundary ...'
            .format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        transformer = PCA(n_components=2)
        transformer.fit(data)
        data = transformer.transform(data)
        # print("PCA data shape: {}".format(data.shape))
        # fit model
        try:
            model.set_params(
                max_features=data.shape[-1]
            )  # becouse we have only two features we must override previous setting
        except ValueError:
            logger.debug(
                '[{}] : [Debug] Model not effected by max feature parameter, setting encoding and decoding size'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            model.set_params(encoder_neurons=[2, 64, 32],
                             decoder_neurons=[32, 64, 2])

        model.fit(data)
        y_pred_outliers = model.predict(data)

        # get anomaly index
        anomaly_index_rf = np.where(y_pred_outliers == anomaly_label)

        # Get anomalies based on index
        ano_rf = data[anomaly_index_rf]
        # plot the line, the samples, and the nearest vectors to the plane
        xx, yy = np.meshgrid(np.linspace(-15, 25, 80), np.linspace(-5, 20, 80))
        Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.title(f"Decision Boundary for {method} with name {mname}")
        plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
        plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')
        b1 = plt.scatter(data[:, 0],
                         data[:, 1],
                         c='white',
                         s=20,
                         edgecolor='k')
        c = plt.scatter(ano_rf[:, 0],
                        ano_rf[:, 1],
                        c='red',
                        s=20,
                        edgecolor='k')
        plt.axis('tight')
        plt.xlim((-15, 25))
        plt.ylim((-5, 20))
        plt.legend([b1, c], [
            "normal",
            "anomaly",
        ], loc="upper left")
        plot_name = f"Decision_Boundary_{method}_{mname}.png"
        plt.savefig(os.path.join(self.modelDir, plot_name))
        plt.close()
Esempio n. 6
0
    def dask_clusterMethod(self, cluster_method, mname, data):
        try:
            logger.info('[{}] : [INFO] Loading Clustering method {}'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                type(cluster_method)))
            # delattr(cluster_method, 'behaviour')
            # del cluster_method.__dict__['behaviour']
            for k, v in cluster_method.get_params().items():
                logger.info(
                    '[{}] : [INFO] Method parameter {} set to {}'.format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
            try:
                with joblib.parallel_backend('dask'):
                    logger.info(
                        '[{}] : [INFO] Using Dask backend for user defined method'
                        .format(
                            datetime.fromtimestamp(
                                time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                    clf = cluster_method.fit(data)
            except Exception as inst:
                logger.error(
                    '[{}] : [ERROR] Failed to fit user defined method with dask backend with {} and {}'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        type(inst), inst.args))
                logger.warning(
                    '[{}] : [WARN] using default process based backend for user defined method'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                clf = cluster_method.fit(data)
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Failed to fit {} with {} and {}'.format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                    type(cluster_method), type(inst), inst.args))
            sys.exit(1)
        predictions = clf.predict(data)
        if list(np.unique(predictions)) == [0, 1]:
            anomaly_marker = 1
            normal_marker = 0
        else:
            anomaly_marker = -1
            normal_marker = 1
        logger.info(
            '[{}] : [INFO] Number of Predicted Anomalies {} from a total of {} datapoints.'
            .format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                list(predictions).count(anomaly_marker),
                len(list(predictions))))
        logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            predictions))
        fname = str(clf).split('(')[0]
        self.__serializemodel(clf, fname, mname)
        self.__plot_feature_sep(data,
                                predictions,
                                method=fname,
                                mname=mname,
                                anomaly_label=anomaly_marker,
                                normal_label=normal_marker)
        self.__decision_boundary(clf,
                                 data,
                                 method=fname,
                                 mname=mname,
                                 anomaly_label=anomaly_marker)

        return clf