def dask_clusterMethod(self, cluster_method, mname, data ): try: logger.info('[{}] : [INFO] Loading Clustering method {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method))) # delattr(cluster_method, 'behaviour') # del cluster_method.__dict__['behaviour'] for k, v in cluster_method.get_params().items(): logger.info('[{}] : [INFO] Method parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: with joblib.parallel_backend('dask'): logger.info('[{}] : [INFO] Using Dask backend for user defined method'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit user defined method with dask backedn with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning('[{}] : [WARN] using default process based backedn for user defined method'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit {} with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method), type(inst), inst.args)) sys.exit(1) predictions = clf.predict(data) logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions)) fname = str(clf).split('(')[0] self.__serializemodel(clf, fname, mname) return clf
def filterLowVariance(self, df): logger.info('[{}] : [INFO] Checking low variance columns ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) uniques = df.apply(lambda x: x.nunique()) rm_columns = [] for uindex, uvalue in uniques.iteritems(): if uvalue == 1: rm_columns.append(uindex) logger.info('[{}] : [INFO] Found {} low variance columns removing ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(rm_columns))) logger.debug('[{}] : [INFO] Found {} low variance columns: {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(rm_columns), rm_columns)) df.drop(rm_columns, inplace=True, axis=1)
def dict2csv(self, response, query, filename, df=False): ''' :param response: elasticsearch response :param query: elasticserch query :param filename: name of file :param df: if set to true method returns dataframe and doesn't save to file. :return: 0 if saved to file and dataframe if not ''' requiredMetrics = [] logger.info('[%s] : [INFO] Started response to csv conversion', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # print "This is the query _------------_-> %s" %query # print "This is the response _------------_-> %s" %response for key, value in response['aggregations'].items(): for k, v in value.items(): for r in v: dictMetrics = {} # print "This is the dictionary ---------> %s " % str(r) for rKey, rValue in r.items(): if rKey == 'doc_count' or rKey == 'key_as_string': pass elif rKey == 'key': logger.debug('[%s] : [DEBUG] Request has keys %s and values %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue) # print "%s -> %s"% (rKey, rValue) dictMetrics['key'] = rValue elif list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance.raw' \ or list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance': logger.debug('[%s] : [DEBUG] Detected Memory type aggregation', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # print "This is rValue ________________> %s" % str(rValue) # print "Keys of rValue ________________> %s" % str(rValue.keys()) try: for val in rValue['buckets']: dictMetrics[val['key']] = val['1']['value'] except Exception as inst: logger.error('[%s] : [ERROR] Failed to find key with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue['value']) sys.exit(1) else: # print "Values -> %s" % rValue # print "rKey -> %s" % rKey # print "This is the rValue ___________> %s " % str(rValue) logger.debug('[%s] : [DEBUG] Request has keys %s and flattened values %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue['value']) dictMetrics[rKey] = rValue['value'] requiredMetrics.append(dictMetrics) # print "Required Metrics -> %s" % requiredMetrics csvOut = os.path.join(self.dataDir, filename) cheaders = [] if list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == "type_instance.raw" or \ list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance': logger.debug('[%s] : [DEBUG] Detected Memory type query', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: cheaders = list(requiredMetrics[0].keys()) except IndexError: logger.error('[%s] : [ERROR] Empty response detected from DMon, stoping detection, check DMon.', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Empty response detected from DMon, stoping detection, check DMon") sys.exit(1) else: kvImp = {} for qKey, qValue in query['aggs'].items(): logger.info('[%s] : [INFO] Value aggs from query %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), qValue['aggs']) for v, t in qValue['aggs'].items(): kvImp[v] = t['avg']['field'] cheaders.append(v) cheaders.append('key') for key, value in kvImp.items(): cheaders[cheaders.index(key)] = value for e in requiredMetrics: for krep, vrep in kvImp.items(): e[vrep] = e.pop(krep) logger.info('[%s] : [INFO] Dict translator %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(kvImp)) logger.info('[%s] : [INFO] Headers detected %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cheaders)) if not df: try: with open(csvOut, 'wb') as csvfile: w = csv.DictWriter(csvfile, cheaders) w.writeheader() for metrics in requiredMetrics: if set(cheaders) != set(metrics.keys()): logger.error('[%s] : [ERROR] Headers different from required metrics: headers -> %s, metrics ->%s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cheaders), str(list(metrics.keys()))) diff = list(set(metrics.keys()) - set(cheaders)) print("Headers different from required metrics with %s " % diff) print("Check qInterval setting for all metrics. Try increasing it!") sys.exit(1) w.writerow(metrics) csvfile.close() except EnvironmentError: logger.error('[%s] : [ERROR] File %s could not be created', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), csvOut) sys.exit(1) logger.info('[%s] : [INFO] Finished csv %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), filename) return 0 else: df = pd.DataFrame(requiredMetrics) # df.set_index('key', inplace=True) logger.info('[%s] : [INFO] Created dataframe', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) return df
def detect(self, method, model, data): ''' :param method: -> method name :param model: -> trained clusterer :param data: -> dataframe with data :return: -> dictionary that contains the list of anomalous timestamps ''' smodel = self.__loadClusterModel(method, model) anomalieslist = [] if not smodel: dpredict = 0 else: if data.shape[0]: if isinstance(smodel, IsolationForest): logger.info('[{}] : [INFO] Loading predictive model IsolationForest ').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) # print("Contamination -> %s" % smodel.contamination) # print("Max_Features -> %s" % smodel.max_features) # print("Max_Samples -> %s" % smodel.max_samples_) # print("Threashold -> %s " % smodel.threshold_) try: dpredict = smodel.predict(data) logger.debug('[{}] : [DEBUG] IsolationForest prediction array: {}').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(dpredict)) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 elif isinstance(smodel, DBSCAN): logger.info('[{}] : [INFO] Loading predictive model DBSCAN ').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) # print("Leaf_zise -> %s" % smodel.leaf_size) # print("Algorithm -> %s" % smodel.algorithm) # print("EPS -> %s" % smodel.eps) # print("Min_Samples -> %s" % smodel.min_samples) # print("N_jobs -> %s" % smodel.n_jobs) try: dpredict = smodel.fit_predict(data) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 else: dpredict = 0 logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]), str(data.shape[1])) print("Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]), str(data.shape[1]))) print("dpredict type is %s" % (type(dpredict))) if type(dpredict) is not int: anomalyarray = np.argwhere(dpredict == -1) for an in anomalyarray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]].name) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name)) anomalieslist.append(anomalies) anomaliesDict = {} anomaliesDict['anomalies'] = anomalieslist logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict)) return anomaliesDict
def __decision_boundary( self, model, data, method, mname, anomaly_label=-1, ): """ :param model: model to be refitted with 2 features (PCA) :param data: dataset after PCA :param method: method used for plotting decision boundary :param mname: name of the model to be displayed :param anomaly_label: label for anomaly instances (differs from method to method) """ logger.info( '[{}] : [INFO] Computing PCA with 2 components for decision boundary ...' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) transformer = PCA(n_components=2) transformer.fit(data) data = transformer.transform(data) # print("PCA data shape: {}".format(data.shape)) # fit model try: model.set_params( max_features=data.shape[-1] ) # becouse we have only two features we must override previous setting except ValueError: logger.debug( '[{}] : [Debug] Model not effected by max feature parameter, setting encoding and decoding size' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) model.set_params(encoder_neurons=[2, 64, 32], decoder_neurons=[32, 64, 2]) model.fit(data) y_pred_outliers = model.predict(data) # get anomaly index anomaly_index_rf = np.where(y_pred_outliers == anomaly_label) # Get anomalies based on index ano_rf = data[anomaly_index_rf] # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-15, 25, 80), np.linspace(-5, 20, 80)) Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title(f"Decision Boundary for {method} with name {mname}") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black') b1 = plt.scatter(data[:, 0], data[:, 1], c='white', s=20, edgecolor='k') c = plt.scatter(ano_rf[:, 0], ano_rf[:, 1], c='red', s=20, edgecolor='k') plt.axis('tight') plt.xlim((-15, 25)) plt.ylim((-5, 20)) plt.legend([b1, c], [ "normal", "anomaly", ], loc="upper left") plot_name = f"Decision_Boundary_{method}_{mname}.png" plt.savefig(os.path.join(self.modelDir, plot_name)) plt.close()
def dask_clusterMethod(self, cluster_method, mname, data): try: logger.info('[{}] : [INFO] Loading Clustering method {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method))) # delattr(cluster_method, 'behaviour') # del cluster_method.__dict__['behaviour'] for k, v in cluster_method.get_params().items(): logger.info( '[{}] : [INFO] Method parameter {} set to {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: with joblib.parallel_backend('dask'): logger.info( '[{}] : [INFO] Using Dask backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit user defined method with dask backend with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning( '[{}] : [WARN] using default process based backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit {} with {} and {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method), type(inst), inst.args)) sys.exit(1) predictions = clf.predict(data) if list(np.unique(predictions)) == [0, 1]: anomaly_marker = 1 normal_marker = 0 else: anomaly_marker = -1 normal_marker = 1 logger.info( '[{}] : [INFO] Number of Predicted Anomalies {} from a total of {} datapoints.' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), list(predictions).count(anomaly_marker), len(list(predictions)))) logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions)) fname = str(clf).split('(')[0] self.__serializemodel(clf, fname, mname) self.__plot_feature_sep(data, predictions, method=fname, mname=mname, anomaly_label=anomaly_marker, normal_label=normal_marker) self.__decision_boundary(clf, data, method=fname, mname=mname, anomaly_label=anomaly_marker) return clf