def main(): # 1) TWO BODYS (TBs) trainTBs = twoBBdf(path=train_path, dict=TB_dict) testTBs = twoBBdf(path=test_path, dict=TB_dict) trainpromisingTBs, testpromisingTBs = firstStage(train_TBs=trainTBs, test_TBs=testTBs, threshold=0.16, random_seed=42) # 2) EXTRA TRACKS (ETs) trainETs = twoBBdf(path=train_path, dict=ET_dict, specific_TBs=trainpromisingTBs.index) testETs = twoBBdf(path=test_path, dict=ET_dict, specific_TBs=testpromisingTBs.index) trainpromisingETs, testpromisingETs = secondStage(train_ETs=trainETs, test_ETs=testETs, threshold=0.5, random_seed=42) trainETs.specific_ETs = trainpromisingETs.index testETs.specific_ETs = testpromisingETs.index # 2.5) Combine TBs and ETs and apply LOF calculation trainTAG_df = combine(TB_COM_df=LOF(trainTBs), ET_COM_df=LOF(trainETs)) testTAG_df = combine(TB_COM_df=LOF(testTBs), ET_COM_df=LOF(testETs)) trainTAG_df.to_csv('trainTAG_df.csv') testTAG_df.to_csv('testTAG_df.csv') # 3) LOF, combine TBs+ETs and then feed into tagger TAGs = thirdStage(train_TAG_df=trainTAG_df, test_TAG_df=testTAG_df, train_TB_scores=trainpromisingTBs, test_TB_scores=testpromisingTBs, train_path=train_path, test_path=test_path, random_seed=42) return TAGs
def lof_calculation(): # try catch statemnt for error handling try: # start calculating as no previous responses cashed to fielsystem # checking parameters existance befor caculation # geting x axis used for plotting only not in calculation if responsee["x"]: x = responsee["x"] # geting y axis used for plotting only not in calculation if responsee["y"]: y = responsee["y"] print x print y """Retrieving Features Types""" if responsee["result"]["fields"]: fields = responsee["result"]["fields"] # logging feature types map logger.debug('fields : ' + json.dumps(fields)) # initializing global varibels to hold features by data type # categorical datatype features list global categorical_features_list categorical_features_list = [] # numarical datatype features list global numaric_features_list numaric_features_list = [] # integer datatype features list global int_features_list int_features_list = [] # consuming provided features or auto detecting features if not provided # check if variable exixts if "analysisFeatures" in responsee: features = responsee["analysisFeatures"] # consuming provided features or auto detecting features if not provided # check if it contatins values if len(features) != 0: for feature in features: if feature: print "feature is: " + feature for field in fields: if field["id"] == feature: if field["type"] == "text": categorical_features_list.append( field["id"]) if field["type"] == "int4": # execlusing ID form features if field["id"] != "_id": int_features_list.append( field["id"]) if field["type"] == "numeric": numaric_features_list.append( field["id"]) # auto detecting features as not provided else: automatic_detection_of_features(fields) # auto detecting features as not provided else: automatic_detection_of_features(fields) # logging information of detected featues logger.debug('features considred for LOF analysis are : ') logger.debug(categorical_features_list) logger.debug(numaric_features_list) logger.debug(int_features_list) """Creating global pre-Analysis (with _ids) and Pre- Response (this will keep track of original data) Data Set """ # initializing global variables global analysis_ready_list analysis_ready_list = [] global response_ready_full_list response_ready_full_list = [] # acomodatibf global variables values """DATA VALIDATION, TRANFORMATION AND HANDLING DUBLICATES AND NULLS STAGES """ for rec in records: analysis_ready_list.append((rec["_id"], )) response_ready_full_list.append((rec["_id"], )) if len(categorical_features_list) > 0: textual_to_numarical_categorical( records, categorical_features_list) if len(int_features_list) > 0: numeric_to_float(records, int_features_list) if len(numaric_features_list) > 0: numeric_to_float(records, numaric_features_list) # logging information for accomdation results logger.debug('analysis_ready_list : ' + json.dumps(analysis_ready_list)) logger.debug('response_ready_full_list : ' + json.dumps(response_ready_full_list)) """real_analysis_ready_data after removing _id as ID IS NOT USED FOR ANALYSIS""" # the REMOVING IDS STAGE TO CREATE ANALYSIS DATASET real_analysis_ready_data = [ tuple(cols[1:]) for cols in analysis_ready_list ] # logging information for data going to analysis logger.debug('analysis_ready_list : ' + json.dumps(analysis_ready_list)) logger.debug('real_analysis_ready_data: ' + json.dumps(real_analysis_ready_data)) """preparing LOF Model""" # passing data to LOF model lof = LOF(real_analysis_ready_data) # preparing LOF anomaly reponse list response_results_list = [] """Using LOF MODEL""" # looping over values (tubles of readings) to get its anomaly score against the model i = 0 for instance in real_analysis_ready_data: # 10 is the number of nighbours considered for calculation value = lof.local_outlier_factor(10, instance) """ # Sending only outliers # if(value>1): """ # tagging readings with normal, local, and global based on its anomaly score if i < len(records): if value <= 1: response_results_list.append( (value, records[i][x], records[i][y], "normal")) if value > 1 and value <= 2: response_results_list.append( (value, records[i][x], records[i][y], "local")) if value > 2: response_results_list.append( (value, records[i][x], records[i][y], "global")) i = i + 1 # initializing and preparing response response_data = {} response_data['success'] = True response_data['result'] = [] # adding anomaly results messages for record in response_results_list: response_data['result'].append(record) # adding service messages response_data['messages'] = messages # casting response map to json json_response = json.dumps(response_data) # logging infomation for the service client response logger.debug('response : ' + json.dumps(response_data)) # storing new resonse to file system store_responses_to_file_system("lof", json_response) # incase compiler catched an error except Exception, e: # logging error logger.error(e) # adding error to response message json_response = error_resonse(str(traceback.print_exc())) # printing full error trace traceback.print_exc()
value = (k - len(rc))**2 if value < min_value: min_value = value k_min = k #k_new = int((k_min + len(rc)) / 2) k_new = len(rc) if k_new not in kList: kList.append(k_new) kList.sort() # print("New klist is \n",kList) while True: print("New instances is \n", instances) lof = LOF(instances) M = [[] for i in range(len(instances))] pre_c1 = [] kinf = [] for i in range(len(instances)): pre_c1.append(0) kinf.append(0) for k in kList: if k > len(instances): kList = kList[0:kList.index(k)] break if len(instances) == 1 and len(kList) == 0: kList.append(1) print("kList is", kList)
# Generate some outliers X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X_inliers, X_outliers] n_outliers = len(X_outliers) ground_truth = np.ones(len(X), dtype=int) ground_truth[-n_outliers:] = -1 y = np.zeros(200,dtype=np.int) y_outlier = np.ones(20,dtype=np.int) y = np.append(y, y_outlier) # use my class lof = LOF() coef = lof.fit_predict(X) coef = (coef - coef.min()) / (coef.max() - coef.min()) #print(coef) from PyDBOD.base import Base ''' probedata = clf.fit_predict(data) print(clf.threshold_) ''' color = np.array(['k','b']) plt.title("Local Outlier Factor (LOF)") plt.scatter(X[:, 0], X[:, 1], color=color[y], s=3., label='Data points') # plot circles with radius proportional to the outlier scores