def read_connection(model_id): try: insert_record = {} insert_record["Timestamp"] = datetime.now() insert_record["S_Model_id"] = model_id insert_record["Result"] = [] update_collection("nnresults", 'S_Model_id', model_id, insert_record) read_model = read_collectionbyid("nnmodels", model_id) #print(read_counters) lines_count = read_model['NumOfRecords'] con_id = read_model['ConId'] read_con = read_collectionbyid("nnconnections", con_id) ConType = read_con['ConType'] if (ConType == "1"): dir_path = read_con['FileLocation'] #filestream(con_id,dir_path,lines_count) print("Reading DataSet Started") filestream(con_id, dir_path, lines_count, model_id) else: broker_endpoints = read_con['BrokerEndPoint'] topics_list = read_con['TopicName'] print(broker_endpoints) #kafkastream(con_id,broker_endpoints,topics_list,lines_count) print("Reading DataSet Started") raw_input_data = kafkastream(con_id, broker_endpoints, topics_list, lines_count) print("Reading DataSet Ended") print("Pre-Processing Data Started") Model_input_Data, text_field_LE, scaler, unique_val_list = pre_processing( raw_input_data) print("Pre-Processing Data Ended") print(Model_input_Data.head()) print("Training Model Started") result, normal, abnormal = trainmodel(Model_input_Data) print("Training Model Ended") print(result.head(100)) print("Post-Processing Data Started") final_result = postprocessing(Model_input_Data, abnormal, scaler, text_field_LE, unique_val_list) print("Post-Processing Data Ended") # print(normal.head(100)) # print(final_result) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) logging.error("Model Created Failed:" + str(sys.exc_info())) update_collection( "nnconnections", "_id", ObjectId(con_id), { "Status": "Failed", "Error_Message": "Creating connection got error while making DB connection. Error Message <--> " + str(sys.exc_info()) }) raise
def read_connection(model_id): #try: read_model = read_collectionbyid("nnmodels", model_id) lines_count = read_model['NumOfRecords'] TrainConId = read_model['TrainConId'] if TrainConId == "": TrainConId = read_model['ConId'] read_con = read_collectionbyid("nnconnections", TrainConId) ConType = read_con['ConType'] if (ConType == "1"): dir_path = read_con['FileLocation'] print("Reading DataSet Started") raw_input_data = filestream(TrainConId, dir_path, lines_count) else: broker_endpoints = read_con['BrokerEndPoint'] topics_list = read_con['TopicName'] #print(broker_endpoints) raw_input_data = kafkastream(TrainConId, broker_endpoints, topics_list, lines_count) print("Reading DataSet Ended") print("Pre-Processing Data Started") Model_input_Data, text_field_LE, scaler, unique_val_list = pre_processing( raw_input_data, model_id) print("Pre-Processing Data Ended") print("Training Model Started") result, normal, abnormal = trainmodel(Model_input_Data, model_id) print("Training Model Ended") print("Post-Processing Data Started") postprocessing(result, abnormal, scaler, text_field_LE, unique_val_list, model_id) print("Post-Processing Data Ended")
def read_connection(con_id): try: read_counters = read_collectionbyid("nnconnections", con_id) #print(read_counters) con_type = read_counters['ConType'] if (con_type == "1"): dir_path = read_counters['FileLocation'] #lines_count = read_counters['NoOfRecords'] file_con_onerecord(con_id, dir_path, lines_count=1) else: broker_endpoints = read_counters['BrokerEndPoint'] topics_list = read_counters['TopicName'] #lines_count = read_counters['NoOfRecords'] onerecord(con_id, broker_endpoints, topics_list, lines_count=1) except: logging.error("Unable to read connection details from DB.." + str(sys.exc_info())) update_collection( "nnconnections", "_id", ObjectId(con_id), { "Status": "Failed", "Error_Message": "Creating connection got error while making DB connection. Error Message <--> " + str(sys.exc_info()) })
def stream_model(data_processed=pd.DataFrame(), model_id=''): Local_File_Path = os.path.dirname(os.path.realpath(__file__)) MODEL_PATH = Local_File_Path + "/Model/model.h5" TB_LOG_DIR = Local_File_Path + "/TB" MODEL_SAVE_PATH = Local_File_Path + "/Model/model.json" OUT_PATH = "/node-parser/data/Results/" RANDOM_STATE = 77 THRESHOLD = 3 #2*STD away from mean will be logger = logging.getLogger(os.path.basename(__file__)) #logger = __get_logger() db_record = read_collectionbyid('nnmodels', model_id) config_over_field_name = db_record['MainField'] config_job_function = db_record['ModelType'] config_job_sub_function = db_record['ModelMethod'] X_test_index = data_processed.index test_columns = list(data_processed.columns) ix = test_columns.index(config_over_field_name) test = pd.DataFrame(data_processed.iloc[:, ix]) if config_job_sub_function == 'NN': X_test = test.values.reshape(test.shape[0], 1) # Build the Model ad_obj = AD.AnomalyDetection(logger) #Load Model model = ad_obj.load_nn_model(MODEL_PATH, MODEL_SAVE_PATH) model = ad_obj.compile_nn_model(model) result = ad_obj.predict_nn_model(model, X_test) # ad_obj.plot_nn_result(THRESHOLD,error_df,test) normal, abnormal = ad_obj.export_nn_result(THRESHOLD, result, test, OUT_PATH) normal.set_index(X_test_index, inplace=True) abnormal.set_index(X_test_index, inplace=True) if config_job_sub_function == 'SVM': model = ad_obj.load_svm_model(MODEL_PATH) pred, result = ad_obj.predict_svm_model(model, test) #ad_obj.plot_svm_result(X_test, pred) normal, abnormal = ad_obj.export_svm_result(test, pred, OUT_PATH) normal.set_index(X_test_index, inplace=True) abnormal.set_index(X_test_index, inplace=True) #print(normal) return normal, abnormal
def postprocessing(data=pd.DataFrame(),abnormal=pd.DataFrame(),scaler= MinMaxScaler(copy=True, feature_range=(0, 1)),text_field_LE={},unique_val_list={},model_id=''): data_index = data.index db_record = read_collectionbyid('nnmodels',model_id) Scaler_deco_data = pd.DataFrame(scaler.inverse_transform(data),columns=data.columns) Scaler_deco_data.set_index(data_index,inplace=True) #print('shape') #print(Scaler_deco_data.dtypes) def ldecoder(data,feature='',le=LabelEncoder()): #print(data.dtypes) new_col=str(feature)+"_Encoded" #print(new_col) data[feature]=le.inverse_transform(data[new_col].astype('int')) data.drop(new_col,axis=1,inplace=True) return data for key,tx_f in text_field_LE.items(): #print(tx_f) data = ldecoder(Scaler_deco_data,key,tx_f) abnormal.loc[abnormal['Actual'].notnull(),'Actual'] = 'YES' #abnormal['Actual'].set_value(abnormal['Actual'].notnull(),value='YES') abnormal['Actual'].fillna('NO', inplace=True) df_result = pd.concat([data,abnormal['Actual']], axis=1,copy=False) df_result.reset_index(level=0, inplace=True) df_result.sort_values(db_record['MainField'],inplace=True) result_json = df_result.to_json(orient='records') insert_record = {} insert_record["Timestamp"] = datetime.now() insert_record["Model_id"] = model_id insert_record["Result"] = result_json insert_record.update(unique_val_list) #print(insert_record) #print(model_id) #print(str(unique_val_list)) insert_collection("nnresults", insert_record) #update_collection("nnresults","Model_id",model_id, unique_val_list) #print(e) #print(result_json)
def trainmodel(data_processed=pd.DataFrame(), model_id=''): Local_File_Path = os.path.dirname(os.path.realpath(__file__)) MODEL_PATH = Local_File_Path + "/Model/model.h5" MODEL_SAVE_PATH = Local_File_Path + "/Model/model.json" TB_LOG_DIR = Local_File_Path + "/TB/" + model_id + "/" if not os.path.exists(TB_LOG_DIR): os.makedirs(TB_LOG_DIR) os.chmod(TB_LOG_DIR, 0o755) OUT_PATH = "/node-parser/data/Results/" RANDOM_STATE = 77 THRESHOLD = 3 #2*STD away from mean will be logger = logging.getLogger(os.path.basename(__file__)) #logger = __get_logger() db_record = read_collectionbyid('nnmodels', model_id) config_over_field_name = db_record['MainField'] config_job_function = db_record['ModelType'] config_job_sub_function = db_record['ModelMethod'] if config_job_function == 'anomaly': # Split data into train/test test_columns = list(data_processed.columns) ix = test_columns.index(config_over_field_name) train, test = train_test_split(data_processed, \ test_size=0.2, \ random_state=77) X_test_pd = pd.DataFrame(test) #print('split') #print(X_test_pd) X_train = pd.DataFrame(train.iloc[:, ix]) X_test = pd.DataFrame(test.iloc[:, ix]) X_test_index = X_test.index # Build the Model ad_obj = AD.AnomalyDetection(logger) if config_job_sub_function == 'NN': X_train = X_train.values.reshape(X_train.shape[0], 1) X_test = X_test.values.reshape(X_test.shape[0], 1) encoding_dim = int(math.log(X_train.shape[0])) input_dim = X_train.shape[1] nn_model = ad_obj.build_nn_model(encoding_dim, input_dim) nn_model = ad_obj.compile_nn_model(nn_model) # Run Model nb_epoch = int(math.pow(X_train.shape[0], 3 / 5)) batch_size = int(math.sqrt(X_train.shape[0])) nn_model, history = ad_obj.run_nn_model( nn_model, X_train, X_test, nb_epoch, batch_size, MODEL_PATH, TB_LOG_DIR, ) # Visualize Training History # ad_obj.depict_nn_training_hist(history) ad_obj.save_nn_model(nn_model, MODEL_SAVE_PATH) nn_history_file = OUT_PATH + "nn_history.json" ad_obj.export_nn_training_result(history, nn_history_file) result = ad_obj.predict_nn_model(nn_model, X_test) result.set_index(X_test_index, inplace=True) # ad_obj.plot_nn_result(THRESHOLD,error_df,test) normal, abnormal = ad_obj.export_nn_result(THRESHOLD, result, test, OUT_PATH) normal.set_index(X_test_index, inplace=True) abnormal.set_index(X_test_index, inplace=True) if config_job_sub_function == 'SVM': svm_model = ad_obj.build_svm_model(X_train) ad_obj.save_svm_model(MODEL_PATH) pred, result = ad_obj.predict_svm_model(svm_model, X_test) result.set_index(X_test_index, inplace=True) #ad_obj.plot_svm_result(X_test,pred) normal, abnormal = ad_obj.export_svm_result(X_test, pred, OUT_PATH) normal.set_index(X_test_index, inplace=True) abnormal.set_index(X_test_index, inplace=True) # test_columns = list(data_processed.columns) # ix = test_columns.index(config_over_field_name) # test = pd.DataFrame(data_processed.iloc[:, ix]) # if config_job_sub_function == 'NeuralNet': # X_test=test.values.reshape(X_test.shape[0], 1) # # Build the Model # ad_obj = AD.AnomalyDetection(logger) # #Load Model # model=ad_obj.load_nn_model(MODEL_PATH) # model=ad_obj.compile_nn_model(model) # result = ad_obj.predict_nn_model(nn_model, X_test) # # ad_obj.plot_nn_result(THRESHOLD,error_df,test) # normal, abnormal = ad_obj.export_nn_result(THRESHOLD, result, test, OUT_PATH) # # if config_job_sub_function == 'SVM': # model=ad_obj.load_svm_model(MODEL_PATH) # pred, result = ad_obj.predict_svm_model(svm_model, test) # ad_obj.plot_svm_result(X_test, pred) # normal, abnormal = ad_obj.export_svm_result(test, pred, OUT_PATH) return X_test_pd, normal, abnormal
def pre_processing(raw_input_data=pd.DataFrame(), model_id=''): db_record = read_collectionbyid('nnmodels', model_id) if db_record['TrainConId'] == '': con_id = db_record['ConId'] else: con_id = db_record['TrainConId'] field_list = read_collectionbyfield('nnfieldslist', 'Con_id', con_id) fiel_l = field_list['Field_list'].split('|') fiel_t = field_list['Field_Type_List'].split('|') #print(fiel_l) field_type = {} for i in range(len(fiel_l)): field_type[fiel_l[i]] = fiel_t[i] Main_Field = db_record['MainField'] Main_Field_Type = field_type[Main_Field] Time_Field = db_record['TimeSeriesField'] filter = '' filter_value = 200.02 filter_value1 = 0 Inf_Field = [] IntervalSpan = db_record['IntervalSpan'].replace('~', '') if IntervalSpan == '': IntervalSpan = None for x in db_record['FieldsSelected'].split(','): Inf_Field.append(x) fun = db_record['AggregationFun'] text_field_list = [] unique_val_list = {} #print(raw_input_data.head()) for Field in Inf_Field: #unique_val_list[Field] = list(raw_input_data[Field].unique()) if field_type[Field] == 'str': text_field_list.append(Field) def lencoder(data, feature): le = LabelEncoder() new_col = str(feature) + "_Encoded" data[new_col] = le.fit_transform(data[feature].astype(str)) #print(le.inverse_transform(data[new_col])) data.drop(feature, axis=1, inplace=True) return data, le def minmaxscaler(data, time_field): scaler = MinMaxScaler(copy=False, feature_range=(0, 1)) data.set_index(time_field, inplace=True) data_index = data.index #data.drop(time,axis=1,inplace=True) #print(data.columns) data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns) data.set_index(data_index, inplace=True) return data, scaler raw_input_data = raw_input_data[list([Time_Field, Main_Field]) + Inf_Field] #raw_input_data = raw_input_data.filter() raw_input_data = raw_input_data.replace('', np.nan, inplace=False) raw_input_data = raw_input_data.dropna(inplace=False, axis=0) if Main_Field_Type == 'int' or Main_Field_Type == 'float': raw_input_data[Main_Field] = pd.to_numeric(raw_input_data[Main_Field], errors='coerce') if filter == '=': raw_input_data = raw_input_data[raw_input_data[Main_Field] == filter_value] elif filter == '!=': raw_input_data = raw_input_data[ raw_input_data[Main_Field] != filter_value] elif filter == '>': raw_input_data = raw_input_data[ raw_input_data[Main_Field] > filter_value] elif filter == '<': raw_input_data = raw_input_data[ raw_input_data[Main_Field] < filter_value] elif filter == '>=': raw_input_data = raw_input_data[ raw_input_data[Main_Field] >= filter_value] elif filter == '<=': raw_input_data = raw_input_data[ raw_input_data[Main_Field] <= filter_value] elif filter == 'like': raw_input_data = raw_input_data[ raw_input_data[Main_Field].str.contains(filter_value)] elif filter == 'btw': raw_input_data = raw_input_data[ raw_input_data[Main_Field] >= filter_value & raw_input_data[Main_Field] <= filter_value1] #print(raw_input_data) try: raw_input_data[Time_Field] = raw_input_data[Time_Field].apply( lambda x: pen.parse(x)) except: raw_input_data[Time_Field] = raw_input_data[Time_Field].apply( lambda x: datetime.fromtimestamp( mktime(time.strptime(x, '%d/%b/%Y:%H:%M:%S %z')))) #raw_input_data[Time_Field] = pd.to_datetime(raw_input_data[Time_Field],format='%d/%b/%Y:%H:%M:%S %z') #print(raw_input_data.columns) #raw_input_data[Time_Field]=raw_input_data[Time_Field].apply(lambda x: datetime.datetime.strftime(x)) #raw_input_data['time1'] = raw_input_data[Time_Field].apply(lambda x: dateparser.parse(x)) if (fun == 'sum'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.sum, axis=0).reset_index() if (fun == 'count'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].agg('count').reset_index() if (fun == 'average'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.average, axis=0).reset_index() if (fun == 'mean'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.mean, axis=0).reset_index() if (fun == 'median'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.median, axis=0).reset_index() if (fun == 'product'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.prod, axis=0).reset_index() if (fun == 'maximum'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.max, axis=0).reset_index() if (fun == 'maximum'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.min, axis=0).reset_index() if (fun == 'cumsum'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.cumsum, axis=0).reset_index() if (fun == 'cumprod'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].apply(np.cumprod, axis=0).reset_index() if (fun == 'cumsum'): Grouped_input_data = raw_input_data.sort_values( Time_Field, ascending=True).groupby( list([pd.Grouper(key=Time_Field, freq=IntervalSpan)]) + Inf_Field)[Main_Field].cumsum().reset_index() #raw_input_data.replace('', np.nan, inplace=True) #print(Grouped_input_data) #raw_input_data.fillna(method='ffill') text_field_LE = {} for tx_f in text_field_list: Encoded_input_data, text_field_LE[tx_f] = lencoder( Grouped_input_data, tx_f) #print(Encoded_input_data) #print(Encoded_input_data) Model_input_Data, scaler = minmaxscaler(Encoded_input_data, Time_Field) #print(Model_input_Data) #print(scaler) #print(datetime) return Model_input_Data, text_field_LE, scaler
def file_stream(con_id, dir_path, lines_count, model_id): raw_input_data = pd.DataFrame() count = 0 i = 0 error_count = 0 f_count = 0 start = 0 end = lines_count rem_rows = 0 os.chdir(dir_path) files = sorted(os.listdir(dir_path), key=os.path.getmtime) oldest = files[0] newest = files[-1] # print("Oldest:",oldest) # print("Newest:", newest) print("List of JSON files Found:", files, ".........") df_l = [] dic_l = [] db_record = read_collectionbyid('nnmodels', model_id) TimeSeriesField = db_record['TimeSeriesField'] IntervalSpan = db_record['IntervalSpan'].replace('~', '') if IntervalSpan == '': IntervalSpan = None span_timestamp = None counter = 0 try: num_files = len([ f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) ]) #print("num_files",num_files) while f_count < num_files: with open(files[f_count]) as myfile: try: print("Reading file:", files[f_count]) chunk = list(islice(myfile, 0, None)) #print(len(chunk)) #print("chunk length:",len(chunk)) #print("old f_count,start,end:",f_count,start,end) f_count = f_count + 1 for record in chunk: try: s = record result = json.loads(s) # try to parse... if span_timestamp == None: counter = 0 #print(record) value, typ = convert_table(record, counter) #print(value) span_timestamp, df_l, dic_l = data_grouper( value, typ, span_timestamp, model_id, df_l, dic_l, TimeSeriesField, IntervalSpan) counter += 1 count += 1 print("\r Records Read... " + str(count), end="") # print("Original", raw_input_data) # print(raw_input_data.head()) # print(raw_input_data.shape) # break # parsing worked -> exit loop except Exception as e: # print("Exception"+str(e)) # "Expecting , delimiter" # position of unexpected character after '"' #-unexp = int(re.findall(r'\(char (\d+)\)', str(e))[0]) # position of unescaped '"' before that #-unesc = s.rfind(r'"', 0, unexp) #-s = s[:unesc] + r'\"' + s[unesc + 1:] # position of correspondig closing '"' (+2 for inserted '\') #-closg = s.find(r'"', unesc + 2) #-s = s[:closg] + r'\"' + s[closg + 1:] #-res = json.dumps(result) # res.strip('\'"') #-value, typ = convert_table(record, count) #-if typ == 'dict': #-dic_l.append(value) #-else: #-df_l.append(value) #-count += 1 # print("Formatted", raw_input_data) # print(raw_input_data.head()) # print(raw_input_data.shape) raise except: error_count += 1 logging.error("Errror while parsing json" + str(sys.exc_info())) raise except: logging.info("Processed all records..") raise if len(dic_l) > 0 and len(df_l) > 0: raw_input_data_dic = json_normalize(dic_l, sep='~') raw_input_data_df = pd.concat(df_l, axis=0) raw_input_data = raw_input_data_dic.append(raw_input_data_df, ignore_index=False) elif len(df_l) > 0: raw_input_data = pd.concat(df_l, axis=0) elif len(dic_l) > 0: raw_input_data = json_normalize(dic_l, sep='~') #print(raw_input_data) return raw_input_data except: logging.error("Unable to open file.." + str(sys.exc_info())) raise