def get_data(): global x_scaled global df_data model_name = [] clust_no = 0 infilename = askopenfilename(initialdir="/", title="Select Training Data", filetypes=((".csv", "*.csv"), (".sav", "*.sav"), ("all files", "*.*"))) blz.tic() print('Selected data input: ', infilename) if infilename[-4:] == '.csv': df_data = pd.read_csv(infilename) x = df_data.values # returns a numpy array elif infilename[-4:] == '.sav': x = pickle.load(open(infilename, 'rb')) print('Data normalization ...') min_max_scaler = preprocessing.MinMaxScaler() # normalization x_scaled = min_max_scaler.fit_transform(x) print("Data sample size: " + str(len(x_scaled))) # f = pandas.DataFrame(x_scaled) blz.toc() print('\x1b[1;33m' + 'Done with [Data Loading].' + '\x1b[0m')
def push2cube(): global value_list global tst_out global out_filename1 global this_prefix usrname = entry11.get() passwd = entry12.get() print('\nStarting ' + '\x1b[6;30;42m' + 'PUSH DATAFRAME TO MSTR CUBE: ' + '\x1b[0m') blz.tic() datasetName = this_prefix + '_cube' tableName = this_prefix + '_table' cubeinfo_name = this_prefix + '_cubeinfo' # Authentication request and connect to the Rally Analytics project conn = microstrategy.Connection(base_url=baseURL, login_mode=16, username=usrname, password=passwd, project_name=projName) conn.connect() print("Connect to " + baseURL) # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube newDatasetId, newTableId = conn.create_dataset(data_frame=df_data2, dataset_name=datasetName, table_name=tableName) # Store Data Set Id and Table Id locally cubeInfoFile = open(cubeinfo_name, 'w') cubeInfoFile.write(newDatasetId + '\n') cubeInfoFile.write(newTableId) cubeInfoFile.close() print("CREATE Cube on URL: " + baseURL[:-25]) print('[ Dataset Name: ' + datasetName + ' \ Cube ID = ' + newDatasetId + '] [Table Name: ' + tableName + ' \ Table ID = ' + newTableId + ' ]') blz.toc() print( '\x1b[1;33m' + "Done with [Output to MSTR Cube for Dossier Reporting (without PA)]" + '\x1b[0m')
def do_clustering(): global k global x_scaled global labels sel_model() blz.tic() # print(model_index) if model_index == 0: model_kmeans(k, x_scaled) elif model_index == 1: model_hc(k, x_scaled) elif model_index == 2: model_sc(k, x_scaled) elif model_index == 3: model_gmm(k, x_scaled) elif model_index == 4: model_dpgmm(k, x_scaled) else: print('Invalid Model') print('Final cluster numbers = ', len(np.unique(labels))) blz.toc() print('\x1b[1;33m' + 'Done with [Clustering].' + '\x1b[0m')
def push2cube_pa(): # global value_list global df_final global baseURL global projName global out_filename0 print('\nStarting ' + '\x1b[6;30;42m' + 'PUSH DATAFRAME TO MSTR CUBE (W. PA): ' + '\x1b[0m') blz.tic() df_cube = pd.read_csv(out_filename0) df_cube['PID'] = df_cube['PID'].apply(str) datasetName = 'DemoTest_n' + value_list[6] + '_pa' tableName = 'ErrorRank_demo_n' + value_list[6] + '_pa' cubeinfoName = 'demoInfo_n' + value_list[6] + '_pa.txt' datasetName0 = 'DemoTest_' + value_list[3] + '_n' + value_list[6] + '_pa' isNewCube = False if value_list[2] == '': isNewCube = True # Authentication request and connect to the Rally Analytics project conn = microstrategy.Connection(base_url=baseURL, login_mode=16, username=value_list[0], password=value_list[1], project_name=projName) conn.connect() print("Connect to " + baseURL) if var1.get() == 1: # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube newDatasetId, newTableId = conn.create_dataset( data_frame=df_cube, dataset_name=datasetName, table_name=tableName) # Store Data Set Id and Table Id locally cubeInfoFile = open(cubeinfoName, 'w') cubeInfoFile.write(newDatasetId + '\n') cubeInfoFile.write(newTableId) cubeInfoFile.close() print("CREATE Cube on URL: " + baseURL[:-25]) print('[ Dataset Name: ' + datasetName + ' \ Cube ID = ' + newDatasetId + '] [Table Name: ' + tableName + ' \ Table ID = ' + newTableId + ' ]') else: # Read saved cube IDs cubeInfoFile = open(cubeinfoName, 'r') datasetID = cubeInfoFile.read().splitlines() cubeInfoFile.close() # Establish cube connection conn.update_dataset(data_frame=df_cube, dataset_id=datasetID[0], table_name=tableName, update_policy='add') print("UPDATE Cube on URL: " + baseURL[:-25]) print("Dataset Name " + datasetName + "[Cube ID: " + datasetID[0] + " Table Name: " + tableName + "]") print("CREATE a backup cube: " + datasetName0) newDatasetId0, newTableId0 = conn.create_dataset(data_frame=df_cube, dataset_name=datasetName0, table_name=tableName) blz.toc() print('\x1b[1;33m' + "Done with [Output to MSTR Cube for Dossier Reporting (with PA)]" + '\x1b[0m')
def push2cube_nopa(): global value_list global tst_out global out_filename1 global isLDAP global this_prefix FMT1 = '%Y-%m-%d %H:%M:%S' print('\nStarting ' + '\x1b[6;30;42m' + 'PUSH DATAFRAME TO MSTR CUBE (WO. PA): ' + '\x1b[0m') blz.tic() df_cube = pd.read_csv(out_filename1) df_cube['PID'] = df_cube['PID'].apply(str) # datasetName = this_prefix + 'err_n' + value_list[6] + '_nopa' # tableName = this_prefix + 'ErrorRank_n' + value_list[6] + '_nopa' # cubeinfo_name = 'Cube Info_' + this_prefix + 'n' + value_list[6] + '_nopa.txt' # datasetName0 = this_prefix + 'cube' + value_list[3] + '_n' + value_list[6] + '_nopa' datasetName = this_prefix + 'err_nopa' tableName = this_prefix + 'ErrorRank_nopa' cubeinfo_name = 'Cube Info_' + this_prefix + '_nopa.txt' datasetName0 = this_prefix + 'cube' + value_list[3] + '_nopa' # Authentication request and connect to the Rally Analytics project # is LDAP login (1) or standard user (0) if isLDAP == 1: conn = microstrategy.Connection(base_url=baseURL, login_mode=16, username=value_list[0], password=value_list[1], project_name=projName) else: conn = microstrategy.Connection(base_url=baseURL, username=value_list[0], password=value_list[1], project_name=projName) conn.connect() print("Connect to " + baseURL) # Create a new cube or use the existing cube if var1.get() == 1: # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube newDatasetId, newTableId = conn.create_dataset( data_frame=df_cube, dataset_name=datasetName, table_name=tableName) # Store Data Set Id and Table Id locally cubeInfoFile = open(cubeinfo_name, 'w') cubeInfoFile.write(newDatasetId + '\n') cubeInfoFile.write(newTableId) cubeInfoFile.close() print("CREATE Cube on URL: " + baseURL[:-25]) print('[ Dataset Name: ' + datasetName + ' \ Cube ID = ' + newDatasetId + '] [Table Name: ' + tableName + ' \ Table ID = ' + newTableId + ' ]') else: # Read saved cube IDs cubeInfoFile = open(cubeinfo_name, 'r') datasetID = cubeInfoFile.read().splitlines() cubeInfoFile.close() # Establish cube connection conn.update_dataset(data_frame=df_cube, dataset_id=datasetID[0], table_name=tableName, update_policy='add') print("UPDATE Cube on URL: " + baseURL[:-25]) print("Dataset Name " + datasetName + "[Cube ID: " + datasetID[0] + " Table Name: " + tableName + "]") print("CREATE a backup cube: " + datasetName0) newDatasetId0, newTableId0 = conn.create_dataset(data_frame=df_cube, dataset_name=datasetName0, table_name=tableName) blz.toc() print( '\x1b[1;33m' + "Done with [Output to MSTR Cube for Dossier Reporting (without PA)]" + '\x1b[0m')
def paTimeline(): global envStr global envfilename global pafilename global tst_out # global prediction global df_err_pa global outfilename global out_filename0 global df_final print('\nStarting ' + '\x1b[6;30;42m' + ' STEP 6 ' + '\x1b[0m') FMT1 = '%Y-%m-%d %H:%M:%S' df_pa0 = pd.read_csv(pafilename) df_ins = pd.read_csv(envfilename) pa_ools = list(itemgetter(2, 3, 4)(list(df_pa0.head(0)))) instance_id = df_ins[df_ins['instance_name'] == envStr]['instance_id'].item() print('Instance ID: ', instance_id) df_pa = df_pa0[df_pa0['instance_id'] == instance_id][pa_ools] df_pa = df_pa.rename( { 'metric_retrieve_time': 'ROUNDED TIME STAMP', 'cpu_avg_usage': 'CPU', 'mem_avg_usage': 'MEMORY' }, axis='columns') pa_timestamp = pd.to_datetime(df_pa['ROUNDED TIME STAMP']) # t1: Rounded TimeStamp lower bound (Error TS lowbound - 2 hours) ; t2: Rounded TimeStamp upper bound (Error TS lowbound + 2 hours) # ETC to UTC (+5 hours) and +/- 2 hours for PA range tst_out2 = tst_out # update TIME STAMP from ETS to UTC time zone tst_out2['TIME STAMP'] = tst_out2['TIME STAMP'] + pd.DateOffset(hours=5) error_ts_lowbound = min(tst_out2['TIME STAMP']) error_ts_upperbound = max(tst_out2['TIME STAMP']) t1 = error_ts_lowbound - pd.DateOffset(hours=2) t2 = error_ts_upperbound + pd.DateOffset(hours=2) df_pa2 = df_pa[pa_timestamp.between(t1, t2)] # PA time sample rate (resolution) is 5 min. detla_t = 5 print( '****** Aligning Error timestamp and PA rounded timestamp [down-sampling to ' + str(detla_t) + ' min interval] ...') blz.tic() pa_time = pd.to_datetime(df_pa2['ROUNDED TIME STAMP']) err_time = tst_out2['TIME STAMP'] nb_error = len(tst_out2) time_allign_idx = [None] * nb_error old_time = [None] * nb_error round_time = [None] * nb_error err_local_clust = list([0] * nb_error) count_in_time = list([0] * nb_error) norm_cpu = list([0] * nb_error) norm_memory = list([0] * nb_error) for i in range(nb_error): err_time = tst_out2.iloc[i]['TIME STAMP'] old_time[i] = err_time # find the time differences between the error outlier timestamp vs all PA timestamps time_delta = err_time - pd.to_datetime(pa_time) idx = np.argmin( abs(np.array(time_delta) )) # find the closest PA timestamp, temp is the PA index if not (idx == 0 and (err_time < pa_time.iloc[0] - timedelta(minutes=detla_t) or err_time > pa_time.iloc[-1] + timedelta(minutes=detla_t))): # if aligned timestamp is far below the lower bound (< t-5 mins) or beyond the upper bound (> t+5 mins.), do not assign corresponding rounded timestamp time_allign_idx[i] = idx else: time_allign_idx[i] = 0 round_time[i] = pa_time.iloc[idx] blz.toc() print("\nArranging dataframe output ...") blz.tic() # x0: BLITZ TEST # x1: PID # x2: TIME STAMP # x3: PRODUCT # x4: ERROR # x5: IS FATAL # x6: EXTRA # x7: UID # x8: SID # x9: OID # x10: THU # x11: KEYWARD # x12: WEIGHT (select the highest if many) # x13: IDENTICALS (initialized by value 1) # x14: CLUSTER_ID # x15: MODEL CLUSTER SIZE # x16: NO.OF ALIKE # x17: ERROR KPI # x18: BOOSTED ERROR KPI # x19: ROUNDED TIME STAMP # x20: CPU # x21: MEMORY # x22: COUNTS OF SIMILAR ERRORS FROM THE SAME CLUSTER # x23: POPULATION NORMALIZED KPI # x24: ERROR COUNT PER TIMESTAMP # x25: POPULATION NORMALIZED CPU # x26: POPULATION NORMALIZED MEMORY # Re-index all error outlier samples df_tstout = pd.DataFrame(tst_out2, index=range(nb_error)) # add rounded timestamps and reassign index df_x19 = pd.DataFrame(round_time, columns=['ROUNDED TIME STAMP'], index=range(nb_error)) # Use the aligned time index to extract PA performance df_x20 = df_pa2.iloc[time_allign_idx][['CPU', 'MEMORY']] # Drop the index from error outlier data frame df_x21 = df_x20.reset_index(drop=True) # time_uniq, loc4, counts4 = np.unique(df_x19, return_index=True, return_counts=True) nb_ts = len(time_uniq) # for i in range(nb_ts): select_indices = list(np.where(df_x19 == time_uniq[i])[0]) time_block = prediction[select_indices] # Number of samples in the time block nb_tb = len(time_block) # find unique cluster tags and counts within a time block clust_uniq, loc5, counts5 = np.unique(time_block, return_index=True, return_counts=True) for j in range(nb_tb): count_in_time[select_indices[j]] = nb_tb norm_cpu[select_indices[j]] = df_x20.iloc[ select_indices[0]]['CPU'] / nb_tb norm_memory[select_indices[j]] = df_x21.iloc[ select_indices[0]]['MEMORY'] / nb_tb # select_index = np.where(clust_uniq == time_block[j])[0][0] err_local_clust[select_indices[j]] = counts5[select_index] df_x22 = pd.DataFrame(err_local_clust, columns=["SIMILAR ERROR COUNTS IN CLUSTER"]) df_x23 = pd.DataFrame(tst_out2["ERROR KPI"].as_matrix(columns=None) / err_local_clust / count_in_time, columns=["NORMALIZED KPI"]) df_x24 = pd.DataFrame({'ERROR COUNT PER TIMESTAMP': count_in_time}) df_x25 = pd.DataFrame({'NORMALIZED CPU': norm_cpu}) df_x26 = pd.DataFrame({'NORMALIZED MEMORY': norm_memory}) df_err2 = pd.concat( [df_tstout, df_x19, df_x21, df_x22, df_x23, df_x24, df_x25, df_x26], axis=1) # patching the CPU and Memory values outside TimeStamp with Errors df_final = df_pa2.append(df_err2) df_final = df_final.sort_values(by='BOOSTED ERROR KPI', ascending=False) df_final = df_final.reset_index(drop=True) df_length = len(df_final) for i in range(df_length): if np.isnan(df_final.iloc[i]['NORMALIZED CPU']): if np.isnan(df_final.iloc[i]['ERROR COUNT PER TIMESTAMP']): df_final['NORMALIZED CPU'][i] = df_final['CPU'][i] df_final['NORMALIZED MEMORY'][i] = df_final['MEMORY'][i] blz.toc() # to remove the rows without errors within time interval ss = np.array( pd.to_datetime(df_final['ROUNDED TIME STAMP']) <= error_ts_upperbound) tt = np.array( pd.to_datetime(df_final['ROUNDED TIME STAMP']) >= error_ts_lowbound) uu = np.array(np.isnan(df_final['ERROR COUNT PER TIMESTAMP'])) df_final = df_final.drop(df_final[ss & tt & uu].index) df_final = df_final.reset_index(drop=True) # OUTPUT RESULT AS .CSV FOR DATA CUBE blz.tic() print("Saving testing results ... ") out_filename0 = outfilename[0:-4] + '_datacube_pa.csv' print('Exporting ' + os.getcwd() + ' ' + out_filename0) df_final.to_csv(out_filename0, index=False) blz.toc() print('\x1b[1;33m' + "Done with [Time Stamp Correlation]." + '\x1b[0m')
def predict(): global df_uniq_err global tst_out global prediction global out_filename1 global vec_filename global pty_filename print('\nStarting ' + '\x1b[6;30;42m' + ' STEP 4 ' + '\x1b[0m') # nb_error is the number of unique errors nb_error = len(df_uniq_err['ERROR']) tst_clean = [None] * nb_error # Load ML Training model print("****** Vectorization gallery ... ") df_gallery = pd.read_csv(pty_filename, encoding="ISO-8859-1", engine='python') vectorizer2 = pickle.load(open(vec_filename, "rb"), encoding='iso-8859-1') x = vectorizer2.transform(df_gallery['ERROR TOKENS']) X = x.toarray() nb_gallery = len(X) print("****** Text Regular Expression: probe ... ") blz.tic() # text regular expression operation regex_pat123 = re.compile(r'[^a-zA-Z0-9\s]', flags=re.IGNORECASE) regex_pat = re.compile(r'[^a-zA-Z\s]', flags=re.IGNORECASE) for k in range(nb_error): if (k % 200 == 1) or (k + 200 >= nb_error): sg.OneLineProgressMeter('Vectorization', k + 1, nb_error, 'key') temp = df_uniq_err['ERROR'].iloc[k] temp = temp.replace("/", " ") temp = temp.replace("_", " ") temp = temp.replace("-", " ") temp = temp.replace("=", " ") temp = temp.replace(";", " ") temp = temp.replace(".", " ") temp = temp.replace("'", "") # take care remove nonprintable characters # temp = temp.replace("\xc3",'') # temp = temp.replace("\xa4",'') # temp = temp.replace("\xe5",'') temp = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', temp) # # tst_words = nltk.word_tokenize(temp) # tst_series = pd.Series(tst_words) tst_series = pd.Series(temp) # keep only words tst_clean1 = tst_series.str.replace(regex_pat123, ' ') mask = ((tst_series.str.len() == 32) | (tst_series.str.len() == 33)) & (~tst_series.str.islower()) tst_clean1.loc[pd.Series.as_matrix(mask)] = 'GUID' tst_clean1 = tst_series.str.replace( regex_pat, ' ') # join the cleaned words in a list tst_clean2 = tst_clean1.str.cat(sep=' ') tst_clean[k] = tst_clean2 blz.toc() print("****** Cluster Prediction on testing samples ... ") blz.tic() print('Prediction: [learning Gallery: ' + pty_filename + '] [vector space: ' + vec_filename + ']') y = vectorizer2.transform(tst_clean) Y = y.toarray() # prediction = loaded_model.predict(Y) bestMatch_index = [0] * nb_error bestMatch_score = [0] * nb_error prediction = [0] * nb_error pred_str = [None] * nb_error frequency = [0] * nb_error for i in range(nb_error): # Pop up a progress bar if (i % 200 == 1) or (i + 200 >= nb_error): sg.OneLineProgressMeter('Simaility Matching', i + 1, nb_error, 'key') temp_similarity = [0] * nb_gallery for j in range(nb_gallery): temp_similarity[j] = 1 - distance.cosine(Y[i], X[j]) bestMatch_index[i] = np.argmax(temp_similarity) bestMatch_score[i] = max(temp_similarity) prediction[i] = df_gallery.iloc[bestMatch_index[i]]['CLUSTER TAG'] frequency[i] = df_gallery.iloc[bestMatch_index[i]]['FREQUENCY'] # Assign the cluster tags pred_str[i] = ['C' + str(prediction[i])] vec_filename = outfilename[0:-4] + '_vsp.npz' np.savez(vec_filename, Y) print( 'Saving vector space projection as .npz file for internal algorithm evaluation: ' + os.getcwd() + '\\' + vec_filename) blz.toc() # Put testing outputs together in a data frame format print("Generating Error Ranking Output ... ") blz.tic() tst_uniq_df = df_uniq_err # MATCH SCORE df_sim = pd.DataFrame(bestMatch_score, columns=["SIMILARITY SCORE"]) # CLUSTER_ID tst_df1 = pd.DataFrame(pred_str, columns=["CLUSTER_ID"]) # NO. OF ALIKE tst_d2 = np.bincount(prediction) tst_d2 = tst_d2[prediction] tst_df2 = pd.DataFrame(tst_d2, columns=["NO. OF ALIKE"]) # MODEL FREQUENCY (MODEL CLUSTER SIZE) # tst_d3 = frequency.astype('float', copy=True) / sum(frequency) tst_d3 = frequency / sum(frequency) tst_df3 = pd.DataFrame(frequency, columns=["MODEL CLUSTER SIZE"]) # MATCH SCORE # tst_d4 = bestMatch_score # tst_df4 = pd.DataFrame(tst_d4, columns=["MATCH SCORE"]) # ERROR KPI # prob. of test in-cluster tst_d2p = tst_d2.astype('float', copy=True) / nb_error # prob. of gallery in-cluster tst_d3p = tst_d3.astype('float', copy=True) / sum(tst_d3) tst_d5 = abs(np.log(tst_d3p * tst_d2p)) tst_df5 = pd.DataFrame(tst_d5, columns=["ERROR KPI"]) # BOOSTED ERROR KPI tst_d6 = np.asarray((1 + tst_uniq_df['WEIGHT']) * tst_d5) tst_df6 = pd.DataFrame(tst_d6, columns=["BOOSTED ERROR KPI"]) # Ignore the dataframe index before concatenation tst_uniq_df.reset_index(drop=True, inplace=True) # Concate with additional atributes # tst_out = pd.concat([tst_uniq_df, df_sim, tst_df1, tst_df2, tst_df3, tst_df5, tst_df6], axis=1) tst_out = pd.concat( [tst_uniq_df, df_sim, tst_df1, tst_df2, tst_df3, tst_df5, tst_df6], axis=1) out_filename1 = outfilename[0:-4] + '_datacube_nopa.csv' # out_filename1 = 'out_DSSErr' + tst_filename_prefix + 'n' + str(node) + '_' + env_id + office + '_datacube.csv' tst_out.to_csv(out_filename1, index=False) print('exporting ' + os.getcwd() + '\\' + out_filename1) blz.toc() print('\x1b[1;33m' + "Done with [Ranking]." + '\x1b[0m')
def parse(): global infilename global outfilename global df_err global df_uniq_err global envStr errorCount = -1 currentLine = 1 filename_prefix = 'DSSErr' isFound = False FMT1 = '%Y-%m-%d %H:%M:%S' # FMT1 = '%m/%d/%Y %H:%M' date_str = value_list[3][0:4] + '-' print('\nStarting ' + '\x1b[6;30;42m' + ' STEP 3 ' + '\x1b[0m') # find total line number nb_lines = sum(1 for line in open(infilename, encoding="utf8")) # Read keyword watchlist df_keyword = pd.read_csv("error_keywords.csv") nb_keyword = len(df_keyword) nexttoMainline = False # Identify env_id and node number = (1,2) fin = open(infilename, "r", encoding="utf8") tline = fin.readline() while not isFound: whereTIMESTAMP = tline[0:30].find(date_str) if whereTIMESTAMP >= 0: whereHOST, rightqHOST, HOSTval = getAtribute( '[HOST:env-', tline, whereTIMESTAMP + 24, whereTIMESTAMP + 24 + 30) if not (HOSTval == []): whereLaio = HOSTval.find('laio') if whereLaio >= 0: # ENV_ID value_list[5] = HOSTval[0:whereLaio] # Node No value_list[6] = HOSTval[whereLaio + 4] isFound = True tline = fin.readline() fin.close() envStr = 'env-' + HOSTval print('Enviroment ID: ' + envStr) outfilename = filename_prefix + value_list[3] + 'n' + value_list[ 6] + '_' + value_list[5] + '_' + value_list[4] + '.csv' print('Output filename: ', outfilename) # Open file fin = open(infilename, "r", encoding="utf8") tline = fin.readline() ary_err = [] print("****** Parsing DSSError log text ... ") blz.tic() while tline: # Pop up a progress bar if (currentLine % 200 == 1) or (currentLine + 200 > nb_lines): sg.OneLineProgressMeter('Line Parsing', currentLine, nb_lines, 'key') isMainLine = False whereTIMESTAMP = tline[0:30].find(date_str) if whereTIMESTAMP >= 0: # if datetime string can be found EXTRAval = '' rightTIMESTAMP = whereTIMESTAMP + 11 whereERROR = tline[rightTIMESTAMP:].find( '[Error]') # Check if [Error} or [Fetal] exists whereFATAL = tline[rightTIMESTAMP:].find('[Fatal]') if ((whereERROR > -1) or (whereFATAL > -1)): isMainLine = True errorCount = errorCount + 1 if (whereFATAL > -1): whereERROR = whereFATAL # to find the absolute position of [ERROR] by dding the TIMESTAMP offset whereERROR = whereERROR + rightTIMESTAMP # record PID: wherePID, rightqPID, PIDval = getAtribute( '[PID:', tline, rightTIMESTAMP, whereERROR) # record THR: whereTHR, rightqTHR, THRval = getAtribute( '[THR:', tline, rightqPID, whereERROR) # record PRODUCT wherePPRODUCT, rightqPRODUCT, PRODUCTval = getAtribute( '[', tline, rightqTHR, whereERROR) # record UID whereUID, rightqUID, UIDval = getAtribute( '[UID:', tline, whereERROR, whereERROR + 50) # record SID whereSID, rightqSID, SIDval = getAtribute( '[SID:', tline, rightqUID, rightqUID + 50) # record OID if (whereSID > -1): whereOID, rightqOID, OIDval = getAtribute( '[OID:', tline, rightqSID, rightqSID + 50) else: whereOID, rightqOID, OIDval = getAtribute( '[OID:', tline, rightqUID, rightqUID + 100) # record TIME STAMP TIMESTAMPval = pd.to_datetime( tline[whereTIMESTAMP:whereTIMESTAMP + 19]) # record ERROR if (whereOID == -1): rightqERROR = whereERROR + 6 if (whereSID > -1): # [To handle the case when SID exists]: # 2018-05-14 20:35:34.745 [HOST:env-93835laio1use1][SERVER:CastorServer][PID:5949][THR:139643536082688] # [Distribution Service][Error][UID:2ED12F4211E7409200000080EF755231] # [SID:3723DC217D2D1C26E2AD86E25D5D1552] MSIDeliveryEngine::hDelivery(): Unknown Delivery Failed. Error string # from ExecuteMultiProcess SSL Error: A failure in the SSL library occurred, usually a protocol error [Provider # certificate may expire]. . <Subscription '' (ID = 00000000000000000000000000000000), Contact 'Monitoring, HeartBeat' (ID = A86D5DC6459DD1909C70188084201E1F) > ERRORval = tline[whereSID + rightqSID + 2:] else: # [To handle the case when [SID: ....] does not exists, find the immediate right of [ERROR], and then check if '[0x' (zeroX) exist]: # 2018-05-14 20:35:34.800 [HOST:env-93835laio2use1][SERVER:CastorServer][PID:84762][THR:140030753163008] # [Metadata Server][Error][0x8004140B] Object with ID '44F9CBE411E857B600000080EF854C73' and type 4 (Metric) # is not found in metadata. It may have been deleted. ERRORval = tline[rightqERROR + 1:] else: # if [OID: ....] exists, find the imediate rightqt of OID. Extract error string all the way to the end of the line ERRORval = tline[rightqOID + 1:] # Exceptional rule to take care of very long text led by <rw_manipulations dumpdf_err whereMANIPULATION = ERRORval.find('<rw_manipulations') if (whereMANIPULATION > -1): EXTRAval = ERRORval[whereMANIPULATION:whereMANIPULATION + 100] + ' ...' ERRORval = ERRORval[1:whereMANIPULATION - 1] # Exceptional rule to take care of Big Data team log dump defect whereCSV = ERRORval.find('[SimpleCSVParser]') if (whereCSV > -1): leftqDEXTRA = ERRORval[18:].find('[') if leftqDEXTRA == -1: leftqDEXTRA = ERRORval[18:].find('PROBLEM DESCRIPTION') if leftqDEXTRA == -1: leftqDEXTRA = ERRORval[18:].find( 'LATEST STATUS SUMMARY') if leftqDEXTRA == -1: leftqDEXTRA = ERRORval[18:].find( 'Cannot parse out numeric value from') + 36 if leftqDEXTRA == -1: EXTRAval = '' else: EXTRAval = '[SimpleCSVParser]: ' + ERRORval[ leftqDEXTRA + 17:] ERRORval = ERRORval[1:(leftqDEXTRA + 17 - 1)] # remove [TAB] and [NEW LINE] character ERRORval = ERRORval.replace("\t", " ") ERRORval = ERRORval.replace("\n", "") isMainLine = True nexttoMainline = True else: nexttoMainline = False currentLine = currentLine + 1 tline = fin.readline() else: # if datetime string does not exist, check if there is an extra information # check if the previous line is a main[ERROR] line(ie.contains [Error] or {Fatal] EXTRAtmp = '' if nexttoMainline: while (tline[0:20].find(date_str) == -1) and (not (tline == '')): tline = tline.replace("\n", " ") EXTRAtmp = EXTRAtmp + tline tline = fin.readline() currentLine = currentLine + 1 EXTRAtmp = EXTRAtmp.replace("\t", " ") EXTRAtmp = EXTRAtmp.replace("\n", "") # Check if there is any term matched to the phrases from the truncation list which might be the XML or SQL dump and needs to be truncated if any(word in EXTRAtmp for word in truncat): # Roll extra error back to the error except the following conditions if ERRORval.find('[SimpleCSVParser]') > -1: ERRORval = ERRORval + EXTRAtmp else: whereERRORTYPE = EXTRAtmp.find('Error type:') if whereERRORTYPE > -1: tempStr = EXTRAtmp[whereERRORTYPE + 12:] leftqTYPE1 = [ x for x in range(len(tempStr)) if tempStr.startswith('[', x) ] leftqTYPE2 = [ x for x in range(len(tempStr)) if tempStr.startswith('(', x) ] leftqTYPE3 = [ x for x in range(len(tempStr)) if tempStr.startswith('.', x) ] if leftqTYPE1 == []: leftqTYPE1 = [99999, 99999] if leftqTYPE2 == []: leftqTYPE2 = [99999, 99999] if leftqTYPE3 == []: leftqTYPE3 = [99999, 99999] leftqTYPE = min(leftqTYPE1[0], leftqTYPE2[0], leftqTYPE3[0]) extra0 = EXTRAtmp[whereERRORTYPE:whereERRORTYPE + leftqTYPE + 10] extra1 = EXTRAtmp[whereERRORTYPE + whereERRORTYPE + 12:] EXTRAval = EXTRAval + ' ' + extra1 ERRORval = ERRORval + ' { ' + extra0 + ' }' else: EXTRAval = EXTRAval + ' { ' + EXTRAtmp + ' }' else: ERRORval = ERRORval + ' {' + EXTRAtmp + '}' nexttoMainline = True isMainLine = True else: # it is NOT an legit message, so the line counter advanves by 1 nexttoMainline = False tline = fin.readline() currentLine = currentLine + 1 # Put all attribute values together to form a vector if (isMainLine or nexttoMainline): # Keyword Search: Search keyword on both ERROR and EXTRA columns. If matched, write keyword to the "KEYWORD" column max_kwd_weight = 0 this_kwd_weight = 0 keywordFound = False KWDval = '' which_keyword = 0 # Keuword search against the keyword watch list while (not keywordFound) and (which_keyword < nb_keyword): loc_keyword = ERRORval.upper().find( df_keyword['ERROR KEYWORD'][which_keyword]) if loc_keyword > -1: this_kwd_weight = df_keyword['WEIGHT'][which_keyword] if this_kwd_weight > max_kwd_weight: max_kwd_weight = this_kwd_weight KWDval = df_keyword['ERROR KEYWORD'][which_keyword] else: if EXTRAval != '': loc_keyword = EXTRAval.upper().find( df_keyword['ERROR KEYWORD'][which_keyword]) if loc_keyword > -1: this_kwd_weight = df_keyword['WEIGHT'][ which_keyword] if this_kwd_weight > max_kwd_weight: max_kwd_weight = this_kwd_weight KWDval = df_keyword['ERROR KEYWORD'][ which_keyword] which_keyword = which_keyword + 1 # FATAL vs. ERROR check if (whereFATAL > -1): isFatal = 1 else: isFatal = 0 # x0: BLITZ TEST # x1: PID # x2: TIME STAMP # x3: PRODUCT # x4: ERROR # x5: IS FATAL # x6: EXTRA # x7: UID # x8: SID # x9: OID # x10: THU # x11: KEYWARD # x12: WEIGHT (select the highest if many) # x13: IDENTICALS (initialized by value 1) if len(EXTRAval) > 256: EXTRAval = EXTRAval[0:255] errorVec = [ value_list[2], PIDval, TIMESTAMPval, PRODUCTval, ERRORval, isFatal, EXTRAval, UIDval, SIDval, OIDval, THRval, KWDval, max_kwd_weight, 1 ] # Append the error vector into error array ary_err.append(errorVec) fin.close() # Convert the data type series to dataframe df_err = pd.DataFrame(ary_err, columns=columns) # Sort Keyword weight in decending order so the higher weight errors can be retained after finding unique single df_temp = df_err.sort_values(['WEIGHT'], ascending=False) # Use unique operation to remove duplicates, where the variables of: # uniq_err = the unique error text. loc1 = the index of the chosen unique. counts = duplicate counts uniq_err, loc1, counts = np.unique(df_temp['ERROR'], return_index=True, return_counts=True) # q is the number of unique errors q = np.size(uniq_err) # Assemble the entire row of chosen unique df_uniq_err = df_temp.iloc[loc1] # Add unique count column into the data frame df_uniq_err['IDENTICALS'] = counts.tolist() blz.toc() # Out unique error dataframe to .csv # out_filename1 = outfilename[0:-4]+'_dataframe.csv' out_filename2 = outfilename[0:-4] + '_parse.csv' print('Exporting ' + os.getcwd() + '\\' + out_filename2) df_uniq_err.to_csv(out_filename2, index=False) print('\x1b[1;33m' + 'Done with [Parsing].' + '\x1b[0m')
def push2cube(): global value_list global tst_out global out_filename1 global this_prefix usrname = entry11.get() passwd = entry12.get() print('\nPush Dataframe to MSTR Cube: ') blz.tic() url_sel = var4.get() print(url_sel) print("Your selected MSTR URL: " + str(url_sel)) baseURL = 'https://' + url_sel + '/MicroStrategyLibrary/api' isLDAP = var3.get() print('Project Name: ', projName) datasetName = this_prefix + '_cube' print('Cube Name: ', datasetName) tableName = this_prefix + '_table' print('Table Name: ', tableName) cubeinfo_name = this_prefix + '_cubeinfo' print('Cube Info Name: ', cubeinfo_name) # Authentication request and connect to the Rally Analytics project if isLDAP == 1: conn = microstrategy.Connection(base_url=baseURL, login_mode=16, username=usrname, password=passwd, project_name=projName) else: conn = microstrategy.Connection(base_url=baseURL, username=usrname, password=passwd, project_name=projName) conn.connect() print("Successfully Connect to " + baseURL[:-25]) # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube newDatasetId, newTableId = conn.create_dataset(data_frame=df_data2, dataset_name=datasetName, table_name=tableName) # Store Data Set Id and Table Id locally cubeInfoFile = open(cubeinfo_name, 'w') cubeInfoFile.write(newDatasetId + '\n') cubeInfoFile.write(newTableId) cubeInfoFile.close() print("Succeefully Create a Cube on URL: ", baseURL) print('Project Name: ', '\x1b[6;30;42m' + projName + '\x1b[0m') print('Dataset/Cube Name: ' + '\x1b[6;30;42m' + datasetName + ' [Cube ID = ' + newDatasetId + ']' + '\x1b[0m') print('Table Name: ' + '\x1b[6;30;42m' + tableName + ' [Table ID = ' + newTableId + ']' + '\x1b[0m') blz.toc() print('\x1b[1;33m' + "Done with [Output to MSTR Cube for Dossier Reporting" + '\x1b[0m')
def do_outlier(): global x_scaled global outlier global outlier_x global outlier_pca_x global pca_x threshold = w.get() blz.tic() # Whitening Transformation print('\nPCA and Whitening transform...') # Find unique group labels uniq_label = np.unique(labels) nb_clusters = len(uniq_label) nb_samples = len(x_scaled) nb_dim = len(x_scaled[0]) pca = PCA(n_components=nb_dim, svd_solver='full', whiten=True) # initialization variable arrays outlier = [0] * nb_samples pca_x = [a[:] for a in [[0] * nb_dim] * nb_samples] sigma_pca_x = [a[:] for a in [[0] * nb_dim] * nb_clusters] sigma_x = [a[:] for a in [[0] * nb_dim] * nb_clusters] mu_pca_x = [a[:] for a in [[0] * nb_dim] * nb_clusters] mu_x = [a[:] for a in [[0] * nb_dim] * nb_clusters] for i in range(nb_clusters): idx1 = np.where(labels == uniq_label[i])[0] x_inCluster = x_scaled[idx1] pca_x_inCluster = pca.fit_transform(x_inCluster) m = len(pca_x_inCluster) for j in range(m): pca_x[idx1[j]] = pca_x_inCluster[j].tolist() mu_x[i] = x_inCluster.mean(axis=0) sigma_x[i] = x_inCluster.std(axis=0) mu_pca_x[i] = pca_x_inCluster.mean(axis=0) sigma_pca_x[i] = pca_x_inCluster.std(axis=0) idx_temp = np.where(abs(pca_x_inCluster) >= threshold) idx2 = np.unique(idx_temp[0]) for u in idx1[idx2].tolist(): outlier[u] = 1 nb_outliers = sum(outlier) print('Number of outliers: ' + str(nb_outliers)) outlier_x = [x[:] for x in [[0] * nb_dim] * nb_outliers] outlier_pca_x = [x[:] for x in [[0] * nb_dim] * nb_outliers] c = 0 for i in range(nb_samples): if outlier[i] == 1: outlier_x[c] = x_scaled[i].tolist() outlier_pca_x[c] = pca_x[i] c = c + 1 do_dataframe() blz.toc() print('\x1b[1;33m' + 'Done with [Outlier Detection].' + '\x1b[0m') do_plotting()