def get_data():
    global x_scaled
    global df_data

    model_name = []
    clust_no = 0

    infilename = askopenfilename(initialdir="/",
                                 title="Select Training Data",
                                 filetypes=((".csv", "*.csv"),
                                            (".sav", "*.sav"), ("all files",
                                                                "*.*")))

    blz.tic()
    print('Selected data input: ', infilename)

    if infilename[-4:] == '.csv':
        df_data = pd.read_csv(infilename)
        x = df_data.values  # returns a numpy array
    elif infilename[-4:] == '.sav':
        x = pickle.load(open(infilename, 'rb'))

    print('Data normalization ...')
    min_max_scaler = preprocessing.MinMaxScaler()
    # normalization
    x_scaled = min_max_scaler.fit_transform(x)
    print("Data sample size: " + str(len(x_scaled)))
    # f = pandas.DataFrame(x_scaled)
    blz.toc()
    print('\x1b[1;33m' + 'Done with [Data Loading].' + '\x1b[0m')
Ejemplo n.º 2
0
def push2cube():

    global value_list
    global tst_out
    global out_filename1
    global this_prefix

    usrname = entry11.get()
    passwd = entry12.get()

    print('\nStarting ' + '\x1b[6;30;42m' + 'PUSH DATAFRAME TO MSTR CUBE: ' +
          '\x1b[0m')
    blz.tic()

    datasetName = this_prefix + '_cube'
    tableName = this_prefix + '_table'
    cubeinfo_name = this_prefix + '_cubeinfo'

    # Authentication request and connect to the Rally Analytics project
    conn = microstrategy.Connection(base_url=baseURL,
                                    login_mode=16,
                                    username=usrname,
                                    password=passwd,
                                    project_name=projName)
    conn.connect()

    print("Connect to " + baseURL)

    # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube
    newDatasetId, newTableId = conn.create_dataset(data_frame=df_data2,
                                                   dataset_name=datasetName,
                                                   table_name=tableName)
    # Store Data Set Id and Table Id locally
    cubeInfoFile = open(cubeinfo_name, 'w')
    cubeInfoFile.write(newDatasetId + '\n')
    cubeInfoFile.write(newTableId)
    cubeInfoFile.close()
    print("CREATE Cube on URL: " + baseURL[:-25])
    print('[ Dataset Name: ' + datasetName + ' \ Cube ID = ' + newDatasetId +
          ']   [Table Name: ' + tableName + ' \ Table ID = ' + newTableId +
          ' ]')
    blz.toc()
    print(
        '\x1b[1;33m' +
        "Done with [Output to MSTR Cube for Dossier Reporting (without PA)]" +
        '\x1b[0m')
def do_clustering():
    global k
    global x_scaled
    global labels

    sel_model()
    blz.tic()
    # print(model_index)
    if model_index == 0:
        model_kmeans(k, x_scaled)
    elif model_index == 1:
        model_hc(k, x_scaled)
    elif model_index == 2:
        model_sc(k, x_scaled)
    elif model_index == 3:
        model_gmm(k, x_scaled)
    elif model_index == 4:
        model_dpgmm(k, x_scaled)
    else:
        print('Invalid Model')
    print('Final cluster numbers = ', len(np.unique(labels)))
    blz.toc()
    print('\x1b[1;33m' + 'Done with [Clustering].' + '\x1b[0m')
def push2cube_pa():

    # global value_list
    global df_final
    global baseURL
    global projName
    global out_filename0

    print('\nStarting ' + '\x1b[6;30;42m' +
          'PUSH DATAFRAME TO MSTR CUBE (W. PA): ' + '\x1b[0m')
    blz.tic()
    df_cube = pd.read_csv(out_filename0)
    df_cube['PID'] = df_cube['PID'].apply(str)

    datasetName = 'DemoTest_n' + value_list[6] + '_pa'
    tableName = 'ErrorRank_demo_n' + value_list[6] + '_pa'
    cubeinfoName = 'demoInfo_n' + value_list[6] + '_pa.txt'
    datasetName0 = 'DemoTest_' + value_list[3] + '_n' + value_list[6] + '_pa'

    isNewCube = False
    if value_list[2] == '':
        isNewCube = True

    # Authentication request and connect to the Rally Analytics project
    conn = microstrategy.Connection(base_url=baseURL,
                                    login_mode=16,
                                    username=value_list[0],
                                    password=value_list[1],
                                    project_name=projName)
    conn.connect()

    print("Connect to " + baseURL)

    if var1.get() == 1:
        # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube
        newDatasetId, newTableId = conn.create_dataset(
            data_frame=df_cube, dataset_name=datasetName, table_name=tableName)

        # Store Data Set Id and Table Id locally
        cubeInfoFile = open(cubeinfoName, 'w')
        cubeInfoFile.write(newDatasetId + '\n')
        cubeInfoFile.write(newTableId)
        cubeInfoFile.close()
        print("CREATE Cube on URL: " + baseURL[:-25])
        print('[ Dataset Name: ' + datasetName + ' \ Cube ID = ' +
              newDatasetId + ']   [Table Name: ' + tableName +
              ' \ Table ID = ' + newTableId + ' ]')
    else:
        # Read saved cube IDs
        cubeInfoFile = open(cubeinfoName, 'r')
        datasetID = cubeInfoFile.read().splitlines()
        cubeInfoFile.close()
        # Establish cube connection
        conn.update_dataset(data_frame=df_cube,
                            dataset_id=datasetID[0],
                            table_name=tableName,
                            update_policy='add')
        print("UPDATE Cube on URL: " + baseURL[:-25])
        print("Dataset Name " + datasetName + "[Cube ID: " + datasetID[0] +
              "   Table Name: " + tableName + "]")

    print("CREATE a backup cube: " + datasetName0)
    newDatasetId0, newTableId0 = conn.create_dataset(data_frame=df_cube,
                                                     dataset_name=datasetName0,
                                                     table_name=tableName)
    blz.toc()
    print('\x1b[1;33m' +
          "Done with [Output to MSTR Cube for Dossier Reporting (with PA)]" +
          '\x1b[0m')
def push2cube_nopa():

    global value_list
    global tst_out
    global out_filename1
    global isLDAP
    global this_prefix

    FMT1 = '%Y-%m-%d %H:%M:%S'

    print('\nStarting ' + '\x1b[6;30;42m' +
          'PUSH DATAFRAME TO MSTR CUBE (WO. PA): ' + '\x1b[0m')
    blz.tic()

    df_cube = pd.read_csv(out_filename1)
    df_cube['PID'] = df_cube['PID'].apply(str)

    # datasetName = this_prefix + 'err_n' + value_list[6] + '_nopa'
    # tableName = this_prefix + 'ErrorRank_n' + value_list[6] + '_nopa'
    # cubeinfo_name = 'Cube Info_' + this_prefix + 'n' + value_list[6] + '_nopa.txt'
    # datasetName0 = this_prefix + 'cube' + value_list[3] + '_n' + value_list[6] + '_nopa'

    datasetName = this_prefix + 'err_nopa'
    tableName = this_prefix + 'ErrorRank_nopa'
    cubeinfo_name = 'Cube Info_' + this_prefix + '_nopa.txt'
    datasetName0 = this_prefix + 'cube' + value_list[3] + '_nopa'

    # Authentication request and connect to the Rally Analytics project
    # is LDAP login (1) or standard user (0)
    if isLDAP == 1:
        conn = microstrategy.Connection(base_url=baseURL,
                                        login_mode=16,
                                        username=value_list[0],
                                        password=value_list[1],
                                        project_name=projName)
    else:
        conn = microstrategy.Connection(base_url=baseURL,
                                        username=value_list[0],
                                        password=value_list[1],
                                        project_name=projName)
    conn.connect()

    print("Connect to " + baseURL)

    # Create a new cube or use the existing cube
    if var1.get() == 1:
        # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube
        newDatasetId, newTableId = conn.create_dataset(
            data_frame=df_cube, dataset_name=datasetName, table_name=tableName)
        # Store Data Set Id and Table Id locally
        cubeInfoFile = open(cubeinfo_name, 'w')
        cubeInfoFile.write(newDatasetId + '\n')
        cubeInfoFile.write(newTableId)
        cubeInfoFile.close()
        print("CREATE Cube on URL: " + baseURL[:-25])
        print('[ Dataset Name: ' + datasetName + ' \ Cube ID = ' +
              newDatasetId + ']   [Table Name: ' + tableName +
              ' \ Table ID = ' + newTableId + ' ]')
    else:
        # Read saved cube IDs

        cubeInfoFile = open(cubeinfo_name, 'r')
        datasetID = cubeInfoFile.read().splitlines()
        cubeInfoFile.close()
        # Establish cube connection
        conn.update_dataset(data_frame=df_cube,
                            dataset_id=datasetID[0],
                            table_name=tableName,
                            update_policy='add')
        print("UPDATE Cube on URL: " + baseURL[:-25])
        print("Dataset Name " + datasetName + "[Cube ID: " + datasetID[0] +
              "   Table Name: " + tableName + "]")

    print("CREATE a backup cube: " + datasetName0)
    newDatasetId0, newTableId0 = conn.create_dataset(data_frame=df_cube,
                                                     dataset_name=datasetName0,
                                                     table_name=tableName)
    blz.toc()
    print(
        '\x1b[1;33m' +
        "Done with [Output to MSTR Cube for Dossier Reporting (without PA)]" +
        '\x1b[0m')
def paTimeline():
    global envStr
    global envfilename
    global pafilename
    global tst_out
    # global prediction
    global df_err_pa
    global outfilename
    global out_filename0
    global df_final

    print('\nStarting ' + '\x1b[6;30;42m' + ' STEP 6 ' + '\x1b[0m')
    FMT1 = '%Y-%m-%d %H:%M:%S'

    df_pa0 = pd.read_csv(pafilename)
    df_ins = pd.read_csv(envfilename)

    pa_ools = list(itemgetter(2, 3, 4)(list(df_pa0.head(0))))
    instance_id = df_ins[df_ins['instance_name'] ==
                         envStr]['instance_id'].item()
    print('Instance ID: ', instance_id)

    df_pa = df_pa0[df_pa0['instance_id'] == instance_id][pa_ools]
    df_pa = df_pa.rename(
        {
            'metric_retrieve_time': 'ROUNDED TIME STAMP',
            'cpu_avg_usage': 'CPU',
            'mem_avg_usage': 'MEMORY'
        },
        axis='columns')

    pa_timestamp = pd.to_datetime(df_pa['ROUNDED TIME STAMP'])

    # t1: Rounded TimeStamp lower bound (Error TS lowbound - 2 hours) ; t2: Rounded TimeStamp upper bound (Error TS lowbound + 2 hours)
    # ETC to UTC (+5 hours) and +/- 2 hours for PA range
    tst_out2 = tst_out
    # update TIME STAMP from ETS to UTC time zone
    tst_out2['TIME STAMP'] = tst_out2['TIME STAMP'] + pd.DateOffset(hours=5)
    error_ts_lowbound = min(tst_out2['TIME STAMP'])
    error_ts_upperbound = max(tst_out2['TIME STAMP'])
    t1 = error_ts_lowbound - pd.DateOffset(hours=2)
    t2 = error_ts_upperbound + pd.DateOffset(hours=2)
    df_pa2 = df_pa[pa_timestamp.between(t1, t2)]

    # PA time sample rate (resolution) is 5 min.
    detla_t = 5

    print(
        '****** Aligning Error timestamp and PA rounded timestamp [down-sampling to '
        + str(detla_t) + ' min interval] ...')
    blz.tic()
    pa_time = pd.to_datetime(df_pa2['ROUNDED TIME STAMP'])
    err_time = tst_out2['TIME STAMP']

    nb_error = len(tst_out2)
    time_allign_idx = [None] * nb_error
    old_time = [None] * nb_error
    round_time = [None] * nb_error
    err_local_clust = list([0] * nb_error)
    count_in_time = list([0] * nb_error)
    norm_cpu = list([0] * nb_error)
    norm_memory = list([0] * nb_error)

    for i in range(nb_error):
        err_time = tst_out2.iloc[i]['TIME STAMP']
        old_time[i] = err_time
        # find the time differences between the error outlier timestamp vs all PA timestamps
        time_delta = err_time - pd.to_datetime(pa_time)
        idx = np.argmin(
            abs(np.array(time_delta)
                ))  # find the closest PA timestamp, temp is the PA index

        if not (idx == 0 and
                (err_time < pa_time.iloc[0] - timedelta(minutes=detla_t)
                 or err_time > pa_time.iloc[-1] + timedelta(minutes=detla_t))):
            # if aligned timestamp is far below the lower bound (< t-5 mins) or beyond the upper bound (> t+5 mins.), do not assign corresponding rounded timestamp
            time_allign_idx[i] = idx
        else:
            time_allign_idx[i] = 0

        round_time[i] = pa_time.iloc[idx]
    blz.toc()

    print("\nArranging dataframe output  ...")
    blz.tic()
    #  x0: BLITZ TEST
    #  x1: PID
    #  x2: TIME STAMP
    #  x3: PRODUCT
    #  x4: ERROR
    #  x5: IS FATAL
    #  x6: EXTRA
    #  x7: UID
    #  x8: SID
    #  x9: OID
    #  x10: THU
    #  x11: KEYWARD
    #  x12: WEIGHT (select the highest if many)
    #  x13: IDENTICALS (initialized by value 1)
    #  x14: CLUSTER_ID
    #  x15: MODEL CLUSTER SIZE
    #  x16: NO.OF ALIKE
    #  x17: ERROR KPI
    #  x18: BOOSTED ERROR KPI
    #  x19: ROUNDED TIME STAMP
    #  x20: CPU
    #  x21: MEMORY
    #  x22: COUNTS OF SIMILAR ERRORS FROM THE SAME CLUSTER
    #  x23: POPULATION NORMALIZED KPI
    #  x24: ERROR COUNT PER TIMESTAMP
    #  x25: POPULATION NORMALIZED CPU
    #  x26: POPULATION NORMALIZED MEMORY

    # Re-index all error outlier samples
    df_tstout = pd.DataFrame(tst_out2, index=range(nb_error))

    # add rounded timestamps and reassign index
    df_x19 = pd.DataFrame(round_time,
                          columns=['ROUNDED TIME STAMP'],
                          index=range(nb_error))
    # Use the aligned time index to extract PA performance
    df_x20 = df_pa2.iloc[time_allign_idx][['CPU', 'MEMORY']]
    # Drop the index from error outlier data frame
    df_x21 = df_x20.reset_index(drop=True)
    #
    time_uniq, loc4, counts4 = np.unique(df_x19,
                                         return_index=True,
                                         return_counts=True)
    nb_ts = len(time_uniq)

    #
    for i in range(nb_ts):
        select_indices = list(np.where(df_x19 == time_uniq[i])[0])
        time_block = prediction[select_indices]
        # Number of samples in the time block
        nb_tb = len(time_block)

        # find unique cluster tags and counts within a time block
        clust_uniq, loc5, counts5 = np.unique(time_block,
                                              return_index=True,
                                              return_counts=True)
        for j in range(nb_tb):
            count_in_time[select_indices[j]] = nb_tb
            norm_cpu[select_indices[j]] = df_x20.iloc[
                select_indices[0]]['CPU'] / nb_tb
            norm_memory[select_indices[j]] = df_x21.iloc[
                select_indices[0]]['MEMORY'] / nb_tb
            #
            select_index = np.where(clust_uniq == time_block[j])[0][0]
            err_local_clust[select_indices[j]] = counts5[select_index]
    df_x22 = pd.DataFrame(err_local_clust,
                          columns=["SIMILAR ERROR COUNTS IN CLUSTER"])
    df_x23 = pd.DataFrame(tst_out2["ERROR KPI"].as_matrix(columns=None) /
                          err_local_clust / count_in_time,
                          columns=["NORMALIZED KPI"])

    df_x24 = pd.DataFrame({'ERROR COUNT PER TIMESTAMP': count_in_time})
    df_x25 = pd.DataFrame({'NORMALIZED CPU': norm_cpu})
    df_x26 = pd.DataFrame({'NORMALIZED MEMORY': norm_memory})

    df_err2 = pd.concat(
        [df_tstout, df_x19, df_x21, df_x22, df_x23, df_x24, df_x25, df_x26],
        axis=1)
    # patching the CPU and Memory values outside TimeStamp with Errors

    df_final = df_pa2.append(df_err2)
    df_final = df_final.sort_values(by='BOOSTED ERROR KPI', ascending=False)
    df_final = df_final.reset_index(drop=True)

    df_length = len(df_final)
    for i in range(df_length):
        if np.isnan(df_final.iloc[i]['NORMALIZED CPU']):
            if np.isnan(df_final.iloc[i]['ERROR COUNT PER TIMESTAMP']):
                df_final['NORMALIZED CPU'][i] = df_final['CPU'][i]
                df_final['NORMALIZED MEMORY'][i] = df_final['MEMORY'][i]
    blz.toc()
    # to remove the rows without errors within time interval
    ss = np.array(
        pd.to_datetime(df_final['ROUNDED TIME STAMP']) <= error_ts_upperbound)
    tt = np.array(
        pd.to_datetime(df_final['ROUNDED TIME STAMP']) >= error_ts_lowbound)
    uu = np.array(np.isnan(df_final['ERROR COUNT PER TIMESTAMP']))

    df_final = df_final.drop(df_final[ss & tt & uu].index)
    df_final = df_final.reset_index(drop=True)

    # OUTPUT RESULT AS .CSV FOR DATA CUBE
    blz.tic()
    print("Saving testing results ... ")
    out_filename0 = outfilename[0:-4] + '_datacube_pa.csv'
    print('Exporting ' + os.getcwd() + ' ' + out_filename0)
    df_final.to_csv(out_filename0, index=False)
    blz.toc()
    print('\x1b[1;33m' + "Done with [Time Stamp Correlation]." + '\x1b[0m')
def predict():
    global df_uniq_err
    global tst_out
    global prediction
    global out_filename1
    global vec_filename
    global pty_filename

    print('\nStarting ' + '\x1b[6;30;42m' + ' STEP 4 ' + '\x1b[0m')

    # nb_error is the number of unique errors
    nb_error = len(df_uniq_err['ERROR'])
    tst_clean = [None] * nb_error

    # Load ML Training model
    print("******  Vectorization gallery ... ")
    df_gallery = pd.read_csv(pty_filename,
                             encoding="ISO-8859-1",
                             engine='python')
    vectorizer2 = pickle.load(open(vec_filename, "rb"), encoding='iso-8859-1')

    x = vectorizer2.transform(df_gallery['ERROR TOKENS'])
    X = x.toarray()
    nb_gallery = len(X)

    print("****** Text Regular Expression: probe  ... ")
    blz.tic()
    # text regular expression operation
    regex_pat123 = re.compile(r'[^a-zA-Z0-9\s]', flags=re.IGNORECASE)
    regex_pat = re.compile(r'[^a-zA-Z\s]', flags=re.IGNORECASE)

    for k in range(nb_error):
        if (k % 200 == 1) or (k + 200 >= nb_error):
            sg.OneLineProgressMeter('Vectorization', k + 1, nb_error, 'key')
        temp = df_uniq_err['ERROR'].iloc[k]
        temp = temp.replace("/", " ")
        temp = temp.replace("_", " ")
        temp = temp.replace("-", " ")
        temp = temp.replace("=", " ")
        temp = temp.replace(";", " ")
        temp = temp.replace(".", " ")
        temp = temp.replace("'", "")
        # take care remove nonprintable characters
        # temp = temp.replace("\xc3",'')
        # temp = temp.replace("\xa4",'')
        # temp = temp.replace("\xe5",'')
        temp = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', temp)
        #
        # tst_words = nltk.word_tokenize(temp)
        # tst_series = pd.Series(tst_words)
        tst_series = pd.Series(temp)
        # keep only words
        tst_clean1 = tst_series.str.replace(regex_pat123, ' ')
        mask = ((tst_series.str.len() == 32) |
                (tst_series.str.len() == 33)) & (~tst_series.str.islower())
        tst_clean1.loc[pd.Series.as_matrix(mask)] = 'GUID'
        tst_clean1 = tst_series.str.replace(
            regex_pat, ' ')  # join the cleaned words in a list
        tst_clean2 = tst_clean1.str.cat(sep=' ')
        tst_clean[k] = tst_clean2
    blz.toc()

    print("****** Cluster Prediction on testing samples ... ")
    blz.tic()

    print('Prediction: [learning Gallery: ' + pty_filename +
          '] [vector space: ' + vec_filename + ']')
    y = vectorizer2.transform(tst_clean)
    Y = y.toarray()
    # prediction = loaded_model.predict(Y)

    bestMatch_index = [0] * nb_error
    bestMatch_score = [0] * nb_error
    prediction = [0] * nb_error
    pred_str = [None] * nb_error
    frequency = [0] * nb_error

    for i in range(nb_error):
        # Pop up a progress bar
        if (i % 200 == 1) or (i + 200 >= nb_error):
            sg.OneLineProgressMeter('Simaility Matching', i + 1, nb_error,
                                    'key')

        temp_similarity = [0] * nb_gallery
        for j in range(nb_gallery):
            temp_similarity[j] = 1 - distance.cosine(Y[i], X[j])

        bestMatch_index[i] = np.argmax(temp_similarity)
        bestMatch_score[i] = max(temp_similarity)
        prediction[i] = df_gallery.iloc[bestMatch_index[i]]['CLUSTER TAG']
        frequency[i] = df_gallery.iloc[bestMatch_index[i]]['FREQUENCY']

        # Assign the cluster tags
        pred_str[i] = ['C' + str(prediction[i])]

    vec_filename = outfilename[0:-4] + '_vsp.npz'
    np.savez(vec_filename, Y)
    print(
        'Saving vector space projection as .npz file for internal algorithm evaluation:  '
        + os.getcwd() + '\\' + vec_filename)
    blz.toc()

    # Put testing outputs together in a data frame format
    print("Generating Error Ranking Output ... ")
    blz.tic()

    tst_uniq_df = df_uniq_err

    # MATCH SCORE
    df_sim = pd.DataFrame(bestMatch_score, columns=["SIMILARITY SCORE"])

    # CLUSTER_ID
    tst_df1 = pd.DataFrame(pred_str, columns=["CLUSTER_ID"])

    # NO. OF ALIKE
    tst_d2 = np.bincount(prediction)
    tst_d2 = tst_d2[prediction]
    tst_df2 = pd.DataFrame(tst_d2, columns=["NO. OF ALIKE"])

    # MODEL FREQUENCY (MODEL CLUSTER SIZE)

    # tst_d3 = frequency.astype('float', copy=True) / sum(frequency)
    tst_d3 = frequency / sum(frequency)
    tst_df3 = pd.DataFrame(frequency, columns=["MODEL CLUSTER SIZE"])

    # MATCH SCORE
    # tst_d4 = bestMatch_score
    # tst_df4 = pd.DataFrame(tst_d4, columns=["MATCH SCORE"])

    # ERROR KPI
    # prob. of test in-cluster
    tst_d2p = tst_d2.astype('float', copy=True) / nb_error
    # prob. of gallery in-cluster
    tst_d3p = tst_d3.astype('float', copy=True) / sum(tst_d3)

    tst_d5 = abs(np.log(tst_d3p * tst_d2p))
    tst_df5 = pd.DataFrame(tst_d5, columns=["ERROR KPI"])

    # BOOSTED ERROR KPI
    tst_d6 = np.asarray((1 + tst_uniq_df['WEIGHT']) * tst_d5)
    tst_df6 = pd.DataFrame(tst_d6, columns=["BOOSTED ERROR KPI"])

    # Ignore the dataframe index before concatenation
    tst_uniq_df.reset_index(drop=True, inplace=True)

    # Concate with additional atributes
    # tst_out = pd.concat([tst_uniq_df, df_sim, tst_df1, tst_df2, tst_df3, tst_df5, tst_df6], axis=1)
    tst_out = pd.concat(
        [tst_uniq_df, df_sim, tst_df1, tst_df2, tst_df3, tst_df5, tst_df6],
        axis=1)

    out_filename1 = outfilename[0:-4] + '_datacube_nopa.csv'
    # out_filename1 = 'out_DSSErr' + tst_filename_prefix + 'n' + str(node) + '_' + env_id + office + '_datacube.csv'
    tst_out.to_csv(out_filename1, index=False)
    print('exporting ' + os.getcwd() + '\\' + out_filename1)
    blz.toc()
    print('\x1b[1;33m' + "Done with [Ranking]." + '\x1b[0m')
def parse():
    global infilename
    global outfilename
    global df_err
    global df_uniq_err
    global envStr

    errorCount = -1
    currentLine = 1
    filename_prefix = 'DSSErr'
    isFound = False

    FMT1 = '%Y-%m-%d %H:%M:%S'
    # FMT1 = '%m/%d/%Y %H:%M'
    date_str = value_list[3][0:4] + '-'

    print('\nStarting ' + '\x1b[6;30;42m' + ' STEP 3 ' + '\x1b[0m')

    # find total line number
    nb_lines = sum(1 for line in open(infilename, encoding="utf8"))

    # Read keyword watchlist
    df_keyword = pd.read_csv("error_keywords.csv")
    nb_keyword = len(df_keyword)
    nexttoMainline = False

    # Identify env_id and node number = (1,2)
    fin = open(infilename, "r", encoding="utf8")
    tline = fin.readline()
    while not isFound:
        whereTIMESTAMP = tline[0:30].find(date_str)
        if whereTIMESTAMP >= 0:
            whereHOST, rightqHOST, HOSTval = getAtribute(
                '[HOST:env-', tline, whereTIMESTAMP + 24,
                whereTIMESTAMP + 24 + 30)
            if not (HOSTval == []):
                whereLaio = HOSTval.find('laio')
                if whereLaio >= 0:
                    # ENV_ID
                    value_list[5] = HOSTval[0:whereLaio]
                    # Node No
                    value_list[6] = HOSTval[whereLaio + 4]
                    isFound = True
        tline = fin.readline()
    fin.close()
    envStr = 'env-' + HOSTval
    print('Enviroment ID: ' + envStr)

    outfilename = filename_prefix + value_list[3] + 'n' + value_list[
        6] + '_' + value_list[5] + '_' + value_list[4] + '.csv'
    print('Output filename: ', outfilename)

    # Open file
    fin = open(infilename, "r", encoding="utf8")
    tline = fin.readline()

    ary_err = []
    print("****** Parsing DSSError log text ... ")
    blz.tic()
    while tline:
        # Pop up a progress bar
        if (currentLine % 200 == 1) or (currentLine + 200 > nb_lines):
            sg.OneLineProgressMeter('Line Parsing', currentLine, nb_lines,
                                    'key')

        isMainLine = False
        whereTIMESTAMP = tline[0:30].find(date_str)
        if whereTIMESTAMP >= 0:  # if datetime string can be found
            EXTRAval = ''
            rightTIMESTAMP = whereTIMESTAMP + 11
            whereERROR = tline[rightTIMESTAMP:].find(
                '[Error]')  # Check if [Error} or [Fetal] exists
            whereFATAL = tline[rightTIMESTAMP:].find('[Fatal]')

            if ((whereERROR > -1) or (whereFATAL > -1)):
                isMainLine = True

                errorCount = errorCount + 1
                if (whereFATAL > -1):
                    whereERROR = whereFATAL

                # to find the absolute position of [ERROR] by dding the TIMESTAMP offset
                whereERROR = whereERROR + rightTIMESTAMP
                # record PID:
                wherePID, rightqPID, PIDval = getAtribute(
                    '[PID:', tline, rightTIMESTAMP, whereERROR)
                # record THR:
                whereTHR, rightqTHR, THRval = getAtribute(
                    '[THR:', tline, rightqPID, whereERROR)
                # record PRODUCT
                wherePPRODUCT, rightqPRODUCT, PRODUCTval = getAtribute(
                    '[', tline, rightqTHR, whereERROR)
                # record UID
                whereUID, rightqUID, UIDval = getAtribute(
                    '[UID:', tline, whereERROR, whereERROR + 50)
                # record SID
                whereSID, rightqSID, SIDval = getAtribute(
                    '[SID:', tline, rightqUID, rightqUID + 50)
                # record OID
                if (whereSID > -1):
                    whereOID, rightqOID, OIDval = getAtribute(
                        '[OID:', tline, rightqSID, rightqSID + 50)
                else:
                    whereOID, rightqOID, OIDval = getAtribute(
                        '[OID:', tline, rightqUID, rightqUID + 100)
                # record TIME STAMP
                TIMESTAMPval = pd.to_datetime(
                    tline[whereTIMESTAMP:whereTIMESTAMP + 19])
                # record ERROR
                if (whereOID == -1):
                    rightqERROR = whereERROR + 6
                    if (whereSID > -1):
                        # [To handle the case when SID exists]:
                        # 2018-05-14 20:35:34.745 [HOST:env-93835laio1use1][SERVER:CastorServer][PID:5949][THR:139643536082688]
                        # [Distribution Service][Error][UID:2ED12F4211E7409200000080EF755231]
                        # [SID:3723DC217D2D1C26E2AD86E25D5D1552] MSIDeliveryEngine::hDelivery(): Unknown Delivery Failed. Error string
                        # from ExecuteMultiProcess SSL Error: A failure in the SSL library occurred, usually a protocol error [Provider
                        # certificate may expire]. . <Subscription '' (ID = 00000000000000000000000000000000), Contact 'Monitoring, HeartBeat' (ID = A86D5DC6459DD1909C70188084201E1F) >
                        ERRORval = tline[whereSID + rightqSID + 2:]
                    else:
                        # [To handle the case when [SID: ....] does not exists, find the immediate right of [ERROR], and then check if '[0x' (zeroX) exist]:
                        # 2018-05-14 20:35:34.800 [HOST:env-93835laio2use1][SERVER:CastorServer][PID:84762][THR:140030753163008]
                        # [Metadata Server][Error][0x8004140B] Object with ID '44F9CBE411E857B600000080EF854C73' and type 4 (Metric)
                        # is not found in metadata. It may have been deleted.
                        ERRORval = tline[rightqERROR + 1:]
                else:
                    # if [OID: ....] exists, find the imediate rightqt of OID.  Extract error string all the way to the end of the line
                    ERRORval = tline[rightqOID + 1:]

                # Exceptional rule to take care of very long text led by <rw_manipulations dumpdf_err
                whereMANIPULATION = ERRORval.find('<rw_manipulations')
                if (whereMANIPULATION > -1):
                    EXTRAval = ERRORval[whereMANIPULATION:whereMANIPULATION +
                                        100] + ' ...'
                    ERRORval = ERRORval[1:whereMANIPULATION - 1]

                # Exceptional rule to take care of Big Data team log dump defect
                whereCSV = ERRORval.find('[SimpleCSVParser]')
                if (whereCSV > -1):
                    leftqDEXTRA = ERRORval[18:].find('[')
                    if leftqDEXTRA == -1:
                        leftqDEXTRA = ERRORval[18:].find('PROBLEM DESCRIPTION')
                    if leftqDEXTRA == -1:
                        leftqDEXTRA = ERRORval[18:].find(
                            'LATEST STATUS SUMMARY')
                    if leftqDEXTRA == -1:
                        leftqDEXTRA = ERRORval[18:].find(
                            'Cannot parse out numeric value from') + 36
                    if leftqDEXTRA == -1:
                        EXTRAval = ''
                    else:
                        EXTRAval = '[SimpleCSVParser]: ' + ERRORval[
                            leftqDEXTRA + 17:]
                        ERRORval = ERRORval[1:(leftqDEXTRA + 17 - 1)]

                # remove [TAB] and [NEW LINE] character
                ERRORval = ERRORval.replace("\t", " ")
                ERRORval = ERRORval.replace("\n", "")
                isMainLine = True
                nexttoMainline = True
            else:
                nexttoMainline = False
            currentLine = currentLine + 1
            tline = fin.readline()

        else:
            # if datetime string does not exist, check if there is an extra information
            # check if the previous line is a main[ERROR] line(ie.contains [Error] or {Fatal]
            EXTRAtmp = ''
            if nexttoMainline:

                while (tline[0:20].find(date_str)
                       == -1) and (not (tline == '')):
                    tline = tline.replace("\n", " ")
                    EXTRAtmp = EXTRAtmp + tline
                    tline = fin.readline()
                    currentLine = currentLine + 1
                EXTRAtmp = EXTRAtmp.replace("\t", " ")
                EXTRAtmp = EXTRAtmp.replace("\n", "")

                # Check if there is any term matched to the phrases from the truncation list which might be the XML or SQL dump and needs to be truncated
                if any(word in EXTRAtmp for word in truncat):
                    # Roll extra error back to the error except the following conditions
                    if ERRORval.find('[SimpleCSVParser]') > -1:
                        ERRORval = ERRORval + EXTRAtmp
                    else:
                        whereERRORTYPE = EXTRAtmp.find('Error type:')
                        if whereERRORTYPE > -1:
                            tempStr = EXTRAtmp[whereERRORTYPE + 12:]
                            leftqTYPE1 = [
                                x for x in range(len(tempStr))
                                if tempStr.startswith('[', x)
                            ]
                            leftqTYPE2 = [
                                x for x in range(len(tempStr))
                                if tempStr.startswith('(', x)
                            ]
                            leftqTYPE3 = [
                                x for x in range(len(tempStr))
                                if tempStr.startswith('.', x)
                            ]
                            if leftqTYPE1 == []:
                                leftqTYPE1 = [99999, 99999]
                            if leftqTYPE2 == []:
                                leftqTYPE2 = [99999, 99999]
                            if leftqTYPE3 == []:
                                leftqTYPE3 = [99999, 99999]
                            leftqTYPE = min(leftqTYPE1[0], leftqTYPE2[0],
                                            leftqTYPE3[0])
                            extra0 = EXTRAtmp[whereERRORTYPE:whereERRORTYPE +
                                              leftqTYPE + 10]
                            extra1 = EXTRAtmp[whereERRORTYPE + whereERRORTYPE +
                                              12:]
                            EXTRAval = EXTRAval + ' ' + extra1
                            ERRORval = ERRORval + ' { ' + extra0 + ' }'
                        else:
                            EXTRAval = EXTRAval + ' { ' + EXTRAtmp + ' }'
                else:
                    ERRORval = ERRORval + ' {' + EXTRAtmp + '}'
                nexttoMainline = True
                isMainLine = True
            else:
                # it is NOT an legit message, so the line counter advanves by 1
                nexttoMainline = False
                tline = fin.readline()
                currentLine = currentLine + 1

        # Put all attribute values together to form a vector
        if (isMainLine or nexttoMainline):
            # Keyword Search: Search keyword on both ERROR and EXTRA columns. If matched, write keyword to the "KEYWORD" column
            max_kwd_weight = 0
            this_kwd_weight = 0
            keywordFound = False
            KWDval = ''
            which_keyword = 0

            # Keuword search against the keyword watch list
            while (not keywordFound) and (which_keyword < nb_keyword):
                loc_keyword = ERRORval.upper().find(
                    df_keyword['ERROR KEYWORD'][which_keyword])
                if loc_keyword > -1:
                    this_kwd_weight = df_keyword['WEIGHT'][which_keyword]
                    if this_kwd_weight > max_kwd_weight:
                        max_kwd_weight = this_kwd_weight
                        KWDval = df_keyword['ERROR KEYWORD'][which_keyword]
                else:
                    if EXTRAval != '':
                        loc_keyword = EXTRAval.upper().find(
                            df_keyword['ERROR KEYWORD'][which_keyword])
                        if loc_keyword > -1:
                            this_kwd_weight = df_keyword['WEIGHT'][
                                which_keyword]
                            if this_kwd_weight > max_kwd_weight:
                                max_kwd_weight = this_kwd_weight
                                KWDval = df_keyword['ERROR KEYWORD'][
                                    which_keyword]
                which_keyword = which_keyword + 1

            # FATAL vs. ERROR check
            if (whereFATAL > -1):
                isFatal = 1
            else:
                isFatal = 0

            #  x0: BLITZ TEST
            #  x1: PID
            #  x2: TIME STAMP
            #  x3: PRODUCT
            #  x4: ERROR
            #  x5: IS FATAL
            #  x6: EXTRA
            #  x7: UID
            #  x8: SID
            #  x9: OID
            #  x10: THU
            #  x11: KEYWARD
            #  x12: WEIGHT (select the highest if many)
            #  x13: IDENTICALS (initialized by value 1)

            if len(EXTRAval) > 256:
                EXTRAval = EXTRAval[0:255]

            errorVec = [
                value_list[2], PIDval, TIMESTAMPval, PRODUCTval, ERRORval,
                isFatal, EXTRAval, UIDval, SIDval, OIDval, THRval, KWDval,
                max_kwd_weight, 1
            ]

            # Append the error vector into error array
            ary_err.append(errorVec)

    fin.close()

    # Convert the data type series to dataframe
    df_err = pd.DataFrame(ary_err, columns=columns)

    # Sort Keyword weight in decending order so the higher weight errors can be retained after finding unique single
    df_temp = df_err.sort_values(['WEIGHT'], ascending=False)
    # Use unique operation to remove duplicates, where the variables of:
    # uniq_err = the unique error text. loc1 = the index of the chosen unique. counts = duplicate counts
    uniq_err, loc1, counts = np.unique(df_temp['ERROR'],
                                       return_index=True,
                                       return_counts=True)

    # q is the number of unique errors
    q = np.size(uniq_err)

    # Assemble the entire row of chosen unique
    df_uniq_err = df_temp.iloc[loc1]

    # Add unique count column into the data frame
    df_uniq_err['IDENTICALS'] = counts.tolist()
    blz.toc()

    # Out unique error dataframe to .csv
    # out_filename1 = outfilename[0:-4]+'_dataframe.csv'
    out_filename2 = outfilename[0:-4] + '_parse.csv'
    print('Exporting ' + os.getcwd() + '\\' + out_filename2)
    df_uniq_err.to_csv(out_filename2, index=False)
    print('\x1b[1;33m' + 'Done with [Parsing].' + '\x1b[0m')
def push2cube():

    global value_list
    global tst_out
    global out_filename1
    global this_prefix

    usrname = entry11.get()
    passwd = entry12.get()

    print('\nPush Dataframe to MSTR Cube: ')
    blz.tic()

    url_sel = var4.get()
    print(url_sel)

    print("Your selected MSTR URL:  " + str(url_sel))
    baseURL = 'https://' + url_sel + '/MicroStrategyLibrary/api'

    isLDAP = var3.get()

    print('Project Name: ', projName)
    datasetName = this_prefix + '_cube'
    print('Cube Name: ', datasetName)
    tableName = this_prefix + '_table'
    print('Table Name: ', tableName)
    cubeinfo_name = this_prefix + '_cubeinfo'
    print('Cube Info Name: ', cubeinfo_name)

    # Authentication request and connect to the Rally Analytics project
    if isLDAP == 1:
        conn = microstrategy.Connection(base_url=baseURL,
                                        login_mode=16,
                                        username=usrname,
                                        password=passwd,
                                        project_name=projName)
    else:
        conn = microstrategy.Connection(base_url=baseURL,
                                        username=usrname,
                                        password=passwd,
                                        project_name=projName)
    conn.connect()
    print("Successfully Connect to " + baseURL[:-25])

    # if the cube does not exist, acquire Data Set Id & Table Id, and create a new cube
    newDatasetId, newTableId = conn.create_dataset(data_frame=df_data2,
                                                   dataset_name=datasetName,
                                                   table_name=tableName)
    # Store Data Set Id and Table Id locally
    cubeInfoFile = open(cubeinfo_name, 'w')
    cubeInfoFile.write(newDatasetId + '\n')
    cubeInfoFile.write(newTableId)
    cubeInfoFile.close()

    print("Succeefully Create a Cube on URL: ", baseURL)
    print('Project Name: ', '\x1b[6;30;42m' + projName + '\x1b[0m')
    print('Dataset/Cube Name: ' + '\x1b[6;30;42m' + datasetName +
          ' [Cube ID = ' + newDatasetId + ']' + '\x1b[0m')
    print('Table Name: ' + '\x1b[6;30;42m' + tableName + ' [Table ID = ' +
          newTableId + ']' + '\x1b[0m')
    blz.toc()
    print('\x1b[1;33m' +
          "Done with [Output to MSTR Cube for Dossier Reporting" + '\x1b[0m')
def do_outlier():
    global x_scaled
    global outlier
    global outlier_x
    global outlier_pca_x
    global pca_x

    threshold = w.get()
    blz.tic()
    # Whitening Transformation
    print('\nPCA and Whitening transform...')

    # Find unique group labels
    uniq_label = np.unique(labels)

    nb_clusters = len(uniq_label)
    nb_samples = len(x_scaled)
    nb_dim = len(x_scaled[0])

    pca = PCA(n_components=nb_dim, svd_solver='full', whiten=True)

    # initialization variable arrays
    outlier = [0] * nb_samples
    pca_x = [a[:] for a in [[0] * nb_dim] * nb_samples]
    sigma_pca_x = [a[:] for a in [[0] * nb_dim] * nb_clusters]
    sigma_x = [a[:] for a in [[0] * nb_dim] * nb_clusters]
    mu_pca_x = [a[:] for a in [[0] * nb_dim] * nb_clusters]
    mu_x = [a[:] for a in [[0] * nb_dim] * nb_clusters]

    for i in range(nb_clusters):
        idx1 = np.where(labels == uniq_label[i])[0]
        x_inCluster = x_scaled[idx1]
        pca_x_inCluster = pca.fit_transform(x_inCluster)
        m = len(pca_x_inCluster)
        for j in range(m):
            pca_x[idx1[j]] = pca_x_inCluster[j].tolist()

        mu_x[i] = x_inCluster.mean(axis=0)
        sigma_x[i] = x_inCluster.std(axis=0)

        mu_pca_x[i] = pca_x_inCluster.mean(axis=0)
        sigma_pca_x[i] = pca_x_inCluster.std(axis=0)

        idx_temp = np.where(abs(pca_x_inCluster) >= threshold)
        idx2 = np.unique(idx_temp[0])
        for u in idx1[idx2].tolist():
            outlier[u] = 1

    nb_outliers = sum(outlier)
    print('Number of outliers: ' + str(nb_outliers))

    outlier_x = [x[:] for x in [[0] * nb_dim] * nb_outliers]
    outlier_pca_x = [x[:] for x in [[0] * nb_dim] * nb_outliers]
    c = 0

    for i in range(nb_samples):
        if outlier[i] == 1:
            outlier_x[c] = x_scaled[i].tolist()
            outlier_pca_x[c] = pca_x[i]
            c = c + 1

    do_dataframe()
    blz.toc()
    print('\x1b[1;33m' + 'Done with [Outlier Detection].' + '\x1b[0m')
    do_plotting()