def cardio_re(): epss = [3, 4, 5] for e in epss: mat = loadmat('../data/cardio.mat') X_car = mat['X'] y = pd.DataFrame(mat['y']) y = y[0].to_numpy() X_car = pd.DataFrame(X_car) print(X_car.head(12)) print(y) # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = dbscan.dbscanp(X_car, 21, eps=e, minpts=10, factor=1) # print(clusterlmat[0][13]) y_t = clusterlmat[0][21].to_numpy() print(y_t) index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 print(y_t) print(f1_score(y, y_t, average='weighted')) print(assess.falsealarmrate(y, [0], y_t, 1)) print(adjusted_rand_score(y, y_t)) print(jaccard_score(y, y_t)) print(e)
def pima_re(): epss = [(45, 0.2, 0.9), (35, 0.35, 2), (25, 0.2, 0.0005)] for t in epss: data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None) print(data.head(12)) y = data[8].to_numpy() print(y) # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = dbscan.dbscanp( data, 8, eps=t[2], minpts=t[0], factor=t[1], initialization=dbscan.Initialization.UNIFORM, plot=True) # print(clusterlmat[0][13]) y_t = clusterlmat[0][9].to_numpy() # print(y_t) index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 print(y_t) print(f1_score(y, y_t, average='weighted')) print(assess.falsealarmrate(y, [0], y_t, 1)) print(adjusted_rand_score(y, y_t)) print(jaccard_score(y, y_t)) print(t[2])
def pima_pre_norm_and_pca_re_dbm(): epss = [(260, 0.5, 0.5), (260, 0.5, 0.4), (260, 0.5, 0.35), (260, 0.5, 0.3), (260, 0.5, 0.2), (260, 0.5, 0.1)] epss2 = [(268, 0.5, 0.5), (268, 0.5, 0.4), (268, 0.5, 0.35), (268, 0.5, 0.3), (268, 0.5, 0.2), (268, 0.5, 0.1)] epss3 = [(280, 0.5, 0.5), (280, 0.5, 0.4), (280, 0.5, 0.35), (280, 0.5, 0.3), (280, 0.5, 0.2), (280, 0.5, 0.1)] for t in epss: data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None) # print(data.head(12)) y = data[8].to_numpy() print(y) scaler = MinMaxScaler() arr_scaled = scaler.fit_transform(data) data2 = pd.DataFrame(arr_scaled) pca = PCA(n_components=7) principalcomponents = pca.fit_transform(data2.iloc[:, 0:8]) principledf = pd.DataFrame(data=principalcomponents) # print(principledf.head(12)) # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = dbscanppm.dbscanmp( principledf, 7, eps=t[2], minpts=t[0], factor=t[1], threshold=0.06, initialization=dbscan.Initialization.NONE, plot=False) # print(clusterlmat[0][13]) y_t = clusterlmat[0][7].to_numpy() # print(y_t) index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 print("cluster labels:", y_t) print("eps: ", t[2]) f_sc = f1_score(y, y_t, average='weighted') fa = assess.falsealarmrate(y, [0], y_t, 1) ard = adjusted_rand_score(y, y_t) js = jaccard_score(y, y_t) print(f_sc) print(fa) print(ard) print(js) print(t[2]) rr = [t[0], t[2], t[1], f_sc, fa, ard, js, dbscan.Initialization.NONE] with open('../data/pima_pca/dbscanm.pima.pca.result.csv', 'a') as fd: writer = csv.writer(fd) writer.writerow(rr)
def glass_re(): epss = [0.4, 0.9, 1, 1.2, 2] for e in epss: mat = loadmat('../data/glass.mat') X_car = mat['X'] y = pd.DataFrame(mat['y']) y = y[0].to_numpy() X_car = pd.DataFrame(X_car) print(X_car.head(12)) print(y) # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = dbscan.dbscanp(X_car, 21, eps=e, minpts=8, factor=0.5, initialization=dbscan.Initialization.KCENTRE) # print(clusterlmat[0][13]) y_t = clusterlmat[0][9].to_numpy() # print(y_t) index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 print(y_t) print(f1_score(y, y_t, average='weighted')) print(assess.falsealarmrate(y, [0], y_t, 1)) print(adjusted_rand_score(y, y_t)) print(jaccard_score(y, y_t)) print(e)
def main(): mat = loadmat('../data/cardio.mat') print(mat) X_car = mat['X'] y_car = mat['y'] y_car = pd.DataFrame(y_car) # print(list(y_car[0])) # y_car.hist() plt.title("Cardiotocography Data Class Distribution") plt.show() data = pd.DataFrame(X_car) # classes 2, 3, 6, 7 has the least number of frequency. Thus we will use those classes as the outliers. clustering = DBSCAN(eps=45, min_samples=15).fit(data) print(clustering.labels_) print(type(clustering.labels_)) print(np.array(y_car[0])) # print(type(clustering.labels_)) copy_clusterings = clustering.labels_.copy() index = 0 for i in copy_clusterings: if i == -1: copy_clusterings[index] = 1 else: copy_clusterings[index] = 0 index += 1 print('false alarm rate: ', metrics.falsealarmrate(np.array(y_car[0]), [1], copy_clusterings, 1)) print(jaccard_score(np.array(y_car[0]), copy_clusterings, average=None))
def pima_pre_norm_and_pca_kmeans_re(): epss = [(1, 250, 50, 0.0001), (1, 268, 50, 0.0001), (1, 275, 50, 0.0001)] epss2 = [(2, 250, 50, 0.0001), (2, 268, 50, 0.0001), (2, 275, 50, 0.0001)] epss3 = [(3, 250, 50, 0.0001), (3, 268, 50, 0.0001), (3, 275, 50, 0.0001)] for t in epss3: data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None) # print(data.head(12)) y = data[8].to_numpy() print(y) scaler = MinMaxScaler() arr_scaled = scaler.fit_transform(data) data2 = pd.DataFrame(arr_scaled) pca = PCA(n_components=7) principalcomponents = pca.fit_transform(data2.iloc[:, 0:8]) principledf = pd.DataFrame(data=principalcomponents) # print(principledf.head(12)) # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = km.kmeansm(principledf, t[0], t[1], t[2], t[3], 7) # print(clusterlmat[0][13]) y_t = clusterlmat # print(y_t) index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 print("cluster labels:", y_t) print("eps: ", t[2]) f_sc = f1_score(y, y_t, average='weighted') fa = assess.falsealarmrate(y, [0], y_t, 1) ard = adjusted_rand_score(y, y_t) js = jaccard_score(y, y_t) print(f_sc) print(fa) print(ard) print(js) print(t[2]) rr = [t[0], t[2], t[1], f_sc, fa, ard, js, "KMEANS--"] with open('../data/pima_pca/kmeans.pima.pca.result.csv', 'a') as fd: writer = csv.writer(fd) writer.writerow(rr)
def main(): data = pd.read_csv('data/satellite-unsupervised-ad.csv', header=None) print(data.head()) y = data[36] y.hist() plt.show() clustering = DBSCAN(eps=42, min_samples=7).fit(data.iloc[:, 0:35]) print() print(clustering.labels_) print(type(clustering.labels_)) for index, row in data.iterrows(): print(row[36] + " " + str(clustering.labels_[index])) print( 'false alarm rate: ', metrics.falsealarmrate(data.iloc[:, 36].values, 'o', clustering.labels_, -1))
def pima_pre_norm_re(): epss = [(45, 0.02, 0.9), (35, 0.0001, 2), (25, 0.2, 0.00005)] for t in epss: data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None) print(data.head(12)) y = data[8].to_numpy() print(y) scaler = MinMaxScaler() arr_scaled = scaler.fit_transform(data) data2 = pd.DataFrame(arr_scaled) # pca = PCA(n_components=7) # principalcomponents = pca.fit(data2.iloc[, 0:8]) # principledf = pd.DataFrame(principalcomponents) # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = dbscan.dbscanp(data2, 8, eps=t[2], minpts=t[0], factor=t[1], initialization=dbscan.Initialization.NONE, plot=False) # print(clusterlmat[0][13]) y_t = clusterlmat[0][9].to_numpy() # print(y_t) index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 print(y_t) print(f1_score(y, y_t, average='weighted')) print(assess.falsealarmrate(y, [0], y_t, 1)) print(adjusted_rand_score(y, y_t)) print(jaccard_score(y, y_t)) print(t[2])
def main(): ## Pima Data set datapima = pd.read_csv("data/pimaindiansdiabetes.csv") col = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] datapima.columns = col X = datapima.iloc[:, :-1] y = datapima.iloc[:, -1] y.hist() plt.title(" Pima Indians diabetes Data Class Distribution") plt.show() clusteringpima = DBSCAN(eps=0.9, min_samples=6).fit(X) print(clusteringpima.labels_) print(type(clusteringpima.labels_)) # print(type(clustering.labels_)) for index, row in datapima.iterrows(): print(str(row[8]) + ":" + str(clusteringpima.labels_[index])) print('false alarm rate: ', metrics.falsealarmrate(y, [1], clusteringpima.labels_, -1)) pima = clusteringpima.labels_ pima = pima.copy() preds = [1 if i == -1 else 0 for i in pima] print(jaccard_score(y, preds)) print(preds) ## Cardio Data Set # mat = scipy.io.loadmat('data/cardio.mat') # X_car = mat['X'] # y_car=mat['y'] # y_car= pd.DataFrame(y_car) # print(list(y_car[0])) # y_car.hist() # plt.title("Cardiotocography Data Class Distribution") # plt.show() # X_car = pd.DataFrame(X_car) # clusteringcar = DBSCAN(eps=1.5, min_samples=10).fit(X_car) # print(clusteringcar.labels_) # print('false alarm rate: ', metrics.falsealarmrate(y_car[0], [1], clusteringcar.labels_, -1)) # car= clusteringcar.labels_ # car = car.copy() # predcar = [1 if i==-1 else 0 for i in car] # print(jaccard_score(y_car[0], predcar)) # print(predcar) ## Wine Data Set # win = scipy.io.loadmat('data/wine.mat') # print(win) # X_win = win['X'] # y_win=win['y'] # print(y_win) # X_win = pd.DataFrame(X_win) # y_win= pd.DataFrame(y_win) # y_win.hist() # plt.title("Wine Data Class Distribution") # plt.show() # clusteringwin = DBSCAN(eps=1.5, min_samples=5).fit(X_win) # print(clusteringwin.labels_) # # print('false alarm rate: ', metrics.falsealarmrate(data.iloc[:, 36].values, 'o', clustering.labels_, -1)) # win= clusteringwin.labels_ # win = win.copy() # predwin = [ 1 if i==-1 else 0 for i in win] # print(predwin) ## Glass Data Set glass = scipy.io.loadmat('data/glass.mat') print(glass) X_gla = glass['X'] y_gla = glass['y'] X_gla = pd.DataFrame(X_gla) y_gla = pd.DataFrame(y_gla) y_gla.hist() plt.title("Glass Data Class Distribution") plt.show() clusteringgla = DBSCAN(eps=0.9, min_samples=10, n_jobs=-1).fit(X_gla) print(clusteringgla.labels_) print('false alarm rate: ', metrics.falsealarmrate(y_gla[0], [1], clusteringgla.labels_, -1)) gla = clusteringgla.labels_ gla = gla.copy() predgla = [1 if i == -1 else 0 for i in gla.copy()] print(jaccard_score(y_gla[0], predgla)) print(predgla)
def shuttle_re_db_ann_uniform(): epss = [(4.5, 10, 0.1), (4.8, 10, 0.1), (5, 10, 0.1), (5.3, 10, 0.1), (5.5, 10, 0.1), (5.8, 10, 0.1), (6, 10, 0.1), (6.8, 10, 0.1), (7, 10, 0.1), (9, 10, 0.1), (10, 10, 0.1), (28, 10, 0.1), (28.5, 10, 0.1)] # epss = [(6.8, 10, 0.1)] for t in epss: data = pd.read_csv('../data/shuttle-unsupervised-trn.csv', header=None) # datafiltered = data[data[9] != 4] # datafiltered = pd.DataFrame(datafiltered) # print(datafiltered) y = data[9].to_numpy() print(y) # datafiltered[9].hist() # plt.title("shuttle data set histogram") # plt.show() # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = dbann.dbscanann( data, 9, eps=t[0], minpts=t[1], factor=t[2], initialization=dbann.Initialization.UNIFORM, plot=False) # print(clusterlmat[0][13]) y_t = clusterlmat[0][10].to_numpy() identifiedNoisepoints = np.count_nonzero(y_t == -1) print("count of noise points", identifiedNoisepoints) # print(y_t) a = y != 4 y = y[y != 4] y_t = y_t[a] index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 # print(y_t) index = 0 for i in y.copy(): if i == 2 or i == 3 or i == 5 or i == 6 or i == 7: y[index] = 1 else: y[index] = 0 index += 1 f1_scored = f1_score(y, y_t, average='weighted') falarm = assess.falsealarmrate(y, [0], y_t, 1) arand = adjusted_rand_score(y, y_t) jacc = jaccard_score(y, y_t) rr = [ t[1], t[0], t[2], f1_scored, falarm, arand, jacc, identifiedNoisepoints ] with open('../data/ann/dbscan.dbann.uniform.shuttle.result.csv', 'a') as fd: writer = csv.writer(fd) writer.writerow(rr)
def shuttle_re_kmeans(): par = [ (1, 2500), (1, 2644), (1, 2700), (2, 2500), (2, 2644), (2, 2700), ] for p in par: data = pd.read_csv('../data/shuttle-unsupervised-trn.csv', header=None) # datafiltered = data[data[9] != 4] # datafiltered = pd.DataFrame(datafiltered) # print(datafiltered) y = data[9].to_numpy() print(y) # datafiltered[9].hist() # plt.title("shuttle data set histogram") # plt.show() # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21) clusterlmat = km.kmeansm(data, p[0], p[1], 50, 0.001, 9) # print(clusterlmat[0][13]) y_t = clusterlmat identifiedNoisepoints = np.count_nonzero(y_t == -1) print("count of noise points", identifiedNoisepoints) # print(y_t) a = y != 4 y = y[y != 4] y_t = y_t[a] index = 0 for i in y_t.copy(): if i == -1: y_t[index] = 1 else: y_t[index] = 0 index += 1 # print(y_t) index = 0 for i in y.copy(): if i == 2 or i == 3 or i == 5 or i == 6 or i == 7: y[index] = 1 else: y[index] = 0 index += 1 print("Actual number of outlier: ", np.count_nonzero(y == 1)) f1_scored = f1_score(y, y_t, average='weighted') falarm = assess.falsealarmrate(y, [0], y_t, 1) arand = adjusted_rand_score(y, y_t) jacc = jaccard_score(y, y_t) rr = [ p[0], p[1], 50, f1_scored, falarm, arand, jacc, identifiedNoisepoints ] with open('../data/ann/kmeans.shuttle.result.csv', 'a') as fd: writer = csv.writer(fd) writer.writerow(rr)