def main(): data_file = "data/winequality-red.csv" x, y = extractData(data_file) decompose = Decomposing() decompose.pca_dim_reduction(x, y, 'wine', 11) decompose.pca_eval(x, y, 'wine', 5, 'Rating') decompose.ica_dim_reduction(x, y, 'wine', 11) decompose.ica_eval(x, y, 'wine', 11, 'Rating') n = decompose.rp_dim_reduction(x, y, 'wine', 11) print('Optimal n for RP(Wine data set) : {} '.format(str(n))) decompose.rp_eval(x, y, 'wine', n, 'Rating') k = decompose.sk_dim_reduction(x, y, 'wine', 11) # decompose.sk_eval(x, y, 'wine', k) data_file = "data/default_of_credit_card_clients.csv" x, y = extractData(data_file) decompose = Decomposing() decompose.pca_dim_reduction(x, y, 'default', 24) decompose.pca_eval(x, y, 'default', 10, 'Defaulted') decompose.ica_dim_reduction(x, y, 'default', 24) decompose.ica_eval(x, y, 'default', 6, 'Defaulted') n = decompose.rp_dim_reduction(x, y, 'default', 24) print('Optimal n for RP(CC Default data set) : {} '.format(str(n))) decompose.rp_eval(x, y, 'default', n, 'Defaulted') k = decompose.sk_dim_reduction(x, y, 'default', 24)
def classify(self, data_file, encode, label): X, Y = extractData(data_file) enc = LabelEncoder() if encode: rows, cols = X.shape for c in range(cols): if not str(X.iloc[1][c]).isnumeric(): enc.fit(X.iloc[:, c]) X.iloc[:, c] = enc.transform(X.iloc[:, c]) enc.fit(Y) Y = enc.transform(Y) train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3) nn = MLPClassifier() parameter_grid = {'activation' : ['identity', 'logistic', 'tanh', 'relu'], 'solver' : ['lbfgs', 'sgd', 'adam'], 'hidden_layer_sizes': [ (5,),(10,),(15,),(20,) ] } start = timer() classifier, grid_search = getBestModel(nn, parameter_grid, train_x, train_y) classify_model = classifier.fit(train_x, train_y) pred_y = classify_model.predict(test_x) end = timer() print('Elapsed time of train and test : ' + str(end - start)) accuracy = metrics.accuracy_score(test_y, pred_y) * 100 print('Accuracy of Neural Network = {:.2f}%'.format(accuracy)) plotValidationCurve("Neural Network", label, grid_search, train_x, train_y, parameter_grid) plotLearningCurve("Neural Network", label, classifier, X, Y) plotPerformance(test_y, pred_y, label, 'Algorithm: Neural Network')
def main(): results = pd.DataFrame(columns=[ 'Data Set', 'Cluster Algo.', '# Clusters', 'Mutual Info Score' ]) cluster = Clustering() data_file = "data/winequality-red.csv" x, y = extractData(data_file) x = pd.DataFrame(preprocessing.scale(x), columns=x.columns) cluster.kMeansCluster('wine', x, y, 6, 10) km = cluster.kmeansFitBestModel(6, 10, 'wine', x, 5) score = adjusted_mutual_info_score(km.labels_, y) results = append_results(results, ['Wine Quality', 'k-Means', 5, score]) cluster.emCluster('wine', x, y, 6, 10) em = cluster.emFitBestModel(6, 10, 'wine', 5, x) score = adjusted_mutual_info_score(em.predict(x), y) results = append_results(results, ['Wine Quality', 'Exp. Maximization', 5, score]) data_file = "data/default_of_credit_card_clients.csv" x, y = extractData(data_file) x = pd.DataFrame(preprocessing.scale(x), columns=x.columns) bill_cols = [ 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6' ] pmt_cols = [ 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6' ] num_cols = x.shape[1] x['total_bill'] = x.loc[:, bill_cols].sum(axis=1) x['total_pmt'] = x.loc[:, pmt_cols].sum(axis=1) cluster.kMeansCluster('default', x, y, num_cols, num_cols + 1) km = cluster.kmeansFitBestModel(num_cols, num_cols + 1, 'default', x, 10) score = adjusted_mutual_info_score(km.labels_, y) results = append_results(results, ['CC Default', 'k-Means', 10, score]) cluster.emCluster('default', x, y, num_cols, num_cols + 1) em = cluster.emFitBestModel(num_cols, num_cols + 1, 'default', 15, x) score = adjusted_mutual_info_score(em.predict(x), y) results = append_results(results, ['CC Default', 'Exp. Maximization', 15, score]) print(results) print(results.to_latex())
def classify(self, data_file, encode, label): # data = pd.read_csv(data_file) X, Y = extractData(data_file) enc = LabelEncoder() if encode: rows, cols = X.shape for c in range(cols): if not str(X.iloc[1][c]).isnumeric(): enc.fit(X.iloc[:, c]) X.iloc[:, c] = enc.transform(X.iloc[:, c]) enc.fit(Y) Y = enc.transform(Y) # train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=123) #Find best model dct = tree.DecisionTreeClassifier() parameter_grid = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': range(1, 10), 'max_features': range(1, 5), 'min_samples_leaf': range(1, 5) } start = timer() classifier, grid_search = getBestModel(dct, parameter_grid, train_x, train_y) classify_model = classifier.fit(train_x, train_y) pred_y = classify_model.predict(test_x) end = timer() print('Elapsed time of train and test : ' + str(end - start)) accuracy = grid_search.best_score_ * 100 print('Accuracy of Decision Tree(depth={}) = {:.2f}%'.format( classify_model.get_depth(), accuracy)) plotValidationCurve("Decision Tree", label, grid_search, train_x, train_y, parameter_grid) plotLearningCurve("Decision Tree", label, classifier, X, Y) plotPerformance(test_y, pred_y, label, 'Algorithm: Decision Tree')
def classifyWithBoost(self, data_file, encode, label): X, Y = extractData(data_file) enc = LabelEncoder() if encode: rows, cols = X.shape for c in range(cols): if not str(X.iloc[1][c]).isnumeric(): enc.fit(X.iloc[:, c]) X.iloc[:, c] = enc.transform(X.iloc[:, c]) enc.fit(Y) Y = enc.transform(Y) train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=123) dctb = GradientBoostingClassifier(random_state=123) parameter_grid = { 'learning_rate': [0.2, 0.3, 0.5], 'max_depth': [2, 3, 4, 5] } start = timer() classifier, grid_search = getBestModel(dctb, parameter_grid, train_x, train_y) plotValidationCurve("Decision Tree with Boost", label, grid_search, train_x, train_y, parameter_grid) plotLearningCurve("Decision Tree with Boost", label, classifier, X, Y) classify_model = classifier.fit(X, Y) pred_y = classify_model.predict(test_x) end = timer() print('Elapsed time of train and test : ' + str(end - start)) accuracy = classify_model.score(test_x, test_y) * 100 print('Accuracy of GradientBoostingClassifier = {:.2f}%'.format( accuracy)) plotPerformance(test_y, pred_y, label, 'Algorithm: Decision Tree with Boost')
def classify(self, data_file, encode, label): X, Y = extractData(data_file) enc = LabelEncoder() if encode: rows, cols = X.shape for c in range(cols): if not str(X.iloc[1][c]).isnumeric(): enc.fit(X.iloc[:, c]) X.iloc[:, c] = enc.transform(X.iloc[:, c]) enc.fit(Y) Y = enc.transform(Y) # train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=123) knn = KNeighborsClassifier() parameter_grid = {'n_neighbors': range(1, 10)} start = timer() classifier, grid_search = getBestModel(knn, parameter_grid, train_x, train_y) classify_model = classifier.fit(train_x, train_y) pred_y = classify_model.predict(test_x) accuracy = classify_model.score(test_x, test_y) * 100 end = timer() print('Elapsed time of train and test : ' + str(end - start)) print('Accuracy of KNN = {:.2f}%'.format(accuracy)) plotValidationCurve("KNeighbours", label, grid_search, train_x, train_y, parameter_grid) plotLearningCurve("KNeighbours", label, classifier, X, Y) plotPerformance(test_y, pred_y, label, 'Algorithm: k-Nearest Neighbors')
def data_prep(self, test_size=0.3): x, y = extractData("data/default_of_credit_card_clients.csv") self.num_classes = len(np.unique(y)) self.train_x, self.test_x, self.train_y, self.test_y = train_test_split( x, y, test_size=test_size)
def main(): exp_results = pd.DataFrame(columns=['Data Set','Dim. Red. Algo','Cluster Algo.','# Clusters','Mutual Info Score']) data_file = "data/winequality-red.csv" x, y = extractData(data_file) x = pd.DataFrame(preprocessing.scale(x), columns=x.columns) dimreduce = Decomposing() cluster = Clustering() pca_result = pd.DataFrame(dimreduce.pca_eval(x,y,'experiment/wine/pca',5,'Rating')[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/pca', pca_result.copy(), 5) score = adjusted_mutual_info_score(km.labels_,y) print('Wine : PCA - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality','PCA','k-Means',5,score]) em = cluster.emFitBestModel(1,2,'experiment/wine/pca',5,pca_result) score = adjusted_mutual_info_score(em.predict(pca_result), y) print('Wine : PCA - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality', 'PCA', 'Exp. Maximization', 5, score]) ica_result = pd.DataFrame(dimreduce.ica_eval(x, y, 'experiment/wine/ica', 11, 'Rating')[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/ica', ica_result.copy(), 5) score = adjusted_mutual_info_score(km.labels_, y) print('Wine : ICA - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality', 'ICA', 'k-Means', 5, score]) em = cluster.emFitBestModel(1, 2, 'experiment/wine/ica', 5, ica_result) score = adjusted_mutual_info_score(em.predict(ica_result), y) print('Wine : ICA - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality', 'ICA', 'Exp. Maximization', 5, score]) rp_result = pd.DataFrame(dimreduce.rp_eval(x, y, 'experiment/wine/rp', 11, 'Rating')[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/rp', rp_result.copy(), 5) score = adjusted_mutual_info_score(km.labels_, y) print('Wine : RP - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality', 'RP', 'k-Means', 5, score]) em = cluster.emFitBestModel(1, 2, 'experiment/wine/rp', 5, rp_result) score = adjusted_mutual_info_score(em.predict(rp_result), y) print('Wine : RP - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality', 'RP', 'Exp. Maximization', 5, score]) sk_result = pd.DataFrame(dimreduce.sk_eval(x, y, 8)[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/sk', sk_result.copy(), 4) score = adjusted_mutual_info_score(km.labels_, y) print('Wine : SK - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality', 'Select-K', 'k-Means', 4, score]) em = cluster.emFitBestModel(1, 2, 'experiment/wine/sk', 5, sk_result) score = adjusted_mutual_info_score(em.predict(sk_result), y) print('Wine : SK - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['Wine Quality', 'Select-K', 'Exp. Maximization', 5, score]) data_file = "data/default_of_credit_card_clients.csv" x, y = extractData(data_file) x = pd.DataFrame(preprocessing.scale(x), columns=x.columns) pca_result = pd.DataFrame(dimreduce.pca_eval(x, y, 'experiment/default/pca', 10,'Defaulted')[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/pca', pca_result, 5) score = adjusted_mutual_info_score(km.labels_, y) print('Default : PCA - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'PCA', 'k-Means', 5, score]) em = cluster.emFitBestModel(1, 2, 'experiment/default/pca', 5, pca_result) score = adjusted_mutual_info_score(em.predict(pca_result), y) print('Default : PCA - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'PCA', 'Exp. Maximization', 5, score]) ica_result = pd.DataFrame(dimreduce.ica_eval(x, y, 'experiment/default/ica', 6, 'Defaulted')[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/ica', ica_result, 4) score = adjusted_mutual_info_score(km.labels_, y) print('Default : ICA - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'ICA', 'k-Means', 6, score]) em = cluster.emFitBestModel(1, 2, 'experiment/default/ica', 4, ica_result) score = adjusted_mutual_info_score(em.predict(ica_result), y) print('Default : ICA - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'ICA', 'Exp. Maximization', 4, score]) rp_result = pd.DataFrame(dimreduce.rp_eval(x, y, 'experiment/default/rp', 24, 'Defaulted')[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/rp', rp_result, 10) score = adjusted_mutual_info_score(km.labels_, y) print('Default : RP - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'RP', 'k-Means', 10, score]) em = cluster.emFitBestModel(1, 2, 'experiment/default/rp', 10, rp_result) score = adjusted_mutual_info_score(em.predict(rp_result), y) print('Default : RP - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'RP', 'Exp. Maximization', 10, score]) sk_result = pd.DataFrame(dimreduce.sk_eval(x, y, 6)[0]) km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/sk', sk_result, 4) score = adjusted_mutual_info_score(km.labels_, y) print('Default : SK - Kmeans : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'Select-K', 'k-Means', 4, score]) em = cluster.emFitBestModel(1, 2, 'experiment/default/sk', 4, rp_result) score = adjusted_mutual_info_score(em.predict(rp_result), y) print('Default : SK - EM : Score = {}'.format(score)) exp_results = append_results(exp_results, ['CC Default', 'Select-K', 'Exp. Maximization', 4, score]) print(exp_results) print(exp_results.to_latex()) ##### NN train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3) accuracy_raw = nn_classifier(train_x, train_y, test_x, test_y) pca_result, pca = dimreduce.pca_eval(train_x, train_y, 'nn', 10, 'Defaulted') test_data = pca.transform(test_x) accuracy_pca = nn_classifier(pca_result, train_y, test_data, test_y) ica_result, ica = dimreduce.ica_eval(train_x, train_y, 'nn', 6, 'Defaulted') test_data = ica.transform(test_x) accuracy_ica = nn_classifier(ica_result, train_y, test_data, test_y) rp_result, rp = dimreduce.rp_eval(train_x, train_y, 'nn', 24, 'Defaulted') test_data = rp.transform(test_x) accuracy_rp = nn_classifier(rp_result, train_y, test_data, test_y) sk_result, sk = dimreduce.sk_eval(train_x, train_y, 4) test_data = sk.transform(test_x) accuracy_sk = nn_classifier(sk_result, train_y, test_data, test_y) plot_results("Dimension Reduction Algo. Accuracy",'dim_red_accuracy', ['Original','PCA','ICA','RP','SK'], [accuracy_raw,accuracy_pca,accuracy_ica,accuracy_rp,accuracy_sk],'Dim. Red. Algorithms','Accuracy') km = cluster.kmeansFitBestModel(1,2,'nn',train_x,10) km_data = km.fit_transform(train_x) test_data = km.transform(test_x) accuracy_km = nn_classifier(km_data,train_y,test_data, test_y) em = cluster.emFitBestModel(1, 2, 'nn', 15,train_x) em_train_labels = em.predict(train_x) em_train_ohc = one_hot_encode(em_train_labels, 15) em_train = np.concatenate((train_x, em_train_ohc), 1) # one hot encode cluster labels to val set em_test_labels = em.predict(test_x) em_test_ohc = one_hot_encode(em_test_labels, 15) em_test = np.concatenate((test_x, em_test_ohc), 1) # scale data scaler = preprocessing.StandardScaler().fit(em_train) em_data = scaler.transform(em_train) test_data = scaler.transform(em_test) accuracy_em = nn_classifier(em_data, train_y, test_data, test_y) plot_results("Clustering Accuracy", 'cluster_accuracy',['Original', 'k-Means', 'Ex. Maximization'], [accuracy_raw, accuracy_km, accuracy_em], 'Clustering Algorithms', 'Accuracy')