df = pd.read_csv("letter-recognition.csv", header=None) # !!!type is dataframe, not ndarray!! df = np.array(df) # type conversion needed to use slicing # dfs = df[(df[:, 0] == 'A') | (df[:, 0] == 'B') | (df[:, 0] == 'C') | (df[:, 0] == 'D') | # (df[:, 0] == 'E') | (df[:, 0] == 'F') | (df[:, 0] == 'G') | (df[:, 0] == 'H') | # (df[:, 0] == 'I') | (df[:, 0] == 'J') | (df[:, 0] == 'K') | (df[:, 0] == 'L') | # (df[:, 0] == 'M') | (df[:, 0] == 'N') | (df[:, 0] == 'O')] data = df[:, 1:] data = scale(data) labels = df[:, 0] n_digits = len(np.unique(labels)) print(data.std()) ica = FastICA(n_components=16, max_iter=200).fit_transform(data) print(ica.shape) print(ica.std()) # ############################################################################### # # Visualize the results on PCA-reduced data # # reduced_data = PCA(n_components=2).fit_transform(data) # kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) # kmeans.fit(reduced_data) # # # Step size of the mesh. Decrease to increase the quality of the VQ. # h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # # # Plot the decision boundary. For that, we will assign a color to each # x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 # y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 # xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) # Plot ICA plt.figure() plt.title("ICA Kurtosis: " + Dataset) plt.xlabel("Independent Components") plt.ylabel("Avg Kurtosis Across IC") plt.plot(dims, kurt, 'b-') plt.grid(False) plt.show() ################################################################### # ICA Dimensional Reduction X1 = ICA(n_components=5, random_state=5).fit_transform(X1) X1 /= X1.std(axis=0) ################################################################### Kclusters = range(2, 50, 2) km_sil_scores = [] km_homo_scores = [] km_inertia_scores = [] km_fitness_times = [] for k in Kclusters: t1 = time.time() km = KMeans(n_clusters=k, n_init=10, random_state=100, n_jobs=-1).fit(X1) t2 = time.time() km_fitness_times.append(t2 - t1) km_sil_scores.append(silhouette_score(X1, km.labels_))
#https://scikit-learn.org/stable/datasets/index.html from sklearn.datasets import load_digits data_digits = load_digits() #X1, Y1 = pd.DataFrame(data_digits["data"]), pd.Series(data_digits["target"]) #Dataset = "digits" from sklearn.datasets import load_wine data_wine = load_wine() X1, Y1 = pd.DataFrame(data_wine["data"],columns=data_wine.feature_names), pd.Series(data_wine["target"]) Dataset = "wine" Xraw = X1 Xpca = PCA(n_components=5,random_state=5).fit_transform(X1) Xica = ICA(n_components=5,random_state=5).fit_transform(X1) Xica /= Xica.std(axis=0) Xrca = RCA(n_components=5,random_state=5).fit_transform(X1) # Run RFC #rfc = RFC(n_estimators=500,min_samples_leaf=round(len(X1)*.01),random_state=5,n_jobs=-1) #imp = rfc.fit(X1,Y1).feature_importances_ #imp = pd.DataFrame(imp,columns=['Feature Importance']) #imp.sort_values(by=['Feature Importance'],inplace=True,ascending=False) #imp['Cum Sum'] = imp['Feature Importance'].cumsum() #imp = imp[imp['Cum Sum']<=0.35] #top_cols = imp.index.tolist() #Xrfc = X1[top_cols] def MLP_classifier(X, Y, datasource):