# Start timer start_time = time.time() # Load the data from income_data import X, y, X_train, X_test, y_train, y_test # Scale the data scaler = StandardScaler() scaler.fit(X) X_train_std = scaler.transform(X) X_test_std = scaler.transform(X) X_toCluster = X_train_std y_inputs = y # Reduce Dimensionality (PCA) projection = ProjectionAlgorithm(n_components=34) X_toCluster = projection.fit_transform(X_toCluster) ###### # Run k-means clustering with 1:n clusters determine scores for each ###### scores = [] silhouette_avg = [] BIC = [] maxClusters = 100 minClusters = 1 for i in range(minClusters,maxClusters): kmeans = KMeans(n_clusters=i+1, random_state=0) cluster_labels = kmeans.fit_predict(X_toCluster) scores.append(kmeans.score(X_toCluster)) silhouette_avg.append(silhouette_score(X, cluster_labels))
from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from plot_learning_curve import drawLearningCurve # Scale the data scaler = StandardScaler() scaler.fit(X) X_train_std = scaler.transform(X_train) X_test_std = scaler.transform(X_test) X_toTransform = X_train_std y_train = y_train y_test = y_test # Define the classifier svm = SVC(random_state=1, kernel='linear', gamma=0.1, C=10) pipe = Pipeline([('reduce_dim', ProjectionAlgorithm()), ('classify', svm)]) N_FEATURES_OPTIONS = range(2, 46) parameters = { 'reduce_dim__n_components': N_FEATURES_OPTIONS, } clf = GridSearchCV(pipe, cv=3, param_grid=parameters) # Run the classifier clf.fit(X_train_std, y_train) # Identify training and test accuracy y_pred = clf.predict(X_test_std) print('Misclassified samples: %d' % (y_test != y_pred).sum()) y_pred_train = clf.predict(X_train_std) y_pred_test = clf.predict(X_test_std) train_accuracy = accuracy_score(y_train, y_pred_train)