# Start timer
start_time = time.time()

# Load the data
from income_data import X, y, X_train,  X_test, y_train, y_test

# Scale the data
scaler = StandardScaler()
scaler.fit(X)
X_train_std = scaler.transform(X)
X_test_std = scaler.transform(X)
X_toCluster = X_train_std
y_inputs = y

# Reduce Dimensionality (Randomized Projection)
projection = ProjectionAlgorithm(n_components=22)
X_toCluster = projection.fit_transform(X_toCluster)

######
# Run em clustering with 2 clusters and plot
######
cluster = GaussianMixture(random_state=0, n_components=2).fit(X_toCluster)
cluster_labels = cluster.predict(X_toCluster)

X_transformed = np.dot(X_toCluster, np.transpose(cluster.means_))

# print diagnostics
print('X_toCluster.shape \n', X_toCluster.shape)
print('X_transformed.shape \n', X_transformed.shape)
print('Labels \n', cluster_labels)
print('Weights \n', cluster.weights_)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from plot_learning_curve import drawLearningCurve

# Scale the data
scaler = StandardScaler()
scaler.fit(X)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
X_toTransform = X_train_std
y_train = y_train
y_test = y_test

# Define the classifier
svm = SVC(random_state=1, kernel='linear', gamma=0.1, C=10)
pipe = Pipeline([('reduce_dim', ProjectionAlgorithm()), ('classify', svm)])
N_FEATURES_OPTIONS = [2, 3]
parameters = {
    'reduce_dim__n_components': N_FEATURES_OPTIONS,
}
clf = GridSearchCV(pipe, cv=3, param_grid=parameters)

# Run the classifier
clf.fit(X_train_std, y_train)

# Identify training and test accuracy
y_pred = clf.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
y_pred_train = clf.predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
train_accuracy = accuracy_score(y_train, y_pred_train)