def print_pca_variance(): # reading the data from sql table using the get_all_data method X_train, y_train, train_le, X_test, y_test, test_le = data_accessor_util.get_all_data_sets() classes = list(test_le.classes_) # Converting data to numpy (X_train, y_train, X_test, y_test) = data_accessor_util.convert_data_sets_to_numpy(X_train, y_train, X_test, y_test) n_features = X_test.shape[1] # this block need to be used once to create the plots pca = PCA(n_components=n_features) pca.fit(X_train) var = pca.explained_variance_ratio_ cumulative_var = np.cumsum(var) # To enable creating figures on the server matplotlib.use('Agg') fig = plt.figure(1) plt.plot(var) plt.title('individual scree plot') plt.xlabel('principal components') plt.ylabel('proportion of variance explained') plt.savefig("proportion_variance.jpg") fig2 = plt.figure(2) plt.plot(cumulative_var) plt.title('commulative scree plot') plt.xlabel('principal components') plt.ylabel('commulative proportion of variance explained') plt.savefig("proportion_Variance_com.jpg")
def read_data_perform_pca(var_percentage = 0.95): # reading the data from sql table using the get_all_data method X_train, y_train, train_le, X_test, y_test, test_le = data_accessor_util.get_all_data_sets() classes = list(test_le.classes_) # Converting data to numpy (X_train, y_train, X_test, y_test) = data_accessor_util.convert_data_sets_to_numpy(X_train, y_train, X_test, y_test) n_features = X_test.shape[1] # this block need to be used once to create the plots pca = PCA(n_components=n_features) pca.fit(X_train) var = pca.explained_variance_ratio_ cumulative_var = np.cumsum(var) # arg max returns index, add 1 b/c indices start at 0 percentage_retained = var_percentage N_reduced = np.argmax(cumulative_var>percentage_retained)+1; pca = PCA(n_components=N_reduced) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) return X_train, y_train, train_le, X_test, y_test, test_le
from sklearn.model_selection import GridSearchCV from sklearn.decomposition import PCA from sklearn.metrics import classification_report # Internal sys.path.append(os.path.realpath("%s/.." % os.path.dirname(__file__))) from util import data_accessor_util #------------------------- # Globals #------------------------- # Get data (train_X, train_Y, train_le, test_X, test_Y, test_le) = data_accessor_util.get_all_data_sets() classes = list(test_le.classes_) print test_le.inverse_transform([0, 1, 2, 3, 4, 5]) # Convert to numpy (train_X, train_Y, test_X, test_Y) = data_accessor_util.convert_data_sets_to_numpy( train_X, train_Y, test_X, test_Y) Cs = {"C": np.arange(10**-5, 10**-1, 0.005)} # PCA print "PCA" pca = PCA(n_components=train_X.shape[1]) train_X = pca.fit_transform(train_X)
from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.gaussian_process import GaussianProcessClassifier from matplotlib.colors import ListedColormap from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.metrics.classification import accuracy_score, log_loss from sklearn.gaussian_process.kernels import RBF DEBUG = 1 # reading the data from sql table using the get_all_data method X_train, y_train, train_le, X_test, y_test, test_le = data_accessor_util.get_all_data_sets() # X = df.drop(['genre'], axis=1) # y = df['genre'] # le = preprocessing.LabelEncoder() # y = le.fit_transform(y) X_train = X_train.drop('songID', axis=1) X_test = X_test.drop('songID', axis=1) h = .02 # step size in the mesh # list of methods used, they correspond to the classifiers list methods = ["Logistic Regression","Decision Tree", "Random Forest", "Linear SVM", "RBF SVM", "Neural Net, MLP", "Gaussian Process", "Gaussian Naive Bayes", "QDA","AdaBoost"] # add these two methods as well # Xgboost # Gradient boosting