def main(): # Importing the dataset dataset = pd.read_csv('Wine.csv') X = dataset.iloc[:, 0:13].values y = dataset.iloc[:, 13].values # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Applying LDA lda = LDA(n_components=2) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) # Fitting Logistic Regression to the Training set classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01), np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha=0.75, cmap=ListedColormap(('red', 'green', 'blue'))) plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c=ListedColormap(('red', 'green', 'blue'))(i), label=j) plt.title('Logistic Regression (Training set)') plt.xlabel('LD1') plt.ylabel('LD2') plt.legend() plt.show() # Visualising the Test set results X_set, y_set = X_test, y_test X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01), np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha=0.75, cmap=ListedColormap(('red', 'green', 'blue'))) plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c=ListedColormap(('red', 'green', 'blue'))(i), label=j) plt.title('Logistic Regression (Test set)') plt.xlabel('LD1') plt.ylabel('LD2') plt.legend() plt.show()
ind[0::2] = np.arange(c - 1, c // 2 - 1, -1) ind[1::2] = np.arange(0, c // 2) W = W[:, ind] csp_filters.append(W.T[:ncomp]) XT_CSP, XV_CSP = [], [] for i in range(nbands): YT = np.asarray([np.dot(csp_filters[i], ep) for ep in XT[i]]) XT_CSP.append(np.log(np.mean(YT**2, axis=2))) # Feature extraction # XT_CSP.append( np.log( np.var( YT, axis=2 ) ) ) #%% LDA SCORE_T = np.zeros((len(ZT), nbands)) lda_list = [] for i in range(nbands): lda = LDA() lda.fit(XT_CSP[i], tT) SCORE_T[:, i] = np.ravel( lda.transform(XT_CSP[i]) ) # classificações de cada época nas N sub bandas - auto validação lda_list.append(lda) #%% Bayesian Meta-Classifier SCORE_T0 = SCORE_T[tT == class_ids[0], :] SCORE_T1 = SCORE_T[tT == class_ids[1], :] p0 = norm(np.mean(SCORE_T0, axis=0), np.std(SCORE_T0, axis=0)) p1 = norm(np.mean(SCORE_T1, axis=0), np.std(SCORE_T1, axis=0)) META_SCORE_T = np.log(p0.pdf(SCORE_T) / p1.pdf(SCORE_T)) #%% Final classification clf_final = SVC(kernel='linear', C=10**(-4), probability=True)
def fit(self, X, y): lda = LDA(store_covariance=self.cov) self.fit_ = lda.fit(X, y) return self
def train(workDir, classifier="LinearSvm", ldaDim=-1): print("Loading embeddings.") fname = "{}/labels.csv".format(workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = list( map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels)))) # Get the directory. fname = "{}/reps.csv".format(workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) nClasses = len(le.classes_) print("Training for {} classes.".format(nClasses)) print(type(embeddings[0])) if classifier == 'LinearSvm': clf = SVC(C=1, kernel='linear', probability=True) elif classifier == 'GridSearchSvm': print(""" Warning: In our experiences, using a grid search over SVM hyper-parameters only gives marginally better performance than a linear SVM with C=1 and is not worth the extra computations of performing a grid search. """) param_grid = [{ 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }] clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5) elif classifier == 'GMM': # Doesn't work best clf = GMM(n_components=nClasses) # ref: # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py elif classifier == 'RadialSvm': # Radial Basis Function kernel # works better with C = 1 and gamma = 2 clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif classifier == 'DecisionTree': # Doesn't work best clf = DecisionTreeClassifier(max_depth=20) elif classifier == 'GaussianNB': clf = GaussianNB() # ref: https://jessesw.com/Deep-Learning/ elif classifier == 'DBN': from nolearn.dbn import DBN clf = DBN( [embeddings.shape[1], 500, labelsNum[-1:][0] + 1 ], # i/p nodes, hidden nodes, o/p nodes learn_rates=0.3, # Smaller steps mean a possibly more accurate result, but the # training will take longer learn_rate_decays=0.9, # a factor the initial learning rate will be multiplied by # after each iteration of the training epochs=300, # no of iternation # dropouts = 0.25, # Express the percentage of nodes that # will be randomly dropped as a decimal. verbose=1) if ldaDim > 0: clf_final = clf clf = Pipeline([('lda', LDA(n_components=ldaDim)), ('clf', clf_final)]) clf.fit(embeddings, labelsNum) fName = "{}/classifier.pkl".format(workDir) print("Saving classifier to '{}'".format(fName)) with open(fName, 'wb') as f: pickle.dump((le, clf), f)
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) # Fitting Logistic Regression to the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred)
# train_indexes = flatten_repetitions(cross_validation_indexes[0]) test_indexes = flatten_repetitions(cross_validation_indexes[1]) # train_data_all_subject.append(np.asarray(all_data_per_char_as_matrix[train_indexes]).astype(np.float32)) test_data_all_subject.append( np.asarray( all_data_per_char_as_matrix[test_indexes]).astype( np.float32)) # train_tags_all_subject.append(target_per_char_as_matrix[train_indexes]) test_tags_all_subject.append( target_per_char_as_matrix[test_indexes]) break model = LDA() from scipy import stats # train_data = stats.zscore(np.vstack(train_data_all_subject),axis=1) # train_tags = np.vstack(train_tags_all_subject).flatten() test_data = stats.zscore(np.vstack(test_data_all_subject), axis=1) test_tags = np.vstack(test_tags_all_subject).flatten() import pandas as pd # final_train_matrix_with_tagging = np.hstack([train_data.reshape(train_data.shape[0] * train_data.shape[1], -1).astype(np.float32),train_tags.reshape(-1,1)]) final_test_matrix_with_tagging = np.hstack([ test_data.reshape(test_data.shape[0] * test_data.shape[1], -1).astype(np.float32), test_tags.reshape(-1, 1) ])
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) from sklearn.preprocessing import StandardScaler scx = StandardScaler() X_train = scx.fit_transform(X_train) X_test = scx.transform(X_test) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=None) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) explained_variance = lda.explained_variance_ratio_ from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(cm)
#use prior 2 days of return as predictor values, #with direction as response X = snpret[['Lag1', 'Lag2']] y = snpret['Direction'] #test data is split into 2 parts: before & after 2005,1,1 start_test = datetime.datetime(2005, 1, 1) #create training & data set X_train = X[X.index < start_test] X_test = X[X.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] #create parametrised models models = [('LR', LogisticRegression()), ('LDA', LDA()), ('QDA', QDA()), ('LSVC', LinearSVC()), ('RSVM', SVC(C=1000000.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)),
plt.figure() plot_points(x) plt.axis('square') plt.tight_layout() save_fig('gda_2d_data.pdf') plt.show() plt.figure() plot_points(x) plot_contours(xx, yy, x_range, y_range, u, sigma) plt.axis('square') plt.tight_layout() save_fig('gda_2d_contours.pdf') plt.show() for k, clf in enumerate((LDA(), QDA())): clf.fit(X, Y) z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(ngrid, ngrid) z_p = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) yhat = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Yhat = make_one_hot(yhat) plt.figure() #plot_dboundaries(xx, yy, z, z_p) plot_dboundaries(xx, yy, z, Yhat) plot_points(x) plot_contours(xx, yy, x_range, y_range, u, sigma) plt.title(model_names[k]) plt.axis('square') plt.tight_layout()
from collections import OrderedDict from moabb.datasets.bnci import BNCI2014001 from moabb.datasets.alex_mi import AlexMI from moabb.datasets.physionet_mi import PhysionetMI datasets = [ AlexMI(with_rest=True), BNCI2014001(), PhysionetMI(with_rest=True, feets=False) ] pipelines = OrderedDict() pipelines['MDM'] = make_pipeline(Covariances('oas'), MDM()) pipelines['TS'] = make_pipeline(Covariances('oas'), TSclassifier()) pipelines['CSP+LDA'] = make_pipeline(Covariances('oas'), CSP(8), LDA()) context = MotorImageryMultiClasses(datasets=datasets, pipelines=pipelines) results = context.evaluate(verbose=True) for p in results.keys(): results[p].to_csv('../../results/MotorImagery/MultiClass/%s.csv' % p) results = pd.concat(results.values()) print(results.groupby('Pipeline').mean()) res = results.pivot(values='Score', columns='Pipeline') sns.lmplot(data=res, x='CSP+LDA', y='TS', fit_reg=False) plt.xlim(0.25, 1) plt.ylim(0.25, 1)
py.xlabel("False Positive Rate") py.ylabel("True Positive Rate") py.legend() py.show() confusion_matrix = ConfusionMatrix(testinglabel, predicted_test) sns.heatmap(confusion_matrix, annot=True) Accuracy = accuracy(testinglabel, predicted_test) print(Accuracy) Kfold(train_fold, newlabel, reduced_test, Test['label']) # In[160]: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) inbuilt_lda = lda.fit_transform(originaldata, Data['label']) # In[161]: inbuilt_lda = pd.DataFrame(inbuilt_lda) inbuilt_lda = pd.concat([inbuilt_lda, Data['label']], axis=1) # In[163]: from numpy.linalg import inv def LDA(lda_data, k): data = lda_data.drop(['label'], axis=1) LDA = pd.DataFrame()
plot_svc_decision_function(model, label = 'SVM', ax = ax) w_svm = np.hstack((model.coef_[0],model.intercept_)) plot_data.plot_lines(X,y, w_svm, ax, label = 'SVM', linestyle = 'dashed') # Applying LinearSVC from sklearn.svm import LinearSVC linearSVC = LinearSVC(C = 1e6) linearSVC.fit(X,y) linearSVC.coef_ linearSVC.intercept_ w_l_SVC = np.hstack((linearSVC.coef_[0],linearSVC.intercept_)) plot_data.plot_lines( X, y, w_l_SVC, ax, color = 'cyan', label = 'Lin SVC', linestyle = 'dashed') # Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA for solver, color, linestyle in zip(['svd', 'eigen', 'lsqr'], ['red', 'green', 'blue'], ['dotted', 'dotted', 'dotted']): if (solver == 'svd'): lda = LDA(solver = solver, store_covariance=True) else: lda = LDA(solver = solver) lda.fit(X, y) lda.coef_ lda.intercept_ w_lda = np.hstack((lda.coef_[0],lda.intercept_)) plot_data.plot_lines( X, y, w_lda, ax, color = color, label = 'LDA_' + solver, linestyle = linestyle) plot_lda_decision_function(lda, label = 'lda') lda.predict_proba(X)
data[X] = data[X].apply(lambda x: x + np.random.rand()) data[X] = data[X].apply(lambda x: x + 1) data[X], _ = boxcox(data[X]) for i in skewed: normalizing(i) fe = False if fe: # Feature engineering print("Feature engineering...") y = train.iloc[:, -1] Severity = ['mvar3', 'mvar4', 'mvar5'] lda_Severity = LDA(n_components=5) lda_Severity = lda_Severity.fit(train[Severity], y) data['Severity'] = lda_Severity.transform(data[Severity]) No_of_active = ['mvar16', 'mvar17', 'mvar19', 'mvar20', 'mvar18'] lda_No_of_active = LDA(n_components=5) lda_No_of_active = lda_No_of_active.fit(train[No_of_active], y) data['No_of_active'] = lda_No_of_active.transform(data[No_of_active]) Average_utilization = ['mvar21', 'mvar22', 'mvar23', 'mvar24'] lda_Average_utilization = LDA(n_components=5) lda_Average_utilization = lda_Average_utilization.fit( train[Average_utilization], y) data['Average_utilization'] = lda_Average_utilization.transform( data[Average_utilization])
l1 = pd.DataFrame(l1) l1 = pd.concat([l1, test["Pclass"]], join='outer', axis=1) l1 x = l y = titanic["Survived"] x_test = l1 #lda analysis from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.metrics import accuracy_score, confusion_matrix ldatemp, rfcdepth, acc = 0, 0, 0 var = [] for i in range(1, 2): lda = LDA(n_components=i) x = lda.fit_transform(x, y) x_test = lda.transform(x_test) for j in range(1, 20): cl = RFC(max_depth=j, random_state=0) cl.fit(x, y) y_pred = cl.predict(x) if (acc < accuracy_score(y_pred, y)): acc = accuracy_score(y_pred, y) ldatemp = i rfcdepth = j var = y_pred print(confusion_matrix(var, titanic["Survived"])) print(str(rfcdepth) + " " + str(ldatemp) + " " + str(acc)) lda = LDA(n_components=ldatemp) x = lda.fit_transform(x, y)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA old_paper_embedding = np.empty((1216315, 50), dtype=np.float32) with open('data/arxiv_fasttext_vector.txt') as f: for i, line in enumerate(f): for j, x in enumerate(line.split()): x = float(x) old_paper_embedding[i, j] = x labels = [] with open('save/arxiv_label.txt') as f: for line in f: label = line.strip() labels.append(label) clf = LDA(n_components=2) clf.fit(old_paper_embedding, labels) paper_embedding = clf.transform(old_paper_embedding) paper_embedding = tf.Variable(paper_embedding, trainable=False, name='paper_embedding', dtype=tf.float32) sess = tf.Session() sess.run(tf.global_variables_initializer()) tf.train.Saver().save(sess, 'save/best') config = projector.ProjectorConfig() paper_projector = config.embeddings.add() paper_projector.tensor_name = paper_embedding.name paper_projector.metadata_path = 'label.txt'
# SVC : sklearn.svm.classes.SVC # NSVC : sklearn.svm.classes.NuSVC # OCSVM : sklearn.svm.classes.OneClassSVM from sklearn.svm.classes import LinearSVC as LSVC, SVC, NuSVC as NSVC, OneClassSVM as OCSVM # ABC : sklearn.ensemble.weight_boosting.AdaBoostClassifier from sklearn.ensemble.weight_boosting import AdaBoostClassifier as ABC # GBC : sklearn.ensemble.gradient_boosting.GradientBoostingClassifier from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier as GBC # RFC : sklearn.ensemble.forest.RandomForestClassifier # ETsC : sklearn.ensemble.forest.ExtraTreesClassifier from sklearn.ensemble.forest import RandomForestClassifier as RFC from sklearn.ensemble.forest import ExtraTreesClassifier as ETsC _Models = { "LR": LR(), "LRCV": LRCV(), "LDA": LDA(), "QDA": QDA(), "KNC": KNC(), # "RNC" : RNC(), "DTC": DTC(), "ETC": ETC(), "GNB": GNB(), # "BDNB" : BDNB(), "MNB": MNB(), "BNB": BNB(), "LSVC": LSVC(), "SVC": SVC(), "NSVC": NSVC(), # "OCSVM" : OCSVM() } # 审查结果比较
fig.show() # sklearn_LDA_cell_line = LDA(n_components=2) # sklearn_LDA_cell_line.fit(X, y) else: print("unknown index for an example!") else: # ----------------------------------------------------- # 6. use sklearn LDA # ----------------------------------------------------- # apply sklearn LDA to iris data iris = load_iris() sklearn_LDA = LDA(n_components=2) sklearn_LDA_projection = sklearn_LDA.fit_transform(iris.data, iris.target) sklearn_LDA_projection = -sklearn_LDA_projection # plot the projections fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.set_title('Results from applying sklearn LDA to iris') # ax.set_xlabel(r'$W_1$') # ax.set_ylabel(r'$W_2$') ax.scatter(sklearn_LDA_projection[0:50, 0], sklearn_LDA_projection[0:50, 1], marker='o', s=marker_size, color='blue', label='setosa') ax.scatter(sklearn_LDA_projection[50:100, 0], sklearn_LDA_projection[50:100, 1], marker='o', s=marker_size, color='red', label='versicolor') ax.scatter(sklearn_LDA_projection[100:150, 0], sklearn_LDA_projection[100:150, 1],
def run_trainer(cfg, ftrain, interactive=False): # feature selection? datadir= cfg.DATADIR feat_picks= None txt= 'all' do_balance= False # preprocessing, epoching and PSD computation n_epochs= {} spfilter= cfg.SP_FILTER tpfilter= cfg.TP_FILTER # Load multiple files multiplier= 1 raw, events= pu.load_multi(ftrain, spfilter=spfilter, multiplier=multiplier) #print(raw._data.shape) #(17L, 2457888L) triggers= { cfg.tdef.by_value[c]:c for c in set(cfg.TRIGGER_DEF) } # Pick channels if cfg.CHANNEL_PICKS is None: picks= pick_types(raw.info, meg=False, eeg=True, stim=False, eog=False, exclude='bads') #print (picks) # [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] else: picks= [] for c in cfg.CHANNEL_PICKS: if type(c)==int: picks.append(c) elif type(c)==str: picks.append( raw.ch_names.index(c) ) else: raise RuntimeError, 'CHANNEL_PICKS is unknown format.\nCHANNEL_PICKS=%s'% cfg.CHANNEL_PICKS if max(picks) > len(raw.info['ch_names']): print('ERROR: "picks" has a channel index %d while there are only %d channels.'%\ ( max(picks),len(raw.info['ch_names']) ) ) sys.exit(-1) # # Spatial filter if cfg.SP_CHANNELS is None: spchannels= pick_types(raw.info, meg=False, eeg=True, stim=False, eog=False, exclude='bads') else: spchannels= [] for c in cfg.SP_CHANNELS: if type(c)==int: spchannels.append(c) elif type(c)==str: spchannels.append( raw.ch_names.index(c) ) else: raise RuntimeError, 'SP_CHANNELS is unknown format.\nSP_CHANNELS=%s'% cfg.SP_CHANNELS # # Spectral filter if tpfilter is not None: raw= raw.filter( tpfilter[0], tpfilter[1], picks=picks, n_jobs= mp.cpu_count() ) if cfg.NOTCH_FILTER is not None: raw= raw.notch_filter( cfg.NOTCH_FILTER, picks=picks, n_jobs= mp.cpu_count() ) # Read epochs try: epochs_train= Epochs(raw, events, triggers, tmin=cfg.EPOCH[0], tmax=cfg.EPOCH[1], proj=False,\ picks=picks, baseline=None, preload=True, add_eeg_ref=False, verbose=False, detrend=None) #print (epochs_train)# <Epochs | n_events : 422 (all good), tmin : 1.0 (s), tmax : 2.0 (s), baseline : None, ~26.5 MB, data loaded,'LEFT_GO': 212, 'RIGHT_GO': 210> except: print('\n*** (trainer.py) ERROR OCCURRED WHILE EPOCHING ***\n') traceback.print_exc() if interactive: print('Dropping into a shell.\n') pdb.set_trace() raise RuntimeError ''' epochs_data= epochs_train.get_data() print (epochs_data.shape) #(422L, 16L, 513L) trail*channel*caiyangdian #Visualize raw data for some channel in some trial ptrial=1 trail=np.zeros((len(spchannels),epochs_data.shape[2])) print(trail) for pch in range(len(spchannels)): print(pch) trail[pch,::] =epochs_data[ptrial,pch,::] color=["b","g","r",'c','m','y','k','w',"b","g","r",'c','m','y','k','w'] linstyle=['-','-','-','-','-','-','-','-','--','--','--','--','--','--','--','--',] for pch in range(len(spchannels)): print(color[pch]) print(linstyle[pch]) plt.plot(np.linspace(cfg.EPOCH[0], cfg.EPOCH[1], epochs_data.shape[2]), trail[pch,::],c=color[pch],ls=linstyle[pch], label='channel %d'%(pch+1),lw=0.5) plt.xlabel('time/s') plt.ylabel('voltage/uV') plt.title('Viewer') plt.legend(loc="lower right") plt.show() ''' label_set= np.unique(triggers.values()) sfreq= raw.info['sfreq'] # Compute features res= get_psd_feature(epochs_train, cfg.EPOCH, cfg.PSD, feat_picks) X_data= res['X_data'] Y_data= res['Y_data'] wlen= res['wlen'] w_frames= res['w_frames'] psde= res['psde'] psdfile= '%s/psd/psd-train.pcl'% datadir plot_pca_componet(X_data, Y_data) psdparams= cfg.PSD # print (events) for ev in triggers: print (ev) n_epochs[ev]= len( np.where(events[:,-1]==triggers[ev])[0] )#{'RIGHT_GO': 150, 'LEFT_GO': 150} total trails # Init a classifier if cfg.CLASSIFIER=='RF': # Make sure to set n_jobs=cpu_count() for training and n_jobs=1 for testing. cls= RandomForestClassifier(n_estimators=cfg.RF['trees'], max_features='auto',\ max_depth=cfg.RF['maxdepth'], n_jobs=mp.cpu_count(), class_weight='balanced' ) elif cfg.CLASSIFIER=='LDA': cls= LDA() # elif cfg.CLASSIFIER=='rLDA': # cls= rLDA(cfg.RLDA_REGULARIZE_COEFF) else: raise RuntimeError, '*** Unknown classifier %s'% cfg.CLASSIFIER # Cross-validation if cfg.CV_PERFORM is not None: ntrials, nsamples, fsize= X_data.shape if cfg.CV_PERFORM=='LeaveOneOut': print('\n>> %d-fold leave-one-out cross-validation'% ntrials) cv= LeaveOneOut(len(Y_data)) elif cfg.CV_PERFORM=='StratifiedShuffleSplit': print('\n>> %d-fold stratified cross-validation with test set ratio %.2f'% (cfg.CV_FOLDS, cfg.CV_TEST_RATIO)) cv= StratifiedShuffleSplit(Y_data[:,0], cfg.CV_FOLDS, test_size=cfg.CV_TEST_RATIO, random_state=0) else: print('>> ERROR: Unsupported CV method yet.') sys.exit(-1) print('%d trials, %d samples per trial, %d feature dimension'% (ntrials, nsamples, fsize) ) # Do it! scores= crossval_epochs(cv, X_data, Y_data, cls, cfg.tdef.by_value, do_balance) ''' #learning curve train_sizes,train_loss,test_loss=learning_curve(cls,X_data.reshape(X_data.shape[0]*X_data.shape[1],X_data.shape[2]),Y_data.reshape(Y_data.shape[0]*Y_data.shape[1]),train_sizes=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]) print(X_data.shape) print(Y_data.shape) train_loss_mean=np.mean(train_loss,axis=1) test_loss_mean=np.mean(test_loss,axis=1) plt.plot(train_sizes,train_loss_mean,label='training') plt.plot(train_sizes,test_loss_mean,label='Cross-validation') plt.xlabel('training examples') plt.ylabel('loss') plt.legend(loc='best') plt.show() ''' # Results print('\n>> Class information') for ev in np.unique(Y_data): print('%s: %d trials'% (cfg.tdef.by_value[ev], len(np.where(Y_data[:,0]==ev)[0])) ) if do_balance: print('The number of samples was balanced across classes. Method:', do_balance) print('\n>> Experiment conditions') print('Spatial filter: %s (channels: %s)'% (spfilter, spchannels) ) print('Spectral filter: %s'% tpfilter) print('Notch filter: %s'% cfg.NOTCH_FILTER) print('Channels: %s'% picks) print('PSD range: %.1f - %.1f Hz'% (psdparams['fmin'], psdparams['fmax']) ) print('Window step: %.1f msec'% (1000.0 * psdparams['wstep'] / sfreq) ) if type(wlen) is list: for i, w in enumerate(wlen): print('Window size: %.1f sec'% (w) ) print('Epoch range: %s sec'% (cfg.EPOCH[i])) else: print('Window size: %.1f sec'% (psdparams['wlen']) ) print('Epoch range: %s sec'% (cfg.EPOCH)) #chance= 1.0 / len(np.unique(Y_data)) cv_mean, cv_std= np.mean(scores), np.std(scores) print('\n>> Average CV accuracy over %d epochs'% ntrials) if cfg.CV_PERFORM in ['LeaveOneOut','StratifiedShuffleSplit']: print("mean %.3f, std: %.3f" % (cv_mean, cv_std) ) print('Classifier: %s'% cfg.CLASSIFIER) if cfg.CLASSIFIER=='RF': print(' %d trees, %d max depth'% (cfg.RF['trees'], cfg.RF['maxdepth']) ) if cfg.USE_LOG: logfile= '%s/result_%s_%s.txt'% (datadir, cfg.CLASSIFIER, txt) logout= open(logfile, 'a') logout.write('%s\t%.3f\t%.3f\n'% (ftrain[0], np.mean(scores), np.var(scores)) ) logout.close() # Train classifier archtype= platform.architecture()[0] # (’64bit’, ‘Windows7’) clsfile= '%s/classifier/classifier-%s.pcl'% (datadir,archtype) print('\n>> Training classifier') X_data_merged= np.concatenate( X_data ) Y_data_merged= np.concatenate( Y_data ) timer= qc.Timer() cls.fit( X_data_merged, Y_data_merged) print('Trained %d samples x %d dimension in %.1f sec'% \ (X_data_merged.shape[0], X_data_merged.shape[1], timer.sec())) # set n_jobs = 1 for testing cls.n_jobs= 1 classes= { c:cfg.tdef.by_value[c] for c in np.unique(Y_data) } #save FEATURES'PSD': data= dict( cls=cls, psde=psde, sfreq=sfreq, picks=picks, classes=classes, epochs=cfg.EPOCH, w_frames=w_frames, w_seconds=psdparams['wlen'], wstep=psdparams['wstep'], spfilter=spfilter, spchannels=spchannels, refchannel=None, tpfilter=tpfilter, notch=cfg.NOTCH_FILTER, triggers=cfg.tdef ) qc.make_dirs('%s/classifier'% datadir) qc.save_obj(clsfile, data) # Show top distinctive features if cfg.CLASSIFIER=='RF': print('\n>> Good features ordered by importance') keys, _= qc.sort_by_value( list(cls.feature_importances_), rev=True ) if cfg.EXPORT_GOOD_FEATURES: gfout= open('%s/good_features.txt'% datadir, 'w') # reverse-lookup frequency from fft if type(wlen) is not list: fq= 0 fq_res= 1.0 / psdparams['wlen'] fqlist= [] while fq <= psdparams['fmax']: if fq >= psdparams['fmin']: fqlist.append(fq) fq += fq_res for k in keys[:cfg.FEAT_TOPN]: ch,hz= qc.feature2chz(k, fqlist, picks, ch_names=raw.ch_names) print('%s, %.1f Hz (feature %d)'% (ch,hz,k) ) if cfg.EXPORT_GOOD_FEATURES: gfout.write( '%s\t%.1f\n'% (ch, hz) ) if cfg.EXPORT_GOOD_FEATURES: if cfg.CV_PERFORM is not None: gfout.write('\nCross-validation performance: mean %.2f, std %.2f\n'%(cv_mean, cv_std) ) gfout.close() print() else: print('Ignoring good features because of multiple epochs.') # Test file if len(cfg.ftest) > 0: raw_test, events_test= pu.load_raw('%s'%(cfg.ftest), spfilter) ''' TODO: implement multi-segment epochs ''' if type(cfg.EPOCH[0]) is list: print('MULTI-SEGMENT EPOCH IS NOT SUPPORTED YET.') sys.exit(-1) epochs_test= Epochs(raw_test, events_test, triggers, tmin=cfg.EPOCH[0], tmax=cfg.EPOCH[1],\ proj=False, picks=picks, baseline=None, preload=True, add_eeg_ref=False) psdfile= 'psd-test.pcl' if not os.path.exists(psdfile): print('\n>> Computing PSD for test set') X_test, y_test= pu.get_psd(epochs_test, psde, w_frames, int(sfreq/8)) qc.save_obj(psdfile, {'X':X_test, 'y':y_test}) else: print('\n>> Loading %s'% psdfile) data= qc.load_obj(psdfile) X_test, y_test= data['X'], data['y'] score_test= cls.score( np.concatenate(X_test), np.concatenate(y_test) ) print('Testing score', score_test) # running performance print('\nRunning performance over time') scores_windows= [] timer= qc.Timer() for ep in range( y_test.shape[0] ): scores= [] frames= X_test[ep].shape[0] timer.reset() for t in range(frames): X= X_test[ep][t,:] y= [y_test[ep][t]] scores.append( cls.score(X, y) ) #print('%d /%d %.1f msec'% (t,X_test[ep].shape[0],1000*timer.sec()) ) print('Tested epoch %d, %.3f msec per window'%(ep, timer.sec()*1000.0/frames) ) scores_windows.append(scores) scores_windows= np.array(scores_windows)
def LDA_classification_aggregate_activity_scores(Data, labels): Labels = labels #Activity score features are sorted as label 0 then label 1, so we need to rearrange the labels (0s first then 1s) Labels.sort() scores = cross_val_score(LDA(solver='svd'), Data, Labels, cv=5) return scores.mean()
num2class = dict(enumerate(label_names)) class_nums = labels.sum(axis=0) for i in range(len(label_names)): print(num2class[i], ":", class_nums[i], "of samples.") features = pd.read_csv('features.csv', index_col=0, header=[0, 1, 2]) print("Features shape:", features.shape) assert features.shape[0] == len(file_names) == class_nums.sum() simple_labels = labels[:, :2] + labels[:, 2:4] simple_labels = np.column_stack((simple_labels, labels[:, -1])) y = np.nonzero(labels)[1] y_weights = dict(enumerate(compute_class_weight('balanced', np.unique(y), y))) X = LDA(n_components=2).fit_transform(features, y) figsize = (8, 6) plt.figure(figsize=figsize) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] ax = [] for c, i in zip(colors[:y.max() + 1], range(y.max() + 1)): idx = np.where(y == i) ax.append(plt.scatter(X[idx, 0], X[idx, 1], c=c, alpha=0.5)) plt.legend(ax, [num2class[i] for i in range(y.max() + 1)]) plt.show() sy = np.nonzero(simple_labels)[1] sy_weights = dict( enumerate(compute_class_weight('balanced', np.unique(sy), sy))) sX = LDA(n_components=2).fit_transform(features, sy)
if n_components_pca == False: n_components_pca = min(5*(cell_cluster_number-1), int(0.01*gene_number)) n_components_lda = parameters.nComponentsLDA if n_components_lda == False: n_components_lda = min(cell_cluster_number - 1, n_components_pca) if n_components_pca < n_components_lda: print("--nComponentsPCA(-np) should not be less than --nComponentsLDA(-nl)") sys.exit (0) expression_matrix = PCA (n_components=n_components_pca, svd_solver="full").fit_transform (expression_matrix) print("Matrix shape after PCA: ", expression_matrix.shape) oa = OAS(store_precision=False, assume_centered=False) expression_matrix = LDA (n_components=n_components_lda, covariance_estimator=OAS(store_precision=False, assume_centered=False), solver='eigen').fit_transform (expression_matrix, cell_type_array) print("Matrix shape after LDA: ", expression_matrix.shape) average_size_subclusters = parameters.sizeSubcluster celltype2subtype = {} for celltype in all_cell_types: idx = np.where (cell_type_array == celltype)[0] n_clu = int (len (idx) / average_size_subclusters) + 1 cells_of_this_celltype = expression_matrix[idx] predict_of_subtype = kmeans (cells_of_this_celltype, n_clusters=n_clu) subcelltype = np.array (["{}*SustechJinLab*{}".format (celltype, p) for p in predict_of_subtype.labels_]) celltype2subtype[celltype] = sorted (list (set (subcelltype)), key=sort_key) cell_subtype_array[idx] = subcelltype all_cell_subtypes = sorted (list (set (cell_subtype_array)), key=sort_key)
Model trained with all pokemon sightings """ trainFeatures = train[featureSelected] trainLabels = train['pokemonId'].as_matrix() testFeatures = test[featureSelected] testLabels = test['pokemonId'].as_matrix() #PCA pca = PCA(n_components=5) pca.fit(trainFeatures) pcaTrainFeatures = pca.transform(trainFeatures) pcaTestFeatures = pca.transform(testFeatures) #LDA lda = LDA(n_components=5) lda.fit(trainFeatures, trainLabels) ldaTrainFeatures = lda.transform(trainFeatures) ldaTestFeatures = lda.transform(testFeatures) #Logistic Regression #lrModel = LogisticRegression() #lrModel.fit(trainFeatures, trainLabels) #acc = lrModel.score(testFeatures, testLabels) # #lrModel = LogisticRegression() #lrModel.fit(pcaTrainFeatures, trainLabels) #acc1 = lrModel.score(pcaTestFeatures, testLabels) # #lrModel = LogisticRegression() #lrModel.fit(ldaTrainFeatures, trainLabels)
############################################################################## # Create Pipelines # ---------------- # # Pipelines must be a dict of sklearn pipeline transformer. # # The CSP implementation is based on the MNE implementation. We selected 8 CSP # components, as usually done in the literature. # # The Riemannian geometry pipeline consists in covariance estimation, tangent # space mapping and finally a logistic regression for the classification. pipelines = {} pipelines["CSP+LDA"] = make_pipeline(CSP(n_components=8), LDA()) pipelines["RG+LR"] = make_pipeline( Covariances(), TangentSpace(), LogisticRegression(solver="lbfgs") ) ############################################################################## # Evaluation # ---------- # # We define the paradigm (LeftRightImagery) and the dataset (BNCI2014001). # The evaluation will return a DataFrame containing a single AUC score for # each subject / session of the dataset, and for each pipeline. # # Results are saved into the database, so that if you add a new pipeline, it # will not run again the evaluation unless a parameter has changed. Results can
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling must be applied when you apply dimentionality techniques!!! from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Applying LDA to extract new independent variables (linear discriminants) that separate the most classes of the dependent variable from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) # called linear discriminants X_train = lda.fit_transform( X_train, y_train ) # LDA is a supervised model so need to include the dependent variable X_test = lda.transform(X_test) # Fitting Logistic Regression to the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix
datasets = [Zhou2016(), BNCI2014001()] subj = [1, 2, 3] for d in datasets: d.subject_list = subj ############################################################################## # The following lines go exactly as in the previous example, where we end up # obtaining a pandas dataframe containing the results of the evaluation. We # could set `overwrite` to False to cache the results, avoiding to restart all # the evaluation from scratch if a problem occurs. paradigm = LeftRightImagery() evaluation = WithinSessionEvaluation(paradigm=paradigm, datasets=datasets, overwrite=False) pipeline = make_pipeline(CSP(n_components=8), LDA()) results = evaluation.process({"csp+lda": pipeline}) ############################################################################## # Plotting Results # ---------------- # # We plot the results using the seaborn library. Note how easy it # is to plot the results from the three datasets with just one line. results["subj"] = [str(resi).zfill(2) for resi in results["subject"]] g = sns.catplot( kind="bar", x="score", y="subj", col="dataset",
def btnConvert_click(self): msgBox = QMessageBox() try: FoldFrom = np.int32(ui.txtFoldFrom.text()) FoldTo = np.int32(ui.txtFoldTo.text()) except: print("Please check fold parameters!") return if FoldTo < FoldFrom: print("Please check fold parameters!") return for fold in range(FoldFrom, FoldTo + 1): # Tol try: Tol = np.float(ui.txtTole.text()) except: msgBox.setText("Tolerance is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Solver Solver = ui.cbSolver.currentData() # OutFile OutFile = ui.txtOutFile.text() OutFile = OutFile.replace("$FOLD$", str(fold)) if not len(OutFile): msgBox.setText("Please enter out file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # InFile InFile = ui.txtInFile.text() InFile = InFile.replace("$FOLD$", str(fold)) if not len(InFile): msgBox.setText("Please enter input file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not os.path.isfile(InFile): msgBox.setText("Input file not found!\n" + InFile) print(InFile + " - not found!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False InData = mainIO_load(InFile) OutData = dict() OutData["imgShape"] = reshape_1Dvector(InData["imgShape"]) # Data if not len(ui.txtITrData.currentText()): msgBox.setText("Please enter Input Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeData.currentText()): msgBox.setText("Please enter Input Test Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrData.text()): msgBox.setText("Please enter Output Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeData.text()): msgBox.setText("Please enter Output Test Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: XTr = InData[ui.txtITrData.currentText()] XTe = InData[ui.txtITeData.currentText()] if ui.cbScale.isChecked(): XTr = preprocessing.scale(XTr) XTe = preprocessing.scale(XTe) print("Whole of data is scaled X~N(0,1).") except: print("Cannot load data") return # NComponent try: NumFea = np.int32(ui.txtNumFea.text()) except: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea < 1: msgBox.setText("Number of features must be greater than zero!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea > np.shape(XTr)[1]: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Label if not len(ui.txtITrLabel.currentText()): msgBox.setText("Please enter Train Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeLabel.currentText()): msgBox.setText("Please enter Test Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrLabel.text()): msgBox.setText( "Please enter Train Output Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeLabel.text()): msgBox.setText("Please enter Test Output Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: YTr = InData[ui.txtITrLabel.currentText()][0] YTe = InData[ui.txtITeLabel.currentText()][0] OutData[ui.txtOTrLabel.text()] = reshape_1Dvector(YTr) OutData[ui.txtOTeLabel.text()] = reshape_1Dvector(YTe) except: print("Cannot load labels!") # Subject if ui.cbSubject.isChecked(): if not len(ui.txtITrSubject.currentText()): msgBox.setText( "Please enter Train Input Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeSubject.currentText()): msgBox.setText( "Please enter Test Input Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrSubject.text()): msgBox.setText( "Please enter Train Output Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeSubject.text()): msgBox.setText( "Please enter Test Output Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrSubject.text()] = reshape_1Dvector( InData[ui.txtITrSubject.currentText()]) OutData[ui.txtOTeSubject.text()] = reshape_1Dvector( InData[ui.txtITeSubject.currentText()]) except: print("Cannot load Subject IDs") return # Task if ui.cbTask.isChecked(): if not len(ui.txtITrTask.currentText()): msgBox.setText( "Please enter Input Train Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeTask.currentText()): msgBox.setText( "Please enter Input Test Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrTask.text()): msgBox.setText( "Please enter Output Train Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeTask.text()): msgBox.setText( "Please enter Output Test Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrTask.text()] = reshape_1Dvector( InData[ui.txtITrTask.currentText()]) OutData[ui.txtOTeTask.text()] = reshape_1Dvector( InData[ui.txtITeTask.currentText()]) except: print("Cannot load Tasks!") return # Run if ui.cbRun.isChecked(): if not len(ui.txtITrRun.currentText()): msgBox.setText( "Please enter Train Input Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeRun.currentText()): msgBox.setText( "Please enter Test Input Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrRun.text()): msgBox.setText( "Please enter Train Output Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeRun.text()): msgBox.setText( "Please enter Test Output Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrRun.text()] = reshape_1Dvector( InData[ui.txtITrRun.currentText()]) OutData[ui.txtOTeRun.text()] = reshape_1Dvector( InData[ui.txtITeRun.currentText()]) except: print("Cannot load Runs!") return # Counter if ui.cbCounter.isChecked(): if not len(ui.txtITrCounter.currentText()): msgBox.setText( "Please enter Train Input Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeCounter.currentText()): msgBox.setText( "Please enter Test Input Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrCounter.text()): msgBox.setText( "Please enter Train Output Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeCounter.text()): msgBox.setText( "Please enter Test Output Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrCounter.text()] = reshape_1Dvector( InData[ui.txtITrCounter.currentText()]) OutData[ui.txtOTeCounter.text()] = reshape_1Dvector( InData[ui.txtITeCounter.currentText()]) except: print("Cannot load Counters!") return # Matrix Label if ui.cbmLabel.isChecked(): if not len(ui.txtITrmLabel.currentText()): msgBox.setText( "Please enter Train Input Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITemLabel.currentText()): msgBox.setText( "Please enter Test Input Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrmLabel.text()): msgBox.setText( "Please enter Train Output Matrix Label variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTemLabel.text()): msgBox.setText( "Please enter Test Output Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrmLabel.text()] = InData[ ui.txtITrmLabel.currentText()] OutData[ui.txtOTemLabel.text()] = InData[ ui.txtITemLabel.currentText()] except: print("Cannot load matrix lables!") return # Design if ui.cbDM.isChecked(): if not len(ui.txtITrDM.currentText()): msgBox.setText( "Please enter Train Input Design Matrix variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeDM.currentText()): msgBox.setText( "Please enter Test Input Design Matrix variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrDM.text()): msgBox.setText( "Please enter Train Output Design Matrix variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeDM.text()): msgBox.setText( "Please enter Test Output Design Matrix variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrDM.text()] = InData[ ui.txtITrDM.currentText()] OutData[ui.txtOTeDM.text()] = InData[ ui.txtITeDM.currentText()] except: print("Cannot load design matrices!") return # Coordinate if ui.cbCol.isChecked(): if not len(ui.txtCol.currentText()): msgBox.setText("Please enter Coordinator variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOCol.text()): msgBox.setText("Please enter Coordinator variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOCol.text()] = InData[ ui.txtCol.currentText()] except: print("Cannot load coordinator!") return # Condition if ui.cbCond.isChecked(): if not len(ui.txtCond.currentText()): msgBox.setText("Please enter Condition variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOCond.text()): msgBox.setText("Please enter Condition variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOCond.text()] = InData[ ui.txtCond.currentText()] except: print("Cannot load conditions!") return # FoldID if ui.cbFoldID.isChecked(): if not len(ui.txtFoldID.currentText()): msgBox.setText("Please enter FoldID variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOFoldID.text()): msgBox.setText("Please enter FoldID variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOFoldID.text()] = reshape_1Dvector( InData[ui.txtFoldID.currentText()]) except: print("Cannot load Fold ID!") return # FoldInfo if ui.cbFoldInfo.isChecked(): if not len(ui.txtFoldInfo.currentText()): msgBox.setText("Please enter FoldInfo variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOFoldInfo.text()): msgBox.setText("Please enter FoldInfo variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOFoldInfo.text()] = InData[ ui.txtFoldInfo.currentText()] except: print("Cannot load Fold Info!") return pass # Number of Scan if ui.cbNScan.isChecked(): if not len(ui.txtITrScan.currentText()): msgBox.setText( "Please enter Number of Scan variable name for Input Train!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeScan.currentText()): msgBox.setText( "Please enter Number of Scan variable name for Input Test!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrScan.text()): msgBox.setText( "Please enter Number of Scan variable name for Output Train!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeScan.text()): msgBox.setText( "Please enter Number of Scan variable name for Output Test!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrScan.text()] = reshape_1Dvector( InData[ui.txtITrScan.currentText()]) OutData[ui.txtOTeScan.text()] = reshape_1Dvector( InData[ui.txtITeScan.currentText()]) except: print("Cannot load NScan!") return print("Running LDA:") model = LDA(n_components=NumFea, solver=Solver, tol=Tol) print("Training ...") XTr_New = model.fit_transform(XTr, YTr) OutData[ui.txtOTrData.text()] = XTr_New print("Testing ...") XTe_New = model.transform(XTe) OutData[ui.txtOTeData.text()] = XTe_New print("Saving ...") mainIO_save(OutData, OutFile) print("Fold " + str(fold) + " is DONE: " + OutFile) print("LDA is done.") msgBox.setText("LDA is done.") msgBox.setIcon(QMessageBox.Information) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_()
# ---------------- # # Pipelines must be a dict of sklearn pipeline transformer. processing_sampling_rate = 128 pipelines = {} # we have to do this because the classes are called 'Target' and 'NonTarget' # but the evaluation function uses a LabelEncoder, transforming them # to 0 and 1 labels_dict = {"Target": 1, "NonTarget": 0} # Riemannian geometry based classification pipelines["RG+LDA"] = make_pipeline( XdawnCovariances(nfilter=5, estimator="lwf", xdawn_estimator="scm"), TangentSpace(), LDA(solver="lsqr", shrinkage="auto"), ) pipelines["Xdw+LDA"] = make_pipeline(Xdawn(nfilter=2, estimator="scm"), Vectorizer(), LDA(solver="lsqr", shrinkage="auto")) ############################################################################## # Evaluation # ---------- # # We define the paradigm (P300) and use all three datasets available for it. # The evaluation will return a dataframe containing AUCs for each permutation # and dataset size. paradigm = P300(resample=processing_sampling_rate)
X_test_raw = np.concatenate( (np.array(hog_test), np.array(rgb_test), np.array(hsv_test)), axis=1) X_train1 = [] y_train1 = [] X_val = [] y_val = [] X_train1, X_val, y_train1, y_val = train_test_split(X_train_raw, y_train_raw, test_size=0.2, random_state=1) print("yay") #rf_class= RandomForestClassifier(n_estimators=200,max_depth=30, random_state=0,class_weight='balanced') clf = LDA() #print(cross_val_score(rf_class, X_train1, y_train1, scoring='accuracy', cv = 10)) accuracy = cross_val_score(clf, X_train1, y_train1, scoring='accuracy').mean() * 100 print("Accuracy of Random Forests is: ", accuracy) if (accuracy > 0.36): print("YAY") clf.fit(X_train1, y_train1) s = clf.score(X_val, y_val) print(s) if s > 0.36: clf.fit(X_train_raw, y_train_raw) predicted = clf.predict(X_test_raw) pickle.dump(clf, open(model_label, 'wb')) with open(test_label, 'w+') as csvfile:
kernel = {'linear', 'poly', 'rbf', 'sigmoid'} # Applying Kernel PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=2, kernel='rbf', shrinkage='auto', n_jobs=-1) X_train = kpca.fit_transform(X_train) X_task = kpca.transform(X_task) solver = {'svd', 'lsqr', 'eigen'} # Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2, solver='svd', shrinkage='auto') X_train = lda.fit_transform(X_train, y_target) X_task = lda.transform(X_task) neighbors = {3, 5, 10, 20} # Training the K-NN model on the Training set #minkowski with p=2 is equivalent to the standard Euclidean metric (ezek a defaultak) from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2, n_jobs=-1)
def main(): full_path = 'data/adult.data' X, y = load_dataset(full_path) X_train, X_test, Y_train, Y_test = train_test_split( X, y, random_state=RANDOM_STATE) kmeans_train_time = np.zeros(5) kmeans_predict_time = np.zeros(5) em_train_time = np.zeros(5) em_predict_time = np.zeros(5) nn_predict_time = np.zeros(9) # Ref: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html # https://www.edupristine.com/blog/beyond-k-means # https://www.linkedin.com/pulse/finding-optimal-number-clusters-k-means-through-elbow-asanka-perera wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++', random_state=RANDOM_STATE) kmeans.fit(X_train) wcss.append(kmeans.inertia_) plt.figure() plt.plot(range(1, 11), wcss) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.savefig("UL2_WCSS.png") t_bef = time.time() kmeans = KMeans(n_clusters=3, random_state=RANDOM_STATE).fit(X_train) t_aft = time.time() kmeans_train_time[0] = t_aft - t_bef kmeans_label_train = kmeans.labels_ centroids = kmeans.cluster_centers_ t_bef = time.time() kmeans_label_test = kmeans.predict(X_test) t_aft = time.time() kmeans_predict_time[0] = t_aft - t_bef plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("Original") ax1.scatter(X_train[:, 0], X_train[:, 1], c=Y_train, alpha=0.5) ax2.set_title('K Means') ax2.scatter(X_train[:, 0], X_train[:, 1], c=kmeans_label_train, alpha=0.5) ax2.scatter(centroids[:, 0], centroids[:, 1], c='red') plt.savefig("UL2_kmeans.png") display_metrics("Original Kmeans Train", Y_train, kmeans_label_train) display_metrics("Original Kmeans Test", Y_test, kmeans_label_test) # Reference: https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html#:~:text=Choosing%20the%20number%20of%20components,pca%20%3D%20PCA(). # PCA ------------------------------------------------------------- plt.figure() pca = PCA().fit(X_train) eigenvalues = pca.explained_variance_ plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') plt.savefig("UL2_PCA_variance.png") pca = PCA(n_components=50, random_state=RANDOM_STATE) pca.fit(X_train) pca_trans_train = pca.transform(X_train) pca_trans_test = pca.transform(X_test) # Run on tranformed PCA dataset wcss_pca = [] for i in range(1, 11): kmeans_pca = KMeans(n_clusters=i, init='k-means++', random_state=RANDOM_STATE) kmeans_pca.fit(pca_trans_train) wcss_pca.append(kmeans_pca.inertia_) plt.figure() plt.plot(range(1, 11), wcss_pca) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.savefig("UL2_WCSS_After_PCA.png") t_bef = time.time() kmeans_pca = KMeans(n_clusters=3, random_state=RANDOM_STATE).fit(pca_trans_train) t_aft = time.time() kmeans_train_time[1] = t_aft - t_bef kmeans_pca.predict(pca_trans_train) kmeans_pca_label = kmeans_pca.labels_ centroids_pca = kmeans_pca.cluster_centers_ t_bef = time.time() kmeans_pca_label_test = kmeans_pca.predict(pca_trans_test) t_aft = time.time() kmeans_predict_time[1] = t_aft - t_bef display_metrics("Kmeans Train after PCA", Y_train, kmeans_pca_label) display_metrics("Kmeans Test after PCA", Y_test, kmeans_pca_label_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("K means before PCA") ax1.scatter(X_train[:, 0], X_train[:, 1], c=kmeans_label_train, alpha=0.5) ax1.scatter(centroids[:, 0], centroids[:, 10], c='red') ax2.set_title("K Means after PCA") ax2.scatter(pca_trans_train[:, 0], pca_trans_train[:, 1], c=kmeans_pca_label, alpha=0.5) ax2.scatter(centroids_pca[:, 0], centroids_pca[:, 8], c='red') plt.savefig("UL2_kmeans_aft_PCA.png") # ICA ------------------------------------------------------------- dims = range(1, 106) kurt = [] for dim in dims: ica = FastICA(n_components=dim, tol=2.0, random_state=RANDOM_STATE) tmp = ica.fit_transform(X_train) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) plt.figure() plt.title("ICA Kurtosis") plt.xlabel("Independent Components") plt.ylabel("Avg Kurtosis Across IC") plt.plot(dims, kurt) plt.savefig("UL2_ICA_kurtosis.png") ica = FastICA(n_components=95, algorithm='parallel', whiten=True, random_state=RANDOM_STATE) ica.fit(X_train) ica_trans_train = ica.transform(X_train) ica_trans_test = ica.transform(X_test) # Run on tranformed ICA dataset wcss_ica = [] for i in range(1, 11): kmeans_ica = KMeans(n_clusters=i, init='k-means++', random_state=RANDOM_STATE) kmeans_ica.fit(ica_trans_train) wcss_ica.append(kmeans_ica.inertia_) plt.figure() plt.plot(range(1, 11), wcss_ica) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.savefig("UL2_WCSS_After_ICA.png") t_bef = time.time() kmeans_ica = KMeans(n_clusters=3, random_state=RANDOM_STATE).fit(ica_trans_train) t_aft = time.time() kmeans_train_time[2] = t_aft - t_bef kmeans_ica.predict(ica_trans_train) kmeans_ica_label = kmeans_ica.labels_ centroids_ica = kmeans_ica.cluster_centers_ t_bef = time.time() kmeans_ica_label_test = kmeans_ica.predict(ica_trans_test) t_aft = time.time() kmeans_predict_time[2] = t_aft - t_bef display_metrics("Kmeans Train after ICA", Y_train, kmeans_ica_label) display_metrics("Kmeans Test after ICA", Y_test, kmeans_ica_label_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("K means before ICA") ax1.scatter(X_train[:, 0], X_train[:, 1], c=kmeans_label_train, alpha=0.5) ax1.scatter(centroids[:, 0], centroids[:, 1], c='red') ax2.set_title('K Means after ICA') ax2.scatter(ica_trans_train[:, 0], ica_trans_train[:, 1], c=kmeans_ica_label, alpha=0.5) ax2.scatter(centroids_ica[:, 0], centroids_ica[:, 1], c='red') plt.savefig("UL2_kmeans_aft_ICA.png") # RP ------------------------------------------------------------- rp = SparseRandomProjection(n_components=106, random_state=RANDOM_STATE) rp.fit(X_train) rp_trans_train = rp.transform(X_train) rp_trans_test = rp.transform(X_test) # Run on tranformed RP dataset wcss_rp = [] for i in range(1, 11): kmeans_rp = KMeans(n_clusters=i, init='k-means++', random_state=RANDOM_STATE) kmeans_rp.fit(rp_trans_train) wcss_rp.append(kmeans_rp.inertia_) plt.figure() plt.plot(range(1, 11), wcss_rp) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.savefig("UL2_WCSS_After_RP.png") t_bef = time.time() kmeans_rp = KMeans(n_clusters=2, random_state=RANDOM_STATE).fit(rp_trans_train) t_aft = time.time() kmeans_train_time[3] = t_aft - t_bef kmeans_rp_label = kmeans_rp.labels_ centroids_rp = kmeans_rp.cluster_centers_ t_bef = time.time() kmeans_rp_label_test = kmeans_rp.predict(rp_trans_test) t_aft = time.time() kmeans_predict_time[3] = t_aft - t_bef display_metrics("Kmeans Train after RP", Y_train, kmeans_rp_label) display_metrics("Kmeans Test after RP", Y_test, kmeans_rp_label_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("K means before RP") ax1.scatter(X_train[:, 0], X_train[:, 1], c=kmeans_label_train, alpha=0.5) ax1.scatter(centroids[:, 0], centroids[:, 1], c='red') ax2.set_title("K Means after RP") ax2.scatter(rp_trans_train[:, 0], rp_trans_train[:, 1], c=kmeans_rp_label, alpha=0.5) ax2.scatter(centroids_rp[:, 0], centroids_rp[:, 1], c='red') plt.savefig("UL2_kmeans_aft_RP.png") # LDA ------------------------------------------------------------- lda = LDA(n_components=1) lda_trans_train = lda.fit_transform(X_train, Y_train) lda_trans_test = lda.transform(X_test) # Run on tranformed LDA dataset wcss_lda = [] for i in range(1, 11): kmeans_lda = KMeans(n_clusters=i, init='k-means++', random_state=RANDOM_STATE) kmeans_lda.fit(lda_trans_train) wcss_lda.append(kmeans_lda.inertia_) plt.figure() plt.plot(range(1, 11), wcss_lda) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.savefig("UL2_WCSS_After_LDA.png") t_bef = time.time() kmeans_lda = KMeans(n_clusters=2, random_state=RANDOM_STATE).fit(lda_trans_train) t_aft = time.time() kmeans_train_time[4] = t_aft - t_bef kmeans_lda_label = kmeans_lda.labels_ centroids_lda = kmeans_lda.cluster_centers_ t_bef = time.time() kmeans_lda_label_test = kmeans_lda.predict(lda_trans_test) t_aft = time.time() kmeans_predict_time[4] = t_aft - t_bef display_metrics("Kmeans Train after LDA", Y_train, kmeans_lda_label) display_metrics("Kmeans Test after LDA", Y_test, kmeans_lda_label_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("K means before LDA") ax1.scatter(X_train[:, 0], X_train[:, 1], c=kmeans_label_train, alpha=0.5) ax1.scatter(centroids[:, 0], centroids[:, 1], c='red') ax2.set_title("K Means after LDA") ax2.scatter(lda_trans_train[:, 0], lda_trans_train[:, 0], c=kmeans_lda_label, alpha=0.5) ax2.scatter(centroids_lda[:, 0], centroids_lda[:, 0], c='red') plt.savefig("UL2_kmeans_aft_LDA.png") # Train time Different Dimensionality reduction algorithms kmeans classifier = [ 'Kmeans', 'Kmeans with PCA', 'Kmeans with ICA', 'Kmeans with RP', 'Kmeans with LDA' ] np_classifier = np.array(classifier) plt.figure() plt.barh(np_classifier, kmeans_train_time, align='center') plt.title('Kmeans Train Time') plt.ylabel('Name') plt.xlabel('Time (seconds)') plt.savefig('UL2_Kmeans_Traintime.png', bbox_inches="tight") # Predict time Different Dimensionality reduction algorithms kmeans plt.figure() plt.barh(np_classifier, em_predict_time, align='center') plt.title('Kmeans Query Time') plt.ylabel('Name') plt.xlabel('Time (seconds)') plt.savefig('UL2_Kmeans_Querytime.png', bbox_inches="tight") # Expectation Maximization --------------------------------------------------- silhouette_score = [] for i in range(2, 12): gmm = GaussianMixture(n_components=i, n_init=2, random_state=RANDOM_STATE).fit(X_train) gmm_predict = gmm.predict(X_train) silhouette_score.append( metrics.silhouette_score(X_train, gmm_predict, metric='euclidean')) plt.figure() plt.plot(range(1, 11), silhouette_score) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('silhouette_score') plt.savefig("UL2_SS.png") t_bef = time.time() gmm = GaussianMixture(n_components=9).fit(X_train) t_aft = time.time() em_train_time[0] = t_aft - t_bef gmm_label_train = gmm.predict(X_train) t_bef = time.time() gmm_label_test = gmm.predict(X_test) t_aft = time.time() em_predict_time[0] = t_aft - t_bef display_metrics("Original EM Train", Y_train, gmm_label_train) display_metrics("Original EM Test", Y_test, gmm_label_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("Original") ax1.scatter(X_train[:, 0], X_train[:, 1], c=Y_train, alpha=0.5) ax2.set_title("Expectation Maximization") ax2.scatter(X_train[:, 0], X_train[:, 1], c=gmm_label_train, alpha=0.5) plt.savefig("UL2_EM.png") # PCA -------------------- silhouette_score_pca = [] for i in range(2, 12): gmm_pca = GaussianMixture( n_components=i, n_init=2, random_state=RANDOM_STATE).fit(pca_trans_train) gmm_predict_pca = gmm_pca.predict(pca_trans_train) silhouette_score_pca.append( metrics.silhouette_score(pca_trans_train, gmm_predict_pca, metric='euclidean')) plt.figure() plt.plot(range(1, 11), silhouette_score_pca) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('silhouette_score') plt.savefig("UL2_SS_After_PCA.png") t_bef = time.time() gmm_pca = GaussianMixture(n_components=9).fit(pca_trans_train) t_aft = time.time() em_train_time[1] = t_aft - t_bef gmm_label_pca = gmm_pca.predict(pca_trans_train) t_bef = time.time() gmm_label_pca_test = gmm_pca.predict(pca_trans_test) t_aft = time.time() em_predict_time[1] = t_aft - t_bef display_metrics("EM Train after PCA", Y_train, gmm_label_pca) display_metrics("EM Test after PCA", Y_test, gmm_label_pca_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("EM before PCA") ax1.scatter(X_train[:, 0], X_train[:, 1], c=gmm_label_train, alpha=0.5) ax2.set_title("EM after PCA") ax2.scatter(pca_trans_train[:, 0], pca_trans_train[:, 1], c=gmm_label_pca, alpha=0.5) plt.savefig("UL2_EM_aft_PCA.png") # ICA -------------------- silhouette_score_ica = [] for i in range(2, 12): gmm_ica = GaussianMixture( n_components=i, n_init=2, random_state=RANDOM_STATE).fit(ica_trans_train) gmm_predict_ica = gmm_ica.predict(ica_trans_train) silhouette_score_ica.append( metrics.silhouette_score(ica_trans_train, gmm_predict_ica, metric='euclidean')) plt.figure() plt.plot(range(1, 11), silhouette_score_ica) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('silhouette_score') plt.savefig("UL2_SS_After_ICA.png") t_bef = time.time() gmm_ica = GaussianMixture(n_components=3).fit(ica_trans_train) t_aft = time.time() em_train_time[2] = t_aft - t_bef gmm_label_ica = gmm_ica.predict(ica_trans_train) t_bef = time.time() gmm_label_ica_test = gmm_ica.predict(ica_trans_test) t_aft = time.time() em_predict_time[2] = t_aft - t_bef display_metrics("EM Train after ICA", Y_train, gmm_label_ica) display_metrics("EM Test after ICA", Y_test, gmm_label_ica_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("EM before ICA") ax1.scatter(X_train[:, 0], X_train[:, 1], c=gmm_label_train, alpha=0.5) ax2.set_title("EM after ICA") ax2.scatter(ica_trans_train[:, 0], ica_trans_train[:, 1], c=gmm_label_ica, alpha=0.5) plt.savefig("UL2_EM_aft_ICA.png") # RP -------------------- silhouette_score_rp = [] for i in range(2, 12): gmm_rp = GaussianMixture(n_components=i, n_init=2, random_state=RANDOM_STATE).fit(rp_trans_train) gmm_predict_rp = gmm_rp.predict(rp_trans_train) silhouette_score_rp.append( metrics.silhouette_score(rp_trans_train, gmm_predict_rp, metric='euclidean')) plt.figure() plt.plot(range(1, 11), silhouette_score_rp) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('silhouette_score') plt.savefig("UL2_SS_After_RP.png") t_bef = time.time() gmm_rp = GaussianMixture(n_components=3).fit(rp_trans_train) t_aft = time.time() em_train_time[3] = t_aft - t_bef gmm_label_rp = gmm_rp.predict(rp_trans_train) t_bef = time.time() gmm_label_rp_test = gmm_rp.predict(rp_trans_test) t_aft = time.time() em_predict_time[3] = t_aft - t_bef display_metrics("EM Train after RP", Y_train, gmm_label_rp) display_metrics("EM Test after RP", Y_test, gmm_label_rp_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("EM before RP") ax1.scatter(X_train[:, 0], X_train[:, 1], c=gmm_label_train, alpha=0.5) ax2.set_title("EM after RP") ax2.scatter(rp_trans_train[:, 0], rp_trans_train[:, 1], c=gmm_label_rp, alpha=0.5) plt.savefig("UL2_EM_aft_RP.png") # LDA -------------------- silhouette_score_lda = [] for i in range(2, 12): gmm_lda = GaussianMixture( n_components=i, n_init=2, random_state=RANDOM_STATE).fit(lda_trans_train) gmm_predict_lda = gmm_lda.predict(lda_trans_train) silhouette_score_lda.append( metrics.silhouette_score(lda_trans_train, gmm_predict_lda, metric='euclidean')) plt.figure() plt.plot(range(1, 11), silhouette_score_lda) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('silhouette_score') plt.savefig("UL2_SS_After_LDA.png") t_bef = time.time() gmm_lda = GaussianMixture(n_components=2).fit(lda_trans_train) t_aft = time.time() em_train_time[4] = t_aft - t_bef gmm_label_lda = gmm_lda.predict(lda_trans_train) t_bef = time.time() gmm_label_lda_test = gmm_lda.predict(lda_trans_test) t_aft = time.time() em_predict_time[4] = t_aft - t_bef display_metrics("EM Train after LDA", Y_train, gmm_label_lda) display_metrics("EM Test after LDA", Y_test, gmm_label_lda_test) plt.figure() f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6)) ax1.set_title("EM before LDA") ax1.scatter(X_train[:, 0], X_train[:, 1], c=gmm_label_train, alpha=0.5) ax2.set_title("EM after LDA") ax2.scatter(lda_trans_train[:, 0], lda_trans_train[:, 0], c=gmm_label_lda, alpha=0.5) plt.savefig("UL2_EM_aft_LDA.png") # Train time Different Dimensionality reduction algorithms kmeans classifier = [ 'EM', 'EM with PCA', 'EM with ICA', 'EM with RP', 'EM with LDA' ] np_classifier = np.array(classifier) plt.figure() plt.barh(np_classifier, em_train_time, align='center') plt.title('EM Train Time') plt.ylabel('Name') plt.xlabel('Time (seconds)') plt.savefig('UL2_EM_Traintime.png', bbox_inches="tight") # Predict time Different Dimensionality reduction algorithms kmeans plt.figure() plt.barh(np_classifier, em_predict_time, align='center') plt.title('EM Query Time') plt.ylabel('Name') plt.xlabel('Time (seconds)') plt.savefig('UL2_EM_Querytime.png', bbox_inches="tight") #4. Neural Network with projected data --------------------------------------------------- # Original run NN querytime = neural_network("Original NN", X_train, Y_train, X_test, Y_test) print("Original NN", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) # NN with PCA querytime = neural_network("NN with PCA", pca_trans_train, Y_train, pca_trans_test, Y_test) print("NN with PCA", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) # NN with ICA querytime = neural_network("NN with ICA", ica_trans_train, Y_train, ica_trans_test, Y_test) print("NN with ICA", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) # NN with RP querytime = neural_network("NN with RP", rp_trans_train, Y_train, rp_trans_test, Y_test) print("NN with RP", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) # NN with LDA querytime = neural_network("NN with LDA", lda_trans_train, Y_train, lda_trans_test, Y_test) print("NN with LDA", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) #5. Neural Network with projected data and clustering ------------------------- pca_trans_train_NN = np.column_stack((pca_trans_train, kmeans_pca_label)) pca_trans_test_NN = np.column_stack( (pca_trans_test, kmeans_pca_label_test)) ica_trans_train_NN = np.column_stack((ica_trans_train, kmeans_ica_label)) ica_trans_test_NN = np.column_stack( (ica_trans_test, kmeans_ica_label_test)) rp_trans_train_NN = np.column_stack((rp_trans_train, kmeans_rp_label)) rp_trans_test_NN = np.column_stack((rp_trans_test, kmeans_rp_label_test)) lda_trans_train_NN = np.column_stack((lda_trans_train, kmeans_lda_label)) lda_trans_test_NN = np.column_stack( (lda_trans_test, kmeans_lda_label_test)) # NN with PCA querytime = neural_network("NN with PCA clustering", pca_trans_train_NN, Y_train, pca_trans_test_NN, Y_test) print("NN with PCA clustering", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) # NN with ICA querytime = neural_network("NN with ICA clustering", ica_trans_train_NN, Y_train, ica_trans_test_NN, Y_test) print("NN with ICA clustering", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) # NN with RP querytime = neural_network("NN with RP clustering", rp_trans_train_NN, Y_train, rp_trans_test_NN, Y_test) print("NN with RP clustering", querytime) # nn_predict_time = np.append(nn_predict_time, [querytime]) # NN with LDA querytime = neural_network("NN with LDA clustering", lda_trans_train_NN, Y_train, lda_trans_test_NN, Y_test) print("NN with LDA clustering", querytime)