def quadratic_discriminant_analysis(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train Quadratic Discriminant Analysis (LDA) classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. n_components: Number of components (< n_classes - 1) for dimensionality reduction. ''' from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} model = QuadraticDiscriminantAnalysis() #X_r2 = model.fit(x_train, y_train).transform(X) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) else: predTest = model.predict(x_test) return predTest, metricsCV, model
def QDA(trainX,trainY,validX,testX): model = QuadraticDiscriminantAnalysis(0.1) model.fit(trainX,trainY) train_pred_Y = model.predict_proba(trainX)[:,1] valid_pred_Y = model.predict_proba(validX)[:,1] test_pred_Y = model.predict_proba(testX)[:,1] return train_pred_Y,valid_pred_Y,test_pred_Y
class myQDABinary(myModel): def make(self , make_params ): self.model = QuadraticDiscriminantAnalysis(**make_params ) return self def fit(self , xtrain , ytrain , xtest =None, ytest =None , fit_params = {} ): if type(xtrain) == pd.core.frame.DataFrame: self.model.fit(xtrain.astype('float32') , ytrain.astype('float32') , **fit_params) else: self.model.fit(xtrain , ytrain , **fit_params) def predict(self , xs , threshold = 0.5): if type(xs) == pd.core.frame.DataFrame: return self.model.predict(xs.astype('float32')) else: return self.model.predict(xs) def predict_proba(self, xs): if type(xs) == pd.core.frame.DataFrame: return self.model.predict_proba(xs.astype('float32'))[:,1] else: if len(xs.shape) == 1: return self.model.predict_proba(xs.reshape(1,-1)) else: return self.model.predict_proba(xs)
def BuildFinalModel(): global train, test, cols oof = np.zeros(len(train)) preds = np.zeros(len(test)) # BUILD 512 SEPARATE MODELS for k in range(512): # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I train2 = train[train['wheezy-copper-turtle-magic']==k] train2p = train2.copy(); idx1 = train2.index test2 = test[test['wheezy-copper-turtle-magic']==k] validTestData = len(test2)!=0 if not validTestData: print("WARNING_PREDICTION : Zero length test data for "" k: ", k, " Length(train2): ", len(train2), " Length(test2): ", len(test2)) # ADD PSEUDO LABELED DATA test2p = test2[ (test2['target']<=0.01) | (test2['target']>=0.99) ].copy() test2p.loc[ test2p['target']>=0.5, 'target' ] = 1 test2p.loc[ test2p['target']<0.5, 'target' ] = 0 train2p = pd.concat([train2p,test2p],axis=0) train2p.reset_index(drop=True,inplace=True) # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES) sel = VarianceThreshold(threshold=1.5).fit(train2p[cols]) train3p = sel.transform(train2p[cols]) train3 = sel.transform(train2[cols]) test3 = None if (validTestData == True): test3 = sel.transform(test2[cols]) # STRATIFIED K FOLD skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True) for train_index, test_index in skf.split(train3p, train2p['target']): test_index3 = test_index[ test_index<len(train3) ] # ignore pseudo in oof # MODEL AND PREDICT WITH QDA clf = QuadraticDiscriminantAnalysis(reg_param=0.5) clf.fit(train3p[train_index,:],train2p.loc[train_index]['target']) oof[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1] if (validTestData == True): preds[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits #if k%64==0: print(k) # PRINT CV AUC auc = roc_auc_score(train['target'],oof) print('Pseudo Labeled QDA scores CV =',round(auc,5)) # # Submit Predictions sub = pd.read_csv(os.path.join(dataDirPath,'sample_submission.csv')) sub['target'] = preds sub.to_csv(os.path.join(scriptDirPath, 'submission.csv'),index=False)
def BuildFirstModel(): global train, test, cols # INITIALIZE VARIABLES # print(cols) oof = np.zeros(len(train)) preds = np.zeros(len(test)) # BUILD 512 SEPARATE MODELS for i in range(512): # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I train2 = train[train['wheezy-copper-turtle-magic'] == i] assert (len(train2) != 0) test2 = test[test['wheezy-copper-turtle-magic'] == i] validTestData = len(test2) != 0 if not validTestData: print("WARNING : Zero length test data for " " i: ", i, " Length(train2): ", len(train2), " Length(test2): ", len(test2)) # print(test['wheezy-copper-turtle-magic'].head()) idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES) sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = None if validTestData: test3 = sel.transform(test2[cols]) # STRATIFIED K-FOLD skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True) for train_index, test_index in skf.split(train3, train2['target']): # MODEL AND PREDICT WITH QDA clf = QuadraticDiscriminantAnalysis(reg_param=0.5) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] if validTestData: preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits #if i%64==0: print(i) # PRINT CV AUC auc = roc_auc_score(train['target'], oof) print('QDA scores CV =', round(auc, 5)) # INITIALIZE VARIABLES test['target'] = preds
class FaceClassifier(): def __init__(self, classifier=FaceClassifierModels.DEFAULT): self._clf = None if classifier == FaceClassifierModels.LINEAR_SVM: self._clf = SVC(C=1.0, kernel="linear", probability=True) elif classifier == FaceClassifierModels.NAIVE_BAYES: self._clf = GaussianNB() elif classifier == FaceClassifierModels.RBF_SVM: self._clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif classifier == FaceClassifierModels.NEAREST_NEIGHBORS: self._clf = KNeighborsClassifier(1) elif classifier == FaceClassifierModels.DECISION_TREE: self._clf = DecisionTreeClassifier(max_depth=5) elif classifier == FaceClassifierModels.RANDOM_FOREST: self._clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) elif classifier == FaceClassifierModels.NEURAL_NET: self._clf = MLPClassifier(alpha=1) elif classifier == FaceClassifierModels.ADABOOST: self._clf = AdaBoostClassifier() elif classifier == FaceClassifierModels.QDA: self._clf = QuadraticDiscriminantAnalysis() print("classifier={}".format(FaceClassifierModels(classifier))) def fit(self, embeddings, labels): self._clf.fit(embeddings, labels) def predict(self, vec): return self._clf.predict_proba(vec)
class QuadraticDiscriminantAnalysisImpl(): def __init__(self, priors=None, reg_param=0.0, store_covariance=False, tol=0.0001, store_covariances=None): self._hyperparams = { 'priors': priors, 'reg_param': reg_param, 'store_covariance': store_covariance, 'tol': tol, 'store_covariances': store_covariances } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def qda(np_train_x, np_train_y, np_test_x, np_test_y, verified_num, rejected_num, p): model_QDA = QuadraticDiscriminantAnalysis() model_QDA.fit(np_train_x, np_train_y) for prob in p: predicted_values_QDA = np.where( model_QDA.predict_proba(np_test_x)[:, 1] > prob, 1, 0) total_miss_classified_QDA = 0 reject_wrong_QDA = 0 verify_wrong_QDA = 0 for i in range(len(np_test_x)): total_miss_classified_QDA += abs(np_test_y[i] - predicted_values_QDA[i]) if np_test_y[i] == 1 and predicted_values_QDA[i] == 0: reject_wrong_QDA += 1 if np_test_y[i] == 0 and predicted_values_QDA[i] == 1: verify_wrong_QDA += 1 print("\n----------------------Quadratic Discriminant Analysis prob:", prob, "--------------------") print("miss-classification rate :", total_miss_classified_QDA / (rejected_num + verified_num), "\nFalse negative rate (type1 error) :", reject_wrong_QDA / verified_num, "\nFalse positive rate (type2 error) :", verify_wrong_QDA / rejected_num)
def test_qda(): # QDA classification. # This checks that QDA implements fit and predict and returns # correct values for a simple toy dataset. clf = QuadraticDiscriminantAnalysis() y_pred = clf.fit(X6, y6).predict(X6) assert_array_equal(y_pred, y6) # Assure that it works with 1D data y_pred1 = clf.fit(X7, y6).predict(X7) assert_array_equal(y_pred1, y6) # Test probas estimates y_proba_pred1 = clf.predict_proba(X7) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6) y_log_proba_pred1 = clf.predict_log_proba(X7) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8) y_pred3 = clf.fit(X6, y7).predict(X6) # QDA shouldn't be able to separate those assert np.any(y_pred3 != y7) # Classes should have at least 2 elements with pytest.raises(ValueError): clf.fit(X6, y4)
def lda(df, headers, title): lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) qda = QuadraticDiscriminantAnalysis(store_covariance=True) df_train = df[:int(len(df)*0.8)].reset_index(drop=True).fillna(0) df_test = df[int(len(df)*0.8):].reset_index(drop=True).fillna(0) lda.fit(df_train[headers], df_train['cho2_b']) qda.fit(df_train[headers], df_train['cho2_b']) y_pred=lda.predict(df_test[headers]) y=df_test['cho_b'] utils.evaluate(y, y_pred, 0, 'LDA '+title) utils.plot_eval(df_test, y, y_pred, title='LDA '+title) y_pred=qda.predict(df_test[headers]) utils.evaluate(y, y_pred, 0, 'QDA '+title) utils.plot_eval(df_test, y, y_pred, title='QDA '+title) # plot areas if len(headers) == 2: cho_true = df_test[df_test['cho2_b'] == True] cho_false = df_test[df_test['cho_b'] == False] fig = plt.figure(figsize=(12, 8)) plt.subplot(2, 1, 1) plt.suptitle('LDA') plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=8, marker='o') plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=15, marker='o') nx, ny = 200, 100 x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()+1/1000000000000]) Z = Z[:, 1].reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap='RdBu', norm=colors.Normalize(0., 1.), zorder=0) plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white') plt.legend() plt.subplot(2, 1, 2) plt.suptitle('QDA') plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=3, marker='o') plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=5, marker='x') nx, ny = 200, 100 x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny)) Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap='RdBu', norm=colors.Normalize(0., 1.), zorder=0) plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white') plt.legend() return lda, qda
class FaceClassifier(): def __init__(self, classifier=FaceClassifierModels.DEFAULT): self._clf = None if classifier.value == FaceClassifierModels.LINEAR_SVM.value: self._clf = SVC(C=1.0, kernel="linear", probability=True) elif classifier.value == FaceClassifierModels.NAIVE_BAYES.value: self._clf = GaussianNB() elif classifier.value == FaceClassifierModels.RBF_SVM.value: pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1, probability=True)) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] param_grid = [{ 'svc__C': param_range, 'svc__kernel': ['linear'] }, { 'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf'] }] self._clf = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1) # self._clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif classifier.value == FaceClassifierModels.NEAREST_NEIGHBORS.value: self._clf = KNeighborsClassifier(1) elif classifier.value == FaceClassifierModels.DECISION_TREE.value: self._clf = DecisionTreeClassifier(max_depth=5) elif classifier.value == FaceClassifierModels.RANDOM_FOREST.value: self._clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) elif classifier.value == FaceClassifierModels.NEURAL_NET.value: # self._clf = MLPClassifier(alpha=1) self._clf = MLPClassifier(solver='lbfgs', alpha=1e-2, hidden_layer_sizes=(512, 100), random_state=1) elif classifier.value == FaceClassifierModels.ADABOOST.value: self._clf = AdaBoostClassifier() elif classifier.value == FaceClassifierModels.QDA.value: self._clf = QuadraticDiscriminantAnalysis() # print("classifier={}".format(FaceClassifierModels(classifier))) print("classifier={}".format(self._clf)) def fit(self, embeddings, labels): self._clf.fit(embeddings, labels) def predict(self, vec): return self._clf.predict_proba(vec) def score(self, X, y): return self._clf.score(X, y)
class QDA(object): def __init__(self, priors=None, reg_param=0., store_covariance=False, tol=1.0e-4): """ :param priors: 分来优先级, array, 可选项, shape=[n_classes] :param reg_param: float, 可选项,将协方差估计正规化 :param store_covariance: boolean 如果为真,则计算并存储协方差矩阵到self.covariance_中 :param tol: 使用排序评估的阈值 """ self.model = QuadraticDiscriminantAnalysis( priors=priors, reg_param=reg_param, store_covariance=store_covariance, tol=tol) def fit(self, x, y): self.model.fit(X=x, y=y) def get_params(self, deep=True): return self.model.get_params(deep=deep) def predict(self, x): return self.model.predict(X=x) def predict_log_dict(self, x): return self.model.predict_log_proba(X=x) def predict_proba(self, x): return self.model.predict_proba(X=x) def score(self, x, y, sample_weight=None): return self.model.score(X=x, y=y, sample_weight=sample_weight) def set_params(self, **params): self.model.set_params(**params) def decision_function(self, x): # 将决策函数应用于样本数组。 return self.model.decision_function(X=x) def get_attribute(self): covariance = self.model.covariance_ # 每个种类的协方差矩阵, list of array-like of shape (n_features, n_features) means = self.model.means # 种类均值, array-like of shape (n_classes, n_features) priors = self.model.priors_ # 种类占比, 求和为1, array-like of shape (n_classes) rotations = self.model.rotations_ # n_k = min(n_features, number of elements in class k) list_array, # 高斯分布的旋转 scalings = self.model.scalings_ # list_array, 每个种类k,shape[n_k]的数组,包含高斯分布的缩放, # 如,旋转坐标系中的方差 classes = self.model.classes_ # array-like, shape(n_classes,), 不同种类标签 return covariance, means, priors, rotations, scalings, classes
class QDA(Model): # TODO investigate NaN in results def __init__(self): input_type = NumericalDataTypesEnum.table output_type = NumericalDataTypesEnum.vector super().__init__(input_type=input_type, output_type=output_type) self.__model = QuadraticDiscriminantAnalysis() def predict(self, data: InputData): predicted = self.__model.predict_proba(data.features)[:, 1] return predicted def fit(self, data: InputData): train_data, _ = train_test_data_setup(data=data) self.__model.fit(train_data.features, train_data.target) def tune(self, data): return 1
class QuadraticDiscriminantAnalysisPredictor(PredictorBase): ''' Quadratic Discriminant Analysis ''' def __init__(self): self.clf = QuadraticDiscriminantAnalysis() def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def get_k_best_k(self): return 4
class QuadraticDiscriminantAnalysisImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
class QuadraticDiscriminant(AbstractModel): def __init__(self, optimised): self.create_model(optimised) def create_model(self, optimised): self.model = QuadraticDiscriminantAnalysis() def fit_model(self, x_train, y_train): self.model.fit(x_train, y_train) def predict(self, x_test): y_pred = self.model.predict(x_test) return y_pred def get_model(self): return self.model def predict_proba(self, x_test): y_pred = self.model.predict_proba(x_test) return y_pred def print(self): pass
class QDAClf(ClassifierHolderBase): def __init__(self, clf_params): super().__init__(clf_params) self.clf_obj = QuadraticDiscriminantAnalysis() self.clf_retrainable = False def normalize_data(self, vals_to_normalize): for i in range(0, len(vals_to_normalize) - 1): self.normalized_data.append( self.op_subtr_input(vals_to_normalize[i], vals_to_normalize[i + 1])) def train_clf(self): self.can_be_fit = False self.clf_obj.fit(self.train_X, self.train_y) self.train_classes = self.clf_obj.classes_ def predict_feature_probability(self): return self.clf_obj.predict_proba(self.test_X) def enough_min_train_data(self): return (self.accumulated_samples >= self.num_train_samples) def add_train_y_value(self, val_list): self.accumulated_samples += 1 for i in range(0, len(val_list) - 1): val = self.op_bin_input(val_list[i], val_list[i + 1]) self.train_y.append(val) def add_new_train_item(self, vals_to_normalize_x, vals_to_normalize_y): self.normalize_data(vals_to_normalize_x) self.add_train_x_values() self.add_train_y_value(vals_to_normalize_y) def get_actual_next_value(self): return self.train_y[-1]
def runModel(X_train, X_test, Y_train, Y_test): """ Create model and run predictions """ numberClasses = 4 Y_predict = np.zeros((Y_test[0].shape[0], numberClasses), dtype=np.float) models = [] for i in range(numberClasses): print('Model: ', i + 1) # model = RandomForestClassifier( # n_estimators=300, # max_depth=180, # min_samples_split=6, # min_samples_leaf=1, # random_state=24, # oob_score=True) model = QuadraticDiscriminantAnalysis() model.fit(X_train[i], Y_train[i]) Y_predict[:, i] = model.predict_proba(X_test[i])[:, 1] models.append(model) return Y_predict, models
def test_qda(): # QDA classification. # This checks that QDA implements fit and predict and returns # correct values for a simple toy dataset. clf = QuadraticDiscriminantAnalysis() y_pred = clf.fit(X6, y6).predict(X6) assert_array_equal(y_pred, y6) # Assure that it works with 1D data y_pred1 = clf.fit(X7, y6).predict(X7) assert_array_equal(y_pred1, y6) # Test probas estimates y_proba_pred1 = clf.predict_proba(X7) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6) y_log_proba_pred1 = clf.predict_log_proba(X7) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8) y_pred3 = clf.fit(X6, y7).predict(X6) # QDA shouldn't be able to separate those assert np.any(y_pred3 != y7) # Classes should have at least 2 elements assert_raises(ValueError, clf.fit, X6, y4)
def main(): # ============================================ # === Loading data # ============================================ train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') cols = [c for c in train.columns if c not in ['id', 'target']] cols.remove('wheezy-copper-turtle-magic') # ============================================ # === Step 1 - Build first QDA model and predict test # ============================================ # initialize variables oof = np.zeros(len(train)) preds = np.zeros(len(test)) # build 512 separate models for i in range(512): # only train with data where wheezy equals i train2 = train[train['wheezy-copper-turtle-magic'] == i] test2 = test[test['wheezy-copper-turtle-magic'] == i] idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) # feature selection (use approx 40 of 255 features) sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) # stratified k-fold skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True) for train_index, test_index in skf.split(train3, train2['target']): # model and predict with QDA clf = QuadraticDiscriminantAnalysis(reg_param=0.5) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits # print cv auc auc = roc_auc_score(train['target'], oof) print('QDA scores CV =', round(auc, 5)) # ============================================ # === Step2 - Add pseudo label data and build second model # ============================================ n_init = 10 test['target'] = preds # initialize variables oof = np.zeros(len(train)) preds = np.zeros(len(test)) # build 512 separate models for k in range(512): # only train with data where wheezy equals i train2 = train[train['wheezy-copper-turtle-magic'] == k] train2p = train2.copy() idx1 = train2.index test2 = test[test['wheezy-copper-turtle-magic'] == k] # add pseudo labeled data test2p = test2[(test2['target'] <= 0.01) | (test2['target'] >= 0.99)].copy() test2p.loc[test2p['target'] >= 0.5, 'target'] = 1 test2p.loc[test2p['target'] < 0.5, 'target'] = 0 train2p = pd.concat([train2p, test2p], axis=0) train2p.reset_index(drop=True, inplace=True) # feature selextion (use approx 40 of 255 features) sel = VarianceThreshold(threshold=1.5).fit(train2p[cols]) train3p = sel.transform(train2p[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) # get cluster labels target_0 = np.argwhere(train2p["target"].values == 0).reshape(-1) target_1 = np.argwhere(train2p["target"].values == 1).reshape(-1) n_cols = train3.shape[1] # 1引くのは万が一insertがうまく言っていなかったとき対策 cluster_labels = np.zeros_like(train2p["target"].values) - 1 proba_x_0 = np.zeros((len(target_0), n_cols * 2)) proba_x_1 = np.zeros((len(target_1), n_cols * 2)) # calculate GMM per col for j in range(n_cols): # target = 0 kms_0 = GaussianMixture(n_components=2, max_iter=10000, n_init=n_init, means_init=[[-1], [1]], init_params="kmeans") kms_0.fit(train3p[target_0, j:j + 1]) pred_0 = kms_0.predict_proba(train3p[target_0, j:j + 1]) proba_x_0[:, j * 2:(j + 1) * 2] = pred_0 # target = 1 kms_1 = GaussianMixture(n_components=2, max_iter=10000, n_init=n_init, means_init=[[-1], [1]], init_params="kmeans") kms_1.fit(train3p[target_1, j:j + 1]) pred_1 = kms_1.predict_proba(train3p[target_1, j:j + 1]) proba_x_1[:, j * 2:(j + 1) * 2] = pred_1 # re-calculate GMM kms_0 = GaussianMixture( n_components=3, max_iter=10000, n_init=n_init, init_params="kmeans", ) kms_0.fit(proba_x_0) kms_1 = GaussianMixture( n_components=3, max_iter=10000, n_init=n_init, init_params="kmeans", ) kms_1.fit(proba_x_1) # predict cluster labels cluster_labels_0 = kms_0.predict(proba_x_0) cluster_labels_1 = kms_1.predict(proba_x_1) + 3 cluster_labels[target_0] = cluster_labels_0 cluster_labels[target_1] = cluster_labels_1 # stratified k-fold skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True) for train_index, test_index in skf.split(train3p, cluster_labels): test_index3 = test_index[test_index < len(train3)] # ignore pseudo in oof # model and predict with QDA clf = QuadraticDiscriminantAnalysis(reg_param=0.5) clf.fit(train3p[train_index, :], cluster_labels[train_index]) # predict cluster labels val_prediction_6 = clf.predict_proba(train3p[test_index3, :]) val_prediction = val_prediction_6[:, 3] + val_prediction_6[:, 4] + val_prediction_6[:, 5] oof[idx1[test_index3]] = val_prediction test_prediction_6 = clf.predict_proba(test3) test_prediction = test_prediction_6[:, 3] + test_prediction_6[:, 4] + test_prediction_6[:, 5] preds[test2.index] += test_prediction / skf.n_splits # print cv auc auc = roc_auc_score(train['target'], oof) print('Pseudo Labeled QDA scores CV =', round(auc, 5)) # ============================================ # === Make Submission # ============================================ sub = pd.read_csv('../input/sample_submission.csv') sub['target'] = preds sub.to_csv('submission.csv', index=False)
Ty3 = np.ravel(Ty3) TX3 = TX3.reset_index().values TX3 = np.delete(TX3, [0, 1], axis=1) #LOG prediction prob log1Prob = log1.predict_proba(TX1) log2Prob = log2.predict_proba(TX2) log3Prob = log3.predict_proba(TX3) #LDA prediction prob lda1Prob = lda1.predict_proba(TX1) lda2Prob = lda2.predict_proba(TX2) lda3Prob = lda3.predict_proba(TX3) #QDA prediction prob qda1Prob = qda1.predict_proba(TX1) qda2Prob = qda2.predict_proba(TX2) qda3Prob = qda3.predict_proba(TX3) #build a function getROCdata, that returns a dataframe with the 11 columns listed #truthvals is a column vector containing the correct classification #probs is a column vector of probability that the model believes the datapoint to be of class 1 #thresholds is a vector of probability thresholds to use when deciding to predict what class it is def getROCdata(truthVals, probs, thresholds): row_array = np.zeros([len(thresholds), 11]) for j in range(len(thresholds)): add_array = np.zeros([1, 11]) Predict = np.zeros(len(probs))
What is the training misclassification rate? """ lda1 = LDA(solver="svd", store_covariance=True) lda1.fit(warX,warY) my_lda_pred = pd.DataFrame() my_lda_pred["pred"] = ["No" if x == 0 else "Yes" for x in lda1.predict(warX)] my_lda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]] conf_lda = pd.crosstab(my_lda_pred["pred"], my_lda_pred["actual"]) conf_lda (1/(war.shape[0])) * (conf_lda.iloc[1,0] + conf_lda.iloc[0,1]) """ 6.69% """ qda1 = QDA(store_covariances=True) qda1.fit(warX,warY) test = qda1.predict_proba(warX) my_qda_pred = pd.DataFrame() my_qda_pred["pred"] = ["No" if x < .5 else "Yes" for x in qda1.predict(warX)] my_qda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]] conf_qda = pd.crosstab(my_qda_pred["pred"], my_qda_pred["actual"]) conf_qda (1/(war.shape[0])) * (conf_qda.iloc[1,0] + conf_qda.iloc[0,1])
def run(self, verbose): if verbose > 0: print("Running ensemble") #Build Model1 - Level 0 if verbose > 0: print("Running Random Forest Classifier") Model1 = RandomForestClassifier( n_estimators=self.attrs.rf_n_estimators, max_depth=self.attrs.rf_max_depth, min_samples_split=self.attrs.rf_min_samples_split, min_samples_leaf=self.attrs.rf_min_samples_leaf, random_state=1, n_jobs=self.attrs.rf_n_jobs) if self.attrs.rf_use_rfe: if verbose > 0: print(" using RFE") Model1 = RFE(Model1, n_features_to_select=150, step=20) Model1.fit(self.attrs.X_train, self.attrs.Y_train) #Predict on X_train, X_test Model1_pred_test = Model1.predict_proba(self.attrs.X_test) Model1_pred_train = Model1.predict_proba(self.attrs.X_train) Model1_pred_testsub = Model1.predict_proba(self.attrs.X_testsub) Model1_pred_blindsub = Model1.predict_proba(self.attrs.X_blindsub) #Build Model2 - Level 0 if verbose > 0: print("Running SVM Classifier") Model2 = SVC(C=self.attrs.svc_C, gamma=self.attrs.svc_gamma, kernel=self.attrs.svc_kernel, probability=self.attrs.svc_probability, random_state=2) Model2.fit(self.attrs.X_train, self.attrs.Y_train) #Predict on X_train, X_test Model2_pred_test = Model2.predict_proba(self.attrs.X_test) Model2_pred_train = Model2.predict_proba(self.attrs.X_train) Model2_pred_testsub = Model2.predict_proba(self.attrs.X_testsub) Model2_pred_blindsub = Model2.predict_proba(self.attrs.X_blindsub) #Build Model3 - Level 0 if verbose > 0: print("Quadratic Discriminant Analysis Classifier") Model3 = QuadraticDiscriminantAnalysis() Model3.fit(self.attrs.X_train, self.attrs.Y_train) #Predict on X_train, X_test Model3_pred_test = Model3.predict_proba(self.attrs.X_test) Model3_pred_train = Model3.predict_proba(self.attrs.X_train) Model3_pred_testsub = Model3.predict_proba(self.attrs.X_testsub) Model3_pred_blindsub = Model3.predict_proba(self.attrs.X_blindsub) #Build Model4 - Level 0 if verbose > 0: print("GaussianNB Classifier") Model4 = GaussianNB() Model4.fit(self.attrs.X_train, self.attrs.Y_train) #Predict on X_train, X_test Model4_pred_test = Model4.predict_proba(self.attrs.X_test) Model4_pred_train = Model4.predict_proba(self.attrs.X_train) Model4_pred_testsub = Model4.predict_proba(self.attrs.X_testsub) Model4_pred_blindsub = Model4.predict_proba(self.attrs.X_blindsub) #Build Model5 - Level 0 if verbose > 0: print("KNeighbors Classifier") Model5 = KNeighborsClassifier(n_neighbors=self.attrs.kn_n_neighbors, weights=self.attrs.kn_weights) Model5.fit(self.attrs.X_train, self.attrs.Y_train) #Predict on X_train, X_test Model5_pred_test = Model5.predict_proba(self.attrs.X_test) Model5_pred_train = Model5.predict_proba(self.attrs.X_train) Model5_pred_testsub = Model5.predict_proba(self.attrs.X_testsub) Model5_pred_blindsub = Model5.predict_proba(self.attrs.X_blindsub) #Build Model6 - Level 0 if verbose > 0: print("Logistic Regression Classifier") Model6 = LogisticRegression(C=self.attrs.lr_C, random_state=6) Model6.fit(self.attrs.X_train, self.attrs.Y_train) #Predict on X_train, X_test Model6_pred_test = Model6.predict_proba(self.attrs.X_test) Model6_pred_train = Model6.predict_proba(self.attrs.X_train) Model6_pred_testsub = Model6.predict_proba(self.attrs.X_testsub) Model6_pred_blindsub = Model6.predict_proba(self.attrs.X_blindsub) #Final Model - Level 1 #Creating training attributes for the stacked model if verbose > 0: print("Stacked Classifier") FeaturesTrain1 = np.hstack([ Model1_pred_train, Model2_pred_train, Model3_pred_train, Model4_pred_train, Model5_pred_train, Model6_pred_train ]) ModelFinal = LogisticRegression(random_state=49) ModelFinal.fit(FeaturesTrain1, self.attrs.Y_train) # Save the final model in case we want to work with it later self.attrs.final_model = ModelFinal #Creating test attributes final model Features_test1 = np.hstack([ Model1_pred_test, Model2_pred_test, Model3_pred_test, Model4_pred_test, Model5_pred_test, Model6_pred_test ]) Features_testsub1 = np.hstack([ Model1_pred_testsub, Model2_pred_testsub, Model3_pred_testsub, Model4_pred_testsub, Model5_pred_testsub, Model6_pred_testsub ]) Features_blindsub1 = np.hstack([ Model1_pred_blindsub, Model2_pred_blindsub, Model3_pred_blindsub, Model4_pred_blindsub, Model5_pred_blindsub, Model6_pred_blindsub ]) #Final predictions self.attrs.final_pred = ModelFinal.predict_proba(Features_test1) self.attrs.final_pred_testsub = ModelFinal.predict_proba( Features_testsub1) self.attrs.final_pred_blindsub = ModelFinal.predict_proba( Features_blindsub1) #AUC if verbose > 0: print("Calculating AUC") fpr, tpr, thresholds = roc_curve(self.attrs.Y_test, self.attrs.final_pred[:, 1]) roc_auc = auc(fpr, tpr) print("AUC with Stacking: ", roc_auc)
plt.title('LightGBM Feature Importance Based on Split') plt.tight_layout() plt.savefig('../output/lgbm_importances_split.jpg') plt.show() # X_train_df, X_valid_df, X_test_df = normalize_data(X_train_df, X_valid_df, X_test_df, features) qda = QuadraticDiscriminantAnalysis(store_covariance=True) qda_model = qda.fit(X_train_df[features], y_train_df.values) qda_valid_predict = qda.predict(X_valid_df[features]) qda_train_predict = qda.predict(X_train_df[features]) qda_test_predict = qda.predict(X_test_df[features]) qda_train_prob = qda.predict_proba(X_train_df[features]) qda_valid_prob = qda.predict_proba(X_valid_df[features]) qda_test_prob = qda.predict_proba(X_test_df[features]) qda_train_acc = accuracy_score(y_train_df.values, qda_train_predict) qda_valid_acc = accuracy_score(y_valid_df.values, qda_valid_predict) qda_test_acc = accuracy_score(y_test_df.values, qda_test_predict) print('Train Accuracy is {}'.format(qda_train_acc)) print('Valid Accuracy is {}'.format(qda_valid_acc)) print('Test Accuracy is {}'.format(qda_test_acc)) qda_profit, qda_cum_profit = utils.backtest(qda_test_predict, qda_test_prob[:, 1], 'qda') lda = LinearDiscriminantAnalysis(store_covariance=True)
def discriminatePlot(X, y, cVal, titleStr='', figdir='.', Xcolname = None, plotFig = False, removeTickLabels = False, testInd = None): # Frederic's Robust Wrapper for discriminant analysis function. Performs lda, qda and RF afer error checking, # Generates nice plots and returns cross-validated # performance, stderr and base line. # X np array n rows x p parameters # y group labels n rows # rgb color code for each data point - should be the same for each data beloging to the same group # titleStr title for plots # figdir is a directory name (folder name) for figures # Xcolname is a np.array or list of strings with column names for printout display # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses # Global Parameters CVFOLDS = 10 MINCOUNT = 10 MINCOUNTTRAINING = 5 # figdir = '/Users/frederictheunissen/Documents/Data/Julie/Acoustical Analysis/Figures Voice' # Initialize Variables and clean up data classes, classesCount = np.unique(y, return_counts = True) # Classes to be discriminated should be same as ldaMod.classes_ goodIndClasses = np.array([n >= MINCOUNT for n in classesCount]) goodInd = np.array([b in classes[goodIndClasses] for b in y]) if testInd is not None: # Check for goodInd - should be an np.array of dtype=bool # Transform testInd into an index inside xGood and yGood testIndx = testInd.nonzero()[0] goodIndx = goodInd.nonzero()[0] testInd = np.hstack([ np.where(goodIndx == testval)[0] for testval in testIndx]) trainInd = np.asarray([i for i in range(len(goodIndx)) if i not in testInd]) yGood = y[goodInd] XGood = X[goodInd] cValGood = cVal[goodInd] classes, classesCount = np.unique(yGood, return_counts = True) nClasses = classes.size # Number of classes or groups # Do we have enough data? if (nClasses < 2): print ('Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT)) return -1, -1, -1, -1 , -1, -1, -1, -1, -1 if testInd is None: cvFolds = min(min(classesCount), CVFOLDS) if (cvFolds < CVFOLDS): print ('Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS)) else: cvFolds = 1 # Data size and color values nD = XGood.shape[1] # number of features in X nX = XGood.shape[0] # number of data points in X cClasses = [] # Color code for each class for cl in classes: icl = (yGood == cl).nonzero()[0][0] cClasses.append(np.append(cValGood[icl],1.0)) cClasses = np.asarray(cClasses) # Use a uniform prior myPrior = np.ones(nClasses)*(1.0/nClasses) # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted. nDmax = int(np.fix(np.sqrt(nX//5))) if nDmax < nD: print ('Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' ) nDmax = min(nD, nDmax) pca = PCA(n_components=nDmax) Xr = pca.fit_transform(XGood) print ('Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0)) # Initialise Classifiers ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') qdaMod = QDA(priors = myPrior) rfMod = RF() # by default assumes equal weights # Perform CVFOLDS fold cross-validation to get performance of classifiers. ldaYes = 0 qdaYes = 0 rfYes = 0 cvCount = 0 if testInd is None: skf = cross_validation.StratifiedKFold(yGood, cvFolds) else: skf = [(trainInd,testInd)] for train, test in skf: # Enforce the MINCOUNT in each class for Training trainClasses, trainCount = np.unique(yGood[train], return_counts=True) goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount]) goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]]) # Specity the training data set, the number of groups and priors yTrain = yGood[train[goodIndTrain]] XrTrain = Xr[train[goodIndTrain]] trainClasses, trainCount = np.unique(yTrain, return_counts=True) ntrainClasses = trainClasses.size # Skip this cross-validation fold because of insufficient data if ntrainClasses < 2: continue goodInd = np.array([b in trainClasses for b in yGood[test]]) if (goodInd.size == 0): continue # Fit the data trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses) ldaMod.priors = trainPriors qdaMod.priors = trainPriors ldaMod.fit(XrTrain, yTrain) qdaMod.fit(XrTrain, yTrain) rfMod.fit(XrTrain, yTrain) ldaYes += np.around((ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]))*goodInd.size) qdaYes += np.around((qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]))*goodInd.size) rfYes += np.around((rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]]))*goodInd.size) cvCount += goodInd.size # Refit with all the data for the plots ldaMod.priors = myPrior qdaMod.priors = myPrior Xrr = ldaMod.fit_transform(Xr, yGood) # Check labels for a, b in zip(classes, ldaMod.classes_): if a != b: print ('Error in ldaPlot: labels do not match') # Check the within-group covariance in the rotated space # covs = [] # for group in classes: # Xg = Xrr[yGood == group, :] # covs.append(np.atleast_2d(np.cov(Xg,rowvar=False))) # withinCov = np.average(covs, axis=0, weights=myPrior) # Print the five largest coefficients of first 3 DFA MAXCOMP = 3 # Maximum number of DFA componnents MAXWEIGHT = 5 # Maximum number of weights printed for each componnent ncomp = min(MAXCOMP, nClasses-1) nweight = min(MAXWEIGHT, nD) # The scalings_ has the eigenvectors of the LDA in columns and the pca.componnents has the eigenvectors of PCA in columns weights = np.dot(ldaMod.scalings_[:,0:ncomp].T, pca.components_) print('LDA Weights:') for ic in range(ncomp): idmax = np.argsort(np.abs(weights[ic,:]))[::-1] print('DFA %d: '%ic, end = '') for iw in range(nweight): if Xcolname is None: colstr = 'C%d' % idmax[iw] else: colstr = Xcolname[idmax[iw]] print('%s %.3f; ' % (colstr, float(weights[ic, idmax[iw]]) ), end='') print() if plotFig: dimVal = 0.8 # Overall diming of background so that points can be seen # Obtain fits in this rotated space for display purposes ldaMod.fit(Xrr, yGood) qdaMod.fit(Xrr, yGood) rfMod.fit(Xrr, yGood) XrrMean = Xrr.mean(0) # Make a mesh for plotting x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1)) xm1 = np.reshape(x1, -1) xm2 = np.reshape(x2, -1) nxm = np.size(xm1) Xm = np.zeros((nxm, Xrr.shape[1])) Xm[:,0] = xm1 if Xrr.shape[1] > 1 : Xm[:,1] = xm2 for ix in range(2,Xrr.shape[1]): Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix] XmcLDA = np.zeros((nxm, 4)) # RGBA values for color for LDA XmcQDA = np.zeros((nxm, 4)) # RGBA values for color for QDA XmcRF = np.zeros((nxm, 4)) # RGBA values for color for RF # Predict values on mesh for plotting based on the first two DFs yPredLDA = ldaMod.predict_proba(Xm) yPredQDA = qdaMod.predict_proba(Xm) yPredRF = rfMod.predict_proba(Xm) # Transform the predictions in color codes maxLDA = yPredLDA.max() for ix in range(nxm) : cWeight = yPredLDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcLDA[ix,:] = np.dot(cWinner*cWeight, cClasses) XmcLDA[ix,3] = (cWeight.max()/maxLDA)*dimVal # Plot the surface of probability plt.figure(facecolor='white', figsize=(10,4)) plt.subplot(131) Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: LDA %d/%d' % (titleStr, ldaYes, cvCount)) plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') if removeTickLabels: ax = plt.gca() labels = [item.get_text() for item in ax.get_xticklabels()] empty_string_labels = ['']*len(labels) ax.set_xticklabels(empty_string_labels) labels = [item.get_text() for item in ax.get_yticklabels()] empty_string_labels = ['']*len(labels) ax.set_yticklabels(empty_string_labels) # Transform the predictions in color codes maxQDA = yPredQDA.max() for ix in range(nxm) : cWeight = yPredQDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcQDA[ix,:] = np.dot(cWinner*cWeight, cClasses) XmcQDA[ix,3] = (cWeight.max()/maxQDA)*dimVal # Plot the surface of probability plt.subplot(132) Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: QDA %d/%d' % (titleStr, qdaYes, cvCount)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) if removeTickLabels: ax = plt.gca() labels = [item.get_text() for item in ax.get_xticklabels()] empty_string_labels = ['']*len(labels) ax.set_xticklabels(empty_string_labels) labels = [item.get_text() for item in ax.get_yticklabels()] empty_string_labels = ['']*len(labels) ax.set_yticklabels(empty_string_labels) # Transform the predictions in color codes maxRF = yPredRF.max() for ix in range(nxm) : cWeight = yPredRF[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses # Weighted colors does not work XmcRF[ix,:] = np.dot(cWinner*cWeight, cClasses) XmcRF[ix,3] = (cWeight.max()/maxRF)*dimVal # Plot the surface of probability plt.subplot(133) Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: RF %d/%d' % (titleStr, rfYes, cvCount)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) if removeTickLabels: ax = plt.gca() labels = [item.get_text() for item in ax.get_xticklabels()] empty_string_labels = ['']*len(labels) ax.set_xticklabels(empty_string_labels) labels = [item.get_text() for item in ax.get_yticklabels()] empty_string_labels = ['']*len(labels) ax.set_yticklabels(empty_string_labels) plt.show() plt.savefig('%s/%s.png' % (figdir,titleStr), format='png', dpi=1000) # Results ldaYes = int(ldaYes) qdaYes = int(qdaYes) rfYes = int(rfYes) p = 1.0/nClasses ldaP = 0 qdaP = 0 rfP = 0 for k in range(ldaYes, cvCount+1): ldaP += binom.pmf(k, cvCount, p) for k in range(qdaYes, cvCount+1): qdaP += binom.pmf(k, cvCount, p) for k in range(rfYes, cvCount+1): rfP += binom.pmf(k, cvCount, p) print ("Number of classes %d. Chance level %.2f %%" % (nClasses, 100.0/nClasses)) print ("%s LDA: %.2f %% (%d/%d p=%.4f)" % (titleStr, 100.0*ldaYes/cvCount, ldaYes, cvCount, ldaP)) print ("%s QDA: %.2f %% (%d/%d p=%.4f)" % (titleStr, 100.0*qdaYes/cvCount, qdaYes, cvCount, qdaP)) print ("%s RF: %.2f %% (%d/%d p=%.4f)" % (titleStr, 100.0*rfYes/cvCount, rfYes, cvCount, rfP)) return ldaYes, qdaYes, rfYes, cvCount, ldaP, qdaP, rfP, nClasses, weights
QDAModel = QDA() QDAModel.fit(X_train, y_train) # ---------------------------------------------------- #Calculating Details print('QDAModel Train Score is : ', QDAModel.score(X_train, y_train)) print('QDAModel Test Score is : ', QDAModel.score(X_test, y_test)) print("=" * 10) # --------------- print('QDAModel means are : ', QDAModel.means_) print('QDAModel classes are : ', QDAModel.classes_) print("=" * 25) # ---------------------------------------------------- # Calculating Prediction y_pred = QDAModel.predict(X_test) y_pred_prob = QDAModel.predict_proba(X_test) print('Prediction Probabilities Value for QDAModel is : \n', y_pred_prob[:5]) print('Pred Value for QDAModel is : ', y_pred[:5]) print('True Value for QDAModel is : ', y_test[:5]) print("=" * 25) # ---------------------------------------------------- ClassificationReport = classification_report(y_test, y_pred) print(ClassificationReport) print("=" * 10) # --------------- CM = confusion_matrix(y_test, y_pred) print(CM) print("=" * 10) # --------------- # plt.figure() # sns.heatmap(CM, center = True, annot=True, fmt="d")
def validation(cancer, local, signature_file, datafile='_data_mRNA.txt', trainfile='_train_idx.txt', testfile='_test_idx.txt', dtype='mRNA'): ''' ''' print('Validation step for cancer {}'.format(cancer), end='') # read signature database signatures = pd.read_csv(signature_file, sep=",") # read data and labels loc = os.path.join(local, cancer) data = pd.read_table(os.path.join(loc, cancer + datafile), sep="\t", index_col=0, header=[0, 1]) lbl = np.asarray( data.columns.get_level_values(0).astype(float).astype(int)) # TODO: use default dict and please change this orrible load method! with open(os.path.join(loc, cancer + trainfile), 'r') as f: rows = (line.split('\t') for line in f) d = {(i, int(row[0])): list(map(int, row[1:][0].split(","))) for i, row in enumerate(rows)} train_idx = {} for key, value in d.items(): train_idx.setdefault(key[1], []).append(value) with open(os.path.join(loc, cancer + testfile), 'r') as f: rows = (line.split('\t') for line in f) d = {(i, int(row[0])): list(map(int, row[1:][0].split(","))) for i, row in enumerate(rows)} test_idx = {} for key, value in d.items(): test_idx.setdefault(key[1], []).append(value) cls = QDA() final_db = [] # loop over each selected signature for i, signature in signatures.iterrows(): n = np.asarray(signature.nodes.split(';'), dtype=int) signature.train = int(signature.train) idx_train = train_idx[int(signature.fold)][int(signature.train)] idx_test = test_idx[int(signature.fold)][int(signature.train)] Train = data.iloc[n, idx_train].T # take the half of indices for temporary training and the other half for validation idx_test1 = idx_test[:len(idx_test) // 2] idx_test2 = idx_test[len(idx_test) // 2:] Test1 = data.iloc[n, idx_test1].T Test2 = data.iloc[n, idx_test2].T lbl_train = lbl[idx_train] lbl_test1 = lbl[idx_test1] lbl_test2 = lbl[idx_test2] # train over training and predict over test cls.fit(Train, lbl_train) # test lbl_pred_test1_as_test = cls.predict_proba(Test1) lbl_pred_test2_as_test = cls.predict_proba(Test2) # train over training + test and predict over validation set cls.fit(pd.concat([Train, Test1], ignore_index=True), np.concatenate([lbl_train, lbl_test1])) lbl_pred_test2_as_val = cls.predict_proba(Test2) cls.fit(pd.concat([Train, Test2], ignore_index=True), np.concatenate([lbl_train, lbl_test2])) lbl_pred_test1_as_val = cls.predict_proba(Test1) for val_scores, test_scores, val_labels, test_labels, val_idx, test_idx_ in zip( [lbl_pred_test1_as_val, lbl_pred_test2_as_val], [lbl_pred_test2_as_test, lbl_pred_test1_as_test], [lbl_test1, lbl_test2], [lbl_test2, lbl_test1], [idx_test1, idx_test2], [idx_test2, idx_test1]): for group_name, scores, labels, idxs in zip( ['validation', 'test'], [val_scores, test_scores], [val_labels, test_labels], [val_idx, test_idx_]): for subj_score, subj_label, subj_idx in zip( scores, labels, idxs): info = dict() info['fold'] = signature.train info['repetition'] = signature.fold info['cancer'] = signature.cancer info['dtype'] = signature['dtype'] info['signature_nodes'] = signature.nodes info['signatureID'] = i info['betweenness_centrality'] = signature.bc info['performance_chunck_couples'] = signature.perf_couple info['training_accuracy_score_signature'] = signature.score info['subjectID'] = subj_idx info['subject_label'] = subj_label prob_cl0, prob_cl1 = subj_score info['subject_proba_cl0'] = prob_cl0 info['subject_proba_cl1'] = prob_cl1 info['subject_group'] = group_name final_db.append(info) final_db = pd.DataFrame(data=final_db).to_csv(os.path.join( local, signature_file.split(os.sep)[-1].split(".")[0] + ".tidy"), sep=",", index=False) print('[done]') return
completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1],
predictions_RF = RF.predict_proba(Xtest) # AdaBoost AD = AdaBoostClassifier() AD.fit(Xtrain, Ytrain) predictions_AD = AD.predict_proba(Xtest) # Naive Bayes NB = GaussianNB() NB.fit(Xtrain, Ytrain) predictions_NB = NB.predict_proba(Xtest) # QDA QDA = QuadraticDiscriminantAnalysis() QDA.fit(Xtrain, Ytrain) predictions_QDA = QDA.predict_proba(Xtest) # Voting from sklearn.ensemble import VotingClassifier eclf1 = VotingClassifier(estimators=[('MLP', MLP), ('SGD', SGD), ('NN', NN), # ('GP', GP), ('DT', DT), ('RF', RF), ('AD', AD), # ('NB', NB), ('QDA', QDA)], voting='soft', n_jobs=-1)
plt.axis('tight') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.plot(roc_auc_lda) # In[27]: # finally lets see how quadratic discriminant analysis performs from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, Y_train.ravel()) # In[28]: Y_prob_qda = qda.predict_proba(X_test)[:, 1] Y_pred_qda = np.where(Y_prob_qda > 0.5, 1, 0) # In[29]: qda_confusion_matrix = confusion_matrix(Y_test, Y_pred_qda) qda_confusion_matrix # In[30]: false_positive_rate_qda, true_positive_rate_qda, thresholds_qda = roc_curve( Y_test, Y_prob_qda) roc_auc_qda = auc(false_positive_rate_qda, true_positive_rate_qda) roc_auc_qda # In[31]:
def discriminatePlot(X, y, cVal, titleStr=''): # Frederic's Robust Wrapper for discriminant analysis function. Performs lda, qda and RF afer error checking, # Generates nice plots and returns cross-validated # performance, stderr and base line. # X np array n rows x p parameters # y group labels n rows # rgb color code for each data point - should be the same for each data beloging to the same group # titleStr title for plots # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses # Global Parameters CVFOLDS = 10 MINCOUNT = 10 MINCOUNTTRAINING = 5 # Initialize Variables and clean up data classes, classesCount = np.unique(y, return_counts = True) # Classes to be discriminated should be same as ldaMod.classes_ goodIndClasses = np.array([n >= MINCOUNT for n in classesCount]) goodInd = np.array([b in classes[goodIndClasses] for b in y]) yGood = y[goodInd] XGood = X[goodInd] cValGood = cVal[goodInd] classes, classesCount = np.unique(yGood, return_counts = True) nClasses = classes.size # Number of classes or groups # Do we have enough data? if (nClasses < 2): print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT) return -1, -1, -1, -1 , -1, -1, -1 cvFolds = min(min(classesCount), CVFOLDS) if (cvFolds < CVFOLDS): print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS) # Data size and color values nD = XGood.shape[1] # number of features in X nX = XGood.shape[0] # number of data points in X cClasses = [] # Color code for each class for cl in classes: icl = (yGood == cl).nonzero()[0][0] cClasses.append(np.append(cValGood[icl],1.0)) cClasses = np.asarray(cClasses) myPrior = np.ones(nClasses)*(1.0/nClasses) # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted. nDmax = int(np.fix(np.sqrt(nX/5))) if nDmax < nD: print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' nDmax = min(nD, nDmax) pca = PCA(n_components=nDmax) Xr = pca.fit_transform(XGood) print 'Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0) # Initialise Classifiers ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') qdaMod = QDA(priors = myPrior) rfMod = RF() # by default assumes equal weights # Perform CVFOLDS fold cross-validation to get performance of classifiers. ldaScores = np.zeros(cvFolds) qdaScores = np.zeros(cvFolds) rfScores = np.zeros(cvFolds) skf = cross_validation.StratifiedKFold(yGood, cvFolds) iskf = 0 for train, test in skf: # Enforce the MINCOUNT in each class for Training trainClasses, trainCount = np.unique(yGood[train], return_counts=True) goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount]) goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]]) # Specity the training data set, the number of groups and priors yTrain = yGood[train[goodIndTrain]] XrTrain = Xr[train[goodIndTrain]] trainClasses, trainCount = np.unique(yTrain, return_counts=True) ntrainClasses = trainClasses.size # Skip this cross-validation fold because of insufficient data if ntrainClasses < 2: continue goodInd = np.array([b in trainClasses for b in yGood[test]]) if (goodInd.size == 0): continue # Fit the data trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses) ldaMod.priors = trainPriors qdaMod.priors = trainPriors ldaMod.fit(XrTrain, yTrain) qdaMod.fit(XrTrain, yTrain) rfMod.fit(XrTrain, yTrain) ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) iskf += 1 if (iskf != cvFolds): cvFolds = iskf ldaScores.reshape(cvFolds) qdaScores.reshape(cvFolds) rfScores.reshape(cvFolds) # Refit with all the data for the plots ldaMod.priors = myPrior qdaMod.priors = myPrior Xrr = ldaMod.fit_transform(Xr, yGood) # Check labels for a, b in zip(classes, ldaMod.classes_): if a != b: print 'Error in ldaPlot: labels do not match' # Print the coefficients of first 3 DFA print 'LDA Weights:' print 'DFA1:', ldaMod.coef_[0,:] if nClasses > 2: print 'DFA2:', ldaMod.coef_[1,:] if nClasses > 3: print 'DFA3:', ldaMod.coef_[2,:] # Obtain fits in this rotated space for display purposes ldaMod.fit(Xrr, yGood) qdaMod.fit(Xrr, yGood) rfMod.fit(Xrr, yGood) XrrMean = Xrr.mean(0) # Make a mesh for plotting x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1)) xm1 = np.reshape(x1, -1) xm2 = np.reshape(x2, -1) nxm = np.size(xm1) Xm = np.zeros((nxm, Xrr.shape[1])) Xm[:,0] = xm1 if Xrr.shape[1] > 1 : Xm[:,1] = xm2 for ix in range(2,Xrr.shape[1]): Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix] XmcLDA = np.zeros((nxm, 4)) # RGBA values for color for LDA XmcQDA = np.zeros((nxm, 4)) # RGBA values for color for QDA XmcRF = np.zeros((nxm, 4)) # RGBA values for color for RF # Predict values on mesh for plotting based on the first two DFs yPredLDA = ldaMod.predict_proba(Xm) yPredQDA = qdaMod.predict_proba(Xm) yPredRF = rfMod.predict_proba(Xm) # Transform the predictions in color codes maxLDA = yPredLDA.max() for ix in range(nxm) : cWeight = yPredLDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcLDA[ix,:] = np.dot(cWinner, cClasses) XmcLDA[ix,3] = cWeight.max()/maxLDA # Plot the surface of probability plt.figure(facecolor='white', figsize=(10,3)) plt.subplot(131) Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean()*100.0))) plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') # Transform the predictions in color codes maxQDA = yPredQDA.max() for ix in range(nxm) : cWeight = yPredQDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcQDA[ix,:] = np.dot(cWinner, cClasses) XmcQDA[ix,3] = cWeight.max()/maxQDA # Plot the surface of probability plt.subplot(132) Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) # Transform the predictions in color codes maxRF = yPredRF.max() for ix in range(nxm) : cWeight = yPredRF[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses # Weighted colors does not work XmcRF[ix,:] = np.dot(cWinner, cClasses) XmcRF[ix,3] = cWeight.max()/maxRF # Plot the surface of probability plt.subplot(133) Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.show() # Results ldaScore = ldaScores.mean()*100.0 qdaScore = qdaScores.mean()*100.0 rfScore = rfScores.mean()*100.0 ldaScoreSE = ldaScores.std() * 100.0 qdaScoreSE = qdaScores.std() * 100.0 rfScoreSE = rfScores.std() * 100.0 print ("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0/nClasses) print ("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE) print ("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE) print ("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE) return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
# check.append(label) check = tst.columns for i in range(512): train2 = train[train['wheezy-copper-turtle-magic'] == i] test2 = test[test['wheezy-copper-turtle-magic'] == i] index1 = train2.index index2 = test2.index print(index1) train2.reset_index(drop=True, inplace=True) lowvardrop = VarianceThreshold(threshold=1.5).fit(train2[check]) trainupdted = lowvardrop.transform(train2[check]) testupdted = lowvardrop.transform(test2[check]) splits = 11 X_train, X_test, Y_train, Y_test = train_test_split(trainupdted, train2['target'], test_size=0.1, random_state=42) model = model.fit(X_train, Y_train) pree[index2] += (model.predict_proba(testupdted)[:, 1]) / splits # folds = StratifiedKFold(n_splits=splits) # for traindex, testdex in folds.split(trainupdted, train2['target']): # print('hi') # print(len(traindex)) # model.fit(trainupdted[traindex,:],train2.loc[traindex]['target']) # res[index1[testdex]] = model.predict_proba(trainupdted[testdex,:])[:,1] # pree[index2] += (model.predict_proba(testupdted)[:,1])/splits lmao = pd.read_csv('../input/sample_submission.csv') lmao['target'] = pree.reshape((-1, 1)) lmao.to_csv('submission.csv', index=False)
# LDA model lda = QuadraticDiscriminantAnalysis() lda.fit(comps, labels) y_pred = lda.predict(comps) print(labels) print(y_pred) mcc = matthews_corrcoef(labels,y_pred) print("MCC="+str(mcc)) # Plotting LDA contour nx, ny = 200, 100 x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0]) y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.contour(xx, yy, Z, [0.5], linewidths=5, colors = 'k', linestyles = 'dashed') # Plotting LDA means #plt.plot(lda.means_[0][0], lda.means_[0][1],'o', color='black', markersize=10) #plt.plot(lda.means_[1][0], lda.means_[1][1],'o', color='black', markersize=10) plt.title('PCA with QDA') # Plot red and green data output_red = comps[0:26] output_green = comps[27:52] #plt.scatter(output_red[:, 0], output_red[:,1], color='r') #plt.scatter(output_green[:, 0], output_green[:, 1],color='g') plt.show()
ys = dataset[1:, 3].astype(np.float64).astype(np.int32) # train classifier = QDA() classifier.fit(xs, ys) # error rate prediction = classifier.predict(xs) error_rate = np.sum(prediction != ys) / ys.shape[0] print("Error rate: %.2f" % error_rate) # visualize # partition nx, ny = 100, 100 xx, yy = np.meshgrid(np.linspace(0, 1, nx), np.linspace(0, 1, ny)) Z = classifier.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap='BuPu') plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k') # plot data positive_xs = xs[ys == 1] negative_xs = xs[ys == 0] plt.scatter(positive_xs[:, 0], positive_xs[:, 1], c='#00CED1', s=60, label='Great (positive)') plt.scatter(negative_xs[:, 0], negative_xs[:, 1], c='#DC143C',
def main(): # PROBLEMS 1, 2, 3 (USPS Data) #################################################################### # read in usps data # training data data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps', 'zip.train'), header=None, delimiter=' ').iloc[:, :-1] y_train = data.pop(0).values X_train = data.values # test data data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps', 'zip.test'), header=None, delimiter=' ') y_test = data.pop(0).values X_test = data.values errs = None rows = np.array([]) cols = np.array([]) # apply pca for var in [.95, .96, .97, .98, .99, 1.]: if var == 1: pca = PCA() else: pca = PCA(n_components=var) pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) n_comps = pca.n_components_ rows = np.append( rows, "{0:.0f} components ({1:.0f}% variance)".format( n_comps, var * 100)) # setting this up in here so that the classifiers re-initialize # for each variance classifiers = { "$k$NN ($k$ = 3)": KNeighborsClassifier(n_neighbors=3), "LDA": LDA() } var_errs = np.array([]) for key in classifiers: # check first if we already have all our classifier columns if (len(cols) < len(classifiers)): cols = np.append(cols, key) cls = classifiers[key] cls_err = 1 - cls.fit(X_train_pca, y_train).score( X_test_pca, y_test) var_errs = np.append(var_errs, "{0:.4f}".format(cls_err)) if errs is not None: errs = np.vstack((errs, var_errs)) else: errs = np.array([var_errs]) # make a table of the results plt.figure(figsize=(12, 2)) plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left') plt.axis('off') save_path = os.path.join(os.path.dirname(__file__), 'hw4/usps_err_rates_lda.pdf') plt.savefig(save_path, bbox_inches='tight') plt.title("USPS Error Rates, LDA") plt.show() errs = None rows = np.array([]) cols = np.array([]) # apply pca for var in [.95, .96, .97, .98, .99, 1.]: if var == 1: pca = PCA() else: pca = PCA(n_components=var) pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) n_comps = pca.n_components_ rows = np.append( rows, "{0:.0f} components ({1:.0f}% variance)".format( n_comps, var * 100)) # setting this up in here so that the classifiers re-initialize # for each variance classifiers = { "$k$NN ($k$ = 3)": KNeighborsClassifier(n_neighbors=3), "QDA": QDA() } var_errs = np.array([]) for key in classifiers: # check first if we already have all our classifier columns if (len(cols) < len(classifiers)): cols = np.append(cols, key) cls = classifiers[key] cls_err = 1 - cls.fit(X_train_pca, y_train).score( X_test_pca, y_test) var_errs = np.append(var_errs, "{0:.4f}".format(cls_err)) if errs is not None: errs = np.vstack((errs, var_errs)) else: errs = np.array([var_errs]) plt.figure(figsize=(12, 2)) plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left') plt.axis('off') save_path = os.path.join(os.path.dirname(__file__), 'hw4/usps_err_rates_qda.pdf') plt.savefig(save_path, bbox_inches='tight') plt.title("USPS Error Rates, QDA") plt.show() errs = None rows = np.array([]) cols = np.array([]) # apply pca for var in [.95, .96, .97, .98, .99, 1.]: if var == 1: pca = PCA() else: pca = PCA(n_components=var) pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) n_comps = pca.n_components_ rows = np.append( rows, "{0:.0f} components ({1:.0f}% variance)".format( n_comps, var * 100)) # setting this up in here so that the classifiers re-initialize # for each variance classifiers = { "$k$NN ($k$ = 3)": KNeighborsClassifier(n_neighbors=3), "GNB": GaussianNB() } var_errs = np.array([]) for key in classifiers: # check first if we already have all our classifier columns if (len(cols) < len(classifiers)): cols = np.append(cols, key) cls = classifiers[key] cls_err = 1 - cls.fit(X_train_pca, y_train).score( X_test_pca, y_test) var_errs = np.append(var_errs, "{0:.4f}".format(cls_err)) if errs is not None: errs = np.vstack((errs, var_errs)) else: errs = np.array([var_errs]) plt.figure(figsize=(12, 2)) plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left') plt.axis('off') save_path = os.path.join(os.path.dirname(__file__), 'hw4/usps_err_rates_gnb.pdf') plt.savefig(save_path, bbox_inches='tight') plt.title("USPS Error Rates, GNB") plt.show() # PROBLEM 4 (MNIST Data) #################################################################### # read in mnist data mndata = MNIST('data/mnist') # training data images, labels = mndata.load_training() X_train = np.array(images) y_train = np.array(labels) # test data images, labels = mndata.load_testing() X_test = np.array(images) y_test = np.array(labels) errs = None rows = np.array([]) cols = np.array([]) classifiers = {"LDA": LDA(), "QDA": QDA()} variances = [.95, .96, .97, .98, .99, 1.] subsets = { "0, 1": [0, 1], "4, 9": [4, 9], "0, 1, 2": [0, 1, 2], "3, 5, 8": [3, 5, 8] } for components in variances: for cls_key in classifiers: var_errs = np.array([]) for i, key in enumerate(subsets): if components == 1: pca = PCA() else: pca = PCA(n_components=components) X_train_sub = X_train[np.isin(y_train, subsets[key])] y_train_sub = y_train[np.isin(y_train, subsets[key])] X_test_sub = X_test[np.isin(y_test, subsets[key])] y_test_sub = y_test[np.isin(y_test, subsets[key])] pca.fit(X_train_sub) X_train_pca = pca.transform(X_train_sub) X_test_pca = pca.transform(X_test_sub) n_comps = pca.n_components_ # only need to fill these if they aren't already filled if (i == 0): rows = np.append( rows, "{0:.0f} components ({1:.0f}% variance) " "+ {2}".format(n_comps, components * 100, cls_key)) if (len(cols) < len(subsets)): cols = np.append(cols, key) cls = classifiers[cls_key] cls.fit(X_train_pca, y_train_sub) cls_err = 1 - cls.score(X_test_pca, y_test_sub) var_errs = np.append(var_errs, "{0:.4f}".format(cls_err)) if errs is not None: errs = np.vstack((errs, var_errs)) else: errs = np.array([var_errs]) # make a table of the results plt.figure(figsize=(12, 2)) plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left') plt.axis('off') save_path = os.path.join(os.path.dirname(__file__), 'hw4/mnist_err_rates.pdf') plt.savefig(save_path, bbox_inches='tight') plt.title("MNIST Error Rates") plt.show() # PROBLEM 5 ################################################################### gaussians = loadmat('data/twogaussians/twogaussians.mat') X_train = gaussians['Xtr'] y_train = np.ravel(gaussians['ytr']) X_test = gaussians['Xtst'] y_test = np.ravel(gaussians['ytst']) lda = LDA() lda.fit(X_train, y_train) lda_err_rate = 1 - lda.score(X_test, y_test) print('{0:.4f}'.format(lda_err_rate)) qda = QDA() qda.fit(X_train, y_train) qda_err_rate = 1 - qda.score(X_test, y_test) print('{0:.4f}'.format(qda_err_rate)) X1 = X_train[y_train == 1] X2 = X_train[y_train == 2] plt.figure(figsize=(12, 12)) plt.plot(X1[:, 0], X1[:, 1], 'o', color='red', label='1') plt.plot(X2[:, 0], X2[:, 1], 'o', color='blue', label='2') nx, ny = 200, 100 x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k') save_path = os.path.join(os.path.dirname(__file__), 'hw4/lda_decision_boundary.pdf') plt.title( "LDA Decision Boundary (Error Rate = {0:.4f})".format(lda_err_rate)) plt.savefig(save_path, bbox_inches='tight') plt.show() plt.figure(figsize=(12, 12)) plt.plot(X1[:, 0], X1[:, 1], 'o', color='red', label='1') plt.plot(X2[:, 0], X2[:, 1], 'o', color='blue', label='2') nx, ny = 200, 100 x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny)) Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k') save_path = os.path.join(os.path.dirname(__file__), 'hw4/qda_decision_boundary.pdf') plt.title( "QDA Decision Boundary (Error Rate = {0:.4f})".format(qda_err_rate)) plt.savefig(save_path, bbox_inches='tight') plt.show()