class myQDABinary(myModel): def make(self , make_params ): self.model = QuadraticDiscriminantAnalysis(**make_params ) return self def fit(self , xtrain , ytrain , xtest =None, ytest =None , fit_params = {} ): if type(xtrain) == pd.core.frame.DataFrame: self.model.fit(xtrain.astype('float32') , ytrain.astype('float32') , **fit_params) else: self.model.fit(xtrain , ytrain , **fit_params) def predict(self , xs , threshold = 0.5): if type(xs) == pd.core.frame.DataFrame: return self.model.predict(xs.astype('float32')) else: return self.model.predict(xs) def predict_proba(self, xs): if type(xs) == pd.core.frame.DataFrame: return self.model.predict_proba(xs.astype('float32'))[:,1] else: if len(xs.shape) == 1: return self.model.predict_proba(xs.reshape(1,-1)) else: return self.model.predict_proba(xs)
def plot_qda(X, y): print('QDA: ') qda = QuadraticDiscriminantAnalysis() qda.fit(X, y) new_y = qda.predict(X) print("Train error rate: " + str(compare_y(new_y, y))) new_y = qda.predict(genre_x_test) print("Test error rate: " + str(compare_y(new_y, genre_y_test))) plot_errors(new_y, "QDA") colors = ['navy', 'turquoise', 'darkorange', 'red'] target_names = ['jazz', 'rock', 'hip_hop', 'classical'] plt.figure() for color, i, target_name in zip(colors, [0, 1, 2, 3], target_names): plt.scatter(X[y == i, 0], X[y == i, 1], alpha=.8, color=color, label=target_name, s=.5) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('QDA of music dataset') plt.xlabel("Dancebility") plt.ylabel("Energy") plt.show()
def test_qda_regularization(): # The default is reg_param=0. and will cause issues when there is a # constant variable. # Fitting on data with constant variable triggers an UserWarning. collinear_msg = "Variables are collinear" clf = QuadraticDiscriminantAnalysis() with pytest.warns(UserWarning, match=collinear_msg): y_pred = clf.fit(X2, y6) # XXX: RuntimeWarning is also raised at predict time because of divisions # by zero when the model is fit with a constant feature and without # regularization: should this be considered a bug? Either by the fit-time # message more informative, raising and exception instead of a warning in # this case or somehow changing predict to avoid division by zero. with pytest.warns(RuntimeWarning, match="divide by zero"): y_pred = clf.predict(X2) assert np.any(y_pred != y6) # Adding a little regularization fixes the division by zero at predict # time. But UserWarning will persist at fit time. clf = QuadraticDiscriminantAnalysis(reg_param=0.01) with pytest.warns(UserWarning, match=collinear_msg): clf.fit(X2, y6) y_pred = clf.predict(X2) assert_array_equal(y_pred, y6) # UserWarning should also be there for the n_samples_in_a_class < # n_features case. clf = QuadraticDiscriminantAnalysis(reg_param=0.1) with pytest.warns(UserWarning, match=collinear_msg): clf.fit(X5, y5) y_pred5 = clf.predict(X5) assert_array_equal(y_pred5, y5)
def m2_QDA(X, y, score_method='default', verbose=False): #split data Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2) yt1 = yt.apply(lambda g: g[0]) # drop unique tracks ug = yt1.drop_duplicates(False).index Xt = Xt.drop(ug) yt = yt.drop(ug) yt1 = yt1.drop(ug) # fitting qda = QuadraticDiscriminantAnalysis(tol=10**-10) qda.fit(Xt, yt1) # predictions pt = qda.predict(Xt) terr = score(pt, yt) pv = qda.predict(Xv) verr = score(pv, yv) if verbose: print_scores(terr, verr, 'QDA') return verr
def crossValidate(attributes, outcomes, foldCount, ownFunction=True): presList = [] recallList = [] accrList = [] fMeasList = [] aucList = [] testingEstimate = [] otcmVal = list(set(outcomes)) params = {} featLen = 4 attrFolds = getFolds(attributes, foldCount) otcmFolds = getFolds(outcomes, foldCount) testDataList = copy.copy(attrFolds) testOtcmList = copy.copy(otcmFolds) for itr in range(foldCount): trainDataList = [] trainOtcmList = [] for intitr in range(foldCount): if intitr != itr: trainDataList.append(attrFolds[intitr]) trainOtcmList.append(otcmFolds[intitr]) trainDataArr = np.array(trainDataList).reshape(-1, featLen) trainOtcmArr = np.array(trainOtcmList).reshape(-1) testDataArr = np.array(testDataList[itr]).reshape(-1, featLen) testOtcmArr = np.array(testOtcmList[itr]).reshape(-1) if ownFunction: params = getParams(trainDataArr, trainOtcmArr, otcmVal, featLen) testingEstimate = gdaNDEstimate(testDataArr, params, otcmVal) else: #clf = LinearDiscriminantAnalysis() clf = QuadraticDiscriminantAnalysis() clf.fit(trainDataArr, trainOtcmArr) trainingEstimate = clf.predict(trainDataArr) testingEstimate = clf.predict(testDataArr) if itr == 0 and len(otcmVal) == 2: addTitle = "Own" if ownFunction else "Inbuilt" metric = getMetrics(testOtcmArr, testingEstimate, otcmVal, showPlot=True, title="GDA2D Versicolor,Virginica - %s" % addTitle) else: metric = getMetrics(testOtcmArr, testingEstimate, otcmVal) accrList.append(metric[0]) presList.append(metric[1]) recallList.append(metric[2]) fMeasList.append(metric[3]) aucList.append(metric[4]) return accrList, presList, recallList, fMeasList, aucList
def qda_predictor(x_train, y_train, x_test, y_test, give_clf = False): clf = QuadraticDiscriminantAnalysis() clf.fit(x_train, y_train) accuracy = clf.score(x_test, y_test) f1 = precision_recall_fscore_support(y_test, clf.predict(x_test), average = 'weighted')[2] print(precision_recall_fscore_support(y_test, clf.predict(x_test), average = 'weighted')) if not give_clf: return(accuracy, f1) else: return(clf)
def train_l1_qda(x_train, x_test, y_train, y_test): clf = QuadraticDiscriminantAnalysis() clf.fit(x_train, y_train) if y_test is not None: print('QuadraticDiscriminantAnalysis:', clf.score(x_test, y_test)) else: print('QuadraticDiscriminantAnalysis:', clf.score(x_train, y_train)) test_res = np.reshape(clf.predict(x_train), (-1, 1)) train_res = np.reshape(clf.predict(x_test), (-1, 1)) return [test_res, train_res]
def Quadratic_Discriminant_Analysis(): Quadratic = QuadraticDiscriminantAnalysis() Quadratic.fit(X_train, y_train) predict = Quadratic.predict(X_train) print('train: ', accuracy_score(y_train, predict)) print('train: ', classification_report(y_train, predict)) predict = Quadratic.predict(X_test) print('test: ', accuracy_score(y_test, predict)) print('test: ', classification_report(y_test, predict))
def qda(train_size=None): _, _, X_train, X_test, y_train, y_test = dataset() if train_size: X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=train_size) qda = QDA() qda.fit(X_train, y_train) mae(y_test, qda.predict(X_test)) confusion_matrix(y_test, qda.predict(X_test), qda.score(X_test, y_test))
def test_qda(data): qda_clf = QDA() qda_clf.fit(data.train_x, data.train_y) qda_predict = qda_clf.predict(data.train_x) print('QDA') print('Classification accuracy for train data = {:.2%}'.format( metrics.accuracy_score(data.train_y, qda_predict))) test_result = qda_clf.predict(data.test_x) print('Classification accuracy for test data = {:.2%}'.format( metrics.accuracy_score(data.test_y, test_result)))
def test_quadratic_discriminant_analysis(data): qda_clf = QDA() qda_clf.fit(data.train_x, data.train_y) qda_predict = qda_clf.predict(data.train_x) print('QDA') print('Classification accuracy for train data = {:.2%}'.format( metrics.accuracy_score(data.train_y, qda_predict))) pred_test = qda_clf.predict(data.test_x) print('Classification accuracy for test data = {:.2%}'.format( metrics.accuracy_score(data.test_y, pred_test)))
def lda(): X_train_feature, X_test_feature, y_train, y_test = train_test_data() print('Start training') # clf = LinearDiscriminantAnalysis() clf = QuadraticDiscriminantAnalysis() clf.fit(X_train_feature, y_train) y_pred = clf.predict(X_train_feature) result_analysis(y_pred, y_train) print('Start predicting') y_pred = clf.predict(X_test_feature) result_analysis(y_pred, y_test)
def crossValidate(attributes, outcomes, foldCount, ownFunction=True): presList =[]; recallList = [] accrList = []; fMeasList = [] aucList = [] testingEstimate = [] otcmVal = list(set(outcomes)) params = {}; featLen = 4; attrFolds = getFolds(attributes,foldCount) otcmFolds = getFolds(outcomes,foldCount) testDataList = copy.copy(attrFolds) testOtcmList = copy.copy(otcmFolds) for itr in range(foldCount): trainDataList = [] trainOtcmList = [] for intitr in range (foldCount): if intitr != itr: trainDataList.append(attrFolds[intitr]) trainOtcmList.append(otcmFolds[intitr]) trainDataArr = np.array(trainDataList).reshape(-1,featLen) trainOtcmArr = np.array(trainOtcmList).reshape(-1) testDataArr = np.array(testDataList[itr]).reshape(-1,featLen) testOtcmArr = np.array(testOtcmList[itr]).reshape(-1) if ownFunction: params = getParams(trainDataArr,trainOtcmArr,otcmVal,featLen) testingEstimate = gdaNDEstimate(testDataArr,params,otcmVal) else: #clf = LinearDiscriminantAnalysis() clf = QuadraticDiscriminantAnalysis() clf.fit(trainDataArr,trainOtcmArr) trainingEstimate = clf.predict(trainDataArr) testingEstimate = clf.predict(testDataArr) if itr == 0 and len(otcmVal)==2: addTitle = "Own" if ownFunction else "Inbuilt" metric = getMetrics(testOtcmArr,testingEstimate,otcmVal,showPlot=True,title="GDA2D Versicolor,Virginica - %s"%addTitle) else: metric = getMetrics(testOtcmArr,testingEstimate,otcmVal) accrList.append(metric[0]) presList.append(metric[1]) recallList.append(metric[2]) fMeasList.append(metric[3]) aucList.append(metric[4]) return accrList, presList, recallList, fMeasList, aucList
def train_quadratic_discriminant_analysis(data_train, data_test, class_train, class_test): qda_clf = QDA() qda_clf.fit(data_train, class_train) pred_train = qda_clf.predict(data_train) print('Quadratic discriminant analysis') print('The accuracy of the classification on the training set of data') print('{:.2%}'.format(metrics.accuracy_score(class_train, pred_train))) pred_test = qda_clf.predict(data_test) print('The accuracy of classification on the test data set') print('{:.2%}'.format(metrics.accuracy_score(class_test, pred_test)))
def QDA_classify(params, dataset, seed, classify): model_name = "QDA" print(model_name, params, dataset, seed) np.random.seed(108) start_time = timeit.default_timer() train_X, train_y, test_X, test_y = gen_train_test_data(dataset, seed) # build a classifier based on selected parameters # reg_param = UniformFloatHyperparameter('reg_param', 0.0, 1.0, default_value=0.0) model = QuadraticDiscriminantAnalysis( reg_param=round(params["reg_param"], 4)) if classify == "test": model.fit(train_X, train_y) pred_y = model.predict(test_X) # maximize accuracy auc = accuracy_score(test_y, pred_y) if classify == "cv": scores = cross_val_score(model, train_X, train_y, cv=cv_train) auc = np.mean(scores) # minimize loss loss = 1.0 - auc end_time = timeit.default_timer() print("{}_runtime: {}(s)".format(model_name, round(end_time - start_time, 2))) del model # dictionary with information for evaluation return {'auc': auc, 'loss': loss, 'status': STATUS_OK}
def qda(X, y, X_train, y_train, X_test, y_test): qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) accuracy_qda = cross_val_score(qda, X, y).mean() print('Score: QDA {}'.format(accuracy_qda)) predictions = qda.predict(X_test) print(confusion_matrix(y_test, predictions))
def qda_lol(X, y, normalize=True, n_components=None, n_splits=5, n_repeats=5): kfold = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats) cm = [] for train_idx, test_idx in kfold.split(X, y): y_train, y_test = y[train_idx], y[test_idx] if normalize: X_train, X_test = scale(X[train_idx]), scale(X[test_idx]) else: X_train, X_test = X[train_idx], X[test_idx] p = LOL(n_components=n_components) X_train = p.fit_transform(X_train, y_train) X_test = p.transform(X_test) #features = p.explained_variance_ratio_ < 0.9 #X_train = X_train[:, features] #X_test = X_test[:, features] l = QuadraticDiscriminantAnalysis() l.fit(X_train, y_train) pred = l.predict(X_test) cm.append(confusion_matrix(y_test, pred)) cm, _ = compute_cm(cm) return cm
class QuadraticDiscriminantAnalysisImpl(): def __init__(self, priors=None, reg_param=0.0, store_covariance=False, tol=0.0001, store_covariances=None): self._hyperparams = { 'priors': priors, 'reg_param': reg_param, 'store_covariance': store_covariance, 'tol': tol, 'store_covariances': store_covariances } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def qda(trainx, trainy, valx, valy): clf = QuadraticDiscriminantAnalysis() clf.fit(trainx, trainy) pred = clf.predict(valx) con_mat = confusion_matrix(valy, pred) acc = sum(valy == pred) / len(valy) return con_mat, acc
def lda_window(df, header, width, title): lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) qda = QuadraticDiscriminantAnalysis(store_covariance=True) df_train = df[:int(len(df)*0.8)].reset_index(drop=True).fillna(0) df_test = df[int(len(df)*0.8):].reset_index(drop=True).fillna(0) # df_train['product'] = np.ones(len(df_train)) # df_test['product'] = np.ones(len(df_test)) # for i, col in enumerate(headers): # df_train['product'] = df_train['product'] * df_train[col] # df_test['product'] = df_test['product'] * df_test[col] X = window_stack(df_train[[header]], width=width) # X = window_stack(df_train[['product']], width=width) y = df_train['cho2_b'][width-1:] print("Input shape" + str(X.shape)) lda.fit(X, y) qda.fit(X, y) X = window_stack(df_test[[header]], width=width) # X = window_stack(df_test[['product']], width=width) y = df_test['cho_b'][width-1:] y_pred=lda.predict(X) utils.evaluate(y, y_pred, 0, f'LDA window of {header}') utils.plot_eval(df_test, y, y_pred, title=f'LDA window of {header}') y_pred=qda.predict(X) utils.evaluate(y, y_pred, 0, f'QDA window of {header}') utils.plot_eval(df_test, y, y_pred, title=f'QDA window of {header}') return lda, qda
def create_symbol_forecast_model(self): # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series( self.symbol_list[0], self.model_start_date, self.model_end_date, lags=5 ) # Use the prior two days of returns as predictor # values, with direction as the response x = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Create training and test sets, each of them is series start_test = self.model_start_test_date x_train = x[x.index < start_test] x_test = x[x.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] model = QuadraticDiscriminantAnalysis() model.fit(x_train, y_train) # return nd array pred_test = model.predict(x_test) print("Error Rate is {0}".format((y_test != pred_test).sum() * 1. / len(y_test))) return model
def classifier_qda(features, targets): """ Classifier for Quadratic Discriminant analysis. Ouput the score of the classifier and serialize the model as pickle :param features: bag of words representation of our users' tweets :param targets: labels of these users :return: score of the learning algorithm """ print('---------- QDA ------------') # Initialize, split the dataset, train and make predictions # We use the prior probabilities to make sure that even if we don't have exactly the same number of labels 1 and 0, # the prediction is not biased towards one or the other candidates qda = QuadraticDiscriminantAnalysis() train_set, test_set, train_features, test_features = train_test_split( features, targets, test_size=0.1) qda.fit(train_set, train_features) prediction = qda.predict(test_set) # Display the confusion matrix and the score of the prediction A = confusion_matrix(test_features, prediction) s = np.sum(A) print(A) print('Success: %f out of %d points' % (((A[0, 0] + A[1, 1]) / s), s)) qda = QuadraticDiscriminantAnalysis() qda.fit(features, targets) # Serialize the model with open("C:/data/serialized/qda.p", "wb") as f: pickle.dump(qda, f) return prediction
def trainingModels(self): if self.no == -1: clf2 = RandomForestClassifier(criterion="gini", random_state=60, max_depth=20, n_estimators=200) clf2.fit(self.featuretrain, self.labeltrain) self.predictiontest = clf2.predict(self.featuretest) self.no += 1 self.displayAccuracy("RandomForestClassifier") elif self.no == 0: clf5 = GradientBoostingClassifier() clf5.fit(self.featuretrain, self.labeltrain) self.predictiontest = clf5.predict(self.featuretest) self.no += 1 self.displayAccuracy("GradientBoostingClassifier") elif self.no == 1: clf4 = AdaBoostClassifier() clf4.fit(self.featuretrain, self.labeltrain) self.predictiontest = clf4.predict(self.featuretest) self.no += 1 self.displayAccuracy("AdaBoostClassifier") elif self.no == 2: clf9 = LinearDiscriminantAnalysis() clf9.fit(self.featuretrain, self.labeltrain) self.predictiontest = clf9.predict(self.featuretest) self.no += 1 self.displayAccuracy("LinearDiscriminantAnalysis") elif self.no == 3: clf10 = QuadraticDiscriminantAnalysis() clf10.fit(self.featuretrain, self.labeltrain) self.predictiontest = clf10.predict(self.featuretest) self.no += 1 self.displayAccuracy("QuadraticDiscriminantAnalysis") elif self.no == 4: clf3 = DecisionTreeClassifier(criterion="gini", random_state=50, max_depth=20, min_samples_leaf=10) clf3.fit(self.featuretrain, self.labeltrain) self.predictiontest = clf3.predict(self.featuretest) self.no += 1 self.displayAccuracy("DecisionTreeClassifier") elif self.no == 5: print("Score is:") print(((self.score) / 16281) * 100) self.finalresult.to_csv("Predicted_result.csv", sep=',', encoding='utf-8') self.richpeople.to_csv("Rich_people.csv", sep=',', encoding='utf-8') exit()
def da_classify(X_train, y_train, X_test, y_test): t0 = time.time() clf = QuadraticDiscriminantAnalysis() clf.fit(X_train, y_train) print("da done in %0.3fs" % (time.time() - t0)) print(1 - np.sum(np.abs(clf.predict(X_test) - y_test)) / float(0.3 * len(y)))
def get_qda_oof_prediction(x_train,y_train,x_test): oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFOLDS,ntest)) for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)): model = QuadraticDiscriminantAnalysis() y_tr = y_train[train_ind] x_tr = x_train[train_ind] x_ts = x_train[test_ind] model.fit(x_tr,y_tr) oof_train[test_ind] = model.predict(x_ts) oof_test_skf[i,:] = model.predict(x_test) print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind]))) oof_test = stats.mode(oof_test_skf,axis=0)[0] return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
def QDA(self, ): QDA = QuadraticDiscriminantAnalysis() QDA.fit(self.X_train, self.y_train) #Predict the test set results y_pred = QDA.predict(self.X_test) #Performance: AUC-ROC roc_auc_QDA = roc_auc_score(self.y_test, y_pred) print( 'QDA completed: ROC-AUC: {}'.format(roc_auc_QDA) + '\n' + '------------------------------------------------------------------' ) #Performance: AUC-PRC auc_prc_QDA = average_precision_score(self.y_test, y_pred, average='weighted') print( 'QDA completed: AUC-PRC: {}'.format(auc_prc_QDA) + '\n' + '------------------------------------------------------------------' ) #Performance: F1 Metric f1_QDA = f1_score(self.y_test, y_pred, average='weighted') print( 'QDA completed: F1 metric: {}'.format(f1_QDA) + '\n' + '------------------------------------------------------------------' ) return roc_auc_QDA, auc_prc_QDA, f1_QDA
class QuadraticDiscriminantAnalysiscls(object): """docstring for ClassName""" def __init__(self): self.qda_cls = QuadraticDiscriminantAnalysis() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.qda_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.qda_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.qda_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def quadratic_discriminant_analysis(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train Quadratic Discriminant Analysis (LDA) classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. n_components: Number of components (< n_classes - 1) for dimensionality reduction. ''' from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} model = QuadraticDiscriminantAnalysis() #X_r2 = model.fit(x_train, y_train).transform(X) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) else: predTest = model.predict(x_test) return predTest, metricsCV, model
def k_folds_testing(training_df, label_df, model, splits=10, bestfeatures=None): kf = model_selection.KFold(n_splits=splits, shuffle=True) results = [] weights_set = None for train_index, test_index in kf.split(training_df): if model == "QDA": reg = QuadraticDiscriminantAnalysis() elif model == "xgboost": pass elif model == "MLP": reg = MLPClassifier() elif model == "ADA": reg = AdaBoostClassifier() elif model == "DT": reg = DecisionTreeClassifier() elif model == "KNN": reg = KNeighborsClassifier() elif model == "GPC": reg = GaussianProcessClassifier(1.0 * RBF(1.0)) elif model == "MNB": reg = MultinomialNB() reg.fit(training_df.iloc[train_index,:], label_df.iloc[train_index, :].values) predictions = reg.predict(training_df.iloc[test_index,:]) if model == "MLP": weights = [] for x in abs(reg.coefs_[-1]): weights.append(x[0]) w = set(np.argsort(weights)[-90:].flat) if weights_set is None: weights_set = w else: weights_set = w.intersection(weights_set) print(weights_set) elif model != "QDA": print(reg.feature_importances_) w = set(np.argsort(reg.feature_importances_)[-15:].flat) if weights_set is None: weights_set = w else: weights_set = w.intersection(weights_set) print(weights_set) # determine accuracy accuracy = metrics.accuracy_score(label_df.iloc[test_index,:].values, predictions) results.append(accuracy) return [np.mean(results), np.std(results), weights_set]
class SNPForecastingStrategy(Strategy): """ Requires: symbol - A stock symbol on which to form a strategy on. bars - A DataFrame of bars for the above symbol.""" def __init__(self, symbol, bars): self.symbol = symbol self.bars = bars self.create_periods() self.fit_model() def create_periods(self): """Create training/test periods.""" self.start_train = datetime.datetime(2001,1,10) self.start_test = datetime.datetime(2005,1,1) self.end_period = datetime.datetime(2005,12,31) def fit_model(self): """Fits a Quadratic Discriminant Analyser to the US stock market index (^GPSC in Yahoo).""" # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor values, with direction as the response X = snpret[["Lag1","Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the predicting factors for use # in direction forecasting self.predictors = X[X.index >= self.start_test] # Create the Quadratic Discriminant Analysis model # and the forecasting strategy self.model = QuadraticDiscriminantAnalysis() self.model.fit(X_train, y_train) def generate_signals(self): """Returns the DataFrame of symbols containing the signals to go long, short or hold (1, -1 or 0).""" signals = pd.DataFrame(index=self.bars.index) signals['signal'] = 0.0 # Predict the subsequent period with the QDA model signals['signal'] = self.model.predict(self.predictors) # Remove the first five signal entries to eliminate # NaN issues with the signals DataFrame signals['signal'][0:5] = 0.0 signals['positions'] = signals['signal'].diff() return signals
def qda(X, y, plot=False): clf = QuadraticDiscriminantAnalysis() clf.fit(X, y) if plot: plot_decision_boundary(lambda x: clf.predict(x), X, y) plt.title("QDA") plt.show() return clf
def da_classify(X_train, y_train, X_cv, y_cv, X_test, y_test): from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf = QuadraticDiscriminantAnalysis() clf.fit(X_train, y_train) pre_y_train = clf.predict(X_train) pre_y_cv = clf.predict(X_cv) pre_y_test = clf.predict(X_test) print("da train Metrics : {0}".format(PRF(y_train, pre_y_train))) print("da cv Metrics : {0}".format(PRF(y_cv, pre_y_cv))) print("da test Metrics : {0}".format(PRF(y_test, pre_y_test))) print("Test PRF : {0}".format( precision_recall_fscore_support(y_test, pre_y_test))) print('The Accuracy of ' + 'da' + ' is :', clf.score(X_test, y_test)) print(classification_report(y_test, pre_y_test)) return clf
def make_qda(X_train, X_test, y_train, y_test,): model = QuadraticDiscriminantAnalysis() model.fit(X_train, y_train) y_pred = model.predict(X_test) get_classification_metrics(y_pred, y_test) return model
def doQDA(x,digits,s): myLDA = LDA() myLDA.fit(x.PCA[:,:s],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:s,:]) labels = myLDA.predict(newtest) errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels) return errors
def quadraticdiscriminant(X_train,X_test,Y_train,Y_test): st = "QDA" print("Quadratic Discriminant Analysis") # labels.append(st) qd = QuadraticDiscriminantAnalysis() qd.fit(X_train,Y_train) prediction = qd.predict(X_test) accuracies.append(result(prediction,Y_test))
def confusion(digits): myLDA = LDA() x = center_matrix_SVD(digits.train_Images) myLDA.fit(x.PCA[:,:50],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:50,:]) labels = myLDA.predict(newtest) import sklearn.metrics as f print(f.confusion_matrix(digits.test_Labels,labels))
def test_qda_regularization(): # the default is reg_param=0. and will cause issues # when there is a constant variable clf = QuadraticDiscriminantAnalysis() with ignore_warnings(): y_pred = clf.fit(X2, y6).predict(X2) assert np.any(y_pred != y6) # adding a little regularization fixes the problem clf = QuadraticDiscriminantAnalysis(reg_param=0.01) with ignore_warnings(): clf.fit(X2, y6) y_pred = clf.predict(X2) assert_array_equal(y_pred, y6) # Case n_samples_in_a_class < n_features clf = QuadraticDiscriminantAnalysis(reg_param=0.1) with ignore_warnings(): clf.fit(X5, y5) y_pred5 = clf.predict(X5) assert_array_equal(y_pred5, y5)
def train_DA(self, X, y, lda_comp, qda_reg): ''' Input: qda_reg - reg_param lda_comp - n_components X - data matrix (train_num, feat_num) y - target labels matrix (train_num, label_num) Output: best_clf - best classifier trained (QDA/LDA) best_score - CV score of best classifier Find best DA classifier. ''' n_samples, n_feat = X.shape cv_folds = 10 kf = KFold(n_samples, cv_folds, shuffle=False) lda = LinearDiscriminantAnalysis(n_components = lda_comp) qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg) score_total_lda = 0 #running total of metric score over all cv runs score_total_qda = 0 #running total of metric score over all cv runs for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] lda.fit(X_train, y_train) cv_pred_lda = lda.predict(X_test) score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_lda += score_lda qda.fit(X_train,y_train) cv_pred_qda = qda.predict(X_test) score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_qda += score_qda score_lda = score_total_lda/cv_folds score_qda = score_total_qda/cv_folds # We keep the best one if(score_qda > score_lda): qda.fit(X,y) return qda, score_qda else: lda.fit(X,y) return lda, score_lda
class road_estimation: def __init__(self, model_selection): self._train_data, self._train_targets, self._valid_data, self._valid_targets, self._test_data, self._test_targets = ( data_load() ) self._model_selection = model_selection self._classifier = [] def train(self): if self._model_selection == "svm": # selected the svc in svm self._classifier = svm.SVC() elif self._model_selection == "nb": self._classifier = GaussianNB() elif self._model_selection == "knn": # parameter n_jobs can be set to -1 to enable parallel calculating self._classifier = KNeighborsClassifier(n_neighbors=7) elif self._model_selection == "ada": # Bunch of parameters, n_estimators, learning_rate self._classifier = AdaBoostClassifier() elif self._model_selection == "rf": # many parameters including n_jobs self._classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) elif self._model_selection == "qda": # complicated array like parameters, perhaps leave it default self._classifier = QuadraticDiscriminantAnalysis() else: print "Please refer to one classifier" self._classifier.fit(self._train_data, self._train_targets) # predict on valid data prediction_valid = self._classifier.predict(self._valid_data) # print validation result for selected model. print ( "Classification report for classifier %s on valid_data:\n%s\n" % (self._model_selection, metrics.classification_report(self._valid_targets, prediction_valid)) ) def test(self): # predict on test data prediction_test = self._classifier.predict(self.test_data) # print test result for selected model. print ( "Classification report for classifier %s on test_data:\n%s\n" % (self._model_selection, metrics.classification_report(self._test_targets, prediction_test)) ) def showPredictionImage(self): f = Feature() f.loadImage("um_000000.png") f.extractFeatures() fea_matrix = f.getFeaturesVectors() predict = self._classifier.predict(fea_matrix) image = np.copy(f.image) num_superpixels = np.max(f.superpixel) + 1 for i in xrange(0, num_superpixels): indices = np.where(f.superpixel == i) if predict[i] == 1: image[indices[0], indices[1], 0] = 1 image[indices[0], indices[1], 1] = 1 image[indices[0], indices[1], 2] = 0 plt.imshow(image) plt.show() # show prediction image with superpixels plt.imshow(mark_boundaries(image, superpixels)) plt.show()
for i in range(9,18): labels.append(2) for i in range(18, 27): labels.append(3) ''' # Creation of random labels for i in range(0,27): labels.append(int(random.random() * 3) + 1) print (labels) ''' # QDA model qda = QuadraticDiscriminantAnalysis() qda.fit(comps, labels) # MCC Calculation y_pred = qda.predict(comps) #print(labels) #print(y_pred) mcc = multimcc(labels,y_pred) print("MCC="+str(mcc)) ''' # Plotting QDA contour nx, ny = 200, 100 x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0]) y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny)) Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.contour(xx, yy, Z, [0.5], linewidths=5, colors = 'k', linestyles = 'dashed') '''
plt.plot(pca.components_.reshape((2,data.shape[0],data.shape[1]))) #plt.plot(pca.explained_variance_, linewidth=2) #plt.title('Principal Component Analysis (PCA) Feature Assessment') # Creation of labels labels = [] for i in range(0,27): labels.append(1) for i in range(27,53): labels.append(2) # LDA model lda = QuadraticDiscriminantAnalysis() lda.fit(comps, labels) y_pred = lda.predict(comps) print(labels) print(y_pred) mcc = matthews_corrcoef(labels,y_pred) print("MCC="+str(mcc)) # Plotting LDA contour nx, ny = 200, 100 x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0]) y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.contour(xx, yy, Z, [0.5], linewidths=5, colors = 'k', linestyles = 'dashed')
plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #smoteen sme = SMOTEENN(random_state=42) os_X,os_y = sme.fit_sample(X_train,y_train) #QDA clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True) clf_QDA.fit(os_X, os_y) y_true, y_pred = y_test, clf_QDA.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) print "G score: " , math.sqrt(recall/ specifity) #Plot non-normalized confusion matrix
What is the training misclassification rate? """ lda1 = LDA(solver="svd", store_covariance=True) lda1.fit(warX,warY) my_lda_pred = pd.DataFrame() my_lda_pred["pred"] = ["No" if x == 0 else "Yes" for x in lda1.predict(warX)] my_lda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]] conf_lda = pd.crosstab(my_lda_pred["pred"], my_lda_pred["actual"]) conf_lda (1/(war.shape[0])) * (conf_lda.iloc[1,0] + conf_lda.iloc[0,1]) """ 6.69% """ qda1 = QDA(store_covariances=True) qda1.fit(warX,warY) test = qda1.predict_proba(warX) my_qda_pred = pd.DataFrame() my_qda_pred["pred"] = ["No" if x < .5 else "Yes" for x in qda1.predict(warX)] my_qda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]] conf_qda = pd.crosstab(my_qda_pred["pred"], my_qda_pred["actual"]) conf_qda (1/(war.shape[0])) * (conf_qda.iloc[1,0] + conf_qda.iloc[0,1])
# # CREATE MODEL # ########################################################################### # Define the estimator: quadratic discriminant analysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda = QuadraticDiscriminantAnalysis() qda.fit(training_data[0], training_data[1]) from sklearn.metrics import accuracy_score # record the best result accuracies[i] = accuracy_score(test_data[1], qda.predict(test_data[0])) mean_accuracy = accuracies.mean() print("\n\nmean accuracy: %f" % mean_accuracy) ############################################################################### # # VISUALIZE # ############################################################################### import matplotlib.pyplot as plt mean_accuracies = np.zeros(shape=(n,)) for i in range(n): mean_accuracies[i] = accuracies[: i + 1].mean()
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis trans = LinearDiscriminantAnalysis(n_components=3) trans.fit(X,y) X = trans.transform(X) """ # Split Up Data x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=None) # Train classifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf = QuadraticDiscriminantAnalysis(reg_param=0.00001) clf.fit(x_train,y_train) # Run Predictions from sklearn.metrics import confusion_matrix, accuracy_score y_preds = clf.predict(x_valid) print( confusion_matrix(y_valid,y_preds) ); print( "Accuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f = open('qda_take1.txt', 'w') f.write( str(confusion_matrix(y_valid,y_preds)) ); f.write( "\nAccuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f.write( "\nclf = QuadraticDiscriminantAnalysis(0.00001)" ); # Now on to final submission x_final = testing.iloc[:,1:].values y_final = clf.predict(x_final).reshape([62096,]); y_final = pd.DataFrame(y_final); numbahs = testing['id'] df = pd.concat([numbahs,y_final],axis=1) df.columns = ['id','country'] df.to_csv("qda_take1.csv",index=False)
logreg = LogisticRegression().fit(X_train, y_train) y_pred = logreg.predict(X_test) y_pred_train = logreg.predict(X_train) log_acc = accuracy_score(y_pred, y_test) #0.64 highest clf = DecisionTreeClassifier().fit(X_train, y_train) y_pred = clf.predict(X_test) clf_acc = accuracy_score(y_pred, y_test) #0.61 neigh = KNeighborsClassifier(n_neighbors=13).fit(X_train, y_train) y_pred = neigh.predict(X_test) nn_acc = accuracy_score(y_pred, y_test) #0.61 quad = QuadraticDiscriminantAnalysis().fit(X_train, y_train) y_pred = quad.predict(X_test) quad_acc = accuracy_score(y_pred, y_test) # 0.19 very low ldaC = LDA(solver='lsqr', shrinkage='auto').fit(X_train, y_train) #LDA with shrinkage y_pred = ldaC.predict(X_test) lda_acc = accuracy_score(y_pred, y_test) #0.58 ######################################### from sklearn.cross_validation import KFold from sklearn.cross_validation import StratifiedKFold import matplotlib.pyplot as plt def calc_params(X, y, clf, param_values, param_name, K, metric = 'accuracy'): '''This function takes the classfier, the training data and labels, the name of the parameter to vary, a list of values to vary by, and a number of folds needed for cross validation and returns a the test and train scores (accuracy or recall) and also