def test_model_ridge_classifier_int(self): model, X = fit_classification_model(linear_model.RidgeClassifier(), 5, is_int=True) model_onnx = convert_sklearn( model, "multi-class ridge classifier", [("input", Int64TensorType([None, X.shape[1]]))], ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnRidgeClassifierInt", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def lines(x_train, x_test, y_train, y_test): res = [] m = linear_model.RidgeClassifier() m.fit(x_train, y_train) predictions = m.predict(x_test) acc = accuracy_score(y_test, predictions) res.append((acc, "RidgeClassifier")) m = linear_model.SGDClassifier() m.fit(x_train, y_train) predictions = m.predict(x_test) acc = accuracy_score(y_test, predictions) res.append((acc, "SGDClassifier")) return res
def get_model(self): if self.problem_type == "regression": if self.regularization == "l1": model = lm.LassoLars(eps=1e-8) elif self.regularization == "l2": model = lm.Ridge(normalize=True, random_state=self.random_state) else: raise ValueError(f"Unknown regularization {self.regularization}") elif self.problem_type == "classification": if self.regularization == "l1": model = lm.LogisticRegression(penalty="l1", solver="saga", class_weight="balanced", random_state=self.random_state) elif self.regularization == "l2": model = lm.RidgeClassifier(normalize=True, random_state=self.random_state) else: raise ValueError(f"Unknown regularization {self.regularization}") else: raise ValueError("Unknown problem_type %r - not performing noise filtering." % self.problem_type) return model
def build_pipeline(hp): n_components = hp.Choice("n_components", [2, 5, 10], default=5) pca = decomposition.PCA(n_components=n_components) model_type = hp.Choice("model_type", ["random_forest", "ridge"]) if model_type == "random_forest": with hp.conditional_scope("model_type", "random_forest"): model = ensemble.RandomForestClassifier( n_estimators=hp.Int("n_estimators", 10, 50, step=10), max_depth=hp.Int("max_depth", 3, 10), ) elif model_type == "ridge": with hp.conditional_scope("model_type", "ridge"): model = linear_model.RidgeClassifier( alpha=hp.Float("alpha", 1e-3, 1, sampling="log")) else: raise ValueError("Unrecognized model_type") skpipeline = pipeline.Pipeline([("pca", pca), ("clf", model)]) return skpipeline
def build_pipeline(hp): n_components = hp.Choice("n_components", [2, 5, 10], default=5) pca = decomposition.PCA(n_components=n_components) model_type = hp.Choice('model_type', ['random_forest', 'ridge']) if model_type == 'random_forest': with hp.conditional_scope('model_type', 'random_forest'): model = ensemble.RandomForestClassifier( n_estimators=hp.Int('n_estimators', 10, 50, step=10), max_depth=hp.Int('max_depth', 3, 10)) elif model_type == 'ridge': with hp.conditional_scope('model_type', 'ridge'): model = linear_model.RidgeClassifier( alpha=hp.Float('alpha', 1e-3, 1, sampling='log')) else: raise ValueError('Unrecognized model_type') skpipeline = pipeline.Pipeline([ ('pca', pca), ('clf', model) ]) return skpipeline
def r_classifier(X, y, alpha=1.0, fit_intercept=True, normalize=True, solver='auto', max_iter=1000, tol=0.0001): reg = linear_model.RidgeClassifier(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=0.001, solver='auto', random_state=30) print_performance(reg, X, y, model='Ridge Calssifier', scores=['accuracy']) reg.fit(X, y) return reg
def classification_analysis(self): tmp = dict() #linear tmp['logic'] = feature_selection.RFECV(lm.LogisticRegression(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['ridge'] = feature_selection.RFECV(lm.RidgeClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['SGD'] = feature_selection.RFECV(lm.SGDClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['lm_svm'] = feature_selection.RFECV(svm.LinearSVC(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ #non-linear tmp['ADABoost'] = feature_selection.RFECV(ensemble.AdaBoostClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['RandomForest'] = feature_selection.RFECV(ensemble.RandomForestClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ #stats chi = feature_selection.chi2(self.x,self.y) tmp['chi2'] = chi[0] tmp['chi2_pval'] = chi[1] fscore = feature_selection.f_classif(self.x,self.y) tmp['f_score'] = fscore[0] tmp['f_pval'] = fscore[1] tmp['MIC'] = feature_selection.mutual_info_classif(self.x,self.y) return tmp
def main(): ## Load in the training and testing data train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"), index_col="id") test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"), index_col='id') count_vectorizer = feature_extraction.text.CountVectorizer() train_vectors = count_vectorizer.fit_transform(train_df["text"]) test_vectors = count_vectorizer.transform(test_df["text"]) clf = linear_model.RidgeClassifier() scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1") print(scores) clf.fit(train_vectors, train_df["target"]) save_model(clf, 'tutorial') preds_test = clf.predict(test_vectors) print(preds_test) create_submission("sample_submission.csv", preds_test, test_df)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingClassifier(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False) bagging_clf.fit(X_train, y_train) treemodel = DecisionTreeClassifier() #treemodel=BaggingClassifier(treemodel,n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False) treemodel.fit(X_train, y_train) randomtree = linear_model.RidgeClassifier() randomtree = BaggingClassifier(randomtree, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False) randomtree.fit(X_train, y_train) sgd = linear_model.SGDClassifier(tol=1e-3) sgd = BaggingClassifier(sgd, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False)
def run_classifier(path, model='logistic_regression', seven_features=False, filterSpO2=True, load_model=False): if model == 'multi_class': data, df = read_data(path, multi_class=True, seven_features=seven_features, fitlerSpO2=filterSpO2) else: data, df = read_data(path, seven_features=seven_features, fitlerSpO2=filterSpO2) if filterSpO2: subSpO2Folder = 'FilteredSpO2/' else: subSpO2Folder = 'NoSpO2Filtering/' if seven_features: n_features = 7 else: n_features = 3 plot_data_file = 'Plots/data/precision_recall_' + str(n_features) + 'Features_' + 'filtered_' + str(filterSpO2) + '.csv' if not os.path.exists(plot_data_file): with open(plot_data_file, 'w', newline='') as file: csvwriter = csv.writer(file) csvwriter.writerow(['model', 'precisions', 'recalls', 'thresholds']) print("-----------------------------------------------") if model == 'multi_class': print(" MULTI CLASS CLASSIFIER") elif model == 'svc': print(" SUPPORT VECTOR CLASSIFIER") elif model == 'mlp': print(" MULTI-LAYER PERCEPTRON") else: print(" LOGISTIC REGRESSION") print("-----------------------------------------------") X = data[:, 0:-1] y = data[:, -1] n_split = 10 kFold = model_selection.KFold(n_splits=n_split, shuffle=True, random_state=1) f1_scores = [] bic_scores = [] i = 0 res = np.zeros((8, n_split)) tprs = [] prec_recall_curves = {'precision': [], 'recall': [], 'threshold':[]} mean_fpr = np.linspace(0, 1, 100) for train_index, test_index in kFold.split(X, y): print('Fold:',i + 1) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] y_preds_prob_total = [] y_trues = [] X_train_scale, X_test_scale = normalize(X_train, X_test, seven_features=seven_features) print('x train:',X_train_scale.shape) if model == 'svc': model_name = 'SVC' classifier = svm.SVC(gamma='auto', kernel='rbf', probability=True) elif model == 'sgd': model_name = 'SGD Classifier' classifier = linear_model.SGDClassifier(loss='log') elif model == 'ridge': model_name = 'Ridge Classifier' classifier = linear_model.RidgeClassifier() elif model == 'mlp': model_name = 'Neural Network' if seven_features: classifier = MLPClassifier((12, 8, 6, 4, 4), max_iter=200, activation='tanh', solver='adam', random_state=1, momentum=0.8) # current BEST 0.7798 accuracy else: classifier = MLPClassifier((6, 3), max_iter=200, activation='tanh', solver='adam', random_state=1, momentum=0.6) # current best else: model_name = 'Logistic Regression' classifier = linear_model.LogisticRegression() final_classifier = base.clone(classifier) if load_model: # load a pretrained model model_path = 'saved_models/classifer/' + subSpO2Folder + model_name.replace(' ', '_') + '_' + str(n_features) + '_features.joblib' print('load file from:', model_path) classifier = load(model_path) else: # train a new model classifier.fit(X_train_scale, y_train) y_pred = classifier.predict(X_test_scale) y_pred_prob = classifier.predict_proba(X_test_scale)[:,1] y_preds_prob_total.append(y_pred_prob) y_trues.append(y_test) f1_scores.append(metrics.f1_score(y_test, y_pred)) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() res[0][i] = tp / (tp + fp) # PPV res[1][i] = tp / (tp + fn) # sensitivity res[2][i] = tn / (tn + fp) # specificity res[3][i] = tn / (fn + tn) # NPV roc_auc = roc_auc_score(y_test, y_pred_prob) res[4][i] = roc_auc res[5][i] = res[1][i]/(1 - res[2][i]) # Positive Likelyhood rato res[6][i] = (1 - res[1][i]) / res[2][i] # Negative Likelyhood rato res[7][i] = metrics.accuracy_score(y_test, y_pred) bic_val = bic.bic(y_test, y_pred_prob, n_features) bic_scores.append(bic_val) aupr = metrics.average_precision_score(y_test, y_pred_prob) fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob) tprs.append(interp(mean_fpr, fpr, tpr)) #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) print('current f1:', f1_scores[-1], ' PPV:', res[0][i], ' Sensitivity:', res[1][i], ' Specificity:', res[2][i], ' NPV:', res[3][i], ' BIC:', bic_val, 'ROAUC:',res[4][i], 'Accuracy:',res[7][i]) i += 1 # train a final model using entire set if not load_model: X, X_copy = normalize(X, X, seven_features=seven_features) final_classifier.fit(X, y) final_pred = final_classifier.predict(X) print('Final F1 ', metrics.f1_score(y, final_pred)) if not os.path.exists('saved_models'): os.mkdir('saved_models') if not os.path.exists('saved_models/classifer'): os.mkdir('saved_models/classifer') model_output_path = 'saved_models/classifer/' + subSpO2Folder + model_name.replace(' ', '_') + '_' + str(n_features) + '_features.joblib' dump(final_classifier, model_output_path) res = np.mean(res, axis=1) print('\n=================================================') print(' RESULT') print("--------------------------------------------------") print(' ', model_name) print(' Total cases:', y.shape) print('Seven Features:', seven_features, ' FilterSpO2:', filterSpO2) print(' Mean F1:', np.mean(np.array(f1_scores))) print(' PPV:', res[0]) print(' Sensitivity:', res[1]) print(' Specificity:', res[2]) print(' NPV:', res[3]) print(' ROAUC:', res[4]) print(' Positive LR:', res[5]) print(' Negative LR:', res[6]) print(' BIC:', np.mean(np.array(bic_scores))) print(' Accuracy:', res[7]) print(' AUPR:', aupr) print("\nBaseline") get_baseline(np.array([df['Spo2'], df['Fio2']]).T, y) print('==================================================') if model == 'multi_class': return precision_array, recall_array, thresholds = metrics.precision_recall_curve(np.array(y_trues).flatten(), np.array(y_pred_prob).flatten()) with open(plot_data_file, 'a+', newline='') as file: csvwriter = csv.writer(file) csvwriter.writerow([model_name, str(precision_array), str(recall_array), str(thresholds)])
regression(linear_model.OrthogonalMatchingPursuit()), regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.BayesianRidge()), regression(linear_model.ARDRegression()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression( linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)), # Logistic Regression classification( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification_binary( linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifierCV()), classification_binary( linear_model.SGDClassifier(random_state=RANDOM_SEED)), # Decision trees regression(tree.DecisionTreeRegressor(**TREE_PARAMS)), regression(tree.ExtraTreeRegressor(**TREE_PARAMS)),
def train(data, **kwargs): clf = lm.RidgeClassifier(**kwargs) clf.fit(data[:, :-1], data[:, -1]) return clf
return [y_clf_train,y_clf_test,acc_clf_train, acc_clf_test,loss_clf_train,loss_clf_test] def get_classifier_results(): return pandas.DataFrame({'classifier':classifier_list, 'classifier_name':classifier_names, 'clf_dataset':clf_datasets, 'acc_train':acc_train,'acc_test':acc_test, 'loss_train':loss_train,'loss_test':loss_test}) classifier_list,classifier_names,clf_datasets=[],[],[] acc_train,acc_test,loss_train,loss_test=[],[],[],[] df_list=['classifier_name','acc_train','acc_test','loss_train','loss_test'] clf=[linear_model.LogisticRegression(solver='liblinear',multi_class='ovr'), linear_model.LogisticRegressionCV(solver='liblinear',multi_class='ovr'), linear_model.SGDClassifier(max_iter=1000,tol=0.00001), linear_model.RidgeClassifier(),linear_model.RidgeClassifierCV(), LinearDiscriminantAnalysis(),QuadraticDiscriminantAnalysis(), svm.LinearSVC(),svm.SVC(gamma='scale',C=10.0,kernel='poly'), svm.NuSVC(gamma='scale',kernel='poly'), KNeighborsClassifier(),RadiusNeighborsClassifier(radius=30), NearestCentroid(), DecisionTreeClassifier(),ExtraTreeClassifier(),GaussianNB(), BernoulliNB(),MultinomialNB(), BaggingClassifier(),RandomForestClassifier(n_estimators=64), AdaBoostClassifier(),GradientBoostingClassifier(), linear_model.Perceptron(max_iter=1000,tol=0.00001), linear_model.PassiveAggressiveClassifier(max_iter=1000,tol=0.00001), GaussianProcessClassifier(),LabelPropagation(),LabelSpreading()] list3clf=['LogisticRegression','LogisticRegressionCV','SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV',
def ridge_classifiers(): ridge = OneVsRestClassifier(linear_model.RidgeClassifier()) return ridge
def ridgeclassifier(self): clf = linear_model.RidgeClassifier() return clf
def __init__(self, X, Y, alpha=2): super(RidgeClassifier, self).__init__(X, Y) self.alpha = alpha self.classifier = linear_model.RidgeClassifier(alpha=self.alpha)
feature_selection_performance = [] # Classification (without feature selection): print('Without feature selection') X = data.iloc[:, 1:34] y = data.iloc[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) no_selection_performance = [] print('Ridge') lin_reg = linear_model.RidgeClassifier(alpha=1000, fit_intercept=True, normalize=False, solver='lsqr', tol=1e-2) lin_reg.fit(X_train, y_train) y_test_pred = lin_reg.predict(X_test) matrix = confusion_matrix(y_test, y_test_pred) score = lin_reg.score(X_test, y_test) no_selection_performance.append(('Ridge', score, matrix)) print('SGD') sgdClassifier = linear_model.SGDClassifier(fit_intercept=True, loss='log', max_iter=1000, penalty='l1', shuffle=False, tol=0.01)
subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, missing=None) Ridge_C = linear_model.RidgeClassifier( alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver='auto', random_state=None) LogisticR_C = linear_model.LogisticRegression( penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs',
import math from sklearn.utils.extmath import safe_sparse_dot from sklearn.model_selection import train_test_split warnings.filterwarnings("ignore", category=FutureWarning) linkF = 'D:/DATA/CODE/GraduationProject/Features_Data/' case = 'blur' dirTrain = linkF dirTest = linkF dataPre.loadDataTrain(dirTrain, case) dataPre.loadDataTest(dirTest, case) matTrain, matVal, labelTrain, labelVal = train_test_split(dataPre.matTrain, dataPre.labelTrain, test_size=0.2, random_state=1) dataPre.FindMaxMin(matTrain) dataPre.StandardData(matTrain) dataPre.StandardData(matVal) dataPre.StandardData(dataPre.matTest) logreg = linear_model.RidgeClassifier(alpha=0.1) # If num_Features = 9, set_up alpha = 70 # Elif num_Features = 11, set_up alpha = 33. logreg.fit(matTrain, labelTrain) print(case.upper()) labelVal_pred = logreg.predict(matVal) print("Accuracy: %.2f %%" % (100 * accuracy_score(labelVal, labelVal_pred))) labelTest_pred = logreg.predict(dataPre.matTest) print("Accuracy: %.2f %%" % (100 * accuracy_score(dataPre.labelTest, labelTest_pred)))
df_1325_red.drop(columns=i) x_train=dateex(df_1325_red,start_date = '03-10-2008',end_date = '06-01-2012')[X_col_red].drop(columns='daily_return') y_train=dateex(df_1325_red,start_date = '03-10-2008',end_date = '06-01-2012')['sign_daily_return'] dates_train=dateex(df_1325_red,start_date = '03-10-2008',end_date = '06-01-2012')['date'] x_test=dateex(df_1325_red,start_date = '08-01-2012',end_date = '06-01-2014')[X_col_red].drop(columns='daily_return') y_test=dateex(df_1325_red,start_date = '08-01-2012',end_date = '06-01-2014')['sign_daily_return'] dates=dateex(df_1325_red,start_date = '08-01-2012',end_date = '06-01-2014')['date'] """Fit ridge regression wih optimised alpha.""" from sklearn import linear_model from sklearn import metrics alpha_good=0.5 reg = linear_model.RidgeClassifier(alpha=0.5) reg.fit(x_train,y_train) y_train_hat=reg.predict(x_train) w_old=metrics.mean_squared_error(y_train, y_train_hat) for x in range(10000): reg = linear_model.RidgeClassifier(alpha=x*0.0001) reg.fit(x_train,y_train) y_train_hat=reg.predict(x_train) w= metrics.mean_squared_error(y_train, y_train_hat) if w<w_old: alpha_good=x*0.0001 w_old=w best_alpha_reg = linear_model.RidgeClassifier(alpha=alpha_good) best_alpha_reg.fit(x_train,y_train) y_train_hat_rid=best_alpha_reg.predict(x_train) print('In of sample Mean Squared Error:', w_old)
[np.sin(theta), np.cos(theta)]]) xx, yy = np.dot(R, [xx, yy]) ## skalowanie xx /= max(np.absolute(xx)) yy /= max(np.absolute(yy)) ## przypisanie do X X[row, ::2] = xx X[row, 1::2] = yy ## Rozdzielenie danych do późniejszego liczenia 'accuracy' i 'confusion matrix' X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, stratify=y) ## UTWORZENIE OBIEKTU KLASYFIKATORA clf = linear_model.RidgeClassifier(alpha=0.14026845637583893, fit_intercept=False) ## CROSS-VALIDACJA scores = model_selection.cross_validate(clf, X_train, y_train, return_estimator=True, n_jobs=-1) print('The score array for test scores on each cv split:', scores['test_score']) print('Mean of above:', scores['test_score'].mean()) ## WYBRANIE NAJLEPSZEGO ESTYMATORA I PREDYKCJA DLA WSZYTKICH DANYCH best_clf = scores['estimator'][np.argmax(scores['test_score'])] print('Accuracy on final set:', best_clf.score(X_test, y_test))
param_grid={ "fit_intercept": [True, False], "selection": ["cyclic", "random"] }, cv=3) lassoc.fit(x_train, y_train) print(" [ ] Algorithm 2: Least Angle Regression classifier...") larsc = GridSearchCV(estimator=linear_model.Lars(copy_X=True), param_grid={"fit_intercept": [True, False]}, cv=3) larsc.fit(x_train, y_train) print(" [ ] Algorithm 3: Ridge regression classifier...") rrc = GridSearchCV( estimator=linear_model.RidgeClassifier(), param_grid={ "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"], "fit_intercept": [True, False] }, cv=3) rrc.fit(x_train, y_train) print(" [ ] Algorithm 4: Stochastic Gradient Descent classifier...") sgdc = GridSearchCV(estimator=linear_model.SGDClassifier(tol=0.001, max_iter=1000, n_jobs=-1), param_grid={ "loss": ["hinge", "log", "modified_huber", "perceptron"],
res += ", C = " + str(params[0]) + ", gamma = " + str(params[1]) if name == POLY: res += ", C = " + str(params[0]) + ", gamma = " + str( params[1]) + ", degree = " + str(params[2]) if name == NN: res += ", k = " + str(params[0]) return res if __name__ == "__main__": # for parallelism under windows # Build Classifiers for C in penalties: # Add Ridge Classifier (One vs Rest approach) classifiers[RIDGE, (C, )] = linear_model.RidgeClassifier(alpha=C) for gamma in bandwidths: # Add RBF SVM Classifiers (One vs One approach) classifiers[RBF, (C, gamma)] = svm.SVC( kernel='rbf', C=C, # regularization -> SVM gamma=gamma, # bandwidth decision_function_shape='ovo') # # Add Laplacian SVM Classifiers (One vs One approach) # always giving Accuracy = 0.34349763744093603 -> almost random # classifiers[LAP, (C, gamma)] = svm.SVC( # need to turn off parallelism # kernel=lambda X,Y: laplacian_kernel(X,Y, gamma), # C=C, # decision_function_shape='ovo')
return dbc.Row([dbc.Col(title, md=8), dbc.Col(link, md=4)]) # CONSTANTS numeric_cols = ['Age', 'Infrared Scan Results', 'Loading'] # LOAD DATA spreasheets = pd.read_excel('./data.xlsx', sheet_name=list(range(5))) df = pd.concat(spreasheets.values()).drop(columns=["ID", 'Heath Index']) df.columns = df.columns.str.strip() # CREATE ENCODER AND MODELS oh_enc = OneHotEncoder(sparse=False) models = { 'Ridge': linear_model.RidgeClassifier(), 'Logistic (L-BFGS)': linear_model.LogisticRegression(), 'Logistic (SAGA)': linear_model.LogisticRegression(solver='saga'), 'SGD': linear_model.SGDClassifier(), } # PREPROCESS DATASET X = np.hstack( [oh_enc.fit_transform(df[['Visual Conditions']]), df[numeric_cols].values]) y = df['Oil Leak'].values X_train, X_test, y_train, y_test = train_test_split(X, y) # train model for name, model in models.items(): model.fit(X_train, y_train)
**3. Разбейте выборку на обучение и тест, использовав метод train_test_split библиотеки model selection в пропорции 70-30, параметр random state = 1** """ data_df = data[["Total day charge", "Customer service calls"]].copy() #Отбираем анализируемые колонки labels_df = data["Churn"].copy() #Отбираем результирующую колонку data_list = data_df.to_numpy().tolist() #Приведение к списку списков data_labels = labels_df.to_numpy().tolist() #Приведение к списку train_data, test_data, train_labels, test_labels = model_selection.train_test_split(data_list, data_labels, test_size = 0.3, random_state = 1) """**4. Создайте объект ridge classifier, настройте его на обучающей выборке и примените его к тестовым данным. Посмотрите на результат. Почему мы получили константную модель?**""" ridge_classifier = linear_model.RidgeClassifier(random_state = 1) ridge_classifier.fit(train_data, train_labels) ridge_predictions = ridge_classifier.predict(test_data) print(ridge_predictions) """*Анализ результата:* Полученная констатная модель, выдающая 0 значение при любой комбинации признаков "Total day charge" и "Customer service calls" свидетельствует об отсутствии зависимости между ними и результирующим признаком ухода пользователя от оператора ("Churn"), подтверждая выводы, полученные графически. **5. Повторите п.4 для логистической регрессии, для тестовых данных рассчитайте вероятность отнесения объекта к каждому классу** """ log_regressor = linear_model.LogisticRegression(random_state = 1) log_regressor.fit(train_data, train_labels)
print 'predict test 3' testlabels = testlabeling(testdata,testouts,betinterval,testres3) countTAs,countwins,pwin1,moneynow,perhour = AItester(testdata,testlabels,predicted,initmoney,bet,payrate,1) else : pwin=0 pwinmax = pwin if testmode != 1: print 'current max prob:',pwinmax for i in range(0,1000): print i if noAI == 1: print 'now fitting...' if fitSVC == 1: clf = linear_model.RidgeClassifier(alpha=1e-15,copy_X=False,tol=1e-16) #clf = svm.LinearSVC(decisi{n_function_shape='ovr',verbose=1) clf.fit(traindata,labels.ravel()) elif fitSVC == 2: clf = ensemble.BaggingClassifier(ExtraTreeClassifier(),n_estimators=numDTC) clf.fit(traindata,labels.ravel()) elif fitSVC == 3: clf = neighbors.KNeighborsClassifier(n_neighbors=20) clf.fit(traindata,labels.ravel()) else : clf = DecisionTreeClassifier(max_depth=25,min_samples_leaf=10) clf.fit(traindata,labels) print 'done!' else : clf = joblib.load(outname)
def get_skl_estimator(self, **default_parameters): return linear_model.RidgeClassifier(**default_parameters)
def predict(train_list, train_result, test_list, method_list, **kwargs): def fit_predict_each_output(model, target): __predict_result = [] for idx in range(np.size(target, 1)): model.fit(train_list, target[:, idx]) __predict_result.append(model.predict(test_list)) return np.transpose(np.asarray(__predict_result)) def fit_predict(model, target): model.fit(train_list, target) return model.predict(test_list) from_bins_idx = kwargs["from_bins_idx"] to_bins_idx = kwargs["to_bins_idx"] _binned_train_result = to_bins_idx(train_result) _predict_result = [] if "current" in method_list: rbm = neural_network.BernoulliRBM(n_components=512, verbose=False, n_iter=100, learning_rate=1e-2, random_state=0) rbm.fit(train_list) rbm.fit(test_list) _predict_result.append(np.transpose(np.asarray(__predict_result))) elif "knn" in method_list: _ = knn_predict(train_list, _binned_train_result, test_list, k=kwargs["k"]) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "dt" in method_list: _ = fit_predict(tree.DecisionTreeClassifier(max_depth=kwargs["max_depth"]), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "rf" in method_list: _ = fit_predict(ensemble.RandomForestClassifier(n_estimators=kwargs["n_estimators"], max_depth=kwargs["max_depth"], n_jobs=kwargs["n_jobs"]), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "average" in method_list: _ = average_predict(train_result, test_list) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "adaboost" in method_list: _ = fit_predict_each_output(ensemble.AdaBoostClassifier(), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "ridge" in method_list: _ = fit_predict_each_output(linear_model.RidgeClassifier(), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "linear" in method_list: _predict_result.append(fit_predict_each_output(linear_model.LinearRegression(), train_result)) elif "huber" in method_list: _predict_result.append(fit_predict_each_output(linear_model.HuberRegressor(), train_result)) elif "theilsen" in method_list: _predict_result.append(fit_predict_each_output(linear_model.TheilSenRegressor(), train_result)) elif "lasso" in method_list: _predict_result.append(fit_predict_each_output(linear_model.Lasso(), train_result)) elif "par" in method_list: _predict_result.append(fit_predict_each_output(linear_model.PassiveAggressiveRegressor(C=kwargs["par_C"], epsilon=kwargs["par_eps"]), train_result)) elif "ridge_reg" in method_list: _predict_result.append(fit_predict_each_output(linear_model.Ridge(), train_result)) elif "dt_reg" in method_list: _predict_result.append(fit_predict(tree.DecisionTreeRegressor(max_depth=kwargs["max_depth"]), train_result)) elif "rf_reg" in method_list: _predict_result.append(fit_predict(ensemble.RandomForestRegressor(max_depth=kwargs["max_depth"], n_jobs=kwargs['n_jobs'], n_estimators=kwargs['n_estimators']), train_result)) elif "xgboost" in method_list: _predict_result.append(fit_predict_each_output(xgb.XGBClassifier(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), _binned_train_result)) elif "xgboost_reg" in method_list: _predict_result.append(fit_predict_each_output(xgb.XGBRegressor(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), train_result)) elif "svr" in method_list: _predict_result.append(fit_predict_each_output(svm.SVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result)) elif "linear_svr" in method_list: _predict_result.append(fit_predict_each_output(svm.LinearSVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result)) else: assert False, "invalid method" return np.asarray(_predict_result)
print("First 5 rows in test data:") print(test_df['text'].head(5)) print("First 5 rows in test data - cleaned:") print(test_df['text_clean'].head(5)) # Apply Count Vectorizer count_vect = CountVectorizer(analyzer=text_utils.clean_text) # create vectors for all training tweets train_vect = count_vect.fit_transform(train_df["text"]) # create vectors for all test tweets test_vect = count_vect.transform(test_df["text"]) # build a linear model for classification using Ridge regression clf = linear_model.RidgeClassifier() scores = model_selection.cross_val_score(clf, train_vect, train_df["target"], cv=3, scoring="accuracy") # [0.714342 0.65602837 0.69846275] # scores = model_selection.cross_val_score(clf, train_vect, train_df["target"], cv=3, scoring="f1") # [0.59878251 0.53089737 0.60949464] print(scores) # fit the train datapredict labels for test tweets clf.fit(train_vect, train_df["target"]) sample_submission = pd.read_csv("dataset/sample_submission.csv")
def getEngines(self): return [(svm.LinearSVC(), 'linear-svm'), (svm.SVC(kernel='rbf'), 'rbf-svm'), (ensemble.RandomForestClassifier(), 'random-forest'), (linear_model.RidgeClassifier(alpha=2.0), 'ridge-regression')]