('feature_selector', feature_selection.SelectFromModel(linear_model.Lasso())), ('pca', decomposition.PCA()), ('regressor', neighbors.KNeighborsRegressor()) ]) pipeline_grid = { 'preprocess__num__imputer__strategy': ['mean', 'median'], 'pca__n_components': [0.90, 0.95], 'regressor__n_neighbors': list(range(5, 15)) } #build model with pipeline scoring = metrics.make_scorer(log_rmse, greater_is_better=False) pipeline_generated = utils.grid_search_best_model(complete_pipeline, pipeline_grid, house_train1, house_train['SalePrice'], scoring=scoring) #read test data house_test = pd.read_csv(os.path.join(path, "test.csv")) house_test.shape house_test.info() house_test['SalePrice'] = None #apply preprocessing required before pipeline utils.cast_to_cat(house_test, features_to_cast) house_test1 = utils.drop_features(house_test, features_to_drop) house_test1.info() #get predictions on test data with constructed pipeline
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) #perceptron algorithm stages = [ ('features', preprocessing.PolynomialFeatures()), ('clf', linear_model.Perceptron(max_iter=1000)) ] perceptron_pipeline = pipeline.Pipeline(stages) perceptron_pipeline_grid = {'features__gamma':[0.1, 0.01, 0.2]} pipeline_object = comutils.grid_search_best_model(perceptron_pipeline, perceptron_pipeline_grid, X_train, y_train) final_estimator = pipeline_object.named_steps['clf'] print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(pipeline_object, X_train, y_train) #logistic regression algorithm stages = [ ('features', preprocessing.PolynomialFeatures()), ('clf', linear_model.LogisticRegression()) ] lr_pipeline = pipeline.Pipeline(stages) lr_pipeline_grid = {'features__gamma':[0.1, 1, 5,10]} pipeline_object = comutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train) final_estimator = pipeline_object.named_steps['clf']
scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False) #linear pattern in 2d X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200, n_features=1) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, random_state=1) rutils.plot_data_2d_regression(X_train, y_train) lr_pipeline = pipeline.Pipeline([('features', preprocessing.PolynomialFeatures()), ('reg', linear_model.LinearRegression())]) lr_pipeline_grid = {'features__degree': [2, 3, 5, 10]} pipeline_object = utils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train, scoring=scoring) final_linear_model = pipeline_object.named_steps['reg'] print(final_linear_model.coef_) print(final_linear_model.intercept_) rutils.plot_model_2d_regression(pipeline_object, X_train, y_train) rutils.regression_performance(pipeline_object, X_test, y_test) lr_pipeline = pipeline.Pipeline([('features', kutils.GaussianFeatures()), ('reg', linear_model.LinearRegression())]) lr_pipeline_grid = {'features__n_centres': [15, 20, 30, 36]} pipeline_object = utils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train,
pca_data = lpca.transform(X_train1) print(pca_data.shape) tsne = manifold.TSNE(n_components=2) tsne_data = tsne.fit_transform(pca_data) rutils.plot_data_3d_regression(tsne_data, y_train) scoring = metrics.make_scorer(log_rmse, greater_is_better=False) sns.distplot(y_train) y_trans = np.log1p(y_train) sns.distplot(y_trans) knn_estimator = neighbors.KNeighborsRegressor() knn_grid = {'n_neighbors': list(range(5, 15))} final_model = utils.grid_search_best_model(knn_estimator, knn_grid, pca_data, y_trans, scoring=scoring) X_test = house3[house_train.shape[0]:] X_test1 = utils.select_features(lasso_selector, X_test) pca_test_data = lpca.transform(X_test1) pca_test_data.shape house_test['SalePrice'] = np.expm1(final_model.predict(pca_test_data)) house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, random_state=1) rutils.plot_data_2d_regression(X_train, y_train) #add outliers in features X_train[::10] = 4 #add outliers in target y_train[::10] = 250 rutils.plot_data_2d_regression(X_train, y_train) # Fit linear model lr_estimator = linear_model.LinearRegression() lr_grid = {'normalize': [True, False]} lr_model = utils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring=scoring) rutils.plot_model_2d_regression(lr_model, X_train, y_train, title="LinearRegression") rutils.regression_performance(lr_model, X_test, y_test) # Robustly fit linear model with Huber Regressor algorithm hr_estimator = linear_model.HuberRegressor() hr_grid = {'epsilon': [1.1, 1.2, 1.3, 1.5]} hr_model = utils.grid_search_best_model(hr_estimator, hr_grid, X_train, y_train,
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.4, 0.6], class_sep=1.5) #X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) X_train, X_eval, y_train, y_eval = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) lr_estimator = linear_model.LogisticRegression() lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]} final_estimator = comutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='accuracy') print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) cutils.performance_metrics_hard_binary_classification(final_estimator, X_eval, y_eval) #multi class classification X, y = cutils.generate_linear_synthetic_data_classification( n_samples=1000, n_features=2, n_classes=4, weights=[0.3, 0.3, 0.2, 0.2], class_sep=1.5)
import common_utils as comutils import classification_utils as cutils from sklearn import model_selection, naive_bayes, preprocessing import seaborn as sns #2-d classification pattern X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.5, 0.5], class_sep=2) X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) sns.distplot(X_train[:, 0], hist=False) sns.distplot(X_train[:, 1], hist=False) #grid search for parameter values gnb_estimator = naive_bayes.GaussianNB() gnb_grid = {'priors': [None]} final_estimator = comutils.grid_search_best_model(gnb_estimator, gnb_grid, X_train, y_train) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) final_estimator.predict_proba(X_test)
tsne_data = tsne.fit_transform(pca_data) rutils.plot_data_3d_regression(tsne_data, y_train) scoring = metrics.make_scorer(log_rmse, greater_is_better=False) sns.distplot(y_train) y_trans = np.log1p(y_train) sns.distplot(y_trans) kernel_svm = svm.SVR(kernel="rbf") kernel_svm_grid = { 'C': [0.2, 0.5, 10, 20, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005] } final_kernel_svm_model = utils.grid_search_best_model(kernel_svm, kernel_svm_grid, pca_data, y_trans, scoring=scoring) X_test = house3[house_train.shape[0]:] X_test1 = utils.select_features(lasso_selector, X_test) pca_test_data = lpca.transform(X_test1) pca_test_data.shape house_test['SalePrice'] = np.expm1( final_kernel_svm_model.predict(pca_test_data)) house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False) #linear pattern in 2d X, y = rutils.generate_linear_synthetic_data_regression(n_samples=100, n_features=1, n_informative=1, noise=200) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, random_state=1) rutils.plot_data_2d_regression(X_train, y_train) linear_estimator = linear_model.LinearRegression() linear_grid = {'normalize': [False]} final_linear_model = utils.grid_search_best_model(linear_estimator, linear_grid, X_train, y_train, scoring=scoring) print(final_linear_model.coef_) print(final_linear_model.intercept_) rutils.plot_model_2d_regression(final_linear_model, X_train, y_train) rutils.regression_performance(final_linear_model, X_test, y_test) lasso_estimator = linear_model.Lasso(max_iter=5000) lasso_grid = {'alpha': [0, 0.1, 0.5, 1.0, 10]} final_lasso_model = utils.grid_search_best_model(lasso_estimator, lasso_grid, X_train, y_train, scoring=scoring) print(final_lasso_model.coef_)
import sys sys.path.append("E:/utils") import common_utils as comutils import classification_utils as cutils from sklearn import preprocessing, linear_model, pipeline X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) stages = [('features', preprocessing.PolynomialFeatures()), ('perceptron', linear_model.Perceptron(max_iter=1000))] perceptron_pipeline = pipeline.Pipeline(stages) perceptron_pipeline_grid = { 'perceptron__penalty': ['l1'], 'perceptron__alpha': [0, 0.1, 0.3, 0.5], 'features__degree': [2, 3] } pipeline_object = comutils.grid_search_best_model(perceptron_pipeline, perceptron_pipeline_grid, X, y) final_estimator = pipeline_object.named_steps['perceptron'] print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(pipeline_object, X, y)
X_train1 = utils.select_features(lasso_selector, X_train) utils.corr_heatmap(X_train1) lpca = decomposition.PCA(0.95) lpca.fit(X_train1) print(np.cumsum(lpca.explained_variance_ratio_)) pca_data = lpca.transform(X_train1) print(pca_data.shape) tsne = manifold.TSNE(n_components=2) tsne_data = tsne.fit_transform(pca_data) rutils.plot_data_3d_regression(tsne_data, y_train) scoring = metrics.make_scorer(log_rmse, greater_is_better=False) sns.distplot(y_train) y_trans = np.log1p(y_train) sns.distplot(y_trans) rf_estimator = ensemble.RandomForestRegressor(random_state=100) rf_grid = {'n_estimators':list(range(100,501,200)), 'max_features':[8, 10, 15], 'max_depth':[3,5,7]} final_rf_model = utils.grid_search_best_model(rf_estimator, rf_grid, pca_data, y_trans, scoring = scoring) X_test = house3[house_train.shape[0]:] X_test1 = utils.select_features(lasso_selector, X_test) pca_test_data = lpca.transform(X_test1) pca_test_data.shape house_test['SalePrice'] = np.expm1(final_rf_model.predict(pca_test_data)) house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
#linear pattern in 2d X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200, n_features=1) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, random_state=1) rutils.plot_data_2d_regression(X_train, y_train) rf_estimator = ensemble.RandomForestRegressor() rf_grid = { 'n_estimators': list(range(10, 200, 20)), 'max_depth': list(range(3, 6)) } final_rf_model = utils.grid_search_best_model(rf_estimator, rf_grid, X_train, y_train, scoring=scoring) rutils.plot_model_2d_regression(final_rf_model, X_train, y_train) rutils.regression_performance(final_rf_model, X_test, y_test) et_estimator = ensemble.ExtraTreesRegressor() et_grid = { 'n_estimators': list(range(10, 200, 20)), 'max_depth': list(range(3, 6)) } final_et_model = utils.grid_search_best_model(et_estimator, et_grid, X_train, y_train, scoring=scoring)
n_features=2, n_classes=2, weights=[0.5, 0.5], class_sep=2) X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) #grid search for parameter values dt_estimator = tree.DecisionTreeClassifier() dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(1, 9))} final_estimator = comutils.grid_search_best_model(dt_estimator, dt_grid, X_train, y_train) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) knn_estimator = neighbors.KNeighborsClassifier() knn_grid = { 'n_neighbors': list(range(1, 21)), 'weights': ['uniform', 'distance'] } final_estimator = comutils.grid_search_best_model(knn_estimator, knn_grid, X_train, y_train) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(5, 10)), 'n_estimators': list(range(1, 100, 20))
X, y = datasets.make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=4, n_repeated=0, n_classes=2) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) np.corrcoef(X_train, rowvar=False) #overfit perceptron_estimator = linear_model.Perceptron(max_iter=1000) perceptron_grid = {'alpha': [0]} final_estimator = cutils.grid_search_best_model(perceptron_estimator, perceptron_grid, X_train, y_train) print(final_estimator.intercept_) print(final_estimator.coef_) #overfit control perceptron_estimator = linear_model.Perceptron(max_iter=1000) perceptron_grid = { 'penalty': ['l1', 'l2'], 'alpha': [0, 0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1, 0.2, 0.5, 1, 3] } final_estimator = cutils.grid_search_best_model(perceptron_estimator, perceptron_grid, X_train, y_train) print(final_estimator.intercept_) print(final_estimator.coef_)
train1, y, test_size=0.1, random_state=1) #perceptron algorithm stages = [('imputer', impute.SimpleImputer()), ('zv_filter', feature_selection.VarianceThreshold()), ('classifier', linear_model.LogisticRegression())] pipeline = pipeline.Pipeline(stages) pipeline_grid = { 'imputer__strategy': ['mean', 'median'], 'zv_filter__threshold': [0, 0.5], 'classifier__C': [0.001, 0.01, 0.1, 0.2, 0.5], 'classifier__penalty': ['l1', 'l2'] } pipeline_generated = utils.grid_search_best_model(pipeline, pipeline_grid, X_train, y_train, scoring="roc_auc") final_estimator = pipeline_generated.named_steps['classifier'] print(pipeline_generated.score(X_eval, y_eval)) test = pd.read_csv(os.path.join(dir, 'test.csv')) print(test.info()) print(test.columns) test1 = test.iloc[:, 1:] test['Survived'] = np.round(pipeline_generated.predict_proba(test1)[:, 1], 2) test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)
import kernel_utils as kutils from sklearn import metrics, linear_model, svm, model_selection, preprocessing, pipeline scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False) X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200, n_features=1) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1) rutils.plot_data_2d_regression(X_train, y_train) stages = [ ('features', preprocessing.PolynomialFeatures()), ('reg', linear_model.LinearRegression()) ] lr_pipeline = pipeline.Pipeline(stages) lr_pipeline_grid = {'reg__normalize':[True, False], 'features__degree':[2,3,5,10]} pipeline_object = utils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train, scoring = scoring) final_linear_model = pipeline_object.named_steps['reg'] print(final_linear_model.coef_) print(final_linear_model.intercept_) rutils.plot_model_2d_regression(pipeline_object, X_train, y_train) rutils.regression_performance(pipeline_object, X_test, y_test) stages = [ ('features', kutils.GaussianFeatures() ), ('reg', linear_model.LinearRegression()) ] lr_pipeline = pipeline.Pipeline(stages) lr_pipeline_grid = {'reg__normalize':[True, False], 'features__n_centres':[15, 20, 30, 36] } pipeline_object = utils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train, scoring = scoring) final_linear_model = pipeline_object.named_steps['reg'] print(final_linear_model.coef_)
scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False) X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200, n_features=1) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, random_state=1) rutils.plot_data_2d_regression(X_train, y_train) kernel_lr = kernel_ridge.KernelRidge(kernel="rbf") kernel_lr_grid = { 'alpha': [0.0001, 0.01, 0.05, 0.2, 0.5, 1], 'gamma': [0.01, 0.1, 1, 2, 3, 4, 5, 10] } final_kernel_lr_model = utils.grid_search_best_model(kernel_lr, kernel_lr_grid, X_train, y_train, scoring=scoring) rutils.plot_model_2d_regression(final_kernel_lr_model, X_train, y_train) rutils.regression_performance(final_kernel_lr_model, X_test, y_test) kernel_svm = svm.SVR(kernel="rbf") kernel_svm_grid = { 'C': [0.2, 0.5, 10, 20, 50, 100], 'gamma': [0.01, 0.1, 1, 2, 3, 4, 5, 10] } final_kernel_svm_model = utils.grid_search_best_model(kernel_svm, kernel_svm_grid, X_train, y_train, scoring=scoring)
#one -hot encoding #One hot encoding is used when there exists no ordinal relationship in column #Ordinal variables are variables that are categorized in an ordered format, so that the different categories can be ranked #from smallest to largest or from less to more on a particular characteristic X_train = cutils.ohe(titanic_train1, cat_features) #get_dummies Convert categorical variable into dummy/indicator variables #A dummy variable (aka, an indicator variable) is a numeric variable that represents categorical data, such as gender, race, political affiliation, etc. y_train = titanic_train['Survived'] #build model knn_pipelines_stages = [('scaler', preprocessing.StandardScaler()), ('knn', neighbors.KNeighborsClassifier())] knn_pipeline = pipeline.Pipeline(knn_pipelines_stages) knn_pipeline_grid = {'knn__n_neighbors': list(range(1, 10))} knn_pipeline_model = cutils.grid_search_best_model(knn_pipeline, knn_pipeline_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) titanic_test1 = cutils.drop_features( titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin']) cutils.cast_to_cat(titanic_test1, ['Sex', 'Embarked', 'Pclass']) cont_features = cutils.get_continuous_features(titanic_test1) cat_features = cutils.get_categorical_features(titanic_test1) titanic_test1[cat_features] = cat_imputers.transform( titanic_test1[cat_features]) titanic_test1[cont_features] = con_imputers.transform( titanic_test1[cont_features]) X_test = cutils.ohe(titanic_test1, cat_features) titanic_test['Survived'] = knn_pipeline_model.predict(X_test)