X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.8, 0.2], class_sep=1.0) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) lr_estimator = linear_model.LogisticRegression() lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]} final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='roc_auc') print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) final_estimator.predict_proba(X_test) cutils.performance_metrics_soft_binary_classification(final_estimator, X_test, y_test) #imbalanced binary classification X, y = cutils.generate_linear_synthetic_data_classification( n_samples=1000, n_features=2, n_classes=2,
sys.path.append("E:/New Folder/utils") import classification_utils as cutils from sklearn import model_selection, linear_model, dummy #binary classification X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.4,0.6], class_sep=1.5) #X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) lr_estimator = linear_model.LogisticRegression() lr_grid = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] } final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='accuracy') print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) cutils.performance_metrics_hard_binary_classification(final_estimator, X_eval, y_eval) #multi class classification X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=4, weights=[0.3,0.3,0.2,0.2], class_sep=1.5) #X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) lr_estimator = linear_model.LogisticRegression() lr_grid = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] }
cat_imputers = utils.get_categorical_imputers(titanic, cat_features) titanic[cat_features] = cat_imputers.transform(titanic[cat_features]) cont_imputers = utils.get_continuous_imputers(titanic, cont_features) titanic[cont_features] = cont_imputers.transform(titanic[cont_features]) #one hot encoding titanic = utils.ohe(titanic, cat_features) #scale the data scaler = preprocessing.StandardScaler() tmp = scaler.fit_transform(titanic) titanic = pd.DataFrame(tmp, columns=titanic.columns) titanic_train1 = titanic[:titanic_train.shape[0]] y_train = titanic_train['Survived'] #feature selection rf_estimator = ensemble.RandomForestClassifier() rf_grid = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, titanic_train1, y_train) X_train = utils.select_features(rf_final_estimator, titanic_train1, threshold='median') kernel_svm_estimator = svm.SVC(kernel='rbf') kernel_svm_grid = {'gamma':[0.001, 0.01, 0.05, 0.1, 1], 'C':[10, 100] } svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train) titanic_test1 = titanic[titanic_train.shape[0]:] X_test = utils.select_features(rf_final_estimator, titanic_test1, threshold='median') titanic_test['Survived'] = svm_final_estimator.predict(X_test) titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)
cv_scores = model_selection.cross_val_score(knn_estimator, X_train, y_train, cv=10) print(np.mean(cv_scores)) train_score = knn_estimator.score(X_train, y_train) print(train_score) cutils.plot_model_2d_classification(knn_estimator, X_train, y_train) #underfitted learning in knn knn_estimator = neighbors.KNeighborsClassifier(n_neighbors=1) knn_estimator.fit(X_train, y_train) cv_scores = model_selection.cross_val_score(knn_estimator, X_train, y_train, cv=10) print(np.mean(cv_scores)) train_score = knn_estimator.score(X_train, y_train) print(train_score) cutils.plot_model_2d_classification(knn_estimator, X_train, y_train) #grid seach tuning knn_estimator = neighbors.KNeighborsClassifier() knn_grid = {'n_neighbors': list(range(1, 200))} cutils.grid_search_plot_one_parameter_curves(knn_estimator, knn_grid, X_train, y_train) cutils.grid_search_plot_models_classification(knn_estimator, knn_grid, X_train, y_train) final_estimator = cutils.grid_search_best_model(knn_estimator, knn_grid, X_train, y_train)
train = pd.read_csv(os.path.join(dir, 'train.csv')) print(train.info()) print(train.columns) sns.countplot(x='target',data=train) #filter unique value features train1 = train.iloc[:,2:] y = train['target'].astype(int) X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train1, y, test_size=0.1, random_state=1) stages = [ ('imputer', preprocessing.Imputer()), ('zv_filter', feature_selection.VarianceThreshold()), ('feature_selector', feature_selection.SelectKBest(score_func=feature_selection.f_classif)), ('classifier', linear_model.LogisticRegression()) ] pipeline_ml = pipeline.Pipeline(stages) pipeline_grid = {'feature_selector__k':[70, 75, 100], 'classifier__C':[0.001, 0.01, 0.1, 0.2, 0.5],'classifier__penalty':['l1', 'l2'], 'classifier__class_weight':['balanced', None]} pipeline_generated = cutils.grid_search_best_model(pipeline_ml, pipeline_grid, X_train, y_train, scoring="roc_auc") final_estimator = pipeline_generated.named_steps['classifier'] print(pipeline_generated.score(X_eval, y_eval)) test = pd.read_csv(os.path.join(dir, 'test.csv')) print(test.info()) print(test.columns) test1 = test.iloc[:,1:] test['target'] = np.round(pipeline_generated.predict_proba(test1)[:,1], 2) test.to_csv(os.path.join(dir, 'submission.csv'), columns=['id', 'target'], index=False)
X, y = cutils.generate_linear_synthetic_data_classification( n_samples=1000, n_features=2, n_classes=2, weights=[0.05, 0.95], class_sep=0.1) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) lr_estimator = linear_model.LogisticRegression() lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]} final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='f1') print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) cutils.performance_metrics_hard_binary_classification(final_estimator, X_test, y_test) dummy_estimator = dummy.DummyClassifier(strategy='most_frequent') dummy_estimator.fit(X_train, y_train) dummy_estimator.predict(X_test) cutils.performance_metrics_hard_binary_classification(dummy_estimator, X_test, y_test)
X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) #perceptron algorithm stages = [('features', preprocessing.PolynomialFeatures()), ('clf', linear_model.Perceptron(max_iter=1000))] perceptron_pipeline = pipeline.Pipeline(stages) perceptron_pipeline_grid = {'features__gamma': [0.1, 0.01, 0.2]} pipeline_object = cutils.grid_search_best_model(perceptron_pipeline, perceptron_pipeline_grid, X_train, y_train) final_estimator = pipeline_object.named_steps['clf'] print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(pipeline_object, X_train, y_train) #logistic regression algorithm stages = [('features', preprocessing.PolynomialFeatures()), ('clf', linear_model.LogisticRegression())] lr_pipeline = pipeline.Pipeline(stages) lr_pipeline_grid = {'features__gamma': [0.1, 1, 5, 10]} pipeline_object = cutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train) final_estimator = pipeline_object.named_steps['clf']
titanic = utils.ohe(titanic, cat_features) #scale the data scaler = preprocessing.StandardScaler() tmp = scaler.fit_transform(titanic) titanic = pd.DataFrame(tmp, columns=titanic.columns) titanic_train1 = titanic[:titanic_train.shape[0]] y_train = titanic_train['Survived'] rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(1, 9)), 'n_estimators': list(range(1, 300, 100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, titanic_train1, y_train) X_train = utils.select_features(rf_final_estimator, titanic_train1, threshold='mean') tpot_estimator = tpot.TPOTClassifier( generations=10, population_size=40, verbosity=2, early_stop=3, random_state=100, cv=5, scoring='accuracy', periodic_checkpoint_folder='E:/checkpoint') tpot_estimator.fit(X_train, y_train) print(tpot_estimator.score(X_train, y_train))
import classification_utils as cutils from sklearn import model_selection, ensemble, tree import xgboost as xgb X_train, y_train = cutils.generate_nonlinear_synthetic_data_classification3( n_samples=500, noise=0.25) cutils.plot_data_2d_classification(X_train, y_train) dt_estimator = tree.DecisionTreeClassifier() ada_estimator = ensemble.AdaBoostClassifier(dt_estimator) ada_grid = { 'base_estimator__max_depth': [1, 2, 3], 'n_estimators': list(range(50, 150, 30)), 'learning_rate': [0.1, 0.2, 0.5, 1.0] } final_estimator = cutils.grid_search_best_model(ada_estimator, ada_grid, X_train, y_train) gb_estimator = ensemble.GradientBoostingClassifier() gb_grid = { 'max_depth': [1, 2, 3], 'n_estimators': list(range(50, 150, 30)), 'learning_rate': [0.1, 0.2, 0.5, 1.0] } final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid, X_train, y_train) xgb_estimator = xgb.XGBClassifier() xgb_grid = { 'max_depth': [1, 2, 3], 'n_estimators': list(range(50, 150, 30)), 'learning_rate': [0.1, 0.2, 0.5, 1.0],
y_train, cv=10) print(np.mean(cv_scores)) train_score = dt_estimator.score(X_train, y_train) print(train_score) #visualize the deciion tree X_df = pd.DataFrame(X_train, columns=['X0', 'X1']) dot_data = io.StringIO() tree.export_graphviz(dt_estimator, out_file=dot_data, feature_names=X_df.columns) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] dir = 'E:/' graph.write_pdf(os.path.join(dir, "tree.pdf")) dt_estimator = tree.DecisionTreeClassifier() dt_grid = {'max_depth': list(range(1, 9))} cutils.grid_search_plot_one_parameter_curves(dt_estimator, dt_grid, X_train, y_train) cutils.grid_search_plot_models_classification(dt_estimator, dt_grid, X_train, y_train) final_estimator = cutils.grid_search_best_model(dt_estimator, dt_grid, X_train, y_train) cutils.generate_linear_synthetic_data_classification dt_grid_estimator = model_selection.GridSearchCV(dt_estimator, dt_grid, cv=10, refit=True) dt_grid_estimator.fit(X_train, y_train)
import classification_utils as cutils from sklearn import model_selection, naive_bayes, preprocessing import seaborn as sns #2-d classification pattern X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.5, 0.5], class_sep=2) X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) sns.distplot(X_train[:, 0], hist=False) sns.distplot(X_train[:, 1], hist=False) #grid search for parameter values gnb_estimator = naive_bayes.GaussianNB() gnb_grid = {'priors': [None]} final_estimator = cutils.grid_search_best_model(gnb_estimator, gnb_grid, X_train, y_train) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) final_estimator.predict_proba(X_test)
import sys sys.path.append("E:/New Folder/utils") import classification_utils as cutils from sklearn import preprocessing, linear_model, pipeline X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) stages = [('features', preprocessing.PolynomialFeatures()), ('perceptron', linear_model.Perceptron(max_iter=1000))] perceptron_pipeline = pipeline.Pipeline(stages) perceptron_pipeline_grid = { 'perceptron__penalty': ['l1'], 'perceptron__alpha': [0, 0.1, 0.3, 0.5], 'features__degree': [2, 3] } pipeline_object = cutils.grid_search_best_model(perceptron_pipeline, perceptron_pipeline_grid, X, y) final_estimator = pipeline_object.named_steps['perceptron'] print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(pipeline_object, X, y)
titanic_train1[cont_features]) #adding new levels #titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5]) #one hot encoding X_train = utils.ohe(titanic_train1, cat_features) y_train = titanic_train['Survived'] #build model knn_pipeline_stages = [('scaler', preprocessing.StandardScaler()), ('knn', neighbors.KNeighborsClassifier())] knn_pipeline = pipeline.Pipeline(knn_pipeline_stages) knn_pipeline_grid = {'knn__n_neighbors': list(range(1, 10))} knn_pipeline_model = cutils.grid_search_best_model(knn_pipeline, knn_pipeline_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.shape) print(titanic_test.info()) titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin']) utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked']) cat_features = utils.get_categorical_features(titanic_test1) print(cat_features) cont_features = utils.get_continuous_features(titanic_test1)
class_sep=2) #X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) #Perceptron algorithm perceptron_estimator = linear_model.Perceptron() perceptron_grid = { 'penalty': ['l1', 'l2'], 'alpha': [0, 0.01, 0.02, 0.1, 0.3, 0.5, 0.7, 1] } final_estimator = cutils.grid_search_best_model(perceptron_estimator, perceptron_grid, X_train, y_train) print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) #predict distances and classes for test data print(final_estimator.decision_function(X_test)) print(final_estimator.predict(X_test)) #logistic regression algorithm lr_estimator = linear_model.LogisticRegression() lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]} final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train) print(final_estimator.intercept_)
X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) #perceptron algorithm stages = [('features', kutils.KernelTransformer('rbf')), ('clf', linear_model.Perceptron(max_iter=1000))] perceptron_pipeline = pipeline.Pipeline(stages) perceptron_pipeline_grid = {'features__gamma': [0.1, 0.01, 0.2]} pipeline_object = cutils.grid_search_best_model(perceptron_pipeline, perceptron_pipeline_grid, X_train, y_train) final_estimator = pipeline_object.named_steps['clf'] print(final_estimator.intercept_) print(final_estimator.coef_) cutils.plot_model_2d_classification(pipeline_object, X_train, y_train) #logistic regression algorithm stages = [('features', kutils.KernelTransformer('rbf')), ('clf', linear_model.LogisticRegression())] lr_pipeline = pipeline.Pipeline(stages) lr_pipeline_grid = {'features__gamma': [0.1, 1, 5, 10]} pipeline_object = cutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train) final_estimator = pipeline_object.named_steps['clf']
n_features=2, n_classes=2, weights=[0.5, 0.5], class_sep=2) X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1) cutils.plot_data_2d_classification(X, y) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) cutils.plot_data_2d_classification(X_train, y_train) #grid search for parameter values dt_estimator = tree.DecisionTreeClassifier() dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(1, 9))} final_estimator = cutils.grid_search_best_model(dt_estimator, dt_grid, X_train, y_train) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) knn_estimator = neighbors.KNeighborsClassifier() knn_grid = { 'n_neighbors': list(range(1, 21)), 'weights': ['uniform', 'distance'] } final_estimator = cutils.grid_search_best_model(knn_estimator, knn_grid, X_train, y_train) cutils.plot_model_2d_classification(final_estimator, X_train, y_train) rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(5, 10)), 'n_estimators': list(range(1, 100, 20))
titanic_train1[cat_features]) cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features) titanic_train1[cont_features] = cont_imputers.transform( titanic_train1[cont_features]) #one hot encoding X_train = utils.ohe(titanic_train1, cat_features) y_train = titanic_train['Survived'] #embedded feature selectors rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(1, 9)), 'n_estimators': list(range(1, 300, 100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, X_train, y_train) embedded_selector = feature_selection.SelectFromModel(rf_final_estimator, prefit=True, threshold='mean') X_train1 = embedded_selector.transform(X_train) utils.plot_feature_importances(rf_final_estimator, X_train) gb_estimator = ensemble.GradientBoostingClassifier() gb_grid = { 'max_depth': [1, 2, 3], 'n_estimators': list(range(50, 300, 100)), 'learning_rate': [0.001, 0.1, 1.0] } gb_final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid, X_train, y_train) embedded_selector = feature_selection.SelectFromModel(gb_final_estimator,
#handle missing data(imputation) cat_imputers = utils.get_categorical_imputers(titanic_train1, cat_features) titanic_train1[cat_features] = cat_imputers.transform(titanic_train1[cat_features]) cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features) titanic_train1[cont_features] = cont_imputers.transform(titanic_train1[cont_features]) #one hot encoding titanic_train2 = utils.ohe(titanic_train1, cat_features) scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(titanic_train2) y_train = titanic_train['Survived'] #build model knn_estimator = neighbors.KNeighborsClassifier() knn_grid = { 'n_neighbors': list(range(1,10)) } knn_final_estimator = cutils.grid_search_best_model(knn_estimator, knn_grid, X_train, y_train) kernel_svm_estimator = svm.SVC(kernel='rbf') kernel_svm_grid = {'gamma':[0.001, 0.01, 0.1, 1], 'C':[0.001, 0.01, 1, 10, 100] } svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train) stages = [('features', kutils.KernelTransformer('rbf')) , ('clf', linear_model.LogisticRegression()) ] lr_pipeline = pipeline.Pipeline(stages) lr_pipeline_grid = {'features__gamma':[0.001, 0.01, 0.1, 1], 'clf__C':[0.001, 0.01, 1, 10, 100]} lr_final_pipeline_estimator = cutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train) rf_estimator = ensemble.RandomForestClassifier() rf_grid = {'max_depth':list(range(5,10)), 'n_estimators':list(range(1,500, 100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, X_train, y_train)
('gb', gb_estimator) , ('knn', knn_estimator), ('rf', rf_estimator), ('svm', kernel_svm_estimator), ('lr', lr_pipeline) ] hvoting_estimator = ensemble.VotingClassifier(estimators) hvoting_grid = { 'gb__max_depth':[2], 'gb__n_estimators':list(range(300,500, 100)), 'gb__learning_rate':[0.1, 0.2, 0.5, 1.0], 'knn__n_neighbors': list(range(6,10)), 'rf__max_depth':list(range(6,8)), 'rf__n_estimators':list(range(200,400, 100)), 'lr__features__gamma':[0.001, 0.01], 'lr__clf__C':[1, 10], 'svm__gamma':[0.001, 0.01], 'svm__C':[0.001, 0.01, 1, 10] } voting_final_estimator = cutils.grid_search_best_model(hvoting_estimator, hvoting_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.shape) print(titanic_test.info()) titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin']) utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked']) cat_features = utils.get_categorical_features(titanic_test1) print(cat_features) cont_features = utils.get_continuous_features(titanic_test1) print(cont_features)
#adding new levels #titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5]) #one hot encoding titanic_train2 = utils.ohe(titanic_train1, cat_features) scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(titanic_train2) y_train = titanic_train['Survived'] kernel_svm_estimator = svm.SVC(kernel='rbf') kernel_svm_grid = { 'gamma': [0.001, 0.01, 0.1, 1], 'C': [0.001, 0.01, 1, 10, 100] } svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.shape) print(titanic_test.info()) titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin']) utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked']) cat_features = utils.get_categorical_features(titanic_test1) print(cat_features) cont_features = utils.get_continuous_features(titanic_test1)
stage1_estimators = [ gb_estimator, knn_estimator, rf_estimator, kernel_svm_estimator, lr_pipeline ] stage2_estimator = linear_model.LogisticRegression() stacking_estimator = mlxtnd.StackingClassifier(stage1_estimators, stage2_estimator ) stacking_grid = { 'gradientboostingclassifier__max_depth':[2], 'gradientboostingclassifier__n_estimators':list(range(300,500, 100)), 'gradientboostingclassifier__learning_rate':[0.1, 0.2, 0.5, 1.0], 'kneighborsclassifier__n_neighbors': list(range(6,10)), 'randomforestclassifier__max_depth':list(range(6,8)), 'randomforestclassifier__n_estimators':list(range(200,400, 100)), 'svc__gamma':[0.001, 0.01], 'svc__C':[0.001, 0.01, 1, 10] , 'meta_classifier__C': [0.1, 10.0] } stacking_final_estimator = cutils.grid_search_best_model(stacking_estimator, stacking_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.shape) print(titanic_test.info()) titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin']) utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked']) cat_features = utils.get_categorical_features(titanic_test1) print(cat_features) cont_features = utils.get_continuous_features(titanic_test1) print(cont_features)