def predict() : data = flask.request.json print("in service") print(data) titanic_test = pd.DataFrame(data) #print(titanic_test.info()) model_objects = joblib.load(os.path.join(dir,'titanic_model_1.pkl') ) titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin']) utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked']) cat_features = utils.get_categorical_features(titanic_test1) #print(cat_features) cont_features = utils.get_continuous_features(titanic_test1) #print(cont_features) titanic_test1[cat_features] = model_objects.get('cat_imputers').transform(titanic_test1[cat_features]) titanic_test1[cont_features] = model_objects.get('cont_imputers').transform(titanic_test1[cont_features]) utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked']) titanic_test1['Sex'] = titanic_test1['Sex'].cat.add_categories(['male', 'female']) titanic_test1['Pclass'] = titanic_test1['Pclass'].cat.add_categories([1,2,3]) titanic_test1['Embarked'] = titanic_test1['Embarked'].cat.add_categories(['S','Q','C']) #print(titanic_test1.info()) titanic_test2 = utils.ohe(titanic_test1, cat_features) print(titanic_test2.shape) X_test = model_objects.get('scaler').transform(titanic_test2) result = model_objects.get('estimator').predict(X_test) print(result) return flask.jsonify(prediction=str(1))
features_to_drop.extend(get_features_to_drop_on_missingdata(house, 0.25)) house1 = drop_features(house, features_to_drop) house1.info() imputable_cat_features = get_categorical_features(house1) cat_imputer = get_categorical_imputers(house1, imputable_cat_features) house1[imputable_cat_features] = cat_imputer.transform( house1[imputable_cat_features]) imputable_cont_features = get_continuous_features(house1) cont_imputer = get_continuous_imputers(house1, imputable_cont_features) house1[imputable_cont_features] = cont_imputer.transform( house1[imputable_cont_features]) house1.info() house2 = ohe(house1, imputable_cat_features) scaler = get_scaler(house2) house3 = scaler.transform(house2) house3 = pd.DataFrame(house3, columns=house2.columns) X_train = house3[:house_train.shape[0]] y_train = house_train['SalePrice'] sns.distplot(y_train, hist=True) y_trans = np.log1p(y_train) sns.distplot(y_trans, hist=True) scoring = metrics.make_scorer(log_rmse, greater_is_better=False) #union of 3 feature selectors lasso_estimator = linear_model.Lasso()
cont_features = utils.get_continuous_features(titanic_train1) print(cont_features) #handle missing data(imputation) cat_imputers = utils.get_categorical_imputers(titanic_train1, cat_features) titanic_train1[cat_features] = cat_imputers.transform( titanic_train1[cat_features]) cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features) titanic_train1[cont_features] = cont_imputers.transform( titanic_train1[cont_features]) #adding new levels #titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5]) #one hot encoding titanic_train2 = utils.ohe(titanic_train1, cat_features) scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(titanic_train2) y_train = titanic_train['Survived'] kernel_svm_estimator = svm.SVC(kernel='rbf') kernel_svm_grid = { 'gamma': [0.001, 0.01, 0.1, 1], 'C': [0.001, 0.01, 1, 10, 100] } svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
utils.cast_to_cat( titanic, ['Sex', 'Pclass', 'Embarked', 'Title', 'FamilyGroup', 'Cabin']) cat_features = utils.get_categorical_features(titanic) print(cat_features) cont_features = utils.get_continuous_features(titanic) print(cont_features) #handle missing data(imputation) cat_imputers = utils.get_categorical_imputers(titanic, cat_features) titanic[cat_features] = cat_imputers.transform(titanic[cat_features]) cont_imputers = utils.get_continuous_imputers(titanic, cont_features) titanic[cont_features] = cont_imputers.transform(titanic[cont_features]) #one hot encoding titanic = utils.ohe(titanic, cat_features) #scale the data scaler = preprocessing.StandardScaler() tmp = scaler.fit_transform(titanic) titanic = pd.DataFrame(tmp, columns=titanic.columns) titanic_train1 = titanic[:titanic_train.shape[0]] y_train = titanic_train['Survived'] rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(1, 9)), 'n_estimators': list(range(1, 300, 100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid,
features_to_drop.extend(missing_features_above_th) house1 = utils.drop_features(house, features_to_drop) house1.info() imputable_cat_features = utils.get_non_continuous_features(house1) cat_imputer = utils.get_categorical_imputers(house1, imputable_cat_features) house1[imputable_cat_features] = cat_imputer.transform( house1[imputable_cat_features]) imputable_cont_features = utils.get_continuous_features(house1) cont_imputer = utils.get_continuous_imputers(house1, imputable_cont_features) house1[imputable_cont_features] = cont_imputer.transform( house1[imputable_cont_features]) house1.info() house2 = utils.ohe(house1, imputable_cat_features) scaler = utils.get_scaler(house2) house3 = scaler.transform(house2) house3 = pd.DataFrame(house3, columns=house2.columns) X_train = house3[:house_train.shape[0]] y_train = house_train['SalePrice'] lasso_selector = linear_model.Lasso() lasso_selector.fit(X_train, y_train) print(lasso_selector.coef_) utils.plot_feature_importances(lasso_selector, X_train, 40) X_train1 = utils.select_features(lasso_selector, X_train)
cat_features = utils.get_categorical_features(titanic_train1) print(cat_features) cont_features = utils.get_continuous_features(titanic_train1) print(cont_features) #handle missing data(imputation) cat_imputers = utils.get_categorical_imputers(titanic_train1, cat_features) titanic_train1[cat_features] = cat_imputers.transform( titanic_train1[cat_features]) cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features) titanic_train1[cont_features] = cont_imputers.transform( titanic_train1[cont_features]) #one hot encoding X_train = utils.ohe(titanic_train1, cat_features) y_train = titanic_train['Survived'] #wrapper feature selectors rf_estimator = ensemble.RandomForestClassifier() wrapper_selector = feature_selection.RFE(rf_estimator, 30, 10) X_train1 = wrapper_selector.fit_transform(X_train, y_train) gb_estimator = ensemble.GradientBoostingClassifier() wrapper_selector = feature_selection.RFE(gb_estimator, 30, 10) X_train1 = wrapper_selector.fit_transform(X_train, y_train) svm_estimator = svm.LinearSVC() wrapper_selector = feature_selection.RFE(svm_estimator, 30, 10) X_train1 = wrapper_selector.fit_transform(X_train, y_train)
#If however we want the output of the mapper to be a dataframe, we can do so using the parameter df_out titanic_train1[cat_features] = cat_imputers.transform( titanic_train1[cat_features]) con_imputers = cutils.get_continuous_imputers(titanic_train1, cont_features) titanic_train1[cont_features] = con_imputers.transform( titanic_train1[cont_features]) #adding new levels #titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5]) #one -hot encoding #One hot encoding is used when there exists no ordinal relationship in column #Ordinal variables are variables that are categorized in an ordered format, so that the different categories can be ranked #from smallest to largest or from less to more on a particular characteristic X_train = cutils.ohe(titanic_train1, cat_features) #get_dummies Convert categorical variable into dummy/indicator variables #A dummy variable (aka, an indicator variable) is a numeric variable that represents categorical data, such as gender, race, political affiliation, etc. y_train = titanic_train['Survived'] #build model knn_pipelines_stages = [('scaler', preprocessing.StandardScaler()), ('knn', neighbors.KNeighborsClassifier())] knn_pipeline = pipeline.Pipeline(knn_pipelines_stages) knn_pipeline_grid = {'knn__n_neighbors': list(range(1, 10))} knn_pipeline_model = cutils.grid_search_best_model(knn_pipeline, knn_pipeline_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) titanic_test1 = cutils.drop_features(