Exemple #1
0
def predict() :
    data = flask.request.json
    print("in service")
    print(data)
    titanic_test = pd.DataFrame(data)
    #print(titanic_test.info()) 
    
    model_objects = joblib.load(os.path.join(dir,'titanic_model_1.pkl') )
    
    titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin'])
    utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked'])

    cat_features = utils.get_categorical_features(titanic_test1)
    #print(cat_features)
    cont_features = utils.get_continuous_features(titanic_test1)
    #print(cont_features)

    titanic_test1[cat_features] = model_objects.get('cat_imputers').transform(titanic_test1[cat_features])
    titanic_test1[cont_features] = model_objects.get('cont_imputers').transform(titanic_test1[cont_features])

    utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked'])

    titanic_test1['Sex'] = titanic_test1['Sex'].cat.add_categories(['male', 'female'])
    titanic_test1['Pclass'] = titanic_test1['Pclass'].cat.add_categories([1,2,3])
    titanic_test1['Embarked'] = titanic_test1['Embarked'].cat.add_categories(['S','Q','C'])
    #print(titanic_test1.info())

    titanic_test2 = utils.ohe(titanic_test1, cat_features)
    print(titanic_test2.shape)
    X_test = model_objects.get('scaler').transform(titanic_test2)
    result = model_objects.get('estimator').predict(X_test)
    print(result)
    return flask.jsonify(prediction=str(1))  
Exemple #2
0
features_to_drop.extend(get_features_to_drop_on_missingdata(house, 0.25))
house1 = drop_features(house, features_to_drop)
house1.info()

imputable_cat_features = get_categorical_features(house1)
cat_imputer = get_categorical_imputers(house1, imputable_cat_features)
house1[imputable_cat_features] = cat_imputer.transform(
    house1[imputable_cat_features])

imputable_cont_features = get_continuous_features(house1)
cont_imputer = get_continuous_imputers(house1, imputable_cont_features)
house1[imputable_cont_features] = cont_imputer.transform(
    house1[imputable_cont_features])
house1.info()

house2 = ohe(house1, imputable_cat_features)

scaler = get_scaler(house2)
house3 = scaler.transform(house2)
house3 = pd.DataFrame(house3, columns=house2.columns)

X_train = house3[:house_train.shape[0]]
y_train = house_train['SalePrice']
sns.distplot(y_train, hist=True)
y_trans = np.log1p(y_train)
sns.distplot(y_trans, hist=True)

scoring = metrics.make_scorer(log_rmse, greater_is_better=False)

#union of 3 feature selectors
lasso_estimator = linear_model.Lasso()
cont_features = utils.get_continuous_features(titanic_train1)
print(cont_features)

#handle missing data(imputation)
cat_imputers = utils.get_categorical_imputers(titanic_train1, cat_features)
titanic_train1[cat_features] = cat_imputers.transform(
    titanic_train1[cat_features])
cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features)
titanic_train1[cont_features] = cont_imputers.transform(
    titanic_train1[cont_features])

#adding new levels
#titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5])

#one hot encoding
titanic_train2 = utils.ohe(titanic_train1, cat_features)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(titanic_train2)
y_train = titanic_train['Survived']

kernel_svm_estimator = svm.SVC(kernel='rbf')
kernel_svm_grid = {
    'gamma': [0.001, 0.01, 0.1, 1],
    'C': [0.001, 0.01, 1, 10, 100]
}
svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator,
                                                    kernel_svm_grid, X_train,
                                                    y_train)

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
Exemple #4
0
utils.cast_to_cat(
    titanic, ['Sex', 'Pclass', 'Embarked', 'Title', 'FamilyGroup', 'Cabin'])

cat_features = utils.get_categorical_features(titanic)
print(cat_features)
cont_features = utils.get_continuous_features(titanic)
print(cont_features)

#handle missing data(imputation)
cat_imputers = utils.get_categorical_imputers(titanic, cat_features)
titanic[cat_features] = cat_imputers.transform(titanic[cat_features])
cont_imputers = utils.get_continuous_imputers(titanic, cont_features)
titanic[cont_features] = cont_imputers.transform(titanic[cont_features])

#one hot encoding
titanic = utils.ohe(titanic, cat_features)

#scale the data
scaler = preprocessing.StandardScaler()
tmp = scaler.fit_transform(titanic)
titanic = pd.DataFrame(tmp, columns=titanic.columns)

titanic_train1 = titanic[:titanic_train.shape[0]]
y_train = titanic_train['Survived']

rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(1, 9)),
    'n_estimators': list(range(1, 300, 100))
}
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid,
Exemple #5
0
features_to_drop.extend(missing_features_above_th)
house1 = utils.drop_features(house, features_to_drop)
house1.info()

imputable_cat_features = utils.get_non_continuous_features(house1)
cat_imputer = utils.get_categorical_imputers(house1, imputable_cat_features)
house1[imputable_cat_features] = cat_imputer.transform(
    house1[imputable_cat_features])

imputable_cont_features = utils.get_continuous_features(house1)
cont_imputer = utils.get_continuous_imputers(house1, imputable_cont_features)
house1[imputable_cont_features] = cont_imputer.transform(
    house1[imputable_cont_features])
house1.info()

house2 = utils.ohe(house1, imputable_cat_features)

scaler = utils.get_scaler(house2)
house3 = scaler.transform(house2)
house3 = pd.DataFrame(house3, columns=house2.columns)

X_train = house3[:house_train.shape[0]]
y_train = house_train['SalePrice']

lasso_selector = linear_model.Lasso()
lasso_selector.fit(X_train, y_train)
print(lasso_selector.coef_)
utils.plot_feature_importances(lasso_selector, X_train, 40)

X_train1 = utils.select_features(lasso_selector, X_train)
cat_features = utils.get_categorical_features(titanic_train1)
print(cat_features)
cont_features = utils.get_continuous_features(titanic_train1)
print(cont_features)

#handle missing data(imputation)
cat_imputers = utils.get_categorical_imputers(titanic_train1, cat_features)
titanic_train1[cat_features] = cat_imputers.transform(
    titanic_train1[cat_features])
cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features)
titanic_train1[cont_features] = cont_imputers.transform(
    titanic_train1[cont_features])

#one hot encoding
X_train = utils.ohe(titanic_train1, cat_features)
y_train = titanic_train['Survived']

#wrapper feature selectors
rf_estimator = ensemble.RandomForestClassifier()
wrapper_selector = feature_selection.RFE(rf_estimator, 30, 10)
X_train1 = wrapper_selector.fit_transform(X_train, y_train)

gb_estimator = ensemble.GradientBoostingClassifier()
wrapper_selector = feature_selection.RFE(gb_estimator, 30, 10)
X_train1 = wrapper_selector.fit_transform(X_train, y_train)

svm_estimator = svm.LinearSVC()
wrapper_selector = feature_selection.RFE(svm_estimator, 30, 10)
X_train1 = wrapper_selector.fit_transform(X_train, y_train)
#If however we want the output of the mapper to be a dataframe, we can do so using the parameter df_out
titanic_train1[cat_features] = cat_imputers.transform(
    titanic_train1[cat_features])

con_imputers = cutils.get_continuous_imputers(titanic_train1, cont_features)
titanic_train1[cont_features] = con_imputers.transform(
    titanic_train1[cont_features])

#adding new levels
#titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5])

#one -hot encoding
#One hot encoding is used when there exists no ordinal relationship in column
#Ordinal variables are variables that are categorized in an ordered format, so that the different categories can be ranked
#from smallest to largest or from less to more on a particular characteristic
X_train = cutils.ohe(titanic_train1, cat_features)
#get_dummies Convert categorical variable into dummy/indicator variables
#A dummy variable (aka, an indicator variable) is a numeric variable that represents categorical data, such as gender, race, political affiliation, etc.
y_train = titanic_train['Survived']

#build model
knn_pipelines_stages = [('scaler', preprocessing.StandardScaler()),
                        ('knn', neighbors.KNeighborsClassifier())]
knn_pipeline = pipeline.Pipeline(knn_pipelines_stages)
knn_pipeline_grid = {'knn__n_neighbors': list(range(1, 10))}
knn_pipeline_model = cutils.grid_search_best_model(knn_pipeline,
                                                   knn_pipeline_grid, X_train,
                                                   y_train)

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
titanic_test1 = cutils.drop_features(