def do_generate_logistic_simple_model(X_train, y_train, parameters): model = LogisticRegression(random_state=my_constants.RANDOM_VALUE) model_grid = GridSearchCV(model, param_grid=parameters, cv=3, verbose=3, n_jobs=3) with ignore_warnings(category=ConvergenceWarning): model_grid.fit(X_train, y_train) file_operations.write_logs(FILENAME, "Calculate logistic simple model" + model_grid) return model_grid
def do_generate_rf_optimazed_model(X_train, y_train, parameters): file_operations.write_logs( FILENAME, 'Starting RF Grid Search with parameters:' + str(parameters)) model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE, oob_score=True) model_grid = GridSearchCV(model, param_grid=parameters, cv=3, verbose=3, n_jobs=3) with ignore_warnings(category=ConvergenceWarning): model_grid.fit(X_train, y_train) file_operations.write_logs(FILENAME, 'RF Grid search completed') return model_grid
def do_generate_metrics_logistic_simple_model(X_train, y_train, X_test, y_test, grid): file_operations.write_logs(FILENAME, "do_generate_metrics_logistic_simple_model") model = LogisticRegression(random_state=my_constants.RANDOM_VALUE) file_operations.write_logs(FILENAME, "grid Best params") file_operations.write_logs(FILENAME, str(grid.best_params_)) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, 'model params:' + str(model.get_params()) + " model score:" + str(model.score)) file_operations.write_logs( FILENAME, 'model grid.best_params_:' + str(model.get_params()) + " grid.best_score_:" + str(grid.best_score_)) return model, metrics
def do_generate_lgbm_optimazed_model(X_train, y_train, parameters): file_operations.write_logs(FILENAME, 'Starting LGBM Grid Search with parameters:') file_operations.write_logs(FILENAME, str(parameters)) model = LGBMClassifier(random_state=0) model = GridSearchCV(model, param_grid=parameters, cv=3, verbose=3, n_jobs=3) model.fit(X_train, y_train) file_operations.write_logs(FILENAME, "LGBM grid search completed") return model
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test, grid): file_operations.write_logs(FILENAME, "LGBM metrics calculation\n") model = LGBMClassifier(random_state=0) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, "Generated model params and results\n params:" + str(model.get_params()) + "\nscore " + str(model.score(X_test, y_test))) file_operations.write_logs( FILENAME, "Search grid best params and results\n params:" + str(grid.best_params_) + "\nscore " + str(grid.best_score_)) return model, metrics
def do_generate_metrics_logistic_simple_model(X_train, y_train, X_test, y_test, grid): model = LogisticRegression(random_state=my_constants.RANDOM_VALUE) file_operations.write_logs( FILENAME, "Calculate logistic simple model best params" + grid.best_params_) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, "model params" + model.get_params() + " scores:" + model.score) file_operations.write_logs( FILENAME, "Grid params" + grid.best_params_ + " scores:" + grid.best_score_) return model, metrics
def do_generate_metrics_rf_optimazed_model(X_train, y_train, X_test, y_test, grid): file_operations.write_logs(FILENAME, 'Starting metrics calculation') model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE, oob_score=True) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, "Generated model params and results\n params:" + str(model.get_params()) + "\nscore " + str(model.score(X_test, y_test))) file_operations.write_logs( FILENAME, "Search grid best params and results\n params:" + str(grid.best_params_) + "\nscore " + str(grid.best_score_)) return model, metrics
def predictions(): train_df = file_operations.read_data('processed', 'train.csv', 'PassengerId') competition_df = file_operations.read_data('processed', 'test.csv', 'PassengerId') X = train_df.loc[:, 'Age':].values.astype('float') y = train_df['Survived'].ravel() shape = X.shape if shape[0] == 891 & shape[1] > 36: file_operations.write_logs( FILENAME, "Dataset has " + shape[1] + " and right amount amount of rows") file_operations.write_logs(FILENAME, 'Creating test and train dataset') X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=my_constants.TEST_SIZE, random_state=my_constants.RANDOM_VALUE) # Linear base dummy model file_operations.write_logs(FILENAME, 'Creating linear model') base_model = create_base_model(X_train, y_train, X_test, y_test) file_operations.write_logs(FILENAME, "Metrics BaseModel: " + base_model['metrics']) file_operations.get_submission_file(base_model['model'], '01_base_model.csv', competition_df) # Logistic regression model file_operations.write_logs(FILENAME, 'Creating logistic simple model') lg_simple_model = create_logistic_simple_model(X_train, y_train, X_test, y_test) file_operations.write_logs( FILENAME, "Metrics lg_simple_model: " + lg_simple_model['metrics']) file_operations.get_submission_file(lg_simple_model['model'], '02_lg_model.csv', competition_df) # Logistic regression model with hyp optimization file_operations.write_logs(FILENAME, 'Creating logistic optimazed model') lg_optimazed_model = create_logistic_optimazed_model( X_train, y_train, X_test, y_test) file_operations.write_logs( FILENAME, "Metrics lg_optimazed_model: " + lg_optimazed_model['metrics']) file_operations.get_submission_file(lg_optimazed_model['model'], '03_lg_model_optimized.csv', competition_df) # print('Creating rf model') # rf_model_scaled = create_rf_optimized_model(X_train, y_train, X_test, y_test) # print("Metrics rf_model_scaled: ", rf_model_scaled['metrics']) # file_operations.get_submission_file(rf_model_scaled['model'], '04_rf_model_optimized.csv', competition_df) # # print('Creating lgbm model') # lgbm_model = create_lgbm_optimized_model(X_train, y_train, X_test, y_test) # print("Metrics lgbm_model: ", lgbm_model['metrics']) # file_operations.get_submission_file(lgbm_model['model'], '04_lgbm_model_optimized.csv', competition_df) # # print('Creating SVC scaled model') # svc_model = create_svc_optimized_model(X_train, y_train, X_test, y_test) # file_operations.get_submission_file(svc_model['model'], '05_svc_model_optimized_scaled.csv', competition_df) # Feature standarization scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # linear base dummy model file_operations.write_logs(FILENAME, 'Creating dummy scaled model') base_model_scaled = create_base_model(X_train_scaled, y_train, X_test_scaled, y_test) file_operations.write_logs( FILENAME, "Metrics base_model_scaled: " + base_model_scaled['metrics']) file_operations.get_submission_file_with_standardization( base_model_scaled['model'], '01_base_model_scaled.csv', scaler, competition_df) # Logistic regression model file_operations.write_logs(FILENAME, 'Creating logitic optimazed scaled model') lg_simple_model_scaled = create_logistic_simple_model( X_train, y_train, X_test, y_test) file_operations.write_logs( FILENAME, "Metrics lg_simple_model_scaled: " + lg_simple_model_scaled['metrics']) file_operations.get_submission_file_with_standardization( lg_simple_model_scaled['model'], '02_lg_model_scaled.csv', scaler, competition_df) # Logistic regression model with hyp optimization print('Creating logitic optimazed scaled model') lg_optimazed_model_scaled = create_logistic_optimazed_model( X_train, y_train, X_test, y_test) print("Metrics lg_optimazed_model_scaled: ", lg_optimazed_model_scaled['metrics']) file_operations.get_submission_file_with_standardization( lg_optimazed_model_scaled['model'], '03_lg_model_optimized_scaled.csv', scaler, competition_df)
def predictions(): train_df = file_operations.read_data('processed', 'train.csv', 'PassengerId') competition_df = file_operations.read_data('processed', 'test.csv', 'PassengerId') X = train_df.loc[:, 'Age':].values.astype('float') y = train_df['Survived'].ravel() shape = X.shape if shape[0] == 891 & shape[1] > 36: file_operations.write_logs(FILENAME, "Dataset has ", shape[1], " and right amount amount of rows") file_operations.write_logs(FILENAME, 'Creating test and train dataset') X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=my_constants.TEST_SIZE, random_state=my_constants.RANDOM_VALUE) # Linear base dummy model file_operations.write_logs(FILENAME, 'Creating linear model') base_model = create_base_model(X_train, y_train, X_test, y_test) file_operations.write_logs(FILENAME, "Metrics base_model: ") file_operations.write_logs(FILENAME, str(base_model['metrics'])) file_operations.get_submission_file(base_model['model'], '01_base_model.csv', competition_df) file_operations.write_logs(FILENAME, 'Creating lgbm model') lgbm_model = create_lgbm_optimized_model(X_train, y_train, X_test, y_test) file_operations.write_logs(FILENAME, "Metrics lgbm_model: ") file_operations.write_logs(FILENAME, str(lgbm_model['metrics'])) file_operations.get_submission_file(lgbm_model['model'], '04_lgbm_model_optimized.csv', competition_df) # Feature standarization scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) file_operations.write_logs(FILENAME, 'Creating lgbm scaled model') lgbm_model_scaled = create_lgbm_optimized_model(X_train_scaled, y_train, X_test_scaled, y_test) file_operations.write_logs(FILENAME, "Metrics lgbm_model_scaled: ") file_operations.write_logs(FILENAME, str(lgbm_model_scaled['metrics'])) file_operations.get_submission_file(str(lgbm_model_scaled['model']), '04_lgbm_model_optimized_scaled.csv', competition_df)