def create_best_gradboost(data_path, output_path): X, y, X_test, _ = prepare_data(data_path) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42) pipe = Pipeline([('feature_selection', SelectKBest(f_classif)), ('clf', GradientBoostingClassifier(random_state=2))]) params = { 'feature_selection__k': [12, 8, 4], 'clf__n_estimators': [50, 100, 500] } grid_search = GridSearchCV(pipe, param_grid=params, scoring='roc_auc') #logging.INFO(str(grid_search.fit(X_train, y_train).best_params_)) grid_search.fit(X_train, y_train) tn, fp, fn, tp = confusion_matrix( y_valid, grid_search.best_estimator_.predict(X_valid)).ravel() #logging.INFO("= Gradient Boosting Classifier = ") #logging.INFO("Accuracy: {}".format( str(((tp + tn) / (tn + fp + fn + tp))))) #logging.INFO("Recall: {}".format(((tp) / (fn + tp)))) #logging.INFO("Specificity: {}".format(((tn) / (tn + fp)))) #logging.INFO("F1 Score: {}".format(f1_score(y_valid, grid_search.best_estimator_.predict(X_valid)))) #logging.INFO("AUC: {}".format(roc_auc_score(y_valid, grid_search.best_estimator_.predict(X_valid)))) model_path = "{0}{1}".format(output_path, 'gradboost.pkl') with open(model_path, 'wb') as file: pickle.dump(grid_search.best_estimator_, file=file) return model_path, roc_auc_score( y_valid, grid_search.best_estimator_.predict(X_valid))
def build_data_loaders(args, tokenizer): snli, mnli = prepare_data(tokenizer) injected_collate = partial(collate, span_drop=args.span_drop, max_spans=args.max_spans, padding_token_id=tokenizer.pad_token_id) train_loader_factory = TrainLoaderFactory(args.batch_size, snli, mnli, injected_collate) train_all = train_loader_factory.get_loader(MODE.TRAIN_ALL) train_snli = train_loader_factory.get_loader(MODE.TRAIN_SNLI) train_mnli = train_loader_factory.get_loader(MODE.TRAIN_MNLI) test_snli = train_loader_factory.get_loader(MODE.TEST_SNLI) test_mnli_matched = train_loader_factory.get_loader(MODE.TEST_MNLI_MATCHED) test_mnli_mismatched = train_loader_factory.get_loader( MODE.TEST_MNLI_MISMATCHED) if args.dataset == 'all': return train_all, { 'snli': test_snli, "mnli_matched": test_mnli_matched, "mnli_mismatched": test_mnli_mismatched } elif args.dataset == 'snli': return train_snli, {'snli': test_snli} elif args.dataset == 'mnli': return train_mnli, { "mnli_matched": test_mnli_matched, "mnli_mismatched": test_mnli_mismatched } else: raise ValueError("--dataset has to contain one of: all, mnli or snli")
def get_predictions( data_path="", model_path=""): champion_path = "{0}{1}".format(model_path, 'champion.pkl') print("0 "+champion_path) champion = pickle.load(open(champion_path, 'rb')) # Future extension: Data path or data retrieval process for consistently updated data _, _, X_test, test_id = prepare_data(data_path) # Future extension: Write to another file, database, etc. print(champion.predict(X_test)) submission_path = "{0}{1}".format(model_path, "submission.csv") print(submission_path) submission = pd.DataFrame({'PassengerId': test_id, 'Survived': champion.predict(X_test)}) submission.to_csv(submission_path)