def traintest(self, classifier, X_train, y_train, X_test, y_test, sys_out, model_name, identifier): classifier = classifier.fit(X_train, y_train) y_preds = classifier.predict(X_test) util.save_scores(y_preds, y_test, y_preds, y_test, model_name, self.task_name, identifier, 2, sys_out) subfolder = sys_out + "/models" model_file = subfolder + "/svml-%s.m" % identifier util.save_classifier_model(classifier, model_file) ec.logger.info("complete, {}".format(datetime.datetime.now()))
def cross_eval_dnn(dataset_name, outfolder, model_descriptor: str, cpus, nfold, X_data, y_data, embedding_layer_max_index, pretrained_embedding_matrix=None, instance_data_source_tags=None, accepted_ds_tags: list = None): print("== Perform ANN ...") subfolder = outfolder + "/models" try: os.stat(subfolder) except: os.mkdir(subfolder) create_model_with_args = \ functools.partial(create_model, max_index=embedding_layer_max_index, wemb_matrix=pretrained_embedding_matrix, model_descriptor=model_descriptor) # model = MyKerasClassifier(build_fn=create_model_with_args, verbose=0) model = KerasClassifier(build_fn=create_model_with_args, verbose=0, batch_size=100) model.fit(X_data, y_data) nfold_predictions = cross_val_predict(model, X_data, y_data, cv=nfold) util.save_scores(nfold_predictions, y_data, None, None, model_descriptor, dataset_name, 3, outfolder, instance_data_source_tags, accepted_ds_tags)
def learn_general(cpus, nfold, task, load_model, model, feature_vocbs: dict, X_train, y_train, X_test, y_test, index_train, index_test, outfolder, classifier_gridsearch=True, dr_option=0, dr_gridsearch=True, fs_option=0, fs_gridsearch=True, instance_data_source_tags=None, accepted_ds_tags: list = None): c = create_classifier(outfolder, model, task, nfold, classifier_gridsearch, dr_option, dr_gridsearch, fs_option, fs_gridsearch, cpus) piped_classifier = c[0] model_file = c[1] # print("### test sfm...") # fs=SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01)) # X_=fs.fit_transform(X_train, y_train) # print(np.count_nonzero(X_)) # print(X_.shape) # print("### end \n\n\n") # # print("### test kb...") # fs=SelectKBest(k=1000, score_func=f_classif) # X_=fs.fit_transform(X_train, y_train) # print(np.count_nonzero(X_)) # print(X_.shape) # print("### end \n\n\n") nfold_predictions = None if load_model: ec.logger.info("model is loaded from [%s]" % str(model_file)) best_estimator = util.load_classifier_model(model_file) else: piped_classifier.fit(X_train, y_train) nfold_predictions = cross_val_predict(piped_classifier.best_estimator_, X_train, y_train, cv=nfold) best_estimator = piped_classifier.best_estimator_ best_param = piped_classifier.best_params_ ec.logger.info("+ best params for {} model are:{}".format( model, best_param)) cv_score = piped_classifier.best_score_ util.save_classifier_model(best_estimator, model_file) # selected features for inspection if 'fs' in best_estimator.named_steps.keys(): finalFeatureIndices = best_estimator.named_steps["fs"].get_support( indices=True) util.save_selected_features(finalFeatureIndices, feature_vocbs, model_file + ".features.csv") if (X_test is not None): heldout_predictions_final = best_estimator.predict(X_test) util.save_scores(nfold_predictions, y_train, heldout_predictions_final, y_test, index_train, index_test, model, task, 2, outfolder, instance_data_source_tags, accepted_ds_tags) else: util.save_scores(nfold_predictions, y_train, None, y_test, model, task, 2, outfolder, instance_data_source_tags, accepted_ds_tags)
def grid_search_dnn(dataset_name, outfolder, model_descriptor: str, cpus, nfold, X_train, y_train, X_test, y_test, X_train_index, X_test_index, embedding_layer_max_index, pretrained_embedding_matrix=None, word_dist_matrix=None, instance_tags_train=None, instance_tags_test=None, accepted_ds_tags: list = None): print("\t== Perform ANN ...") subfolder = outfolder + "/models" try: os.stat(subfolder) except: os.mkdir(subfolder) create_model_with_args = \ functools.partial(create_model, max_index=embedding_layer_max_index, wemb_matrix=pretrained_embedding_matrix, wdist_matrix=word_dist_matrix, model_descriptor=model_descriptor) # model = MyKerasClassifier(build_fn=create_model_with_args, verbose=0) model = KerasClassifier(build_fn=create_model_with_args, verbose=0) # model = KerasClassifier(build_fn=create_model_with_args, verbose=0, batch_size=100, # nb_epoch=10) # # nfold_predictions = cross_val_predict(model, X_train, y_train, cv=nfold) # define the grid search parameters batch_size = [100] epochs = [10] param_grid = dict(batch_size=batch_size, nb_epoch=epochs) #it seems that the default gridsearchcv can have problem with stratifiedkfold sometimes, on w and ws dataset when we add "mixed_data" fold=StratifiedKFold(n_folds=nfold, y=y_train) _classifier = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=cpus, cv=fold) #this is the original grid search cv object to replace the above #_classifier = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=cpus, # cv=nfold) print("\tfitting model...{}".format(datetime.datetime.now())) _classifier.fit(X_train, y_train) print("\tcrossfold running...{}".format(datetime.datetime.now())) nfold_predictions = cross_val_predict(_classifier.best_estimator_, X_train, y_train, cv=nfold) best_param_ann = _classifier.best_params_ print("\tdone {}".format(datetime.datetime.now())) print("\tbest params for {} model are:{}".format(model_descriptor, best_param_ann)) best_estimator = _classifier.best_estimator_ # util.save_classifier_model(best_estimator, ann_model_file) # logger.info("testing on development set ....") if (X_test is not None): print("\tpredicting...{}".format(datetime.datetime.now())) heldout_predictions_final = best_estimator.predict(X_test) print("\tsaving...{}".format(datetime.datetime.now())) util.save_scores(nfold_predictions, y_train, heldout_predictions_final, y_test, X_train_index, X_test_index, model_descriptor, dataset_name, 3, outfolder, instance_tags_train, instance_tags_test, accepted_ds_tags) else: print("\tsaving...{}".format(datetime.datetime.now())) util.save_scores(nfold_predictions, y_train, None, y_test,X_train_index, X_test_index, model_descriptor, dataset_name, 3, outfolder, instance_tags_train, instance_tags_test, accepted_ds_tags)