Beispiel #1
0
    def traintest(self, classifier, X_train, y_train, X_test, y_test, sys_out,
                  model_name, identifier):
        classifier = classifier.fit(X_train, y_train)

        y_preds = classifier.predict(X_test)
        util.save_scores(y_preds, y_test, y_preds, y_test, model_name,
                         self.task_name, identifier, 2, sys_out)
        subfolder = sys_out + "/models"
        model_file = subfolder + "/svml-%s.m" % identifier
        util.save_classifier_model(classifier, model_file)
        ec.logger.info("complete, {}".format(datetime.datetime.now()))
Beispiel #2
0
def cross_eval_dnn(dataset_name, outfolder, model_descriptor: str,
                   cpus, nfold, X_data, y_data,
                   embedding_layer_max_index, pretrained_embedding_matrix=None,
                   instance_data_source_tags=None, accepted_ds_tags: list = None):
    print("== Perform ANN ...")
    subfolder = outfolder + "/models"
    try:
        os.stat(subfolder)
    except:
        os.mkdir(subfolder)

    create_model_with_args = \
        functools.partial(create_model, max_index=embedding_layer_max_index,
                          wemb_matrix=pretrained_embedding_matrix,
                          model_descriptor=model_descriptor)
    # model = MyKerasClassifier(build_fn=create_model_with_args, verbose=0)
    model = KerasClassifier(build_fn=create_model_with_args, verbose=0, batch_size=100)
    model.fit(X_data, y_data)

    nfold_predictions = cross_val_predict(model, X_data, y_data, cv=nfold)

    util.save_scores(nfold_predictions, y_data, None, None,
                     model_descriptor, dataset_name, 3,
                     outfolder, instance_data_source_tags, accepted_ds_tags)
def learn_general(cpus,
                  nfold,
                  task,
                  load_model,
                  model,
                  feature_vocbs: dict,
                  X_train,
                  y_train,
                  X_test,
                  y_test,
                  index_train,
                  index_test,
                  outfolder,
                  classifier_gridsearch=True,
                  dr_option=0,
                  dr_gridsearch=True,
                  fs_option=0,
                  fs_gridsearch=True,
                  instance_data_source_tags=None,
                  accepted_ds_tags: list = None):
    c = create_classifier(outfolder, model, task, nfold, classifier_gridsearch,
                          dr_option, dr_gridsearch, fs_option, fs_gridsearch,
                          cpus)
    piped_classifier = c[0]
    model_file = c[1]

    # print("### test sfm...")
    # fs=SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))
    # X_=fs.fit_transform(X_train, y_train)
    # print(np.count_nonzero(X_))
    # print(X_.shape)
    # print("### end \n\n\n")
    #
    # print("### test kb...")
    # fs=SelectKBest(k=1000, score_func=f_classif)
    # X_=fs.fit_transform(X_train, y_train)
    # print(np.count_nonzero(X_))
    # print(X_.shape)
    # print("### end \n\n\n")

    nfold_predictions = None

    if load_model:
        ec.logger.info("model is loaded from [%s]" % str(model_file))
        best_estimator = util.load_classifier_model(model_file)
    else:
        piped_classifier.fit(X_train, y_train)
        nfold_predictions = cross_val_predict(piped_classifier.best_estimator_,
                                              X_train,
                                              y_train,
                                              cv=nfold)

        best_estimator = piped_classifier.best_estimator_
        best_param = piped_classifier.best_params_
        ec.logger.info("+ best params for {} model are:{}".format(
            model, best_param))
        cv_score = piped_classifier.best_score_
        util.save_classifier_model(best_estimator, model_file)

        # selected features for inspection
        if 'fs' in best_estimator.named_steps.keys():
            finalFeatureIndices = best_estimator.named_steps["fs"].get_support(
                indices=True)
            util.save_selected_features(finalFeatureIndices, feature_vocbs,
                                        model_file + ".features.csv")

    if (X_test is not None):
        heldout_predictions_final = best_estimator.predict(X_test)
        util.save_scores(nfold_predictions, y_train, heldout_predictions_final,
                         y_test, index_train, index_test, model, task, 2,
                         outfolder, instance_data_source_tags,
                         accepted_ds_tags)

    else:
        util.save_scores(nfold_predictions, y_train, None, y_test, model, task,
                         2, outfolder, instance_data_source_tags,
                         accepted_ds_tags)
Beispiel #4
0
def grid_search_dnn(dataset_name, outfolder, model_descriptor: str,
                    cpus, nfold, X_train, y_train, X_test, y_test, X_train_index, X_test_index,
                    embedding_layer_max_index, pretrained_embedding_matrix=None,
                    word_dist_matrix=None,
                    instance_tags_train=None, instance_tags_test=None,
                    accepted_ds_tags: list = None):
    print("\t== Perform ANN ...")
    subfolder = outfolder + "/models"
    try:
        os.stat(subfolder)
    except:
        os.mkdir(subfolder)

    create_model_with_args = \
        functools.partial(create_model, max_index=embedding_layer_max_index,
                          wemb_matrix=pretrained_embedding_matrix,
                          wdist_matrix=word_dist_matrix,
                          model_descriptor=model_descriptor)
    # model = MyKerasClassifier(build_fn=create_model_with_args, verbose=0)
    model = KerasClassifier(build_fn=create_model_with_args, verbose=0)

    # model = KerasClassifier(build_fn=create_model_with_args, verbose=0, batch_size=100,
    #                         nb_epoch=10)
    #
    # nfold_predictions = cross_val_predict(model, X_train, y_train, cv=nfold)

    # define the grid search parameters
    batch_size = [100]
    epochs = [10]
    param_grid = dict(batch_size=batch_size, nb_epoch=epochs)

    #it seems that the default gridsearchcv can have problem with stratifiedkfold sometimes, on w and ws dataset when we add "mixed_data"
    fold=StratifiedKFold(n_folds=nfold, y=y_train)
    _classifier = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=cpus,
                               cv=fold)

    #this is the original grid search cv object to replace the above
    #_classifier = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=cpus,
    #                           cv=nfold)

    print("\tfitting model...{}".format(datetime.datetime.now()))
    _classifier.fit(X_train, y_train)
    print("\tcrossfold running...{}".format(datetime.datetime.now()))
    nfold_predictions = cross_val_predict(_classifier.best_estimator_, X_train, y_train, cv=nfold)
    best_param_ann = _classifier.best_params_
    print("\tdone {}".format(datetime.datetime.now()))
    print("\tbest params for {} model are:{}".format(model_descriptor, best_param_ann))
    best_estimator = _classifier.best_estimator_

    # util.save_classifier_model(best_estimator, ann_model_file)

    # logger.info("testing on development set ....")
    if (X_test is not None):
        print("\tpredicting...{}".format(datetime.datetime.now()))
        heldout_predictions_final = best_estimator.predict(X_test)
        print("\tsaving...{}".format(datetime.datetime.now()))
        util.save_scores(nfold_predictions, y_train, heldout_predictions_final, y_test,
                         X_train_index, X_test_index,
                         model_descriptor, dataset_name,
                         3, outfolder, instance_tags_train, instance_tags_test, accepted_ds_tags)

    else:
        print("\tsaving...{}".format(datetime.datetime.now()))
        util.save_scores(nfold_predictions, y_train, None, y_test,X_train_index, X_test_index,
                         model_descriptor, dataset_name, 3,
                         outfolder, instance_tags_train, instance_tags_test, accepted_ds_tags)