Beispiel #1
0
    def fit(self, X, y, savepath=None, refit=True):
        rst = dict()
        param_dict = self._get_bayesian_param_dict()

        if savepath is None:
            savepath = os.getcwd()

        estimator_name = self._estimator_name

        if self.cv is None:
            self.cv = ms.RepeatedKFold()

        model = BayesSearchCV(estimator=self.estimator,
                              search_spaces=param_dict,
                              n_iter=self.n_iter,
                              scoring=self.scoring,
                              cv=self.cv,
                              refit=refit)

        try:
            rst[estimator_name] = model.fit(X, y)
        except:
            log.error(
                'Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
                ' one or more parameters over. Please check your input file and the sklearn docs for the mode'
                ' you are optimizing for the domain of correct values')
            exit()

        best_estimator = rst[estimator_name].best_estimator_

        self._save_output(savepath, rst)
        return best_estimator
Beispiel #2
0
def set_custom_scorer_cv(n_splits=5, n_repeats=2):
    """Define custom scorer and Cross-Validation strategy

    Args:
        n_splits (int, optional): Num. of splits in Cross-Validation strategy.
        Defaults to 5.
        n_repeats (int, optional): Num. of repeats for repeated CV.
        Defaults to 1.

    Returns:
        objects: custom scorer, model_selection.RepeatedKFold
    """
    print(f"\nCreate custom scorer...")
    scorer = make_scorer(
        score_func=gscreen.utils.accuracy,
        greater_is_better=True,  # Whether score_func is a score function (default),
        # meaning high is good, or a loss function, meaning low is good.
    )
    #%% Define cross-validation parameters
    print(f"Define Cross-Validation strategy...")
    cv = model_selection.RepeatedKFold(
        n_splits=n_splits,
        # Repeats K-Fold:  n times with different randomization in each repetition.
        n_repeats=n_repeats,
        random_state=rnd_state,
    )
    return scorer, cv
def KfoldValidation(model,train_set,train_label):
  cv=skl_ms.RepeatedKFold(n_splits=7,random_state=4,n_repeats=10)
  accuracyList = []
  model.fit(train_set,train_label)
  scoring = {'accuracy' : make_scorer(accuracy_score),'precision' : make_scorer(precision_score),'recall' : make_scorer(recall_score),'f1_score' : make_scorer(f1_score)}
  score=skl_ms.cross_validate(model,train_set,train_label,scoring=scoring,cv=cv,n_jobs=-1)
  print('Accuracy : %.3f' % (np.mean(score['test_accuracy'])))
  print('Precision : %.3f' % (np.mean(score['test_precision'])))
  print('Recall : %.3f' % (np.mean(score['test_recall'])))
  print('F1  Score : %.3f' % (np.mean(score['test_f1_score'])))
  return model
Beispiel #4
0
    def fit(self, X, y, model, cv=None, savepath=None):
        rst = dict()
        param_dict = self._get_grid_param_dict()

        if savepath is None:
            savepath = os.getcwd()

        estimator_name = model.model.__class__.__name__
        param_dict = self._search_space_generator(param_dict)

        if cv is None:
            cv = ms.RepeatedKFold()

        metrics = Metrics(metrics_list=None)._metric_zoo()
        if self.scoring is None:
            scoring = make_scorer(
                metrics['mean_absolute_error'][1],
                greater_is_better=metrics['mean_absolute_error'][0]
            )  # Note using True b/c if False then sklearn multiplies by -1
        else:
            scoring = make_scorer(
                metrics[self.scoring][1],
                greater_is_better=metrics[self.scoring][0]
            )  # Note using True b/c if False then sklearn multiplies by -1

        model = GridSearchCV(model.model,
                             param_dict,
                             scoring=scoring,
                             cv=cv,
                             refit=True,
                             n_jobs=self.n_jobs,
                             verbose=0)

        try:
            rst[estimator_name] = model.fit(X, y)
        except:
            print(
                'Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
                ' one or more parameters over. Please check your input file and the sklearn docs for the mode'
                ' you are optimizing for the domain of correct values')
            exit()

        best_estimator = rst[estimator_name].best_estimator_

        self._save_output(savepath, rst)

        # Need to rebuild the estimator as SklearnModel
        best_estimator = SklearnModel(model=best_estimator.__class__.__name__,
                                      **best_estimator.get_params())

        return best_estimator
def cv(df, cols_to_drop, xgboost_params=config.xgboost_params):
    X = df.drop(columns=['target'] + cols_to_drop)
    y = df.target

    started = dt.datetime.now()
    kf = model_selection.RepeatedKFold(n_repeats=1, n_splits=10)
    cv_perf = {'kf train': [], 'kf test': [], 'evals_result': []}

    for i, (train_index, test_index) in enumerate(kf.split(X)):

        kf_X_train, kf_X_test = X.iloc[train_index], X.iloc[test_index]
        kf_y_train, kf_y_test = y.iloc[train_index], y.iloc[test_index]

        model = xgboost.XGBClassifier(**xgboost_params)
        _ = model.fit(kf_X_train,
                      kf_y_train,
                      verbose=False,
                      eval_metric=["error"],
                      eval_set=[
                          (kf_X_train, kf_y_train), (kf_X_test, kf_y_test)
                      ])  # early_stopping_rounds=100

        kf_train_pred = model.predict(kf_X_train)
        kf_test_pred = model.predict(kf_X_test)

        evals_result = {
            k: v['error']
            for k, v in zip(['train', 'test'],
                            model.evals_result().values())
        }

        cv_perf['kf train'].append(
            metrics.accuracy_score(kf_y_train, kf_train_pred, normalize=True))
        cv_perf['kf test'].append(
            metrics.accuracy_score(kf_y_test, kf_test_pred, normalize=True))
        cv_perf['evals_result'].append(evals_result)

        tr_cummean = np.mean(cv_perf['kf train'])
        te_cummean = np.mean(cv_perf['kf test'])

        print(
            f'Iteration #{i+1:02}. Elapsed: {dt.datetime.now()-started}. Cum. Accuracy: test: {te_cummean:.2%}, train: {tr_cummean:.2%}'
        )
def plot_cross_validated_coefs(
        pipe,
        numerical_columns,
        nominal_columns,
        X_train,
        X_test,
        y_train,
        y_test,
        scorer,
        n_repeats=5,
        n_splits=5,
        axis_tick_label_fontsize=12,
        fig_size=(8, 12),
):
    feature_names = (pipe.named_steps["preprocessor"].
                     named_transformers_["onehot"].get_feature_names(
                         input_features=nominal_columns))
    feature_names = np.concatenate([numerical_columns, feature_names])
    cv_model = ms.cross_validate(
        pipe,
        X=pd.concat([X_train, X_test]),
        y=pd.concat([y_train, y_test]),
        cv=ms.RepeatedKFold(n_splits=n_splits,
                            n_repeats=n_repeats,
                            random_state=42),
        scoring=scorer,
        return_train_score=True,
        return_estimator=True,
        n_jobs=-1,
    )
    coefs = pd.DataFrame(
        [
            est.named_steps["clf"].coef_.flatten()
            for est in cv_model["estimator"]
        ],
        columns=feature_names,
    )
    coefs = coefs[coefs.mean(axis=0).sort_values(ascending=False).index]
    plot_coefs(coefs, "Coefficient variability", axis_tick_label_fontsize,
               fig_size)
        for dataset_ele in dataset_list:

            DATA_URL1 = PRJ_FOLDER + dataset_ele[1]
            print(DATA_URL1)
            GIST_utility.set_data_params(PRJ_FOLDER, dataset_ele[0], DATA_URL1)
            print('\n\nProcessing the Dataset ############# '+ des_i+ GIST_utility.DATA_NAME + ' #############')

            X, y = GIST_utility.summary_data(DATA_URL1,des_i)
            print('Dataset: {0} - {1}'.format(X.shape, y.shape))
            results = []
            algNames = []
            classifierNum = 0
            seed = 5
            testing_porc = 0.2
            scoring = 'accuracy'
            k_fold = model_selection.RepeatedKFold(n_splits=5, n_repeats=5, random_state=seed)
            # print(y)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testing_porc, random_state=seed)

            print('\n\nComparing the algorithms')
            print('Algorithm - Accuracy -  recall_micro - precision_micro -f1 - Time (HH:MM:SS.mmm)')
            csv_data_header = ["Algorithm", "Accuracy",  "recall_micro", "precision_micro","f1_micro", "Time_(HH:MM:SS.mmm)"]
            csv_data = []

            for name, clf in zip(names, classifiers):
                time1 = dtm.datetime.now()
                clf.fit(X_train, y_train)
                cv_results = model_selection.cross_val_score(clf, X_train, y_train, cv=k_fold)
                print(name,cv_results.mean())
                cv_f1 = model_selection.cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
Beispiel #8
0
from sklearn.svm import LinearSVC
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import model_selection

data = sio.loadmat('ex3data1.mat')

A = data['X']
B = data['y']

rkf = model_selection.RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)
for train_index, test_index in rkf.split(A):
    X_train, X_test = A[train_index], A[test_index]
    y_train, y_test = B[train_index], B[test_index]
#X_train,X_test,y_train,y_test = train_test_split(A,B,test_size=0.3,random_state = 42)

scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = MLPClassifier(hidden_layer_sizes=(25), max_iter=500)
clf.fit(X_train, y_train)
p = clf.predict(X_test)
print('This is the confusion matrix')
    return history, test_acc


# In[21]:

epochs = 150  # maximum number of training epochs
batch_size = 10
folds = 5  # the number of folds for k-fold cross validation
n_repeats = 1  # the number of repeats for repeated k-fold cross validation

# In[22]:

test_accs = []

stratified_folds = model_selection.RepeatedKFold(n_splits=folds,
                                                 n_repeats=n_repeats).split(
                                                     graph_labels,
                                                     graph_labels)
checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path,
                                                  verbose=1,
                                                  save_best_only='loss',
                                                  save_weights_only=True,
                                                  restore_best_weights=True)

for i, (train_index, test_index) in enumerate(stratified_folds):
    print(
        f"Training and evaluating on fold {i+1} out of {folds * n_repeats}...")
    train_gen, test_gen = get_generators(train_index,
                                         test_index,
                                         graph_labels,
                                         batch_size=batch_size)
Beispiel #10
0
def _train_and_score(file_path, folds, iterations=None):
    """
	Use the given log_entries to score the classifier in a <fold>-fold cross-validation.
	: param iterations : Optionally specify to repeat <iterations> times.
	"""

    if iterations <= 1:
        iterations = None

    log_entries = _read_file_flow(file_path)

    if len(log_entries) < 10000:
        raise IOError(
            "Insufficient number of entries found in the file. Need >= 10,000."
        )

    scores = {}
    for app_id in ids_data.get_app_ids():
        scores[app_id] = []

    printer = util.prtr.Printer()

    printer.prt("Using {}-fold cross-validation".format(folds) +
                "" if iterations is None else " with {} iteration{}.".
                format(iterations, "s" if iterations > 1 else ""))

    folds = None
    if iterations:
        folds = sk_mod.RepeatedKFold(n_splits=folds, n_repeats=iterations)
    else:
        folds = sk_mod.KFold(n_splits=folds)

    current_round = 1

    for train_indices, score_indices in folds.split(log_entries):
        printer.prt("Round {} of {}.".format(current_round,
                                             folds * iterations))
        current_round += 1

        # Selecting items based on the given indices
        printer.prt("Splitting... ", newline=False)
        training_entries = [log_entries[i] for i in train_indices]
        scoring_entries = [log_entries[i] for i in score_indices]

        preconditions_msg = "Please make sure that all preconditions are met and rerun."

        # Train
        printer.prt("Training... ", newline=False)
        training_succeeded = _train_entries(training_entries,
                                            squelch_output=True)
        if not training_succeeded:
            printer.prt("")
            printer.prt("Training failed. " + preconditions_msg)
            continue

        # Score
        printer.prt("Scoring... ", newline=False)
        scoring_result = _score_entries(scoring_entries, squelch_output=True)
        if not scoring_result:
            printer.prt("")
            printer.prt("Scoring failed. " + preconditions_msg)
            # Don't continue; reset needs to happen in order to allow for the next iteration

        for app_id in scoring_result:
            scores[app_id].append(scoring_result[app_id])

        # Reset
        printer.prt("Resetting... ", newline=False)
        IntrusionClassifier.reset_models(purge=True)
        printer.prt("Done.")

    _print_scores(scores, printer)
Beispiel #11
0
)
X_train_org, X_test_org, y_train_org, y_test_org = train_test_split(
    x, data['Method'], test_size=0.2, shuffle=True)
X_train_org1, X_test_org1, y_train_org1, y_test_org1 = train_test_split(
    x, data['Fighter1Result'], test_size=0.2, shuffle=True)

# In[ ]:

################################LOGISTIC REGRESSION#####################################
print(
    "######################Logistic Regression#####################################"
)
method_model = lm.LogisticRegression(
    warm_start=True, verbose=1)  #model selection Logistic Regression
kf = model_selection.RepeatedKFold(
    n_splits=5, n_repeats=4,
    random_state=None)  #applying k-fold cross-validations
y = y_train_org  #choosing output data column
lgreg_method_acc = []  #list for populating accuracies of validation phase

for train_index, test_index in kf.split(X_train_org):
    X_train, X_test = X_train_org.iloc[train_index], X_train_org.iloc[
        test_index]  #splitting of input data rows for training and testing
    y_train, y_test = y.iloc[train_index], y.iloc[
        test_index]  #splitting of ouput data rows for training and testing
    method_model.fit(X_train,
                     y_train)  #calculating prediction on training data
    pred = method_model.predict(
        X_test)  #calculating prediction on testing data
    acc = accuracy_score(y_test, pred)  #computing accuracy
    lgreg_method_acc.append(acc)
Beispiel #12
0
                                                          Y,
                                                          test_size=valid_size,
                                                          random_state=1)
if valid_size == 0: del X_valid, Y_valid, valid_size

#%% Cross Validation
n_lambdas = 31  #32
n_shuffles = 100  #100
n_folds = 100  #20
lambdas = np.linspace(.2, .5, num=n_lambdas)
score_lambdas = np.zeros(n_lambdas)
k = 0
for alpha in lambdas:  #Go over all possible lambdas
    score_cv = pd.DataFrame(np.zeros(shape=(n_shuffles, n_folds)))
    kfold = model_selection.RepeatedKFold(n_splits=n_folds,
                                          n_repeats=n_shuffles,
                                          random_state=42)
    i = 0
    j = 0
    for train_index, test_index in kfold.split(
            X, Y):  #Cross Validation for evaluation lambda
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

        model = linear_model.Lasso(alpha=alpha,
                                   fit_intercept=False,
                                   normalize=True,
                                   precompute=False,
                                   copy_X=True,
                                   max_iter=1000,
                                   tol=0.0001,
Beispiel #13
0
def solveRegressor(rfConfig, X, y, saveLoc=None, initModel=None, CV=False):
    '''Generate and run an RF model
    
    This function is used for generating an RF mdoel, and then running
    it. If an initial model is provided, then this will load the 
    model provided, and then use that model as an initializer. The 
    model generated will then be given a hot start from the initial 
    model. 
    
    Arguments:
        rfConfig {dict} -- the dictionary of hyperparameters
        X {numpy 2d array} -- The array of values that will
            be used for generating a prediction. 
        y {numpy 1d array} -- The expected result that we want
            the model to train to. 
    
    Keyword Arguments:
        saveLoc {str} -- Location where the model should be saved. 
            This assumes that the location whre the model is to be
            saved will be writable and exists. Remember, at this time
            the function just does not do any error checking. (default: 
            {None}, in which case, the model is not saved. )
        initModel {RandomForestRegressor() model} -- This is the result 
            of an earlier fitted model. In case one is provided, the 
            current model will be restarted from this model (default: {None}
            in which case, a new model will be generated.)
    
    Returns:
        RandomForestRegressor() -- This is the result of a fitted model,
            given the data and the rest of the parameters. 
    '''

    if initModel is None:
        rfModel = RandomForestRegressor(**rfConfig)

    if CV:

        # We want to make sure that the information is meaningful
        # for all splits. Otherwise, its pretty meaningless ...
        # Obtain hyperparameters from the JSON file. This obviously
        # Takes a long time. So, we shall use this for testing only
        # -----------------------------------------------------------
        rkfFact = json.load(open('../config/RepeatedKFold.json'))
        rkf = MS.RepeatedKFold(**rkfFact)

        scores = []
        for train_index, test_index in tqdm(rkf.split(X),
                                            total=rkfFact['n_splits'] *
                                            rkfFact['n_repeats']):

            # We want to make sure that we start with
            # the provided model in every split. Otherwsie
            # we will be training on top of the other models
            # as warm-start is 1.
            if initModel is not None:
                rfModel = initModel
                rfModel.set_parameter(warm_start=True)

            rfModel.fit(X[train_index, :], y[train_index])
            yHat = rfModel.predict(X[test_index])
            score = 0
            score = np.sqrt(((yHat - y[test_index])**2).mean())

            scores.append(score)
            tqdm.write('Score = ({}) {}'.format(np.mean(scores), score))

        # Refitting the model with the whole data
        if initModel is not None:
            rfModel = initModel
            rfModel.set_parameter(warm_start=True)

        print('Score summary: {} +-({})'.format(np.mean(scores),
                                                np.std(scores)))
        print('Percentage difference: {}'.format(100 * np.mean(scores) /
                                                 np.mean(y)))

    rfModel.fit(X, y)

    return rfModel
Beispiel #14
0
from sklearn.model_selection import train_test_split

from helper import prepare_data

df = prepare_data()

y = df["Berri1"]

X = df[[
    "day", "month", "day_of_week", "Mean Temp (°C)", "Total Precip (mm)",
    "Snow on Grnd (cm)", "Min Temp (°C)", "Max Temp (°C)"
]]

regr = RandomForestRegressor(n_estimators=100)

rkf = model_selection.RepeatedKFold()

score, mse, r2 = [], [], []

for train_index, test_index in rkf.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    regr.fit(X_train, y_train)

    score.append(regr.score(X_test, y_test))

    y_pred = regr.predict(X_test)

    mse.append(metrics.mean_squared_error(y_test, y_pred))
Beispiel #15
0
def learn():
    # np.warnings.simplefilter(action='ignore', category=UserWarning)
    overlaped = 5
    # windows_size = 10
    # clusters = 5
    data_set = process_from_files('test')
    print('get data set')
    classes_names_as_is_in_data = create_classes_names_list(data_set)
    print(f'get {len(classes_names_as_is_in_data)} classes')

    files_as_nested_list = get_files_as_list_of_lists(data_set)
    print(
        f"extract data for {len(files_as_nested_list)} files with {len(files_as_nested_list[0])} columns"
    )
    for clusters in [5, 10, 20]:
        windows_sizes = [5, 10, 20]
        for windows_size in windows_sizes:
            if windows_size == 5:
                overlaps = [1, 4]
            elif windows_size == 10:
                overlaps = [1, 5, 9]
            elif windows_size == 15:
                overlaps = [1, 7, 14]
            elif windows_size == 20:
                overlaps = [1, 10, 19]
            elif windows_size == 25:
                overlaps = [1, 13, 24]
            elif windows_size == 30:
                overlaps = [1, 15, 29]
            elif windows_size == 35:
                overlaps = [1, 19, 34]
            elif windows_size == 40:
                overlaps = [1, 20, 39]
            for overlaped in overlaps:
                X_train, X_test, _, y_test = train_test_split(
                    files_as_nested_list,
                    classes_names_as_is_in_data,
                    test_size=0.9,
                    random_state=4564567,
                    shuffle=True)

                files_as_windows_test = get_overlapped_chunks_separated_for_files(
                    X_test, windows_size, overlaped)
                all_sliding_windows = get_all_overlapped_chunks(
                    X_train, windows_size, overlaped)
                print(
                    f'Generate {len(all_sliding_windows)} windows to create codebook'
                )
                kmeans_models = prepare_codebook(all_sliding_windows, clusters)
                print(f'create {len(kmeans_models)} models')
                histograms_test = get_histogram_basic_on_kmean(
                    clusters, kmeans_models, files_as_windows_test)

                # find_the_best(X_train, X_test, y_train1, y_test1)
                models = get_models()

                for name, model in models:
                    kfold = model_selection.RepeatedKFold(n_splits=5,
                                                          random_state=7,
                                                          n_repeats=10)
                    # selection = svc_param_selection(histograms_test, y_test, kfold, model, name)
                    # print(selection)
                    cv_results = model_selection.cross_val_score(
                        model,
                        histograms_test,
                        y_test,
                        cv=kfold,
                        scoring='accuracy')
                    msg = "%s: %f (%f)" % (name, cv_results.mean(),
                                           cv_results.std())
                    print(msg)