Beispiel #1
0
 def __init__(self, categorical_features_indices, params):
     super().__init__(params)
     self.categorical_features_indices = categorical_features_indices
     self.name = 'CatBoostRegressor'
     self.cv = KFold(5, shuffle=True, random_state=1)
     self.metrics = {'MAE': mae_score, 'RMSE': rmse_score}
    res = []
    for md in max_depths:
        # print()
        for nest in n_estimators:

            mdl = RandomForestClassifier(max_depth=md,
                                         n_estimators=nest,
                                         oob_score=True,
                                         max_features='auto',
                                         random_state=88)
            '''
            Kfold cross validation
            '''
            scores_test, scores_train = [], []
            kf = KFold(n_splits=k_fold)

            for train_index, test_index in kf.split(X):
                X_train = X[train_index]
                X_test = X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                mdl.fit(X_train, y_train)
                score_test = mdl.score(X_test, y_test)
                score_train = mdl.score(X_train, y_train)
                # print("\t score test {:.3f} train {:.3f}  ".format(score_test, score_train))
                scores_test.append(score_test)
                scores_train.append(score_train)

            # print("md {}, nest {},  test {:.3f} +- {:.2f} train {:.3f}".format(md,nest,np.mean(scores_test), np.std(scores_test), np.mean(scores_train)  ))
            res.append({
 def __init__(self, n_splits=2, shuffle=False):
     self.n_splits = n_splits
     if self.n_splits > 1:
         self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
Beispiel #4
0
def train_LSTM(X,
               y,
               model,
               inp_dim,
               weights,
               epochs=EPOCHS,
               batch_size=BATCH_SIZE):
    cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
    print cv_object
    p, r, f1 = 0., 0., 0.
    p1, r1, f11 = 0., 0., 0.
    sentence_len = X.shape[1]
    for train_index, test_index in cv_object.split(X):
        if INITIALIZE_WEIGHTS_WITH == "glove":
            model.layers[0].set_weights([weights])
        elif INITIALIZE_WEIGHTS_WITH == "random":
            shuffle_weights(model)
        else:
            print "ERROR!"
            return
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        y_train = y_train.reshape((len(y_train), 1))
        X_temp = np.hstack((X_train, y_train))
        for epoch in xrange(epochs):
            for X_batch in batch_gen(X_temp, batch_size):
                x = X_batch[:, :sentence_len]
                y_temp = X_batch[:, sentence_len]

                class_weights = None
                if SCALE_LOSS_FUN:
                    class_weights = {}
                    class_weights[0] = np.where(
                        y_temp == 0)[0].shape[0] / float(len(y_temp))
                    class_weights[1] = np.where(
                        y_temp == 1)[0].shape[0] / float(len(y_temp))
                    class_weights[2] = np.where(
                        y_temp == 2)[0].shape[0] / float(len(y_temp))

                try:
                    y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
                except Exception as e:
                    print e
                    print y_temp
                print x.shape, y.shape
                loss, acc = model.train_on_batch(x,
                                                 y_temp,
                                                 class_weight=class_weights)
                print loss, acc

        y_pred = model.predict_on_batch(X_test)
        y_pred = np.argmax(y_pred, axis=1)
        print classification_report(y_test, y_pred)
        print precision_recall_fscore_support(y_test, y_pred)
        print y_pred
        p += precision_score(y_test, y_pred, average='weighted')
        p1 += precision_score(y_test, y_pred, average='micro')
        r += recall_score(y_test, y_pred, average='weighted')
        r1 += recall_score(y_test, y_pred, average='micro')
        f1 += f1_score(y_test, y_pred, average='weighted')
        f11 += f1_score(y_test, y_pred, average='micro')

    print "macro results are"
    print "average precision is %f" % (p / NO_OF_FOLDS)
    print "average recall is %f" % (r / NO_OF_FOLDS)
    print "average f1 is %f" % (f1 / NO_OF_FOLDS)

    print "micro results are"
    print "average precision is %f" % (p1 / NO_OF_FOLDS)
    print "average recall is %f" % (r1 / NO_OF_FOLDS)
    print "average f1 is %f" % (f11 / NO_OF_FOLDS)
X = X_train_df.values.astype('float32')
X_scaler = MinMaxScaler(feature_range=(0, 1))
X = X_scaler.fit_transform(X)
X_test = X_df.loc[test_dates_in_X_df]

Y_df = pd.read_csv('swell_Y.csv', index_col=[0])
Y_train_df = Y_df.loc[set(X_train_df.index.values)]
Y = Y_train_df.values  # 24시간 100101011... 같은 형태의 Y값

number_of_var = len(X_train_df.columns)
first_layer_node_cnt = int(number_of_var * (number_of_var - 1) / 2)
print("first_layer_node_cnt %d" % first_layer_node_cnt)
epochs = 300
patience_num = 100
n_fold = 10
kf = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

# 빈 accuracy 배열
accuracy = []
filename = os.path.basename(os.path.realpath(sys.argv[0]))

# 모델의 설정, 컴파일, 실행
for train_index, validation_index in kf.split(X):  # 이하 모델을 학습한 뒤 테스트.
    print("TRAIN:", train_index, "TEST:", validation_index)
    X_train, X_Validation = X[train_index], X[validation_index]
    Y_train, Y_Validation = Y[train_index], Y[validation_index]
    model = Sequential()
    model.add(
        Dense(first_layer_node_cnt, input_dim=number_of_var,
              activation='relu'))
    model.add(Dense(int(first_layer_node_cnt / 2), activation='relu'))
Beispiel #6
0
def calc_model():
    global word_features, classifier, word_features_2gram
    # documents = [(list(movie_reviews.words(fileid)), category)
    #              for category in movie_reviews.categories()
    #              for fileid in movie_reviews.fileids(category)]

    documents = []
    documents2gram = []

    with open("positive.txt", 'r') as csv_file:
        pos = 1
        for record in csv_file:
            documents.append((word_tokenize(record), pos))
            # sixgrams = get_ngrams(record, 2)
            # documents2gram.append((get_ngrams(record, 2), pos))

    with open("negative.txt", 'r') as csv_file:
        for record in csv_file:
            documents.append((word_tokenize(record), 0))

            # documents2gram.append((get_ngrams(record, 2), 0))


    random.shuffle(documents)
    # random.shuffle(documents2gram)

    all_words = []
    for lst in documents:
        for w in lst[0]:
            all_words.append(w.lower())

    # all_words_2gram = []
    # for lst in documents2gram:
    #     for w in lst[0]:
    #         all_words_2gram.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    print("getting features")
    word_features = list(all_words.keys())[:5000]

    # all_words_2gram = nltk.FreqDist(all_words_2gram)
    # print("getting features")
    # word_features_2gram = list(all_words_2gram.keys())[:5000]

    save_pickle(pickle_word_features, word_features)
    print("saved word features")

    print("setting features per tweet")
    feature_sets = [(find_features(rev), category) for (rev, category) in documents]
    # feature_sets_2gram = [(find_features(rev), category) for (rev, category) in documents2gram]



    k = 10
    cv = KFold(k)
    accur = []
    i = 0

    testing_set = feature_sets[1900:] #+ feature_sets_2gram[1900:]
    training_set = feature_sets[:1900] #+ feature_sets_2gram[:1900]

    linear_svc_classifier = SklearnClassifier(LinearSVC())
    # classifier = nltk.NaiveBayesClassifier.train(testing_set)
    classifier = linear_svc_classifier.train(testing_set)
    accur.insert(i, nltk.classify.util.accuracy(classifier, training_set))


    print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur))
Beispiel #7
0
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=47)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=47)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(nthread=4,
                             n_estimators=10000,
                             learning_rate=0.02,
                             num_leaves=32,
                             colsample_bytree=0.9497036,
                             subsample=0.8715623,
                             max_depth=8,
                             reg_alpha=0.04,
                             reg_lambda=0.073,
                             min_split_gain=0.0222415,
                             min_child_weight=40,
                             silent=-1,
                             verbose=-1)

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=100,
                early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name,
                                                 index=False)
    display_importances(feature_importance_df)
    return feature_importance_df
Beispiel #8
0
def train_classifier(name,
                     classifier,
                     parameters,
                     data,
                     seed,
                     k=10,
                     plot=False):
    """
    Train the given classifier using a 10-fold cross validation.
    The given hyper-parameters are chosen using a 5-fold cross validation.
    :param name: Name of the classifier to train.
    :param classifier: Classifier to train.
    :param parameters: Hyper-parameters to choose from.
    :param data: Samples to use for the training and test.
    :param seed: Seed to use for the random generator, used for the k-fold split.
    :param k: Number of fold for the cross validation (default 10).
    :param plot: If True, a plot for the hyper-parameters will be generated.
    :return: Performances (tuple of accuracy, f1, roc_auc) of the classifiers
             trained with the best hyper-parameters for each k-fold.
    """

    # save results of the 10-folds
    folds = []

    # store the performances of the best 10 trained classifiers
    accuracy = []
    f1 = []
    roc_auc = []

    # divide features and classes
    X, y = data

    # split the data for the 10-fold cross validation
    kf = KFold(n_splits=k, shuffle=True, random_state=seed)
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        print("Training {}... Fold {}".format(name, i + 1))

        # extract training and test data
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]

        # tune the hyper-parameters (5-fold cross validation, F1 scoring function)
        best_classifier = GridSearchCV(classifier,
                                       parameters,
                                       scoring="f1",
                                       cv=5,
                                       n_jobs=5,
                                       refit=True)
        best_classifier.fit(X_train, y_train)

        # store the classifier for the plot (tuning of hyper-parameter)
        folds.append(best_classifier)

        # measure the performances
        prediction = best_classifier.predict(X_test)
        accuracy.append(accuracy_score(y_test, prediction))
        f1.append(f1_score(y_test, prediction))
        roc_auc.append(roc_auc_score(y_test, prediction))

    # plot the choice of the best hyper-parameters
    if plot is True:

        # extract the parameter
        if len(parameters.keys()) != 1:
            raise NotImplementedError(
                "The number of hyper-parameters is not equal to 1. "
                "I do not know how to plot them.")
        parameter_name = list(parameters.keys())[0]

        # extract the hyper-parameter values
        assert len(folds) == k
        x_axis_values = folds[0].cv_results_["param_" + parameter_name]
        for i in range(k):
            np.testing.assert_array_equal(
                x_axis_values, folds[i].cv_results_["param_" + parameter_name])

        # extract the F1 scores
        y_axis_values = list(
            map(lambda x: x.cv_results_["mean_test_score"], folds))

        # plot the graph
        for i in range(k):
            plt.semilogx(x_axis_values,
                         y_axis_values[i],
                         label="Fold " + str(i + 1))
        plt.title(name + " - tuning of the best value for " + parameter_name)
        plt.xlabel(parameter_name + " parameter")
        plt.ylabel("F1 scores (on the test set)")
        plt.legend(loc=4)
        plt.savefig(name.lower().replace(" ", "_") + ".png")
        plt.close()

    # return the performances
    return accuracy, f1, roc_auc
# The following figure illustrates k-fold cross-validation with k=4. There are some other schemes to divide the training set, we'll look at them briefly later.

# ### K-Fold Cross Validation

# It is a statistical technique which enables us to make extremely efficient use of available data It divides the data into several pieces, or 'folds', and uses each piece as test data one at a time

# In[103]:

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# In[104]:

xgb = XGBClassifier(n_estimators=18)
scores = cross_val_score(xgb, X_smote, y_smote, scoring='r2', cv=5)
scores

# In[105]:

# the other way of doing the same thing (more explicit)

# create a KFold object with 5 splits
folds = KFold(n_splits=5, shuffle=True, random_state=100)
scores_1 = cross_val_score(xgb, X_smote, y_smote, scoring='r2', cv=folds)
scores_1

# We used several methods to predict the default the best result we got by using XGboost on data which was sampled using SMOTE the Accuracy of XGB on the testing dataset is :0.981. Also the important features are:V4,V14,V12,V16,V11. Also by performing logistic regression we got a good score of Accuracy: 97.6780% Recall: 87.8378% ROC AUC: 92.7664% For classification model.

# In[ ]: