Exemple #1
0
def get_model(inputSize, classWeight):
    if aiLib == 'keras':
        model = Sequential()
        model.add(Dense(units=1000,
                        activation='tanh',
                        input_shape=(inputSize,),
                        kernel_initializer='lecun_normal',
                        kernel_regularizer=regularizers.l2(0.01),
                        ))
        model.add(Dropout(rate=0.5))
        model.add(Dense(units=1000,
                        activation='tanh',
                        bias_initializer='lecun_normal',
                        bias_regularizer=regularizers.l2(0.01)
                        ))
        model.add(Dropout(rate=0.5))
        model.add(Dense(units=10, activation='softmax'))
        sgd = SGD(lr=1, clipvalue=0.5, decay=1, momentum=0.5, nesterov=True)
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy')
    elif aiLib == 'sklearn':
        model = SVC(probability=True, class_weight=classWeight)
    elif aiLib == 'xgboost':
        model = XGB(probability=True, class_weight=classWeight,
                    eta=1e-3, objective='multi:softprob', num_class=10,
                    max_depth=20)
    return model
Exemple #2
0
def combinedTwo(X_train, X_test, y_train):

    # Predict with XGB for fftlog10
    clf1 = XGB(n_estimators=1000, gamma=0.87)

    fft_train = np.log10(np.abs(np.fft.fft(X_train[:, 4:]))[:, :, :63] + 1)
    fft_train = fft_train.reshape([
        np.shape(fft_train)[0],
        np.shape(fft_train)[1] * np.shape(fft_train)[2]
    ])

    fft_test = np.log10(np.abs(np.fft.fft(X_test[:, 4:]))[:, :, :63] + 1)
    fft_test = fft_test.reshape(
        [np.shape(fft_test)[0],
         np.shape(fft_test)[1] * np.shape(fft_test)[2]])

    clf1.fit(fft_train, y_train)
    p1 = clf1.predict_proba(fft_test)

    # Predict with RF for mean_std
    clf2 = RFC(n_estimators=1000)

    mean_std_train = np.hstack([np.mean(X_train, 2), np.std(X_train, 2)])
    mean_std_test = np.hstack([np.mean(X_test, 2), np.std(X_test, 2)])

    clf2.fit(mean_std_train, y_train)
    p2 = clf2.predict_proba(mean_std_test)

    # Take the prediction of which one of the classifiers is most sure of
    p = np.stack([p1, p2])
    predicted_classes = np.argmax(np.max(p, 0), 1)
    return predicted_classes
Exemple #3
0
def try_params(n_iterations, params, data, get_predictions=False):
    n_estimators = int(round(n_iterations * trees_per_iteration))
    print("n_estimators:", n_estimators)
    pprint(params)

    clf = XGB(n_estimators=n_estimators, nthread=-1, **params)

    return train_and_eval_sklearn_classifier(clf, data)
Exemple #4
0
def try_params(n_iterations, params, data, get_predictions=False):
    n_estimators = int(round(n_iterations * trees_per_iteration))
    print("n_estimators:", n_estimators)
    pprint(params)

    model = XGB(n_estimators=n_estimators, nthread=-1, **params)

    return train_and_eval_sklearn_regressor(model, data)
Exemple #5
0
def fast_gbtree_classifier(
    X,
    y,
    *,
    learning_rate: float = 1.0,
    n_estimators: int = 100,
    subsample: float = 0.8,
    max_depth: Optional[int] = None,
    reg_alpha: Optional[float] = None,  # L1
    reg_lambda: Optional[float] = 1e-05,  # L2
    gamma: Optional[float] = None,
    missing: Optional[Any] = np.nan,
    objective: Objectives = 'binary:logistic',
    grow_policy: Literal['depthwise', 'lossguide'] = 'depthwise',
    tree_method: Literal['auto', 'exact', 'approx', 'hist',
                         'gpu_hist'] = 'auto',
    importance_type: Literal['gain', 'weight', 'cover', 'total_gain',
                             'total_cover'] = 'gain',
    random_state: int = 1,
    n_jobs: Optional[int] = None,
    framework: Literal['auto', 'xgboost', 'sklearn'] = 'auto',
    **kwargs,
) -> GradientBoostingClassifier:
    """Shared interface for XGBoost and sklearn Gradient Boosting Tree Classifier"""
    kw = dict(locals())
    kwargs = kw.pop('kwargs')
    X = kw.pop('X')
    y = kw.pop('y')
    kw.update(kwargs)
    framework = kw.pop('framework')
    ### XGBOOST
    is_xgboost = False
    if framework == 'sklearn':
        XGB = GradientBoostingClassifier
    else:
        try:
            from xgboost import XGBRFClassifier as XGB
            is_xgboost = True
        except ImportError as e:
            warn('Run `pip install xgboost` to get significant '
                 'faster GradientBoostingTree')
            XGB = GradientBoostingClassifier
    ### fine-tune the keywords for sklearn
    if not is_xgboost:
        org = dict(kw)
        spec = inspect.getfullargspec(XGB.__init__)
        kw = dict()
        for k in spec.args + spec.kwonlyargs:
            if k in org:
                kw[k] = org[k]
    ### training
    tree = XGB(**kw)
    tree.fit(X, y)
    return tree
Exemple #6
0
def baseline(X_train, X_test, y_train):
    # Predict with XGB for fftlog10
    clf1 = XGB(n_estimators=300, gamma=0.87)

    fft_train = np.log10(np.abs(np.fft.fft(X_train[:, 4:]))[:, :, :63] + 1)
    fft_train = fft_train.reshape([
        np.shape(fft_train)[0],
        np.shape(fft_train)[1] * np.shape(fft_train)[2]
    ])

    fft_test = np.log10(np.abs(np.fft.fft(X_test[:, 4:]))[:, :, :63] + 1)
    fft_test = fft_test.reshape(
        [np.shape(fft_test)[0],
         np.shape(fft_test)[1] * np.shape(fft_test)[2]])

    clf1.fit(fft_train, y_train)
    return clf1.predict(fft_test)
Exemple #7
0
def main(args):
    start = time.time()

    if not args.all_feats:
        data = pickle.load(open(args.pruned_ds, 'rb'))
    else:
        data = pickle.load(open(args.full_ds, 'rb'))
        data = np.array([feats[1] for feats in data])

    X = data[:, 1:]
    y = data[:,0]

    if args.num_folds > 0:
        print(f'Performing {args.num_folds}-fold validation')
        f_scores = kfold_validation(X, y, algorithm=args.algorithm, num_folds=args.num_folds)
        accs = kfold_scores(f_scores)
        print(f_scores)
        print(f'Average accuracy of {args.num_folds}-folds: {100*accs[0]:.2f}%')
        print(f'Best accuracy of {args.num_folds}-folds: {100*accs[1]:.2f}%')
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=args.seed)
        print(f'Train data: {X_train.shape}, train labels: {y_train.shape}')
        print(f'Test data: {X_test.shape}, test labels: {y_train.shape}')

        if args.algorithm == 'NB':
            model = BernoulliNB()
            model.fit(X_train, y_train)

        if args.algorithm == 'RF':
            model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2)
            model.fit(X_train, y_train)

        if args.algorithm == 'XGB':
            model = XGB(verbosity=1, n_estimators=1000, max_depth=8, reg_lambda=1e-2, reg_alpha=4)
            model.fit(X_train, y_train, eval_set=[(X_test,y_test)], eval_metric='logloss', verbose=True, early_stopping_rounds=20)

        # test model
        test_model(model, X_test, y_test)

    print(f'Script completed in {time.time()-start:.2f} secs')

    return 0
Exemple #8
0
def try_params(n_iterations, params):
    """El objetivo de esta funcion es evaluar las diferentes configuraciones
    obtenidas de la muestra.

    :param n_iterations:
        Aumento de estimadores del arbol que se agregaran por iteracion.

    :param params:
        Configuracion para el modelo correspondiente.

    :returns:
        Retorna la configuracion con el tratamiento correspondiente.
    """
    n_estimators = int(round(n_iterations * trees_per_iteration))
    print("n_estimators:", n_estimators)
    pprint(params)

    clf = XGB(n_estimators=n_estimators, nthread=-1, **params)

    return train_and_eval_sklearn_classifier(clf, data)
Exemple #9
0
def kfold_validation(features, labels, algorithm='XGB', num_folds=2):

    kf = KFold(n_splits=num_folds)
    kf.get_n_splits(features)

    fold_scores = {'train':[], 'val':[]} 

    fold_num = 0
    for train_idx, val_idx in kf.split(features):
        fold_num += 1
        print(f'Training on fold {fold_num}')
        X_train, y_train = features[train_idx], labels[train_idx]
        X_val, y_val = features[val_idx], labels[val_idx]

        if args.algorithm == 'NB':
            model = BernoulliNB()
            model.fit(X_train, y_train)

        if args.algorithm == 'RF':
            model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2)
            model.fit(X_train, y_train)

        if args.algorithm == 'XGB':
            model = XGB(verbosity=1, n_estimators=1000, max_depth=3, reg_lambda=1, reg_alpha=1e-4)
            model.fit(X_train, y_train, eval_set=[(X_val,y_val)], eval_metric='logloss', verbose=True, early_stopping_rounds=20)

        train_score = model.score(X_train, y_train)
        fold_scores['train'].append(train_score)

        val_score = model.score(X_val, y_val)
        fold_scores['val'].append(val_score)

        print(f'Fold {fold_num}: training score = {train_score}, validation score = {val_score}')

        with open('fold_accs_random_forest.npy', 'wb') as outfile:
            pickle.dump(fold_scores, outfile)

    return fold_scores
Exemple #10
0
        clf = GridSearchCV(RFC(),
                           RFC_tuned_parameter,
                           cv=7,
                           scoring='%s' % score)
    elif (x == 9):
        clf = GridSearchCV(ABC(),
                           ABC_tuned_parameter,
                           cv=7,
                           scoring='%s' % score)
    elif (x == 10):
        clf = GridSearchCV(GBC(),
                           GBC_tuned_parameter,
                           cv=7,
                           scoring='%s' % score)
    elif (x == 11):
        clf = GridSearchCV(XGB(),
                           XGB_tuned_parameter,
                           cv=7,
                           scoring='%s' % score)

    print("Check Point")

    clf.fit(X_train, y_train)

    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    #    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    #        print("%0.3f (+/-%0.03f) for %r"
Exemple #11
0
BINS = np.zeros((NUM + 1))
for i in range(1, NUM + 1):
    BINS[i] = NUM * i

#Loading data
X, Y = Loader.data_load(CLASS, PARTS, PATH)

#Using colour histograms if needed
X = Loader.histogram(X, BINS, NUM)

#preprocessing and data split if needed
X, TRAIN_IND, TEST_IND = Loader.preproc(X)
print(X.shape)
print(X[TRAIN_IND].shape, X[TEST_IND].shape)
eval_set = [X[TEST_IND], Y[TEST_IND]]
#Creating and fitting the model
model = XGB(max_depth=DEPTH,
            n_estimators=ESTIMATORS,
            learning_rate=RATE,
            nthread=4)
model.fit(X[TRAIN_IND], Y[TRAIN_IND])
NANI = np.copy(X[TEST_IND])
#Predictions for train data
#Y_P = model.predict(X[TRAIN_IND])
#accuracy = score(Y[TRAIN_IND], Y_P.round())
#print('TRAIN ACCURACY = ', accuracy*100, '%')
#Predictions for test data
Y_P = model.predict(NANI)
accuracy = score(Y[TEST_IND], Y_P.round())
print('TEST ACCURACY = ', accuracy * 100, '%')
Exemple #12
0
print(f"Saving tfidf count vector to {file_name}")
joblib.dump(cv, file_name)


# Feature Scaling
# =============================================================================
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train_scaled = sc.fit_transform(X_train_cv)
# X_test_scaled = sc.transform(X_test_cv)
# =============================================================================

classifiers = { 'MultinomialNB' : MNB(),
               'RandomForest': RF(n_jobs=-1),
               'GradientBoosting': GBC(),
               'xgb': XGB()}

score_list_columns = ['model_name', 'accuracy', 'precision', 'recall', 'f1_score']
score_list = []

best_score = 0
for model_name, model in classifiers.items():
    # Fitting MultinomialNB
    print("="*60)
    y_score = model.fit(X_train, y_train)
    print(model)
    #predicting the test results
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    #Making the confusion Matrix
    cost_benefit = np.array([[100, -10], [0, 0]])

    X_train_raw, X_test_raw, y_train, y_test = get_train_test(corpus_filepath)

    print("Training & Test", X_train_raw.shape, X_test_raw.shape,
          y_train.shape, y_test.shape)

    # Bag of words model
    cv = TfidfVectorizer(max_features=1000, stop_words="english")
    print(cv)

    X_train = cv.fit_transform(X_train_raw).toarray()
    X_test = cv.transform(X_test_raw).toarray()

    #models = [RF(n_jobs=-1), LR(n_jobs=-1), GBC(), SVC(probability=True)]
    models = [MultinomialNB(), GaussianNB(), RF(n_jobs=-1), GBC(), XGB()]

    model_profits = []
    for model in models:
        print(model.__class__.__name__)
        profits, thresholds = get_model_profits(model, cost_benefit, X_train,
                                                X_test, y_train, y_test)
        model_profits.append((model, profits, thresholds))

    plot_model_profits(model_profits, "./presentation/proft_curve.png")
    #plot_model_profits(model_profits)

    max_model, max_thresh, max_profit, summary_list = find_best_threshold(
        model_profits)
    max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh
    proportion_positives = max_labeled_positives.mean()
Exemple #14
0

    model2 = Sequential()
    model2.add(LSTM(units=32, input_shape=(1, x_train.shape[1])))
    model2.add(Dense(1, activation='sigmoid'))
    model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    ### 第一层模型
    clfs = [

         # GBDT(n_estimators=100),

        # RF(n_estimators=100),

        model1,
        XGB(n_estimators=100)

        # SVM()

    ]

    X_train_stack  = np.zeros((x_train.shape[0], len(clfs)))
    X_test_stack = np.zeros((x_test.shape[0], len(clfs)))


    # 5折stacking
    n_folds = 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1)
    print("longding...")
    for i,clf in enumerate(clfs):
        # print("分类器:{}".format(clf))
Exemple #15
0
y = data[1]
groups = data[2]
limit = 'all'
feature = 'mean_std'
X = getRelevantData(X, limit)
X_feats = getFeatures(X, feature, False, 0)

params = {
    'max_depth': np.arange(2, 10, 1),
    'min_child_weight': [0.1, 0.5, 1, 2],
    'gamma': [0, 0.001, 0.01, 0.1, 0.2],
    'learning_rate': [0.1, 0.2, 0.3, 0.5, 0.7, 1]
}

cv = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
clf = RandomizedSearchCV(XGB(n_jobs=-1), params, cv=cv)
clf.fit(X_feats, y, groups)

printScores(clf)

#%% test best found estimator
bestclf = clf.best_estimator_
bestclf.n_estimators = 200
testcv = GroupShuffleSplit(n_splits=30, test_size=0.2, random_state=0)
accuracies = cross_val_score(bestclf, X_feats, y, groups, cv=testcv)

print('Real score for best found estimator is {}'.format(np.mean(accuracies)))

#%% Make submission

chosenclf = XGB(base_score=0.5,
def XGB_ModelBuilder(X_train, y_train, X_test, y_test, X_unknown=[]):

    # XGB_ModelBuilder.py
    # Created by KAC on 02/12/2020
    """ This function takes in data and completes a grid search to tune parameters automatically. It then makes predictions
    and calculates an MAE score for those predictions."""

    import numpy as np
    import pandas as pd
    from sklearn.feature_selection import RFECV
    from sklearn.metrics import log_loss
    from xgboost import XGBClassifier as XGB
    from sklearn.model_selection import cross_val_score, RandomizedSearchCV
    from sklearn.metrics import make_scorer

    # scorer = make_scorer(log_loss, greater_is_better=False)
    XGB_model = XGB()
    selector = RFECV(estimator=XGB_model, scoring='neg_log_loss', cv=5)
    selector.fit(X_train, y_train)
    CV_score = cross_val_score(selector,
                               X_train,
                               y_train,
                               scoring='neg_log_loss',
                               cv=5)
    scr = np.mean(CV_score)
    print(
        pd.DataFrame({
            'Variable': X_train.columns,
            'Importance': selector.ranking_
        }).sort_values('Importance', ascending=True).head(50))
    print("Optimal number of features: ", selector.n_features_)
    print("Log Loss for All Features: ", scr)

    if selector.n_features_ < len(X_train.columns):
        X_train_transformed = selector.transform(X_train)
        X_test_transformed = selector.transform(X_test)

        CV_score = cross_val_score(selector,
                                   X_train_transformed,
                                   y_train,
                                   scoring='neg_log_loss',
                                   cv=5)
        scr = np.mean(CV_score)
        print("Log Loss for Selected Features on Training Data: ", scr)
    else:
        X_train_transformed = X_train
        X_test_transformed = X_test
        print(
            "Not optimal to remove features. Proceeding to parameter tuning.")

    # Current Best: {'subsample': 0.9, 'n_estimators': 250, 'min_child_weight': 2, 'max_depth': 8, 'learning_rate': 0.02, 'colsample_bytree': 0.85}
    parameters = {
        "learning_rate": [0.01, 0.015, 0.02, 0.025, 0.03],  #[0.01, 0.05, 0.1],
        "n_estimators": [250, 500, 600],  #[500, 750, 1000],
        "max_depth": [8, 9, 10, 12],  #[3, 6, 9],
        "min_child_weight": [2, 5, 8],  #[1, 2],
        "colsample_bytree": [0.7, 0.75, 0.8, 0.85],  #[0.5, 0.75, 1],
        "subsample": [0.9, 1]  #[0.5, 0., 1]
    }
    rsearch = RandomizedSearchCV(estimator=XGB_model,
                                 param_distributions=parameters,
                                 scoring='neg_log_loss',
                                 n_iter=250,
                                 cv=5)  #XGB_model
    rsearch.fit(X_train_transformed, y_train)
    print(rsearch.best_params_)

    CV_score = cross_val_score(rsearch,
                               X_train_transformed,
                               y_train,
                               scoring='neg_log_loss',
                               cv=5)
    scr = np.mean(CV_score)
    print(
        "Log Loss for Selected Features and Parameter Tuning on Training Data: ",
        scr)

    predictions = rsearch.predict_proba(X_test_transformed)

    pred_scr = round(log_loss(y_test, predictions), 5)
    print("2019 Score: ", pred_scr)

    if X_unknown is not None:
        X_final = pd.concat([X_train, X_test])
        X_final = RFECV.transform(X_final)
        y_final = pd.concat([y_train, y_test])

        X_unknown = RFECV.transform(X_unknown)

        rsearch.fit(X_final, y_final)
        predictions_final = rsearch.predict(X_unknown)

    else:
        predictions_final = []

    return predictions, predictions_final
import numpy as np
from own_functions import loadData, getRelevantData, getFeatures, getMaxpeaks
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier as XGB
from xgboost import plot_importance
import matplotlib.pyplot as plt

folder = getcwd() + '/robotsurface/'
data = loadData(folder)

X = data[0]
X = getRelevantData(X, 'all')
X_f = getFeatures(X, 'mean_std')
y = data[1]
clf = XGB()
clf.fit(X_f, y)
# %%
plt.figure(figsize=(20, 10))
ax = plt.axes()
plot_importance(clf, ax)
plt.show()

# %% see feature order

fft = np.abs(np.fft.fft(X))[:, :, :63]
fftmean = np.expand_dims(np.mean(fft, 2), axis=2)
fftstd = np.expand_dims(np.std(fft, 2), axis=2)
mean = np.expand_dims(np.mean(X, 2), axis=2)
std = np.expand_dims(np.std(X, 2), axis=2)
peaks = getMaxpeaks(fft, 2)
Exemple #18
0
warnings.filterwarnings("ignore", category=ConvergenceWarning)

##################################
## 3.1 train and test models using GridSearchCV
models = {
    'DT': DTC(),
    'LR': LR(),
    'MLP': MLPC(),
    'SVC': SVC(),
    'NB': NB(),
    'KNN': KNNC(),
    'Bagging': BaggingC(),
    'RF': RFC(),
    'AdaBoost': AdaBoostC(),
    'GB': GBC(),
    'XGB': XGB(),
}

param_dict = {
    # 0.67 {'max_depth': 1, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
    'DT': {
        'max_depth': [1,2,3,None],
        'max_leaf_nodes': [4,6,8,10,None],
        'min_samples_leaf': [1,2,3],
        'min_samples_split': [2,4,6]
    },
    # LR 0.64 {'C': 5.0, 'class_weight': None, 'fit_intercept': False, 'penalty': 'l2', 'solver': 'sag'}
    'LR': {
        "solver": ['lbfgs', 'liblinear', 'sag', 'saga'],
        "penalty": ['l2'],
        "C": [1.0, 1.5, 2.0, 5.0, 10],
          y_train.shape, y_test.shape)

    # Bag of words model
    cv = TfidfVectorizer(max_features=1000, stop_words="english")
    print(cv)

    X_train = cv.fit_transform(X_train_raw).toarray()
    X_test = cv.transform(X_test_raw).toarray()

    #models = [RF(n_jobs=-1), LR(n_jobs=-1), GBC(), SVC(probability=True)]
    models = [
        MultinomialNB(),
        GaussianNB(),
        RF(n_jobs=-1),
        GBC(),
        XGB(n_jobs=-1)
    ]

    model_profits = []
    for model in models:
        print(model.__class__.__name__)
        profits, thresholds = get_model_profits(model, cost_benefit, X_train,
                                                X_test, y_train, y_test)
        model_profits.append((model, profits, thresholds))

    plot_model_profits(model_profits, "./presentation/proft_curve.png")
    #plot_model_profits(model_profits)

    max_model, max_thresh, max_profit, summary_list = find_best_threshold(
        model_profits)
    max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh
from xgboost import XGBClassifier as XGB

data = pd.read_csv("Train.csv")

X = data.drop(['INCIDENT_ID', 'DATE', 'MULTIPLE_OFFENSE'], axis=1)
Y = data['MULTIPLE_OFFENSE']

x = X.values
y = Y.values

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

clf = XGB(seed=0)
clf.fit(x_train, y_train)
# rfc = RandomForestClassifier()
# rfc.fit(x_train,y_train)

# y_pred = rfc.predict(x_test)

# from sklearn.metrics import confusion_matrix
# import matplotlib.pyplot as plt
# import seaborn as sns

# LABELS = ['Normal', 'Fraud']
# conf_matrix = confusion_matrix(y_test, y_pred)
# plt.figure(figsize =(12, 12))
# sns.heatmap(conf_matrix, xticklabels = LABELS,
#             yticklabels = LABELS, annot = True, fmt ="d");

# In[86]:

trainB = resample_data(train, target=target)
print('Number of clients in the dataset is : {}'.format(len(dataset)))
print('Number of clients in the balanced train set is : {}'.format(
    len(trainB)))
print('Number of clients in the test set is : {}'.format(len(test)))

# In[88]:

model_XGB = XGB(max_depth=6,
                learning_rate=.1,
                n_estimators=100,
                reg_lambda=0.5,
                reg_alpha=0,
                verbosity=1,
                n_jobs=-1,
                tree_method='exact').fit(trainB[features], trainB[target])

pred = model_XGB.predict(test[features])
predp = model_XGB.predict_proba(test[features])[:, 1]

importances = model_XGB.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(15, 8))
plt.title('Feature Importances: Balanced Extreme Gradient Boosting (XGBoost)')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
def XGB_ModelBuilder(X_train, y_train, X_test, y_test, X_unknown):

    # XGB_ModelBuilder.py
    # Created by KAC on 02/12/2020
    """ This function takes in data and completes a grid search to tune parameters automatically. It then makes predictions
    and calculates an MAE score for those predictions."""

    import numpy as np
    import pandas as pd
    from sklearn.feature_selection import RFECV
    from sklearn.metrics import mean_absolute_error
    from xgboost import XGBRegressor as XGB
    from sklearn.model_selection import cross_val_score, RandomizedSearchCV
    from sklearn.metrics import make_scorer

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    XGB_model = XGB(objective='reg:squarederror')
    RFECV = RFECV(estimator=XGB_model, scoring=scorer)
    RFECV.fit(X_train, y_train)
    CV_score = cross_val_score(RFECV, X_train, y_train, scoring=scorer)
    scr = np.mean(CV_score)
    print(
        pd.DataFrame({
            'Variable': X_train.columns,
            'Importance': RFECV.ranking_
        }).sort_values('Importance', ascending=True).head(50))
    print("Optimal number of features: ", RFECV.n_features_)
    print("MAE for All Features: ", scr)

    X_train_transformed = RFECV.transform(X_train)
    X_test_transformed = RFECV.transform(X_test)

    CV_score = cross_val_score(RFECV,
                               X_train_transformed,
                               y_train,
                               scoring=scorer)
    scr = np.mean(CV_score)
    print("MAE for Selected Features on Training Data: ", scr)

    parameters = {
        "learning_rate": [0.01, 0.015, 0.02],
        "n_estimators": [650, 700, 750],
        "max_depth": [8, 9, 10],
        "min_child_weight": [1, 2],
        "gamma": [0.15, 0.2, 0.25],
        "colsample_bytree": [0.75, 0.8, 0.85],
        "subsample": [0.3, 0.4, 0.5]
    }
    rsearch = RandomizedSearchCV(estimator=XGB_model,
                                 param_distributions=parameters,
                                 n_iter=250)
    rsearch.fit(X_train_transformed, y_train)
    # print(rsearch.best_params_)

    CV_score = cross_val_score(rsearch,
                               X_train_transformed,
                               y_train,
                               scoring=scorer)
    scr = np.mean(CV_score)
    print("MAE for Selected Features and Parameter Tuning on Training Data: ",
          scr)

    predictions = rsearch.predict(X_test_transformed)
    pred_scr = round(mean_absolute_error(y_test, predictions), 3)
    print("MAE for Selected Features and Parameter Tuning on 2019 Data: ",
          pred_scr)

    if X_unknown is not None:
        X_final = pd.concat([X_train, X_test])
        X_final = RFECV.transform(X_final)
        y_final = pd.concat([y_train, y_test])

        X_unknown = RFECV.transform(X_unknown)

        rsearch.fit(X_final, y_final)
        predictions_final = rsearch.predict(X_unknown)

    else:
        predictions_final = []

    return predictions, predictions_final