Example #1
0
def saving_trees(model_ehr, logs_file):
    """ Function to visualize XGB trees. The ranges generated are compared with RENASCA """
    from matplotlib.pylab import rcParams
    rcParams['figure.figsize'] = 60, 50
    xgb.plot_tree(model_ehr, ax=plt.gca())
    features_url = os.path.join(logs_file, str('plot_tree_' + '.svg'))
    plt.savefig(features_url)
Example #2
0
def plot_BDT(bst):
    '''
    Produces two different plots:
        a) Plot specified tree
        b) Plot importance based on fitted trees.

    Parameters
    ----------
    bst : trained booster model
        Booster.

    Returns
    -------
    None.

    '''
    xgb.plot_importance(bst)
    fig = plt.gcf()
    fig.set_size_inches(20, 10)
    fig.savefig('plotImportanceHiggs.pdf')
    plt.clf()

    xgb.plot_tree(bst, num_trees=4)
    fig = plt.gcf()
    fig.set_size_inches(15, 10)
    fig.savefig('plotTreeHiggs.pdf', dpi=300)
    plt.clf()
Example #3
0
def boost(data: pd.DataFrame):
    X, y = data.iloc[:, :-1], data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.05,
                                                        random_state=42)

    xg_reg = xgb.XGBRegressor(
        objective='reg:squarederror',
        colsample_bytree=0.3,
        learning_rate=0.25,
        max_depth=40,
        alpha=50,
        n_estimators=100,
        reg_lambda=30,
    )
    xg_reg.fit(X_train, y_train)
    preds = xg_reg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"[boost] RMSE: {rmse}")

    xgb.plot_tree(xg_reg, num_trees=0)
    plt.savefig(f"{data_root}/figs/tree_development.svg", format="svg")
    xgb.plot_importance(xg_reg)
    plt.rcParams['figure.figsize'] = [5, 5]
    plt.savefig(f"{data_root}/figs/importance_development.svg", format="svg")

    bundle = data.copy().iloc[y_test.index]
    bundle['TARGET'] = preds
    bundle['DIFF'] = abs(bundle['TARGET'] - bundle['HOURSINDEVELOPMENT'])
    return bundle.sort_values(by='DIFF')
Example #4
0
def multiple_run(model,
                 dtrain,
                 predictors,
                 cv_folds=4,
                 early_stopping_rounds=100):
    print("running a multiple fit with CV to check predictors")
    xgtrain = xgb.DMatrix(dtrain[predictors].values,
                          label=dtrain['target'].values)
    cvresult = xgb.cv(model.get_xgb_params(),
                      xgtrain,
                      num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,
                      early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=10,
                      feval=gini_xgb,
                      maximize=True)
    gc.collect()
    # fit the algorithm on the data
    model.fit(dtrain[predictors], dtrain['target'])

    # plot feature importances
    fig, ax = plt.subplots(figsize=(20, 20))
    xgb.plot_importance(model, ax=ax)
    plt.savefig("importance.pdf")

    # plot tree
    fig, ax = plt.subplots(figsize=(20, 20))
    xgb.plot_tree(model, ax=ax)
    plt.savefig("tree.pdf")
Example #5
0
def get_plot_tree(model, num_trees=2):
    from xgboost import plot_tree
    from matplotlib.pylab import rcParams

    ##set up the parameters
    rcParams['figure.figsize'] = 20, 10
    plot_tree(model, num_trees=num_trees)
Example #6
0
    def plot_trees(self, step):
        """ Plot the trees via an internal function

        Arguments
        ---------
        step : int
            Number of steps between two plotted trees.

        Returns
        -------
        fig_trees : list
            List of figure objects containing the graphs of the trees.
        """
        fig_trees = []
        for i in range(len(self.trees)):
            if i % step == 0:
                print("Plotting tree {} / {}...".format(i, len(self.trees)))
                try:
                    xgboost.plot_tree(self.model, num_trees=i)
                    fig = matplotlib.pyplot.gcf()
                    fig.set_size_inches(50, 25)
                    ax = plt.gca()
                    ax.set_title("Tree {}".format(i))

                    fig_trees.append(fig)
                except ValueError:
                    print("Bad tree {}".format(i))

        return fig_trees
Example #7
0
def train():
    # Read the csv file
    stpData = pd.read_csv('StudentsPerformance-encoded.csv')

    # split and train
    X, y = stpData.iloc[:, :-3], stpData.iloc[:, -3]
    data_dmatrix = xgb.DMatrix(data=X, label=y)

    # split and test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=123)
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror',
                              learning_rate=0.1,
                              max_depth=7,
                              alpha=10,
                              n_estimators=1000,
                              num_boost_round=1000)

    # test the model accuracy
    xg_reg.fit(X_train, y_train)
    preds = xg_reg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print("RMSE: %f" % (rmse))

    # Plot the feature importance and the tree
    xgb.plot_tree(xg_reg, num_trees=100)
    xgb.plot_importance(xg_reg)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(150, 100)
    fig.savefig('tree.png')
Example #8
0
def showTree(bst, ntree):
    xgb.plot_tree(bst,
                  num_trees=ntree,
                  fontsize='24',
                  rankdir='LR',
                  size="7.75,10.25")
    plt.savefig('tree.png')
Example #9
0
    def train_predict_all(self,
                          x,
                          y,
                          column_id,
                          x_all,
                          feature_names=None,
                          column_names=None):
        if self.balance:
            ratio = float(np.sum(y == False)) / np.sum(y == True)
            print "weight ratio: " + str(ratio)
            self.params[column_id]['scale_pos_weight'] = ratio

        xgdmat = xgb.DMatrix(x, y, feature_names=feature_names)
        self.model[column_id] = xgb.train(self.params[column_id],
                                          xgdmat,
                                          num_boost_round=3000,
                                          verbose_eval=False)

        if feature_names != None:
            all_trees = self.model[column_id].get_dump()
            print "number trees:" + str(len(all_trees))

            plot_tree(self.model[column_id])

            fig = plt.gcf()
            fig.set_size_inches(150, 100)
            plt.savefig('out/' + str(column_id) + "_" +
                        column_names[column_id] + '.pdf')

        # predict
        all_records = xgb.DMatrix(x_all, feature_names=feature_names)
        probability_prediction = self.model[column_id].predict(all_records)
        class_prediction = (probability_prediction > 0.5)

        return probability_prediction, class_prediction
Example #10
0
 def viz_trees(self):
     self.xg_regressor = xgb.train(params=self.params,
                                   dtrain=self.Dmatrix,
                                   num_boost_round=10)
     xgb.plot_tree(self.xg_regressor, num_trees=0)
     plt.rcParams['figure.figsize'] = [50, 10]
     plt.show()
Example #11
0
def xbg_plot():
    """
    Function for plotting the first tree and the feature importance
    (how many times the feature appears in the trees) with the
    XGBoost classifier.
    """
    xg_clf.fit(X_train, y_train)
    plt.rcParams["figure.figsize"] = [40, 40]
    xgb.plot_tree(xg_clf, num_trees=0)
    plt.title("Plot of the first tree with XGBoost")
    plt.tight_layout()
    plt.savefig("Figures/xgb_tree.png")
    plt.show()

    fonts = {
        "font.size": 18,
        "legend.fontsize": "medium",
        "xtick.labelsize": 16,
        "ytick.labelsize": 16,
        "axes.titlesize": 18
    }
    plt.rcParams.update(fonts)
    plt.rcParams["figure.figsize"] = [10, 10]
    xgb.plot_importance(xg_clf)
    plt.savefig("Figures/Importance.png")
    plt.show()
Example #12
0
 def plot_estimator(self, item):
     """
     plot critical information of an XGB regressor
     if item == "importance", plot feature importance
     if item == "tree", plot the tree structure of listed in tree_index
     :param item:  "importance" or "tree"
     :return: None, save plots
     """
     if item == "importance":
         plot_importance(self.best_estimator,
                         importance_type="gain",
                         xlabel="Feature contribution",
                         ylabel=None,
                         title=None,
                         grid=False,
                         xlim=(0, 35000))
         plt.tight_layout()
         plt.savefig("feature_importance_by_gain.jpg", dpi=600)
         plt.show()
     else:
         tree_index = [0, 1, 100, 300,
                       400]  # tree index used to plot structure
         for index in tree_index:
             fig, ax = plt.subplots()
             fig.set_size_inches(300, 150)
             plot_tree(self.best_estimator, num_trees=index, ax=ax)
             plt.savefig('./tree structure/tree_' + str(index) + ".jpg")
             plt.show()
Example #13
0
 def get_tree_plot(self):
     file_path = gu.get_target_path([self.local_folder, 'tree'],
                                    file_extension='png')
     plt.rcParams.update(plt.rcParamsDefault)
     plt.rcParams['figure.figsize'] = [50, 10]
     xgb.plot_tree(self.model, num_trees=0)
     plt.savefig(file_path, bbox_inches='tight')
     plt.close()
Example #14
0
def XGModelExperiment():
    X_train, y_train, X_val, y_val, test_train, test_val = getAllCleanedDataExperiment(
        binning=1)
    X_train_dummies = pd.get_dummies(X_train)
    X_val_dummies = pd.get_dummies(X_val)
    test_train_dummies = pd.get_dummies(test_train)

    printFullDf(X_train.head())
    # DMatrix
    train = xgb.DMatrix(data=X_train_dummies, label=y_train)
    val = xgb.DMatrix(data=X_val_dummies, label=y_val)
    test = xgb.DMatrix(data=test_train_dummies, label=test_val)

    params = {
        "base_score": 0.5,
        "booster": 'gbtree',
        "colsample_bylevel": 1,
        "colsample_bynode": 1,
        "colsample_bytree": 0.4,
        "gamma": 0.1,
        "gpu_id": -1,
        "importance_type": 'gain',
        "interaction_constraints": '',
        "learning_rate": 0.2,
        'max_delta_step': 0,
        'max_depth': 5,
        'min_child_weight': 1,
        'monotone_constraints': '()',
        'n_estimators': 50,
        'n_jobs': 0,
        'num_parallel_tree': 1,
        'random_state': 0,
        'reg_alpha': 0,
        'reg_lambda': 1,
        'scale_pos_weight': 1,
        'subsample': 1,
        'tree_method': 'exact',
        'validate_parameters': 0,
        'verbosity': None,
        'objective': 'binary:hinge'
    }
    params['eval_metric'] = 'auc'

    evallist = [(val, 'eval'), (train, 'train')]
    num_rounds = 20
    bst = xgb.train(params,
                    train,
                    num_rounds,
                    evallist,
                    early_stopping_rounds=4)
    ypred = bst.predict(test, ntree_limit=bst.best_ntree_limit)
    print(f1_score(test_val, ypred))
    # print(bst.get_score(importance_type='gain'))
    # print(bst.get_score(importance_type='weight'))
    xgb.plot_importance(bst, importance_type='gain')
    xgb.plot_importance(bst, importance_type='weight')
    xgb.plot_tree(bst)
    plt.show()
def single_tree(cols=['乳酸脱氢酶', '淋巴细胞(%)', '超敏C反应蛋白']):
    print('single_tree:\n')
    #获取375病人(data_df_unna) 和110病人(data_pre_df)数据
    data_df_unna, data_pre_df = data_preprocess()
    #去掉全空行,此时375总数目变成351
    data_df_unna = data_df_unna.dropna(subset=cols, how='any')

    cols.append('Type2')
    #获取病人的结局标签
    Tets_Y = data_pre_df.reset_index()[['PATIENT_ID', '出院方式']].copy()
    #修改dataframe的名字
    Tets_Y = Tets_Y.rename(columns={'PATIENT_ID': 'ID', '出院方式': 'Y'})
    # 获取110病人的标签数据
    y_true = Tets_Y['Y'].values

    x_col = cols[:-1]
    y_col = cols[-1]
    # 获取351病人的三特征数据
    x_np = data_df_unna[x_col].values
    # 获取351病人的标签数据
    y_np = data_df_unna[y_col].values
    # 获取110病人的三特征数据
    x_test = data_pre_df[x_col].values
    # 在351病人上划分训练集和验证集,此时110视为测试集
    X_train, X_val, y_train, y_val = train_test_split(x_np,
                                                      y_np,
                                                      test_size=0.3,
                                                      random_state=6)
    #限定单树xgb模型
    model = xgb.XGBClassifier(
        max_depth=3,
        n_estimators=1,
    )
    model.fit(X_train, y_train)

    #训练集混淆矩阵
    pred_train = model.predict(X_train)
    show_confusion_matrix(y_train, pred_train)
    print(classification_report(y_train, pred_train))

    #验证集混淆矩阵
    pred_val = model.predict(X_val)
    show_confusion_matrix(y_val, pred_val)
    print(classification_report(y_val, pred_val))
    #测试集混淆矩阵

    pred_test = model.predict(x_test)
    print('True test label:', y_true)
    print('Predict test label:', pred_test.astype('int32'))
    show_confusion_matrix(y_true, pred_test)
    print(classification_report(y_true, pred_test))

    plt.figure(dpi=300, figsize=(8, 6))
    plot_tree(model)
    plt.show()

    graph = xgb.to_graphviz(model)
    graph.render(filename='single-tree.dot')
Example #16
0
def tree_example(info_train, target_train):
    # Create and fit a classifier to plot an example of a tree
    classifier_xgb = xgboost.XGBClassifier(max_depth=3,
                                           random_state=1,
                                           use_label_encoder=False)
    classifier_xgb.fit(info_train, target_train)

    # Plot one of the resulting trees to examine what happens inside
    xgboost.plot_tree(classifier_xgb, rankdir='LR')
Example #17
0
def viz_tree(model):
    rcParams['figure.figsize'] = 80, 120

    # xgb.plot_tree(xg_reg, rankdir='LR'); plt.show()
    xgb.plot_tree(model, num_trees=0, rankdir='LR')
    fig = plt.gcf()
    fig.set_size_inches(150, 100)
    fig.savefig('tree.png')
    plt.show(fig)
Example #18
0
def fig_fixing(xlf, fmp=None, save: bool = True):
    fig, ax = pyplot.subplots()
    fig.set_size_inches(100, 100)
    if fmp is not None:
        xgb.plot_tree(xlf, ax=ax, fmap="{}.fmap".format(fmp))
    else:
        xgb.plot_tree(xlf, ax=ax)
    # print(fmp)
    if save: fig.savefig("{}.png".format(fmp))
Example #19
0
 def classifi_plot(self):
     fig = plt.figure(figsize = (19.20,10.80))
     ax1 = fig.add_subplot(224)
     ax2 = fig.add_subplot(211)
     ax3 = fig.add_subplot(223)
     cm = pd.DataFrame(metrics.confusion_matrix(self.test_y, self.predictions), columns=self.target_names, index=self.target_names)
     sns.heatmap(cm, annot=True, ax=ax1)
     xgb.plot_tree(self.gbm, num_trees=0,ax=ax2)
     xgb.plot_importance(self.gbm, ax=ax3)
     plt.show()
Example #20
0
    def model_with_sklearn_api(self, cv_routine='grid_search', param_grid=None,
                               plot=True, use_lightgbm=False):
        self.logger.info('modelling with sklearn api...')

        # specify model using sklearn api
        if use_lightgbm:
            xg_reg = lgb.LGBMRegressor(objective='regression',
                                       random_state=self.random_state)
        else:
            xg_reg = xgb.XGBRegressor(objective='reg:linear',
                                      random_state=self.random_state)
        # note: at time of writing at least (2018-12), it seems that early
        # stopping (e.g. by adding early_stopping_rounds=50) is not supported
        # with sklearn's hyper-parameter optimisers like GridSearchCV

        # note: xgb.XGBRegressor and lgb.LGBMRegressor support many types of
        # regressions (i.e. many different loss functions), e.g. Poisson
        # regression, via the 'objective' argument

        if param_grid is None:
            param_grid = self._get_default_param_gird(cv_routine=cv_routine,
                                                      use_lightgbm=use_lightgbm)

        xg_reg = self._fit_model(xg_reg=xg_reg, cv_routine=cv_routine,
                                 param_grid=param_grid)

        if cv_routine in ['grid_search', 'randomised_search']:
            # use xgboot's api to further optimise n_estimators early stopping
            # note: no scaling here (but that is fine for tree weak learners)
            xg_reg = self._update_n_estimators(est=xg_reg,
                                               use_lightgbm=use_lightgbm)
        else:
            assert (cv_routine in ['bayes_search'])
            # note: Bayes searches should be capable of optimising
            # n_estimators well without early stopping

        # get predictions for test set
        preds = xg_reg.predict(self._X_test)

        # get root mean-square-error in test set
        rmse = np.sqrt(mean_squared_error(y_true=self._y_test, y_pred=preds))
        self.logger.info('RMSE in test set: {}'.format(rmse))

        if plot:
            # visualise model (using xgboost functionality)
            xgb.plot_tree(xg_reg.named_steps['model'], num_trees=0)
            plt.rcParams['figure.figsize'] = [50, 10]
            plt.show()

            xgb.plot_importance(xg_reg.named_steps['model'])
            plt.rcParams['figure.figsize'] = [5, 5]
            plt.show()
            # TODO: ensure names of features are used

        return xg_reg
Example #21
0
def optmodelcomp():
    """
    Compares performanceo of optimal models, including ROC and PR curves,
    as well as area under curves. Also plots an example decision tree from
    tree based xgb models, along with importance of the features.

    The optimization of the xgb models are performed in this function, while
    the NN model is previously optimized and the LogReg model has no hyperparameters.
    """
    loader = pulsardat()
    model1 = xgbtreeopter('dart')
    model2 = xgbtreeopter('gbtree')
    model3 = xgblinearopter()
    model4 = LogReg()
    model5 = NNmodel()
    model5.paramchanger('layers', [64, 2])  #selected values from NNopter
    model5.paramchanger('batch_size', 64)
    models = [model1, model2, model3, model4, model5]
    A = analyze(models, loader)
    Ks = [3, 4, 5]

    funcs = [PRcurve, metrics.roc_curve]
    figs = A.kfold_analysis(Ks, funcs)

    plt.figure(figs[0].number)
    plt.xlabel("Recall", fontsize=14)
    plt.ylabel("Precision", fontsize=14)
    plt.grid()
    plt.legend()
    plt.savefig("Auc_PR.png")

    plt.figure(figs[1].number)
    plt.xlabel("False positive ratio", fontsize=14)
    plt.ylabel("True positive ratio", fontsize=14)
    plt.grid()
    plt.legend()
    plt.savefig("Auc_ROC.png")

    plt.figure()
    A.models[1].model.fit(A.df[A.xlabels].values, A.df[A.ylabels].values)
    xgb.plot_tree(A.models[1].model)
    fig = plt.gcf()
    fig.set_size_inches(150, 100)
    plt.savefig("tree_plot.pdf")

    plt.figure()

    importances = A.models[1].feature_importances()
    plt.bar(np.arange(len(A.xlabels)), importances)
    plt.ylabel("Importance", fontsize=14)
    plt.xlabel("Predictor number", fontsize=14)
    plt.savefig("importance.png")

    plt.show()
    """
Example #22
0
    def plot(self):
        fig, ax = plt.subplots()
        # weight, gain, or cover
        # importance_type='weight'
        xgb.plot_importance(self.model, ax=ax, fmap=feature_file)
        plt.savefig(plots_folder /
                    'XGB_importance_{}.png'.format(self.accesses))

        fig, ax = plt.subplots(constrained_layout=True, figsize=(150, 100))
        xgb.plot_tree(self.model, ax=ax, fmap=feature_file, rankdir='LR')
        plt.savefig(plots_folder / 'XGB_tree_{}.png'.format(self.accesses))
    def visualize_model(self, model_filepath):
        """
			Visualizes one tree of a model for reporting purposes
		"""

        bst = xgb.Booster()
        bst.load_model(model_filepath)

        #plot the model
        xgb.plot_tree(bst, num_trees=1, rank_dir='LR')
        plt.show()
Example #24
0
    def XGBoost(self, X, yy, y):
        # Reshape from 3D to 2D: https://stackoverflow.com/questions/61573260/reshape-3d-numpy-array-of-images-to-2d-numpy-array-for-xgboost-dmatrix-input
        X = X.reshape(X.shape[0], -1)
        print(X.shape)

        # PCA dimenstionality reduction
        #pca = PCA(n_components=2)
        #principalComponents = pca.fit_transform(X)
        #principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2'])
        #X = np.array(principalDf['pc1'].tolist())
        #X = pca.components_

        # Split the dataset
        x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state=42, stratify=y)
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

        # Reshape from 3D to 2D: https://stackoverflow.com/questions/61573260/reshape-3d-numpy-array-of-images-to-2d-numpy-array-for-xgboost-dmatrix-input
        # x_train = x_train.reshape(x_train.shape[0], -1)
        # x_test = x_test.reshape(x_test.shape[0], -1)
        # print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

        dtrain = xgb.DMatrix(data=x_train, label=y_train)
        dtest = xgb.DMatrix(data=x_test, label=y_test)
        eval_list = [(dtest, 'eval')]

        # Train the model
        params = {
            'max_depth': 3,
            'objective': 'multi:softmax',  # error evaluation for multiclass training
            'num_class': 3,
            'tree_method':'gpu_hist'
        }
        model = xgb.train(params, dtrain, evals=eval_list, early_stopping_rounds=20, verbose_eval=True)

        # Evaluate predictions
        y_pred = model.predict(dtest)
        predictions = [round(value) for value in y_pred]

        accuracy = accuracy_score(y_test.argmax(axis=1), predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))

        # Plots
        xgb.plot_tree(model)
        fig = matplotlib.pyplot.gcf()
        fig.set_size_inches(150, 150)
        fig.savefig('xgboost/tree.png')

        # Confusion matrix
        #cm = confusion_matrix(y_train.argmax(axis=1), y_pred.argmax(axis=0))
        #self.plot_confusion_matrix(cm, self.target_names)

        # Save the model
        model.save_model('./saved_models/xgboost_audio_classifier.hdf5')
        print('Complete.')
Example #25
0
def main(_):

    # creating training data
    data = np.random.rand(5, 10)  # 5 entities, each contains 10 features
    label = np.random.randint(2, size=5)  # binary target
    dtrain = xgb.DMatrix(data, label=label)

    #csr = scipy.sparse.csr_matrix((dat, (row, col))) #  data creation using scipy
    #dtrain = xgb.DMatrix(csr)

    # Booster parameters
    param = {
        'max_depth': 2,
        'eta': 1,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    param['nthread'] = 4
    param['eval_metric'] = 'auc'

    # Evaluation parameters
    #param['eval_metric'] = ['auc', 'ams@0']
    plst = param.items()
    plst += [('eval_metric', 'ams@0')]

    # Testing data
    data = np.random.rand(7, 10)  # 7 entities, each contains 10 features
    dtest = xgb.DMatrix(data)

    # Specify validations set to watch performance
    evallist = [(dtest, 'eval'), (dtrain, 'train')]

    # Training
    num_round = 10
    bst = xgb.train(plst, dtrain, num_round, evallist)
    bst.save_model('0001.model')  # Saving the model
    bst.dump_model('dump.raw.txt')  # dump model
    bst.dump_model('dump.raw.txt',
                   'featmap.txt')  # dump model with feature map
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model.bin')  # load data

    # Testing
    ypred = bst.predict(dtest)
    #ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit) # Use this one only if early stopping is enabled in training

    # Plotting
    xgb.plot_importance(bst)
    xgb.plot_tree(bst, num_trees=2)
    xgb.to_graphviz(bst, num_trees=2)

    file = open("results.txt", "w")
    file.write(ypred)
    file.close()
Example #26
0
 def train(self, dataset, model, logger=print):
     dtrain = xgb.DMatrix(dataset.data(), label=dataset.label())
     param = self.config['train'].get('param') or {}
     num_round = self.config['train']['epochs']
     evallist = [(dtrain, 'train')]
     model.set_model(xgb.train(param, dtrain, num_round, evallist))
     model.save()
     xgb.plot_importance(model.model())
     plt.savefig('importance.png')
     xgb.plot_tree(model.model(), num_trees=2)
     plt.savefig('tree.png')
 def PlotModelTree(self, model, model_name):
     # plot tree
     plot_tree(model, rankdir='LR')
     fig = plt.gcf()
     fig.set_size_inches(25, 15)
     fig.savefig(self.getOutputFolder() + '/model_tree.png')
     # plot importances
     plot_importance(model)
     fig = plt.gcf()
     fig.set_size_inches(25, 25)
     fig.savefig(self.getOutputFolder() + '/feat_importances.png')
Example #28
0
 def tree_pic(self, features, fmap_filename, path_1):
     outfile = open(fmap_filename, 'w')
     i = 0
     for feat in features:
         outfile.write('{0}\t{1}\tq\n'.format(i, feat))
         i = i + 1
     outfile.close()
     from xgboost import plot_tree
     plot_tree(self.model, num_trees=0, fmap=fmap_filename)
     fig = plt.gcf()
     fig.set_size_inches(15, 10)
     fig.savefig(path_1)
Example #29
0
def model_plot():
    import matplotlib.pyplot as plt
    bst = xgb.Booster({"nthread": 4})  # init model
    bst.load_model("../data/model/xgb.model")  # load data
    # xgb.plot_importance(bst)
    # plt.show()
    # To plot the output tree via matplotlib, use plot_tree, specifying the ordinal number of the target tree.
    xgb.plot_tree(bst, num_trees=2)
    plt.show()
    # When using IPython, you can use the to_graphviz function, which converts the target tree to a graphviz instance.
    # The graphviz instance is automatically rendered in IPython.
    xgb.to_graphviz(bst, num_trees=2)
Example #30
0
def feature_importance_xgboost():
    params = dict()
    params['eta'] = 0.3
    params['min_child_weight'] = 10
    params['cosample_bytree'] = 0.8
    params['max_depth'] = 5
    params['subsample'] = 0.5
    params['gamma'] = 2.0
    params['alpha'] = 1.0

    config = {
        'eval_metric': 'rmse',
        'objective': 'reg:linear',
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1
    }

    config = {**config, **params}

    xtrain, xtest, ytrain, ytest, fname = get_data_boston_with_fname()

    dtrain = xgb.DMatrix(xtrain, label=ytrain)
    dtest = xgb.DMatrix(xtest, label=ytest)

    evallist = [(dtrain, 'train'), (dtest, 'test')]
    num_boost_round = 10
    model = xgb.train(config,
                      dtrain,
                      num_boost_round,
                      evals=evallist,
                      early_stopping_rounds=100,
                      verbose_eval=True)

    fmap_fp = 'fmap.txt'
    f = open(fmap_fp, 'w')
    for i, feature in enumerate(fname):
        f.write('{0}\t{1}\tq\n'.format(i, feature))
    f.close()

    feature_weights = model.get_fscore(fmap=fmap_fp)
    feature_weights = sorted(feature_weights.items(),
                             key=lambda x: x[1],
                             reverse=True)
    print(feature_weights)

    model.save_model('model.bin')
    model.dump_model('desc.txt', fmap=fmap_fp)

    xgb.plot_tree(model, fmap=fmap_fp, num_trees=2)
    plt.show()
Example #31
0
    def plot_tree(self, num_trees=0, rankdir='UT', ax=None, **kwargs):

        """Plot specified tree.

        Parameters
        ----------
        booster : Booster, XGBModel
            Booster or XGBModel instance
        num_trees : int, default 0
            Specify the ordinal number of target tree
        rankdir : str, default "UT"
            Passed to graphiz via graph_attr
        ax : matplotlib Axes, default None
            Target axes instance. If None, new figure and axes will be created.
        kwargs :
            Other keywords passed to to_graphviz

        Returns
        -------
        ax : matplotlib Axes

        """

        import xgboost as xgb

        if not isinstance(self._df.estimator, xgb.XGBModel):
            raise ValueError('estimator must be XGBRegressor or XGBClassifier')
        return xgb.plot_tree(self._df.estimator.booster(),
                             num_trees=num_trees, rankdir=rankdir, **kwargs)
def test_sklearn_plotting():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_iris

    iris = load_iris()

    classifier = xgb.XGBClassifier()
    classifier.fit(iris.data, iris.target)

    import matplotlib
    matplotlib.use('Agg')

    from matplotlib.axes import Axes
    from graphviz import Digraph

    ax = xgb.plot_importance(classifier)
    assert isinstance(ax, Axes)
    assert ax.get_title() == 'Feature importance'
    assert ax.get_xlabel() == 'F score'
    assert ax.get_ylabel() == 'Features'
    assert len(ax.patches) == 4

    g = xgb.to_graphviz(classifier, num_trees=0)
    assert isinstance(g, Digraph)

    ax = xgb.plot_tree(classifier, num_trees=0)
    assert isinstance(ax, Axes)
Example #33
0
    def test_plotting(self):
        bst2 = xgb.Booster(model_file='xgb.model')
        # plotting

        import matplotlib
        matplotlib.use('Agg')

        from matplotlib.axes import Axes
        from graphviz import Digraph

        ax = xgb.plot_importance(bst2)
        assert isinstance(ax, Axes)
        assert ax.get_title() == 'Feature importance'
        assert ax.get_xlabel() == 'F score'
        assert ax.get_ylabel() == 'Features'
        assert len(ax.patches) == 4

        ax = xgb.plot_importance(bst2, color='r',
                                 title='t', xlabel='x', ylabel='y')
        assert isinstance(ax, Axes)
        assert ax.get_title() == 't'
        assert ax.get_xlabel() == 'x'
        assert ax.get_ylabel() == 'y'
        assert len(ax.patches) == 4
        for p in ax.patches:
            assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red


        ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
                                 title=None, xlabel=None, ylabel=None)
        assert isinstance(ax, Axes)
        assert ax.get_title() == ''
        assert ax.get_xlabel() == ''
        assert ax.get_ylabel() == ''
        assert len(ax.patches) == 4
        assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red
        assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red
        assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue
        assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue

        g = xgb.to_graphviz(bst2, num_trees=0)
        assert isinstance(g, Digraph)
        ax = xgb.plot_tree(bst2, num_trees=0)
        assert isinstance(ax, Axes)
# plot decision tree
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_tree
from matplotlib import pyplot
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
y = dataset[:,8]
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot single tree
plot_tree(model)
pyplot.show()
param['objective'] = 'multi:softprob'
param['eta'] = 1.3
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 3
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 100

bst = xgb.train(param, xg_train, num_round)
yprob = bst.predict( xg_test ).reshape( y_test.shape[0], 3 )
ylabel = np.argmax(yprob, axis=1)
print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test)) ))

fig, ax = plt.subplots(1, 1)
xgb.plot_tree(bst, ax=ax)
fig.savefig('analysis/output/xg.png', dpi=600)


ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10)
total_score = 0.0
total_f1 = 0.0
runs = 0
for train, test in ss.split(X, y):
    X_train = np.array(X)[train]
    y_train = y[train]

    X_test = np.array(X)[test]
    y_test = y[test]

    count_vect = CountVectorizer(ngram_range=(1, 3))
Example #36
0
testreader = csv.reader(open("../avtest.csv",'r'), delimiter=",")
test = []
i = 0
for row in testreader:
    i += 1
    #convert strings to floats
    converted = []
    #remove old converted every 5000 cycles
    if ((i % 5000.0) == 0):
        print "clean up"+str(i)
        gc.collect()
    for j in row:
        if (len(j) > 0):
            converted.append(float(j))
        else:
            converted.append(float("nan"))
    test.append(converted)

print "done looping"
test = numpy.array(test)
print test[0]
print test.shape
dfintest = xgboost.DMatrix(test,missing=float("nan"))
finpred = model.predict(dfintest)
print finpred
to_output.to_output(finpred,"xpredictions.csv")
xgboost.plot_importance(model)
plt.show()
xgboost.plot_tree(model)
plt.show()
Example #37
0
#Predict on Test Set
X_test4_new = X_test[cols4]
dtest4_new = xgb.DMatrix(X_test4_new, label=y3_test, weight=wt4_test)
dtest4_new.save_binary('/Users/wangbruce/Google Drive/test4_new.buffer')
y4_pred_p = bst4.predict(dtest4_new)
y4_pred = [0 if x <0.5 else 1 for x in y4_pred_p]
(y4_pred == y3_test).mean()


#############################################################################
#plot

xgb.plot_importance(bst4)
plt.show()

xgb.plot_tree(bst4)
plt.show()



def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(range(0,2)))
    plt.xticks(tick_marks, range(0,2), rotation=45)
    plt.yticks(tick_marks, range(0,2))
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
                              reg_alpha=0.05,
                              reg_lambda=2,
                              subsample=1.0,
                              colsample_bytree=1.0,
                              max_delta_step=1,
                              scale_pos_weight=1,
                              objective='multi:softprob',
                              nthread=8,
                              seed=0  # ,
                              # silent = False
                              )
    print('training...')
    xgb_model.fit(training, label)
    print('predicting...')
    predicted = xgb_model.predict_proba(testing)
    predicted = pandas.DataFrame(predicted)
    predicted.columns = xgb_model.classes_
    # Name index column.
    predicted.index.name = 'Id'
    # Write csv.
    print('Saving prediction...')
    predicted.to_csv('Prediction.csv')
    # feature importance
    feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    matplotlib.pyplot.show()
    plot_importance(xgb_model, title='Feature importance')
    matplotlib.pyplot.show()
    plot_tree(xgb_model, num_trees=0)
    matplotlib.pyplot.show()
Example #39
0
                                      colsample_bytree=colsample,
                                      subsample=subsample)
                m.fit(Xtr, ytr)
                pp = m.predict_proba(Xts)[:, 1]
                if FINAL_SUBMISSION:
                    import datetime
                    timestamp = datetime.datetime.now().strftime(
                        '%Y-%m-%d-%H:%M')
                    scores = np.c_[np.arange(len(pp)), pp]
                    np.savetxt('../out/vilab-submission-%s.csv' % timestamp,
                               scores, '%d,%.8f', ',', header='id,probability',
                               comments='')
                    toc()
                else:
                    toc('cs=%.2f md=%2d lr=%.2f mcw=%1d g=%d score=%.4f' % (
                        colsample, max_depth, learning_rate, min_child_weight,
                        gamma, roc_auc_score(yts, pp)))
                sys.stdout.flush()

import matplotlib.pyplot as plt
plt.ioff()
xgb.plot_importance(m, tick_label=names)
plt.savefig('xgb-features.pdf')
plt.show()

'''
xgb.plot_tree(m)
plt.savefig('xgb-tree.pdf', dpi=900)
plt.show()
'''
# plot decision tree
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_tree
from matplotlib import pyplot
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
y = dataset[:,8]
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot single tree
plot_tree(model, num_trees=0, rankdir='LR')
pyplot.show()
#!/usr/bin/python
# -*- coding: utf-8 -*-

import numpy as np
from sklearn import datasets
from sklearn import tree
import graphviz
import xgboost as xgb
import matplotlib.axes as axes

iris = datasets.load_iris()
X = iris.data
y = iris.target

#clf = tree.DecisionTreeClassifier()
#clf = clf.fit(X, y)

#dot_data = tree.export_graphviz(clf, out_file=None)
#graph = graphviz.Source(dot_data)
#graph.render("iris")

clf = xgb.XGBClassifier()
clf = clf.fit(X, y)
xgb.plot_tree(clf)

Example #42
0
##              feval=None, maximize=False, early_stopping_rounds=None, 
##              evals_result=None, verbose_eval=True, learning_rates=None, 
##              xgb_model=None)
#
evallist  = [(dtest,'eval'), (dtrain,'train')]

watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
evals_result = {}
num_round = 10
bst = xgb.train(param,xg_train, num_round, evals_result=evals_result)
pred = bst.predict(xg_test)

print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))

xgb.plot_importance(bst)
xgb.plot_tree(bst, num_trees=2)

#=============Logistic Regression==============================================================

#Define sigmoid function
def sigmoid(z):
    return 1 / (1 + e**(-z))

#Calcualte the cost to be minimized -- using the sigmoid function
def cost(theta, X, y, l):
    m = X.shape[0] #Number of rows in the data
    z = X.dot(theta)
    O = (-1 / m) * (log(sigmoid(z)).T.dot(y)  +  log(1-sigmoid(z)).T.dot((1-y)))
#    print(m)
#    print(theta)
#    print(theta[1:])
# In[126]:

gbb = gb.booster()
gbb.dump_model('trees.txt')


# In[118]:

import matplotlib.pyplot as plt

# %matplotlib inline
fig = plt.figure(figsize=[10,10])
ax = fig.gca()
gb = gbm[0]
xgb.plot_tree(gb, num_trees=13, ax=ax)


# In[ ]:

import matplotlib.pyplot as plt
import numpy as np

dic = {'lgt mean':'light intensity mean', 'lgt std':'light intensity variance', 'lgt off':'darkness duration', 'lgt zcrossing':'light change',       'lgt skew':'light intensity skewness', 'lgt kurt':'light intensity kurtosis', 'aud mean':'sound amplitude mean',        'aud std':'sound amplitude variance', 'aud skew':'sound amplitude skewness', 'aud kurt':'sound amplitude kurtosis',       'aud frq mean':'sound frequency mean', 'aud frq std':'sound frequency variance', 'aud frq skew':'sound frequency skewness',       'aud frq kurt':'sound frequency kurtosis', 'scr frq':'screen on/off frequency', 'scr dur mean':'screen on time',        'scr dur std':'screen on time variance', 'still':'stillness time', 'tilting':'tilting time', 'walking':'walking time',       'unknown act':'unknown activity time', 'still-walking':'still/walking transition', 'still-tilting':'still/tilting transition',       'still-unknown':'still/unknown transition', 'walking-unknown':'walking/unknown transition', 'call in':'no incoming calls', 'call out':       'no outgoing calls', 'sms in':'no incoming sms', 'sms out':'no outgoing sms', 'call missed':'no missed calls', 'n wifi':       'no wifi nets', 'temperature':'outside temperature', 'dew point':'outside windchill', 'weather':'outside weather',        'lat mean':'latitude mean', 'lng mean':'longitude mean', 'loc var':'location variance', 'duration':'visit timespan',       'midtime':'visit timestamp', 'midhour':'visit time of day', 'dow start':'arrive day of week', 'dow end':       'leave day of week', 'fsq 0':'Foursquare Nightlife Spot', 'fsq 1':'Foursquare Outdoors & Recreation', 'fsq 2':'Foursquare Arts & Entertainment'       , 'fsq 3':'Foursquare Professional or Medical Office', 'fsq 4':'Foursquare Food', 'fsq 5':'Foursquare Home',        'fsq 6':'Foursquare Shop or Store', 'fsq 7':'Foursquare Travel or Transport', 'fsq 8':'Foursquare Unknown', 'fsq distance':       'Foursquare distance', 'LT frequency':'visit frequency', 'LT interval mean':'mean time between visits', 'n gps':'visit duration'}

# extracting means and CIs
feature_label = x_train.columns

fscore = pd.DataFrame(index=np.arange(n_bootstrap), columns=feature_label)
for i in range(n_bootstrap):
    keys = np.array(gbm[i].booster().get_fscore().keys())