Exemple #1
0
def test_create_tree_digraph(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split

    constraints = [-1, 1] * int(X_train.shape[1] / 2)
    gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, monotone_constraints=constraints)
    gbm.fit(X_train, y_train)

    with pytest.raises(IndexError):
        lgb.create_tree_digraph(gbm, tree_index=83)

    graph = lgb.create_tree_digraph(gbm, tree_index=3,
                                    show_info=['split_gain', 'internal_value', 'internal_weight'],
                                    name='Tree4', node_attr={'color': 'red'})
    graph.render(view=False)
    assert isinstance(graph, graphviz.Digraph)
    assert graph.name == 'Tree4'
    assert len(graph.node_attr) == 1
    assert graph.node_attr['color'] == 'red'
    assert len(graph.graph_attr) == 0
    assert len(graph.edge_attr) == 0
    graph_body = ''.join(graph.body)
    assert 'leaf' in graph_body
    assert 'gain' in graph_body
    assert 'value' in graph_body
    assert 'weight' in graph_body
    assert '#ffdddd' in graph_body
    assert '#ddffdd' in graph_body
    assert 'data' not in graph_body
    assert 'count' not in graph_body
Exemple #2
0
 def fit(self, X, y=None):
     render = False
     zero_update=True
     logging.debug("Starting KiGB fit")
     lgb_train = lgb.Dataset(X, y, free_raw_data=False)
     param = self.get_params().copy()
     param.pop('trees')
     param.pop('lamda')
     param.pop('epsilon')
     param.pop('advice')
     # Learn first tree
     kigb_gbm = lgb.train(param,
                         lgb_train,
                         num_boost_round=1)
     if render: # Render tree in pdf for debugging
         graph = lgb.create_tree_digraph(kigb_gbm, tree_index=0, name='before_update_' + str(0))
         graph.render('./render/lgbm/before_update_' + str(0))
     # Update penalty values
     update = kigb_penalty_update(kigb_gbm, self.advice, lamda=self.lamda, epsilon=self.epsilon)
     if update:
         zero_update=False
         kigb_gbm.model_from_string(update, verbose=False)
         if render: # Rrender tree in pdf for debugging
             graph = lgb.create_tree_digraph(kigb_gbm, tree_index=0, name='after_update_' + str(0))
             graph.render('./render/lgbm/after_update_' + str(0))
     # iterate over trees.
     for h in range(1, self.trees + 1):
         lgb_train = lgb.Dataset(X, y, free_raw_data=False) # Bug in Lightgbm, need to initialize data
         # Learn next tree with initial model
         kigb_gbm = lgb.train(param,
                             lgb_train,
                             num_boost_round=1,
                             init_model=kigb_gbm)
         # If trees are not learnt further, break the loop
         if kigb_gbm.num_trees() <= h:
             logging.info("Trees are not learnt further")
             break
         if render: # Render tree for debugging
             graph = lgb.create_tree_digraph(kigb_gbm, tree_index=h, name='before_update_'+str(h))
             graph.render('./render/lgbm/before_update_'+str(h))
         # Update the penalty
         update = kigb_penalty_update(kigb_gbm, self.advice, h, lamda=self.lamda, epsilon=self.epsilon)
         if update:
             zero_update=False
             kigb_gbm.model_from_string(update, verbose=False)
             if render: # Render tree for debugging
                 graph = lgb.create_tree_digraph(kigb_gbm, tree_index=h, name='after_update_' + str(h))
                 graph.render('./render/lgbm/after_update_' + str(h))
     self.kigb = kigb_gbm
     if zero_update:
         logging.info("ZERO UPDATES")
     logging.debug("finished KiGB fit")
     return self
Exemple #3
0
    def test_create_tree_digraph(self):
        constraints = [-1, 1] * int(self.X_train.shape[1] / 2)
        gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True, monotone_constraints=constraints)
        gbm.fit(self.X_train, self.y_train, verbose=False)

        self.assertRaises(IndexError, lgb.create_tree_digraph, gbm, tree_index=83)

        graph = lgb.create_tree_digraph(gbm, tree_index=3,
                                        show_info=['split_gain', 'internal_value', 'internal_weight'],
                                        name='Tree4', node_attr={'color': 'red'})
        graph.render(view=False)
        self.assertIsInstance(graph, graphviz.Digraph)
        self.assertEqual(graph.name, 'Tree4')
        self.assertEqual(graph.filename, 'Tree4.gv')
        self.assertEqual(len(graph.node_attr), 1)
        self.assertEqual(graph.node_attr['color'], 'red')
        self.assertEqual(len(graph.graph_attr), 0)
        self.assertEqual(len(graph.edge_attr), 0)
        graph_body = ''.join(graph.body)
        self.assertIn('leaf', graph_body)
        self.assertIn('gain', graph_body)
        self.assertIn('value', graph_body)
        self.assertIn('weight', graph_body)
        self.assertIn('#ffdddd', graph_body)
        self.assertIn('#ddffdd', graph_body)
        self.assertNotIn('data', graph_body)
        self.assertNotIn('count', graph_body)
    def __use_model__(self):
        #
        # Create a submission
        #

        submission = pd.read_csv('test.csv')
        ids = submission['id'].values
        submission.drop('id', inplace=True, axis=1)

        x = submission.values
        y = self.model.predict(x)

        # note: anything above .5 is rounded up
        binY = [round(i) for i in y]

        time_label = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

        output = pd.DataFrame({'id': ids, 'target': binY})
        output.to_csv("output{0}{1}_submission.csv".format(
            self.sep, time_label),
                      index=False)

        auc = max(self.myoutput["valid_0"]["auc"])
        params = ",".join(self.model.model_to_string().split("parameters:\n")
                          [1].split("\n\n")[0].split("\n"))
        with open("submission_list.csv", "a") as csv:
            csv.write("{0},{1},{2},{3}\n".format(time_label, auc, params,
                                                 self.test_size))

        if self.save_graph:
            graph = lightgbm.create_tree_digraph(self.model)
            graph.format = "png"
            graph.render("output{0}{1}".format(self.sep, time_label))
    def test_create_tree_digraph(self):
        gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm.fit(self.X_train, self.y_train, verbose=False)

        self.assertRaises(IndexError, lgb.create_tree_digraph, gbm, tree_index=83)

        graph = lgb.create_tree_digraph(gbm, tree_index=3,
                                        show_info=['split_gain', 'internal_value'],
                                        name='Tree4', node_attr={'color': 'red'})
        graph.render(view=False)
        self.assertIsInstance(graph, graphviz.Digraph)
        self.assertEqual(graph.name, 'Tree4')
        self.assertEqual(graph.filename, 'Tree4.gv')
        self.assertEqual(len(graph.node_attr), 1)
        self.assertEqual(graph.node_attr['color'], 'red')
        self.assertEqual(len(graph.graph_attr), 0)
        self.assertEqual(len(graph.edge_attr), 0)
        graph_body = ''.join(graph.body)
        self.assertIn('threshold', graph_body)
        self.assertIn('split_feature_name', graph_body)
        self.assertNotIn('split_feature_index', graph_body)
        self.assertIn('leaf_index', graph_body)
        self.assertIn('split_gain', graph_body)
        self.assertIn('internal_value', graph_body)
        self.assertNotIn('internal_count', graph_body)
        self.assertNotIn('leaf_count', graph_body)
Exemple #6
0
def test_plot_example():
    print('Loading data...')
    # load or create your dataset
    df_train = pd.read_csv(
        r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.train',
        header=None,
        sep='\t')
    df_test = pd.read_csv(
        r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.test',
        header=None,
        sep='\t')

    y_train = df_train[0]
    y_test = df_test[0]
    X_train = df_train.drop(0, axis=1)
    X_test = df_test.drop(0, axis=1)

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

    # specify your configurations as a dict
    params = {'num_leaves': 5, 'metric': ('l1', 'l2'), 'verbose': 0}

    evals_result = {}  # to record eval results for plotting

    print('Starting training...')
    # train
    gbm = lgb.train(
        params,
        lgb_train,
        num_boost_round=100,
        valid_sets=[lgb_train, lgb_test],
        feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
        categorical_feature=[21],
        evals_result=evals_result,
        verbose_eval=10)

    print('Plotting metrics recorded during training...')
    ax = lgb.plot_metric(evals_result, metric='l1')
    plt.show()

    print('Plotting feature importances...')
    ax = lgb.plot_importance(gbm, max_num_features=10)
    plt.show()

    print('Plotting 84th tree...')  # one tree use categorical feature to split
    ax = lgb.plot_tree(gbm,
                       tree_index=83,
                       figsize=(20, 8),
                       show_info=['split_gain'])
    plt.show()

    print('Plotting 84th tree with graphviz...')
    graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
    graph.render(view=True)
Exemple #7
0
    def plot_lgb_n_graphviz_trees(self, n: int):
        for idx in range(n):
            graph = lgb.create_tree_digraph(
                booster=self.model,
                tree_index=idx,
                show_info=['split_gain', 'leaf_count', 'internal_value'])

            graph.render(view=False,
                         directory=self.folder_structure.dir_tree_graphviz,
                         filename=str(idx) + "_tree",
                         cleanup=True)
Exemple #8
0
def show_model_performance(gbm, evals_result):
    # show model importance
    # lgb.plot_importance(gbm)
    # Show Decision Tree
    if config.can_plot_tree:
        graph = lgb.create_tree_digraph(gbm, name='Decision Tree')
        graph.render(view=True)
    if config.can_show_metric:
        fig, axs = plt.subplots(2, 1, figsize=(8, 10))
        for index in range(len(config.metric)):
            lgb.plot_metric(evals_result,
                            config.metric[index],
                            title=config.metric[index],
                            ax=axs[index])
    plt.show()
def plot_tree(model_path, tree_index, save_plot_path):
    '''
    对模型进行可视化
    :param model_path:
    :param tree_index:
    :param save_plot_path:
    :return:
    '''
    if not os.path.exists(model_path):
        print("file no exists! {}".format(model_path))
        sys.exit(0)
    gbm = lgb.Booster(model_file=model_path)
    graph = lgb.create_tree_digraph(gbm,
                                    tree_index=tree_index,
                                    name='tree' + str(tree_index))
    graph.render(filename=save_plot_path, view=True)  #可视图保存到save_plot_path中
Exemple #10
0
    def train(self):
        train_x, valid_x, train_y, valid_y = train_test_split(
            self.train_features,
            self.train_labels,
            test_size=0.2,
            shuffle=False,
            random_state=712)
        if self.use_sparse_matrix:
            train_x, valid_x, self.test_features = csr_matrix(train_x, dtype='float32'), \
                                                   csr_matrix(valid_x, dtype='float32'), \
                                                   csr_matrix(self.test_features, dtype='float32')

        lgb_train = lgb.Dataset(train_x, train_y)
        lgb_eval = lgb.Dataset(valid_x, valid_y)
        param = read_json(self.config.LIGHTGBM_BEST_PARAM)
        param = self.config.PARAM
        gbm = lgb.train(param,
                        lgb_train,
                        valid_sets=[lgb_eval],
                        categorical_feature=self.config.CATEGORY_VARIABLES)
        print('Predicting...')
        test_predictions = gbm.predict(self.test_features,
                                       num_iteration=gbm.best_iteration)
        # save model to file
        gbm.save_model(self.config.MODEL_SAVING_PATH)

        if not self.use_sparse_matrix:
            self.feature_importance = pd.DataFrame({
                'feature':
                self.train_features.columns,
                'importance':
                gbm.feature_importance()
            })
            self.plot_feature_importance()
            print('Saving model...')
        print_tree = False
        if print_tree:
            print('Plotting 1th tree with graphviz...')
            graph = lgb.create_tree_digraph(gbm, tree_index=0, name='Tree1')
            graph.render(filename='assets/tree_graph')
        submission(self.config, test_predictions, True,
                   '%.5f' % gbm.best_score['valid_0']['auc'])

        return test_predictions
Exemple #11
0
def get_model_tree_digraph(model, model_name="default", outputpath="./"):
    '''
    画出tree 的树结构
    :param model:
    :param model_name:
    :param outputpath:
    :param importance_type:
    :param num_feature:
    :return:
    '''
    try:
        outputpath = outputpath + model_name + "_tree_digraph.gv"
        graph = lgb.create_tree_digraph(model, name=model_name)
        graph.render(filename=outputpath)
    except:
        logger.error("create model tree_digrap fail.")
        return False
    else:
        logger.info("create model tree_digrap sucess.")
        return True
    def train_light_gbm(self, dts):
        # create dataset for lightgbm
        lgb_train = lgb.Dataset(dts.trainX, dts.trainY)
        lgb_test = lgb.Dataset(dts.testX, dts.testY, reference=lgb_train)

        # specify your configurations as a dict
        params = {
            'num_leaves': 5,
            'metric': ('l1', 'l2'),
            'verbose': 0
        }

        evals_result = {}  # to record eval results for plotting

        print('Starting training...')
        # train
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=100,
                        valid_sets=[lgb_train, lgb_test],
                        feature_name=['close', 'open', 'high', 'low', 'volume'],
                        categorical_feature=[21],
                        evals_result=evals_result,
                        verbose_eval=10)

        print('Plotting metrics recorded during training...')
        ax = lgb.plot_metric(evals_result, metric='l1')
        plt.show()

        print('Plotting feature importances...')
        ax = lgb.plot_importance(gbm, max_num_features=10)
        plt.show()

        print('Plotting 84th tree...')  # one tree use categorical feature to split
        ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
        plt.show()

        print('Plotting 84th tree with graphviz...')
        graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
        graph.render(view=True)
Exemple #13
0
    'verbose': 0
}

evals_result = {}  # to record eval results for plotting

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=['f' + str(i + 1) for i in range(28)],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plot metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plot feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plot 84th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()

print('Plot 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True)
Exemple #14
0
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.datasets import  make_classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score
# 加载数据
print('Load data...')

X_train,X_test,y_train,y_test =train_test_split(x,l,test_size=0.2)


print('Start training...')
# 创建模型,训练模型
lgbm= lgb.LGBMClassifier()
lgbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)
lgb.create_tree_digraph(lgbm, tree_index=1)
import matplotlib.pyplot as plt
import matplotlib
fig2 = plt.figure(figsize=(20, 20))
ax = fig2.subplots()
lgb.plot_tree(lgbm._Booster, tree_index=1, ax=ax)
plt.show()   

print('Start predicting...')
# 测试机预测
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_)

# feature importances
print('Feature importances:', list(lgbm.feature_importances_))

recall = recall_score(y_pred,y_test)
Exemple #15
0
}

evals_result = {}  # to record eval results for plotting

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=['f' + str(i + 1) for i in range(28)],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plot metrics during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plot feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plot 84th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()

print('Plot 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True)
Exemple #16
0
def plot_tree(est_or_grower,
              est_lightgbm=None,
              tree_index=0,
              view=True,
              **kwargs):
    """Plot the i'th predictor tree of a GBM or a grower tree

    est_or_grower can either be a GradientBoostingMachine instance or a
    TreeGrower. In this latter case tree_index is ignored, and more debugging
    info are displayed. Trees displayed from TreeGrower has additional
    profiling information that are not kept in the predictor trees that
    result from fitting a GradientBoostingMachine.

    tree_index corresponds to the ith built tree. In a multiclass setting,
    e.g. with 3 classes, tree_index=5 will print the third tree of the
    second iteration.

    Can also plot a LightGBM estimator (on the left) for comparison.

    Requires matplotlib and graphviz (both python package and binary program).

    kwargs are passed to graphviz.Digraph()

    Example: plotting.plot_tree(est_pygbm, est_lightgbm, view=False,
    filename='output') will silently save output to output.pdf
    """
    def make_pygbm_tree():
        def add_predictor_node(node_idx, parent=None, decision=None):
            iteration = tree_index // est_or_grower.n_trees_per_iteration_
            k = tree_index % est_or_grower.n_trees_per_iteration_
            predictor_tree = est_or_grower.predictors_[iteration][k]
            node = predictor_tree.nodes[node_idx]
            name = 'split__{}'.format(node_idx)
            label = 'split_feature_index: {}'.format(node['feature_idx'])
            label += r'\nthreshold: {:.3f}'.format(node['threshold'])
            label += r'\ngain: {:.3E}'.format(node['gain'])
            label += r'\nvalue: {:.3f}'.format(node['value'])
            label += r'\ncount: {:,}'.format(node['count'])

            graph.node(name, label=label)
            if not node['is_leaf']:
                add_predictor_node(node['left'], name, decision='<=')
                add_predictor_node(node['right'], name, decision='>')

            if parent is not None:
                graph.edge(parent, name, decision)

        def add_grower_node(node, parent=None, decision=None):
            name = 'split__{0}'.format(id(node))
            si = node.split_info
            if si is None:
                feature_idx = 0
                bin_idx = 0
                gain = 0.
                sum_gradients = 0.
                sum_hessians = 0.
            else:
                feature_idx = si.feature_idx
                gain = 0. if si.gain is None else si.gain
                bin_idx = si.bin_idx
                sum_gradients = si.gradient_left + si.gradient_right
                sum_hessians = si.hessian_left + si.hessian_right

            value = 0. if node.value is None else node.value
            label = 'split_feature_index: {}'.format(feature_idx)
            label += r'\nbin threshold: {}'.format(bin_idx)
            label += r'\ngain: {:.3E}'.format(gain)
            label += r'\nvalue: {:.3f}'.format(value)
            label += r'\ncount: {:,}'.format(node.sample_indices.shape[0])
            label += r'\nhist substration: {}'.format(node.hist_subtraction)
            label += r'\nhist speed: {:.3E}'.format(node.construction_speed)
            label += r'\nfind split time: {:.4f}'.format(node.find_split_time)
            label += r'\napply split time: {:.4f}'.format(
                node.apply_split_time)
            label += r'\nsum gradients: {:.3E}'.format(sum_gradients)
            label += r'\nsum hessians: {:.3E}'.format(sum_hessians)

            graph.node(name, label=label)
            if node.value is None:  # not a leaf node
                add_grower_node(node.left_child, name, decision='<=')
                add_grower_node(node.right_child, name, decision='>')

            if parent is not None:
                graph.edge(parent, name, decision)

        if isinstance(est_or_grower, BaseGradientBoostingMachine):
            add_predictor_node(0)
        elif isinstance(est_or_grower, pygbm.grower.TreeGrower):
            add_grower_node(est_or_grower.root)

    # make lightgbm tree
    if est_lightgbm is not None:
        import lightgbm as lb
        graph = lb.create_tree_digraph(est_lightgbm,
                                       tree_index=tree_index,
                                       show_info=[
                                           'split_gain', 'internal_value',
                                           'internal_count', 'leaf_count'
                                       ],
                                       **kwargs)
    else:
        graph = Digraph(**kwargs)

    # make pygbm tree
    make_pygbm_tree()

    graph.render(view=view)
# -*- coding: utf-8 -*-
__author__ = 'lijingjie'

import sys
sys.path.insert(0, 'src/models/')
sys.path.insert(0, 'src/conf/')
sys.path.insert(0, '../conf/')
sys.path.insert(0, '../models')
sys.path.insert(0, '../')
import graphviz
import warnings
warnings.filterwarnings("ignore")

import os
os.environ['LIGHTGBM_EXEC'] = "/Users/jacklee/LightGBM/lightgbm"
# os.environ["PATH"] += os.pathsep + 'E:/Program Files (x86)/Graphviz2.38/bin'

import lightgbm as lgb

bst = lgb.Booster(model_file='lightgbm/20190512-1047/lgb-lgb-tst1-fold-0-0.dump')

image = lgb.create_tree_digraph(bst, tree_index=1,show_info=['split_gain','internal_value','internal_count','leaf_count'])

image.render('lightgbm/20190512-1047/lgb-lgb-tst1-fold-0-0.gv', view=True)

print ('checking Done!')
Exemple #18
0
print('Plot feature importances...')
ax = lgb.plot_importance(lgbm, max_num_features=10)


# In[73]:

print('Plot 4th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(lgbm, tree_index=3, figsize=(20, 8), show_info=['split_gain'])


# In[70]:

import graphviz
print('Plot 4th tree with graphviz...')
graph = lgb.create_tree_digraph(lgbm, tree_index=, name='Tree4')
graph.render(view=True)


# In[60]:

from sklearn.model_selection import GridSearchCV
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}
estimator = lgb.LGBMRegressor(num_boost_round=20, early_stopping_rounds=5)
grid_lgbm = GridSearchCV(estimator, param_grid)
grid_lgbm.fit(train_all[features], train_all['kda_ratio'])
print('Best parameters found by grid search are:', grid_lgbm.best_params_)
Exemple #19
0
    yi = dfval.loc[n]['y']
    n += 1
print("top n: ", n)
print('test:', dtest[dtest['y'] >= yi].shape[0])
# 预训练证集
ytrain_pred = gbm.predict(X_train)
dftrain['y'] = ytrain_pred
dftrain = dftrain.sort_values('y', ascending=False).reset_index(drop=True)
dftrain.to_csv('../datas/{0}_ytrain_pred'.format(xstr), index=None)
#y_pred = gbm.predict(X_test)
#xtest['y'] = y_pred
#xtest = xtest.sort_values('y',ascending=False).reset_index(drop=True)

cols = df_train.columns.tolist()
scores = gbm.feature_importance()
df = pd.DataFrame({'cols': cols, 'scores': scores})
df = df.sort_values('scores', ascending=False).reset_index(drop=True)

df.to_csv('../datas/a', index=None, header=None)

#print xtest.head()
#xtest.to_csv('../datas/xtest',index=None)
#print evals_result
#lgb.plot_metric(evals_result,metric='auc')
# #lgb.plot_metric(evals_result,metric='binary_logloss')
#lgb.plot_importance(gbm, max_num_features=50)
#
graph = lgb.create_tree_digraph(gbm, tree_index=0, name='Tree0')
graph.render(view=True)
#plt.show()
Exemple #20
0
print("Starting training . . .")

categoricals = []
for col in cat_columns:
    try:
        categoricals.append(df_train.columns.tolist().index(col))
    except ValueError:
        continue
print(categoricals)
print(df_train.columns.tolist())

# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_valid,
                early_stopping_rounds=15,
                categorical_feature=categoricals)
# feature names
print('Feature names:', gbm.feature_name())

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

graph = lgb.create_tree_digraph(gbm)
graph.view(cleanup=True)
# predict
#y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
    # 2020/05/30 予想値と真の値の差が大きいデータを分析する End

    # 予測
    pred_list.append(model.predict(test_x))

va_pred_list = np.array(va_pred_list)
pred_list = np.array(pred_list)

print(va_weight_list)

# 提出用ファイルの作成
submission = pd.DataFrame({
    'Id':
    test_id,
    'SalePrice':
    np.average(pred_list, axis=0, weights=va_weight_list)
})
submission.to_csv('/kaggle/output/submission_ensemble.csv', index=False)

printTime('モデルの作成終了')
# -

# #### データの分析にどの特徴量が重要だったのかをプロット

lgb.plot_importance(model, figsize=(10, 30), max_num_features=100)

# #### 分析に使用した決定木を可視化

lgb.create_tree_digraph(model)
Exemple #22
0
os.environ["PATH"] += os.pathsep + 'C:\\Program Files\\Graphviz 2.44.1\\bin'
import lightgbm as lgb
from src.data.sk_data import Iris
from src.utils import data_split
import numpy as np

feature, label = Iris.features, Iris.label
feature = feature[label <= 1]
label = label[label <= 1]
train_feature, test_feature, train_label, test_label = data_split.split(feature, label)
train_data = lgb.Dataset(data=train_feature, label=train_label)
test_data = lgb.Dataset(data=test_feature, label=test_label)
param = {'num_leaves': 31, 'num_trees': 100, 'objective': 'binary', 'num_class': 1}
param['metric'] = 'multi_logloss'

num_round = 10
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])
bst.save_model('model.txt')
# A saved model can be loaded:
bst = lgb.Booster(model_file='model.txt')

ypred = bst.predict(test_feature, num_iteration=bst.best_iteration)
print(np.array([1 if score > 0.5 else 0 for score in ypred]))
print(test_label)

for i in range(0, num_round):
    img = lgb.create_tree_digraph(bst, tree_index=i)
    with open('trees-{}.svg'.format(i), 'w') as f:
        f.write(img._repr_svg_())
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# number of leaves,will be used in feature transformation

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                categorical_feature=categorical_cols,
                valid_sets=lgb_train)
lgb.create_tree_digraph()
print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)

print(np.array(y_pred).shape)
print(y_pred[:10])

print('Writing transformed training data')
transformed_training_matrix = np.zeros(
    [len(y_pred), len(y_pred[0]) * num_leaf],
    dtype=np.int64)  # N * num_tress * num_leafs
    def classifier_lgbm_general(self, X_DDTpd, X_eval, features):
        best_params_ = {
            'objective': 'binary',
            'num_leaves': 50,
            'min_data_in_leaf': 10,
            'max_depth': 10,
            'max_bin': 50,
            'learning_rate': 0.01,
            'dart': False,
            'reg_alpha': 0.1,
            'reg_lambda': 0,
            'n_estimators': 1000,
            'bootstrap': True,
            'dart': False
        }

        scaler = StandardScaler().fit(X_DDTpd)
        X_DDTpd = scaler.transform(X_DDTpd)
        X_eval = scaler.transform(X_eval)
        #poly = PolynomialFeatures(2)
        #X_DDTpd = poly.fit_transform(X_DDTpd)
        #X_eval = poly.fit_transform
        #pca = PCA(n_components=50).fit(X_DDTpd)
        #print(pca.explained_variance_ratio_)
        #X_DDTpd = pca.transform(X_DDTpd)
        #X_eval = pca.transform(X_eval)

        final_model = lgb.LGBMClassifier(**best_params_,
                                         random_state=self.args.seed)
        #cv_score_best = cross_val_score(final_model, X_DDTpd, self.Y_train_proba, cv=5, verbose=6)
        #print(cv_score_best.mean(), cv_score_best.std())

        final_model.fit(X_DDTpd, self.Y_train_proba)

        self.plot_feat_importance(
            final_model, features,
            self.path_save_model + "features_importances_LGBM_nbrefeat_" +
            str(len(features)) + ".png")
        y_pred = final_model.predict(X_eval)

        y_pred_df = pd.DataFrame(y_pred)
        y_pred_df.to_csv(self.path_save_model + "Y_pred_val.csv", index=False)

        #print(self.nn_model_ref.outputs_pred_val[:,0].shape, y_pred.shape)
        #print(self.nn_model_ref.outputs_pred_val[:,0], y_pred)

        if self.args.eval_nn_ref:

            same_output = self.nn_model_ref.outputs_pred_val[:, 0] == y_pred

            #print(same_output)

            p2 = 100 * np.sum(same_output) / len(same_output)
            print("Proportion des prediction identiques: " + str(p2))

            index_interext = np.logical_and(same_output,
                                            self.Y_eval_proba == y_pred)
            p22 = 100 * np.sum(index_interext) / len(index_interext)
            print("Proportion des prediction identiques et egal au label: " +
                  str(p22))
        else:
            p22 = None
            p2 = None
            same_output = None
        #print(ok)
        cm = confusion_matrix(y_pred=y_pred,
                              y_true=self.Y_eval_proba,
                              normalize="true")
        res = np.array([
            accuracy_score(self.Y_eval_proba, y_pred), cm[0][0], cm[1][1], p2,
            p22
        ])
        print(res)
        np.save(self.path_save_model + "res_" + str(self.cpt) + ".npy", res)

        self.save_logs(
            self.path_save_model + "logs_lgbm_" + str(len(features)) + ".txt",
            y_pred, self.Y_eval_proba)
        lgb.create_tree_digraph(final_model).save(
            directory=self.path_save_model,
            filename="tree_LGBM_nbrefeat_" + str(len(features)) + ".dot")
        os.system("dot -Tpng " + self.path_save_model + "tree_LGBM_nbrefeat_" +
                  str(len(features)) + ".dot > " + self.path_save_model +
                  "tree_LGBM_nbrefeat_" + str(len(features)) + ".png")
        del X_DDTpd
        self.importances = final_model.feature_importances_
        self.indices = np.argsort(self.importances)[::-1]
        with open(
                self.path_save_model + "features_impotances_order_nbrefeat_" +
                str(len(features)) + ".txt", "w") as file:
            file.write(
                str(np.array(features)[self.indices]) +
                str(self.importances[self.indices]))
            file.write("\n")
        if self.masks_infos_score is None:
            self.masks_infos_score = self.importances.copy()
            self.masks_infos_rank = np.array([
                np.where(self.indices == x)[0][0]
                for x in range(len(self.importances))
            ])
        return final_model
Exemple #25
0
lgb_train = lgb.Dataset(X, y)
lightgbm = lgb.train(lgb_params, lgb_train)

lgb.plot_importance(lightgbm)
plt.title("Feature importances by LightGBM")
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
plt.show()

# ### Create Tree digraph using
# `create_tree_digraph`
#

# In[ ]:

lgb.create_tree_digraph(lightgbm)

# - Contradiction
#
#     1. Gender should be important, also.
#
# - The important features from __whole dataset__ are total different than stack method refer to [Anisotropic](https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python)
#     1. From [Faron's](https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867) script, he did k-fold training, and I just use entire dataset.
#
#
#
# ## New content coming Soon

# # Acknowledgements
# 1. [Oscar Takeshita](https://www.kaggle.com/pliptor) for pointing my XGB feature importance typo.
def create_booster_summary(
    booster: Union[lgb.Booster, lgb.sklearn.LGBMModel],
    log_importances: bool = True,
    max_num_features: int = 10,
    list_trees: list = None,
    log_trees_as_dataframe: bool = True,
    log_pickled_booster: bool = True,
    log_trees: bool = False,
    tree_figsize: int = 30,
    log_confusion_matrix: bool = False,
    y_true: np.ndarray = None,
    y_pred: np.ndarray = None,
):
    """Create model summary after training that can be assigned to the run namespace.

    See guide with examples in the `Neptune-LightGBM docs`_.

    You can log multiple types of metadata:
        - pickled model
        - feature importance chart
        - visualized trees
        - trees represented as DataFrame
        - confusion matrix (only for classification problems)

    See Args section for more info how to parametrize behaviour of this function.

    Note:
        You can log summary to the new run, or to the same run that you used for logging model training.
        Second option can be very useful because you have all the information in the single run.

    Args:
        booster (:obj:`lgb.Booster` or :obj:`lgb.sklearn.LGBMModel`): Trained LightGBM model.
        log_importances (bool): Defaults to True. Log feature importance charts.
        max_num_features (int): Defaults to 10. Max number of top features on the importance charts.
            Works only if ``log_importances`` is set to ``True``.
            If None or <1, all features will be displayed.
            See `lightgbm.plot_importance`_ for details.
        list_trees (list): Defaults to None. Indices of the target tree to visualize.
            Works only if ``log_trees`` is set to ``True``.
            See `lightgbm.plot_tree`_ for details.
        log_trees_as_dataframe (bool): Defaults to True.
            Parse the model and log trees in the easy-to-read pandas DataFrame format.
            Works only for ``lgb.Booster``.
            See `lightgbm.Booster.trees_to_dataframe`_ for details.
        log_pickled_booster (bool): Defaults to True. Log model as pickled file.
        log_trees (bool): Defaults to False. Log visualized trees.
            This requires graphviz to work. Learn about setup in the `Neptune-LightGBM installation`_ docs.
        tree_figsize (int): Defaults to 30, Control size of the visualized tree image.
            Increase this in case you work with large trees.
            Works only if ``log_trees`` is set to ``True``.
        log_confusion_matrix (bool): Defaults to False. Log confusion matrix.
            If set to True, you need to pass ``y_true`` and ``y_pred``.
        y_true (:obj:`np.ndarray`): Defaults to None. True labels on the test set.
            Needed only if ``log_confusion_matrix`` is set to True.
        y_pred (:obj:`np.ndarray`): Defaults to None. Predictions on the test set.
            Needed only if ``log_confusion_matrix`` is set to True.

    Returns:
        dict: Python dictionary with all metadata, that can be assigned to the run namespace.
            ``run["booster_summary"] = create_booster_summary(...)``

    Examples:
        For more examples visit `example scripts`_.

        Full script that does logging during model training and logs booster summary after training::

            import lightgbm as lgb
            import neptune.new as neptune
            import numpy as np
            from neptune.new.integrations.lightgbm import NeptuneCallback, create_booster_summary
            from sklearn.datasets import load_digits
            from sklearn.model_selection import train_test_split

            # Create run
            run = neptune.init(
                project="common/lightgbm-integration",
                api_token="ANONYMOUS",
                name="train-cls",
                tags=["lgbm-integration", "train", "cls"]
            )

            # Create neptune callback
            neptune_callback = NeptuneCallback(run=run)

            # Prepare data
            X, y = load_digits(return_X_y=True)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

            # Define parameters
            params = {
                "boosting_type": "gbdt",
                "objective": "multiclass",
                "num_class": 10,
                "metric": ["multi_logloss", "multi_error"],
                "num_leaves": 21,
                "learning_rate": 0.05,
                "feature_fraction": 0.9,
                "bagging_fraction": 0.8,
                "bagging_freq": 5,
                "max_depth": 12,
            }

            # Train the model and log metadata to the run in Neptune
            gbm = lgb.train(
                params,
                lgb_train,
                num_boost_round=200,
                valid_sets=[lgb_train, lgb_eval],
                valid_names=["training", "validation"],
                callbacks=[neptune_callback],
            )

            y_pred = np.argmax(gbm.predict(X_test), axis=1)

            # Log summary metadata to the same run under the "lgbm_summary" namespace
            run["lgbm_summary"] = create_booster_summary(
                booster=gbm,
                log_trees=True,
                list_trees=[0, 1, 2, 3, 4],
                log_confusion_matrix=True,
                y_pred=y_pred,
                y_true=y_test
            )

    .. _Neptune-LightGBM docs:
        https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm
       _lightgbm.plot_importance:
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_importance.html#lightgbm-plot-importance
       _lightgbm.plot_tree:
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_tree.html#lightgbm-plot-tree
       _lightgbm.Booster.trees_to_dataframe:
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.trees_to_dataframe
       _Neptune-LightGBM installation:
        https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm#install-requirements
       _example scripts:
        https://github.com/neptune-ai/examples/tree/main/integrations-and-supported-tools/lightgbm/scripts

    """
    results_dict = {}
    visuals_path = "visualizations/"
    if log_importances:
        split_plot = lgb.plot_importance(booster,
                                         importance_type="split",
                                         title="Feature importance (split)",
                                         max_num_features=max_num_features)
        gain_plot = lgb.plot_importance(booster,
                                        importance_type="gain",
                                        title="Feature importance (gain)",
                                        max_num_features=max_num_features)
        results_dict["{}feature_importances/split".format(visuals_path)] \
            = neptune.types.File.as_image(split_plot.figure)
        results_dict["{}feature_importances/gain".format(visuals_path)] \
            = neptune.types.File.as_image(gain_plot.figure)

    if log_trees:
        try:
            subprocess.call(["dot", "-V"],
                            stdout=subprocess.DEVNULL,
                            stderr=subprocess.DEVNULL)
        except OSError:
            log_trees = False
            message = "Graphviz executables not found, so trees will not be logged. " \
                      "Make sure the Graphviz executables are on your systems' PATH"
            warnings.warn(message)

    if log_trees:
        trees_series = []
        for i in list_trees:
            digraph = lgb.create_tree_digraph(booster,
                                              tree_index=i,
                                              show_info="data_percentage")
            _, ax = plt.subplots(1, 1, figsize=(tree_figsize, tree_figsize))
            s = BytesIO()
            s.write(digraph.pipe(format="png"))
            s.seek(0)
            ax.imshow(image.imread(s))
            ax.axis("off")
            trees_series.append(neptune.types.File.as_image(ax.figure))
        results_dict["{}trees".format(
            visuals_path)] = neptune.types.FileSeries(trees_series)

    if log_trees_as_dataframe:
        if isinstance(booster, lgb.Booster):
            df = booster.trees_to_dataframe()
            html_df = neptune.types.File.as_html(df)
            results_dict["trees_as_dataframe"] = html_df
            if not df.empty and not html_df.content:
                warnings.warn(
                    "'trees_as_dataframe' wasn't logged. Probably generated dataframe was to large."
                )
        else:
            warnings.warn(
                "'trees_as_dataframe' won't be logged."
                " `booster` must be instance of `lightgbm.Booster` class.")

    if log_pickled_booster:
        results_dict["pickled_model"] = neptune.types.File.as_pickle(booster)

    if log_confusion_matrix:
        ax = plot_confusion_matrix(y_true=y_true, y_pred=y_pred)
        results_dict[
            f"{visuals_path}confusion_matrix"] = neptune.types.File.as_image(
                ax.figure)

    return results_dict
Exemple #27
0
def plot_tree(thing, est_lightgbm=None, tree_index=0, view=True,
              **kwargs):
    """Plot the i'th predictor tree of an estimator, a grower's tree, or
    directly a predictor tree.

    Trees displayed from TreeGrower have additional information like sum of
    gradients, etc.

    tree_index corresponds to the ith built tree (only used when thing is an
    estimator). In a multiclass setting, the ith tree isn't necessarily the
    tree built durint the ith iteration because there are K trees per
    iteration. For example with 3 classes, tree_index=5 will print the third
    tree of the second iteration.

    Can also plot a LightGBM estimator (on the left) for comparison.

    kwargs are passed to graphviz.Digraph()

    Example: plotting.plot_tree(est_sklearn, est_lightgbm, view=False,
    filename='output') will silently save output to output.pdf
    """
    def make_sklearn_tree(est):
        def add_predictor_node(node_idx, parent=None, decision=None):
            node = predictor_tree.nodes[node_idx]
            name = 'split__{}'.format(node_idx)
            label = 'split_feature_index: {}'.format(
                node['feature_idx'])
            label += r'\nthreshold: {:.3f}'.format(node['threshold'])
            label += r'\ngain: {:.3E}'.format(node['gain'])
            label += r'\nvalue: {:.3f}'.format(node['value'])
            label += r'\ncount: {:,}'.format(node['count'])

            graph.node(name, label=label)
            if not node['is_leaf']:
                add_predictor_node(node['left'], name, decision='<=')
                add_predictor_node(node['right'], name, decision='>')

            if parent is not None:
                graph.edge(parent, name, decision)

        def add_grower_node(node, parent=None, decision=None):
            name = 'split__{0}'.format(id(node))
            si = node.split_info
            if si is None:
                feature_idx = 0
                bin_idx = 0
                gain = 0.
                sum_gradients = 0.
                sum_hessians = 0.
            else:
                feature_idx = si.feature_idx
                gain = 0. if si.gain is None else si.gain
                bin_idx = si.bin_idx
                sum_gradients = si.sum_gradient_left + si.sum_gradient_right
                sum_hessians = si.sum_hessian_left + si.sum_hessian_right

            value = 0. if node.value is None else node.value
            label = 'split_feature_index: {}'.format(feature_idx)
            label += r'\nbin threshold: {}'.format(bin_idx)
            label += r'\ngain: {:.3E}'.format(gain)
            label += r'\nvalue: {:.3f}'.format(value)
            label += r'\ncount: {:,}'.format(node.sample_indices.shape[0])
            label += r'\nsum gradients: {:.3E}'.format(sum_gradients)
            label += r'\nsum hessians: {:.3E}'.format(sum_hessians)

            graph.node(name, label=label)
            if node.value is None:  # not a leaf node
                add_grower_node(node.left_child, name, decision='<=')
                add_grower_node(node.right_child, name, decision='>')

            if parent is not None:
                graph.edge(parent, name, decision)

        if isinstance(thing, BaseHistGradientBoosting):
            est = thing
            # check_is_fitted(est)
            iteration = tree_index // est.n_trees_per_iteration_
            k = tree_index % est.n_trees_per_iteration_
            predictor_tree = est._predictors[iteration][k]
            add_predictor_node(0)
        elif isinstance(thing, TreePredictor):
            predictor_tree = thing
            add_predictor_node(0)
        elif isinstance(thing, TreeGrower):
            add_grower_node(thing.root)

    # make lightgbm tree
    if est_lightgbm is not None:
        import lightgbm as lb
        graph = lb.create_tree_digraph(
            est_lightgbm,
            tree_index=tree_index,
            show_info=['split_gain', 'internal_value', 'internal_count',
                       'leaf_count'],
            **kwargs)
    else:
        graph = Digraph(**kwargs)

    # make sklearn tree
    make_sklearn_tree(thing)

    graph.render(view=view)
Exemple #28
0
        "bagging_seed": 0,
        "boost_from_average": True,
        "metric": "mae",
        "verbosity": -1,
    }
    model = lgbm.train(params=params,
                       train_set=training_data,
                       num_boost_round=10**5,
                       valid_sets=[training_data, val_data],
                       early_stopping_rounds=200,
                       verbose_eval=10**4)
    a = model.predict(submission, num_iterations=model.best_iteration)
    prediction = prediction + a.reshape(-1, 1)

print("Feature Importance")
axes = lgbm.plot_importance(model)
plt.show()

print("Another boosting tree which has to be rendered ")
graph = lgbm.create_tree_digraph(model)
graph.render(view=True)

model.save_model('LGBM.txt')
print("Done with saving and printing everything")

prediction = prediction / n_fold
prediction = prediction.reshape(-1)
sample_submission[:, 1] = prediction
submission = pandas.DataFrame(sample_submission,
                              columns=['seg_id', 'time_to_failure'])
submission.to_csv("Submission.csv", index=None, sep=",")
Exemple #29
0
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
plt.show()

print('Plotting 54th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm,
                   tree_index=53,
                   figsize=(15, 15),
                   show_info=['split_gain'])
plt.show()

print('Plotting 54th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
graph.render(view=True)
Exemple #30
0
        # s.write(str(jj) + '\t' + str(count_set[jj]) + '\t' + str(count_set[jj + 1]) + '\t' + str(auc) + '\n')
        AUC_set.append(auc)
median_tpr = np.median(tprs, axis=0)
mean_tpr = np.mean(tprs, axis=0)
median_tpr[-1] = 1.0
mean_tpr[-1] = 1.0
per_tpr = np.percentile(tprs, [25, 50, 75], axis=0)
median_auc = np.trapz(median_tpr, mean_fpr)
mean_auc = np.trapz(mean_tpr, mean_fpr)
plt.plot(mean_fpr, median_tpr, 'k', lw=3, label='median ROC')
plt.title(f'{str(median_auc)}({str(mean_auc)})')
plt.fill_between(mean_fpr,
                 per_tpr[0, :],
                 per_tpr[2, :],
                 color='g',
                 alpha=.2,
                 label='Quartile')
plt.legend(loc='lower right')
plt.show()

f = pd.DataFrame({
    'feature_name': x_train.columns,
    'feature_importance': clf.feature_importance()
})
f.sort_values(by='feature_importance', ascending=False).to_clipboard()
lgb.create_tree_digraph(clf, tree_index=1)
lgb.plot_tree(clf, tree_index=0, figsize=(100, 50))
plt.savefig('test.png')

###############