def __init__(
     self,
     mode,
     max_depth=3,
     feature_names=None,
     max_sentences=20000,
     exp_rand_tree_size=True,
     tree_generator=None,
 ):
     '''
     mode: 'classify' or 'regress'
     max_depth: maximum depth of trained trees
     feature_names: names of features
     max_sentences: maximum number of extracted sentences
     exp_rand_tree_size: Having trees with different sizes
     tree_generator: Tree generator model (overwrites above features)
     '''
     self.feature_names = feature_names
     self.mode = mode
     max_leafs = 2**max_depth
     num_trees = max_sentences // max_leafs
     if tree_generator is None:
         tree_generator = RandomForestClassifier(num_trees,
                                                 max_depth=max_depth)
     self.exp_rand_tree_size = exp_rand_tree_size
     self.rf = RuleFit(rfmode=mode,
                       tree_size=max_leafs,
                       max_rules=max_sentences,
                       tree_generator=tree_generator,
                       exp_rand_tree_size=True,
                       fit_lasso=False,
                       Cs=10.**np.arange(-4, 1),
                       cv=3)
Esempio n. 2
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 5
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)
        model = model[0]
        training_df, test_df = get_encoded_logs(job)
        feature_names = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)

        X_train = training_df.drop(['trace_id', 'label'], 1)
        Y_train = training_df.drop(
            ['trace_id', 'prefix_1', 'prefix_3', 'prefix_4', 'label'], 1)

        rf = RuleFit()
        columns = list(X_train.columns)

        X = X_train.as_matrix()

        rf.fit(X, Y_train.values.ravel(), feature_names=columns)
        rules = rf.get_rules()
        # rules = rules[rules.coef != 0].sort_values("support", ascending=False)
        rules = rules[(rules.coef > 0.) & (rules.type != 'linear')]
        rules['effect'] = rules['coef'] * rules['support']
        pd.set_option('display.max_colwidth', -1)
        rules.nlargest(10, 'effect')
        # print(rules)
        rules
Esempio n. 3
0
def train_job(train_idx, test_idx, t_size, rf_mode, m_rules, r_seed, X, y,
              feas, n_samples):
    """
    每个 fold 中进行训练和验证
    """
    p = current_process()
    print('process counter:', p._identity[0], 'pid:', os.getpid())
    # 初始化 estimator 训练集进入模型
    rf = RuleFit(tree_size=t_size,
                 rfmode=rf_mode,
                 max_rules=m_rules,
                 random_state=r_seed)
    print(
        "\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}"
        .format(rf.tree_generator, rf.max_rules, rf.tree_size,
                rf.random_state))
    rf.fit(X[train_idx], y[train_idx], feas)
    # 验证测试集 (通过 index 去除 fake data)
    real_test_index = test_idx[test_idx < n_samples]
    batch_test_x = X[real_test_index]
    batch_test_y = y[real_test_index]
    batch_test_size = len(real_test_index)
    y_pred = rf.predict(batch_test_x)
    # 计算测试集 ACC
    accTest = accuracy_score(batch_test_y, y_pred)
    print("\nTest Accuracy:", "{:.6f}".format(accTest), "Test Size:",
          batch_test_size)

    print(
        "\n========================================================================="
    )
    # 返回测试集和预测结果用于统计
    return batch_test_y, y_pred
Esempio n. 4
0
    def _getRulesRulefit(df_aux, model_params):
        # Prepare data
        X_train = df_aux[feature_cols]
        y_train = df_aux["predictions"]

        # Fit model
        if "tree_size" not in model_params.keys():
            model_params["tree_size"] = len(feature_cols) * 2
        if "rfmode" not in model_params.keys():
            model_params["rfmode"] = "classify"
        rf = RuleFit(**model_params)
        rf.fit(X_train.values, y_train.values, feature_names=feature_cols)

        # Get rules
        print("Obtaining Rules using RuleFit...")
        rules_all = rf.get_rules()
        rules_all = rules_all[rules_all.coef != 0]
        rules_all = rules_all[rules_all.importance > 0].sort_values(
            "support", ascending=False)
        rules_all = rules_all[rules_all.coef > 0]
        rules_all = rules_all.sort_values("support", ascending=False)
        rules_all = rules_all[rules_all["type"] == "rule"]
        rules_all["size_rules"] = rules_all.apply(
            lambda x: len(x["rule"].split("&")), axis=1)

        # Turn list of rules to dataframe
        print("Turning rules to hypercubes...")
        df_rules = turn_rules_to_df(list_rules=list(rules_all["rule"].values),
                                    list_cols=feature_cols)

        # Get corresponding rule size from the original rule extraction model,
        # not on the hypercubes obtained later
        df_rules["size_rules"] = list(rules_all["size_rules"].values)

        # Prune rules
        if simplify_rules:
            print("Prunning the rules obtained...")
            df_rules_pruned = df_rules.drop(columns=["size_rules"]).copy()
            df_rules_pruned = simplifyRules(df_rules_pruned, categorical_cols)
            df_rules_pruned = df_rules_pruned.reset_index().merge(
                df_rules.reset_index()[["index", "size_rules"]], how="left")
            df_rules_pruned.index = df_rules_pruned["index"]
            df_rules_pruned = df_rules_pruned.drop(columns=["index"],
                                                   errors="ignore")
            df_rules = df_rules_pruned
        return df_rules
 def __init__(self,
              mode,
              max_depth=3,
              feature_names=None,
              max_rules=20000,
              exp_rand_tree_size=True,
              Cs=None,
              cv=None,
              tree_generator=None):
     super().__init__(mode,
                      max_depth=3,
                      feature_names=None,
                      max_rules=20000,
                      exp_rand_tree_size=True,
                      Cs=None,
                      cv=None,
                      tree_generator=None)
     if Cs is None:
         Cs = 10.**np.arange(-4, 1)
     if cv is None:
         cv = 3
     self.feature_names = feature_names
     self.mode = mode
     max_leafs = 2**max_depth
     num_trees = max_rules // max_leafs
     if tree_generator is None:
         tree_generator = RandomForestClassifier(num_trees,
                                                 max_depth=max_depth)
     self.exp_rand_tree_size = exp_rand_tree_size
     self.rf = RuleFit(rfmode=mode,
                       tree_size=max_leafs,
                       max_rules=max_rules,
                       tree_generator=tree_generator,
                       exp_rand_tree_size=True,
                       fit_lasso=False,
                       Cs=Cs,
                       cv=cv)
Esempio n. 6
0
boston_data = pd.read_csv("boston.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.as_matrix()

typ = 'classifier'  #regressor or classifier

if typ == 'regressor':
    rf = RuleFit(tree_size=4,
                 sample_fract='default',
                 max_rules=2000,
                 memory_par=0.01,
                 tree_generator=None,
                 rfmode='regress',
                 lin_trim_quantile=0.025,
                 lin_standardise=True,
                 exp_rand_tree_size=True,
                 random_state=1)
    rf.fit(X, y, feature_names=features)
    y_pred = rf.predict(X)
    insample_rmse = np.sqrt(np.sum((y_pred - y)**2) / len(y))
elif typ == 'classifier':
    y_class = y.copy()
    y_class[y_class < 21] = -1
    y_class[y_class >= 21] = +1
    N = X.shape[0]
    rf = RuleFit(tree_size=4,
                 sample_fract='default',
                 max_rules=2000,
Esempio n. 7
0
def fitrf(train, labels):
    r = RuleFit()
    r.fit(train, labels)
    return r
Esempio n. 8
0
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor
from rulefit import RuleFit

boston_data = pd.read_csv("boston.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.as_matrix()

gb = GradientBoostingRegressor(n_estimators=100,
                               max_depth=3,
                               learning_rate=0.01)
rf = RuleFit(gb)

rf.fit(X, y, feature_names=features)

rules = rf.get_rules()

rules = rules[rules.coef != 0].sort("support")
rgb = RuleFit(Cs=None,
              cv=3,
              exp_rand_tree_size=True,
              lin_standardise=True,
              lin_trim_quantile=0.025,
              max_rules=2000,
              memory_par=0.01,
              model_type='rl',
              random_state=None,
              rfmode='regress',
              sample_fract='default',
              tree_generator=GradientBoostingRegressor(
                  alpha=0.9,
                  criterion='friedman_mse',
                  init=None,
                  learning_rate=0.02,
                  loss='ls',
                  max_depth=100,
                  max_features=None,
                  max_leaf_nodes=15,
                  min_impurity_decrease=0.0,
                  min_impurity_split=None,
                  min_samples_leaf=1,
                  min_samples_split=2,
                  min_weight_fraction_leaf=0.0,
                  n_estimators=500,
                  n_iter_no_change=None,
                  presort='auto',
                  random_state=572,
                  subsample=0.46436099318265595,
                  tol=0.0001,
                  validation_fraction=0.1,
                  verbose=0,
                  warm_start=False),
              tree_size=3)
Esempio n. 10
0
def fitrf(train, labels):
    r = RuleFit()
    r.fit(train, labels)
    return r
Esempio n. 11
0
    # 交叉验证
    rs = KFold(n_splits=args.kfolds,
               shuffle=True,
               random_state=args.randomseed)
    # 生成 k-fold 训练集、测试集索引
    resampled_index_set = rs.split(y_resampled)
    k_fold_step = 1  # 初始化折数
    # 暂存每次选中的测试集和对应预测结果
    test_cache = pred_cache = np.array([], dtype=np.int)
    # 迭代训练 k-fold 交叉验证
    for train_index, test_index in resampled_index_set:
        print("\nFold:", k_fold_step)
        # 初始化 estimator 训练集进入模型
        rf = RuleFit(tree_size=args.treesize,
                     rfmode=args.rfmode,
                     max_rules=args.maxrules,
                     random_state=args.randomseed)
        rf.fit(x_resampled[train_index], y_resampled[train_index], features)
        # 测试集验证
        # 验证测试集 (通过 index 去除 fake data)
        real_test_index = test_index[test_index < X.shape[0]]
        batch_test_x = x_resampled[real_test_index]
        batch_test_y = y_resampled[real_test_index]
        batch_test_size = len(real_test_index)
        y_pred = rf.predict(batch_test_x)
        # 计算测试集 ACC
        accTest = accuracy_score(batch_test_y, y_pred)
        print("\nFold:", k_fold_step, "Test Accuracy:",
              "{:.6f}".format(accTest), "Test Size:", batch_test_size)
        # 暂存每次选中的测试集和预测结果
        test_cache = np.concatenate((test_cache, batch_test_y))
Esempio n. 12
0
    features = dataset.columns[1:]
    print("\nDataset shape: ", dataset.shape, " Number of features: ",
          features.size)
    # 不同 Class 统计 (根据第1列)
    class_info = dataset.groupby(dataset.columns[0]).size()
    print('\n', class_info)
    # print("\nClass info:", np.unique(y, return_counts=True))
    print("\nTraining Start...")

    # 标准化处理
    scaler = StandardScaler().fit(X_origin)
    X = scaler.transform(X_origin)

    # 初始化 estimator
    rf = RuleFit(tree_size=args.treesize,
                 rfmode=args.rfmode,
                 max_rules=args.maxrules,
                 random_state=args.randomseed)
    # 模型训练
    rf.fit(X, y, features)
    # 输出 rulefit 相关训练参数
    print("\n=== Model parameters ===")
    print(
        "\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}"
        .format(rf.tree_generator, rf.max_rules, rf.tree_size,
                rf.random_state))
    # 验证
    y_pred = rf.predict(X)
    # 输出统计结果
    num_categories = class_info.values.size
    if (num_categories > 2):
        utils.model_evaluation(num_categories, y, y_pred)
Esempio n. 13
0
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor
from rulefit import RuleFit


boston_data = pd.read_csv("boston.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.as_matrix()

gb = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=0.01)
rf = RuleFit(gb)

rf.fit(X, y, feature_names=features)

rules = rf.get_rules()

rules = rules[rules.coef != 0].sort("support")
Esempio n. 14
0
    print('\n', df_sum_y)

    # 交叉验证
    rs = KFold(n_splits=args.kfolds,
               shuffle=True,
               random_state=args.randomseed)
    # 生成 k-fold 训练集、测试集索引
    cv_index_set = rs.split(y)
    # 暂存每次选中的测试集和对应预测结果
    test_cache = pred_cache = np.array([], dtype=np.int)
    # 构建进程池并行训练
    pool = Pool(processes=args.kfolds)
    res = []
    # 初始化 estimator 训练集进入模型
    rf = RuleFit(tree_size=args.treesize,
                 rfmode=args.rfmode,
                 max_rules=args.maxrules,
                 random_state=args.randomseed)
    # 输出rulefit模型参数
    print("\n=== Model parameters ===")
    print(
        "\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}"
        .format(rf.tree_generator, rf.max_rules, rf.tree_size,
                rf.random_state))
    print("\nTraining Start...")
    for train_index, test_index in cv_index_set:
        result = pool.apply_async(train_job,
                                  args=(train_index, test_index),
                                  kwds=dict(estimator=rf,
                                            X=X,
                                            y=y,
                                            feas=features))
Esempio n. 15
0
perm = PermutationImportance(rf, random_state=1).fit(X_validation,
                                                     Y_validation)
results.write('\n\n\nRANDOM FOREST REGRESSOR PERMUTATION IMPORTANCE\n\n\n')
print(
    eli5.format_as_text(
        eli5.explain_weights(perm, feature_names=data.columns.tolist())))

results.write(
    eli5.format_as_text(
        eli5.explain_weights(perm, feature_names=data.columns.tolist())))

##########################################################
#### CREATE SHADOW MODEL IN FORM OF RULE FIT ALGORITHM ###
##########################################################

rf = RuleFit()
rf.fit(X_train, [int(i) for i in Y_train],
       feature_names=[
           'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'
       ])
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("support", ascending=False)
print('\n\nRule Fit aglorithm rules\n' + str(rules))
results.write('\n\nRule Fit aglorithm rules\n' + str(rules))

##############################################################
#### CREATE SHADOW MODEL IN FORM OF Formel Concept Lattice ###
##############################################################

from concepts import Context
Esempio n. 16
0
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 25 23:54:35 2018

@author: Melanie
"""

import numpy as np
import pandas as pd

from rulefit import RuleFit

boston_data = pd.read_csv("prism_numeric.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.as_matrix()

rf = RuleFit()
rf.fit(X, y, feature_names=features)
rf.predict(X)
rules = rf.get_rules()

rules = rules[rules.coef != 0].sort_values("support", ascending=False)

print(rules)
Esempio n. 17
0
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from rulefit import RuleFit


boston_data = pd.read_csv("boston.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.values

typ = 'regressor' #regressor or classifier

if typ == 'regressor':
    rf = RuleFit(
        rfmode='regress',
        tree_generator=RandomForestRegressor()
    )
    rf.fit(X, y, feature_names=features)
    y_pred = rf.predict(X)
    insample_rmse = np.sqrt(np.sum((y_pred - y)**2)/len(y))
elif typ == 'classifier':
    y_class = y.copy()
    y_class[y_class < 21] = -1
    y_class[y_class >= 21] = +1
    N = X.shape[0]
    rf = RuleFit(   rfmode='classify',
                    tree_generator=RandomForestClassifier()
                )
    rf.fit(X, y_class, feature_names=features)
    y_pred = rf.predict(X)
    y_proba = rf.predict_proba(X)
import numpy as np
import pandas as pd
from rulefit import RuleFit

## Create artificial data set with
n = 10000
x1 = np.random.normal(scale=1, size=n)
x2 = np.random.normal(loc=0, scale=1, size=n)
x3 = np.random.normal(size=n)
x4 = np.random.normal(size=n)

eps = np.random.normal(loc=0, scale=0.1, size=n)

y = 5 * ((x1 > 1).astype(int) * (x2 < -1).astype(int)) + 0.3 * x4 + eps

X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4})

rf = RuleFit()
rf.fit(X.values, y, X.columns)
rf.fit(X.values, y)

rules = rf.get_rules(exclude_zero_coef=True)

print(rules)
Esempio n. 19
0
    # 交叉验证
    rs = KFold(n_splits=args.kfolds,
               shuffle=True,
               random_state=args.randomseed)
    # 生成 k-fold 训练集、测试集索引
    cv_index_set = rs.split(y)
    k_fold_step = 1  # 初始化折数
    # 暂存每次选中的测试集和对应预测结果
    test_cache = pred_cache = np.array([], dtype=np.int)
    # 迭代训练 k-fold 交叉验证
    for train_index, test_index in cv_index_set:
        print("\nFold:", k_fold_step)
        # 初始化 estimator 训练集进入模型
        rf = RuleFit(tree_size=args.treesize,
                     rfmode=args.rfmode,
                     max_rules=args.maxrules,
                     random_state=args.randomseed)
        rf.fit(X[train_index], y[train_index], features)
        # 测试集验证
        y_pred = rf.predict(X[test_index])
        # 计算测试集 ACC
        accTest = accuracy_score(y[test_index], y_pred)
        print("\nFold:", k_fold_step, "Test Accuracy:",
              "{:.6f}".format(accTest), "Test Size:", test_index.size)
        # 暂存每次选中的测试集和预测结果
        test_cache = np.concatenate((test_cache, y[test_index]))
        pred_cache = np.concatenate((pred_cache, y_pred))
        print(
            "\n========================================================================="
        )
        # 每个fold训练结束后次数 +1
Esempio n. 20
0
data = np.load('data.npy')
target = np.load('target.npy')

feature_name = ['QNH', 'TEMP', 'RH', 'absolute_temp', 'WS2A', 'CW2A']

train = data[200:, :]
test = data[:200, :]
train_target = target[200:]
test_target = target[:200]

from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators=500,
                               max_depth=10,
                               learning_rate=0.01)

relu_fit = RuleFit()
relu_fit.max_iter = 4000
relu_fit.tree_generator = gb
relu_fit.fit(train, train_target, feature_names=feature_name)
f = relu_fit.predict(test)
ff = relu_fit.predict(train)
rule = relu_fit.get_rules()
truth = 0
for i in range(test_target.shape[0]):
    if abs(test_target[i] - f[i]) / test_target[i] < 0.1:
        truth += 1

print("truth: ", truth / test_target.shape[0])
#print(rule)
ruleset = pd.DataFrame(data=rule)
writer = pd.ExcelWriter('./rules.xlsx')
class FeatureVec(object):
    "Feature-vector class."

    def __init__(
        self,
        mode,
        max_depth=3,
        feature_names=None,
        max_sentences=20000,
        exp_rand_tree_size=True,
        tree_generator=None,
    ):
        '''
        mode: 'classify' or 'regress'
        max_depth: maximum depth of trained trees
        feature_names: names of features
        max_sentences: maximum number of extracted sentences
        exp_rand_tree_size: Having trees with different sizes
        tree_generator: Tree generator model (overwrites above features)
        '''
        self.feature_names = feature_names
        self.mode = mode
        max_leafs = 2**max_depth
        num_trees = max_sentences // max_leafs
        if tree_generator is None:
            tree_generator = RandomForestClassifier(num_trees,
                                                    max_depth=max_depth)
        self.exp_rand_tree_size = exp_rand_tree_size
        self.rf = RuleFit(rfmode=mode,
                          tree_size=max_leafs,
                          max_rules=max_sentences,
                          tree_generator=tree_generator,
                          exp_rand_tree_size=True,
                          fit_lasso=False,
                          Cs=10.**np.arange(-4, 1),
                          cv=3)

    def fit(self, X, y, restart=True, bagging=0):
        '''Fit the tree model.
        X: inputs
        y: outputs (integer class label or real value)
        restart: To train from scratch tree generator model
        bagging: If >0 applies bagging on trees to compute confidence intervals
        '''

        if not bagging:
            bagging = 0

        dimred = TruncatedSVD(2)
        self.rf.fit(X, y, restart=restart)
        rules = self.rf.get_rules()['rule'].values
        cm = cooccurance_matrix(rules, X.shape[-1])
        vectors = dimred.fit_transform(cm)
        vectors = normalize_angles(vectors)
        self.norms = np.clip(np.linalg.norm(vectors, axis=-1, keepdims=True),
                             1e-12, None)
        vectors /= np.max(self.norms)
        self.vectors = vectors
        self.importance = np.linalg.norm(self.vectors, axis=-1)
        self.angles = np.arctan2(self.vectors[:, 1], self.vectors[:, 0])
        self.stds = np.zeros(vectors.shape)
        self.predictor = self.rf.tree_generator
        if bagging:
            all_vectors = []
            for _ in range(bagging):
                self.rf.bag_trees(X, y)
                rules_bag = self.rf.get_rules()['rule'].values
                cm_bag = cooccurance_matrix(rules_bag, X.shape[-1])
                vectors_bag = dimred.fit_transform(cm_bag)
                vectors_bag = normalize_angles(vectors_bag)
                norms_bag = np.clip(
                    np.linalg.norm(vectors_bag, axis=-1, keepdims=True), 1e-12,
                    None)
                all_vectors.append(vectors_bag / norms_bag)
            self.stds = np.std(all_vectors, 0)

    def plot(self, dynamic=True, confidence=True, path=None):
        '''Plot the feature-vectors.
        dynamic: If True the output is a dynamic html plot. Otherwise, it will be an image.
        confidence: To show confidence intervals or not
        path: Path to save the image. If dy
        '''
        mx = 1.1
        angles = np.arctan2(self.vectors[:, 1], self.vectors[:, 0])
        max_angle = np.max(np.abs(angles))
        feature_names = self.feature_names + ['origin', '']
        plot_vectors = np.concatenate([self.vectors, [[0, 0], [0, 0]]])
        vectors_sizes = np.linalg.norm(plot_vectors, axis=-1)
        plot_angles = np.concatenate([angles, [-max_angle, max_angle]])
        plot_data = np.stack([
            plot_vectors[:, 1], plot_vectors[:, 0], plot_angles, feature_names
        ],
                             axis=-1)
        plot_df = pd.DataFrame(data=plot_data,
                               columns=['x', 'y', 'angles', 'names'])
        plot_df[["x", "y",
                 "angles"]] = plot_df[["x", "y",
                                       "angles"]].apply(pd.to_numeric)
        if dynamic:
            fig = px.scatter(
                plot_df,
                x='x',
                y='y',
                color='angles',
                width=1000,
                height=500,
                hover_name=feature_names,
                hover_data={
                    'x': False,
                    'y': False,
                    'angles': False,
                    'names': False
                },
                color_continuous_scale=px.colors.sequential.Rainbow)

            fig.update_yaxes(visible=False,
                             showticklabels=False,
                             range=[0, mx])
            fig.update_xaxes(visible=False,
                             showticklabels=False,
                             range=[-mx, mx])
        else:
            fig = px.scatter(
                plot_df,
                x='x',
                y='y',
                color='angles',
                width=1000,
                height=500,
                hover_name='names',
                hover_data={
                    'x': False,
                    'y': False,
                    'angles': False,
                    'names': False
                },
                color_continuous_scale=px.colors.sequential.Rainbow)
            max_name_len = max([len(i) for i in feature_names])
            for i in range(len(plot_vectors) - 2):
                if plot_vectors[:, 1][i] > 0:
                    name = feature_names[i] + ''.join(
                        [' '] * (max_name_len - len(feature_names[i])))
                    ax = plot_vectors[:, 1][i] + 0.2
                else:
                    name = ''.join([' '] *
                                   (max_name_len -
                                    len(feature_names[i]))) + feature_names[i]
                    ax = plot_vectors[:, 1][i] - 0.2
                if vectors_sizes[i] < 0.2:
                    continue
                fig.add_annotation(
                    x=plot_vectors[:, 1][i],
                    y=plot_vectors[:, 0][i],
                    text=feature_names[i] +
                    ''.join([' '] * (max_name_len - len(feature_names[i]))),
                    font=dict(size=15),
                    axref="x",
                    ayref="y",
                    ax=ax,
                    ay=plot_vectors[:, 0][i],
                    arrowhead=2,
                )
                fig.update_yaxes(visible=False,
                                 showticklabels=False,
                                 range=[0, mx])
                fig.update_xaxes(visible=False,
                                 showticklabels=False,
                                 range=[-mx, mx])
        fig.update_traces(marker=dict(size=10), textfont_size=15)
        fig.update(layout_coloraxis_showscale=False)
        fig.update_layout(showlegend=False)
        for i in range(10):
            fig.add_shape(type='circle',
                          x0=(i + 1) / 10 * mx,
                          y0=(i + 1) / 10 * mx,
                          x1=-(i + 1) / 10 * mx,
                          y1=-(i + 1) / 10 * mx,
                          line_color="red",
                          opacity=0.5,
                          line=dict(dash='dot', width=3))
        if confidence:
            for vector, std, angle in zip(self.vectors, self.stds, angles):
                fig.add_shape(type='circle',
                              x0=vector[1] + 3 * std[1],
                              y0=vector[0] + 3 * std[0],
                              x1=vector[1] - 3 * std[1],
                              y1=vector[0] - 3 * std[0],
                              line_color='gray',
                              opacity=0.5,
                              line=dict(dash='solid', width=1))
        fig.show()
        if path:
            if len(path.split('/')) > 1 and not os.path.exists('/'.join(
                    path.split('/')[:-1])):
                os.makedirs('/'.join(path.split('/')[:-1]))
            if dynamic:
                assert path.split(
                    '.'
                )[-1] == 'html', 'For a dynamic figure, path should be an html file!'
                fig.write_html(path)
            else:
                fig.write_image(path)
Esempio n. 22
0
import numpy as np
import pandas as pd
from rulefit import RuleFit



## Create artificial data set with
n = 10000
x1 = np.random.normal(scale=1, size=n)
x2 = np.random.normal(loc=0, scale=1, size=n)
x3 = np.random.normal(size=n)
x4 = np.random.normal(size=n)

eps = np.random.normal(loc=0, scale=0.1, size=n)

y = 5 * ((x1 > 1).astype(int) * (x2 <  -1).astype(int)) + 0.3 * x4 + eps


X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4})



rf = RuleFit()
rf.fit(X.values, y, X.columns)
rf.fit(X.values, y)

rules = rf.get_rules(exclude_zero_coef=True)

print(rules)