Example #1
0
 def rf_discrete(self, x, y):
     """
     Discrete the input 1-D numpy array based on RandomForest
     :param x: 1-D numpy array
     :param y: 1-D numpy array target variable
     :return: discreted 1-D numpy array
     """
     res = np.array([0] * x.shape[-1], dtype=int)
     interval_list = []
     x = np.column_stack((x, res))
     # model = RandomForestRegressor(n_estimators=60, max_depth=10)
     model = DecisionTreeRegressor(max_depth=10)
     model.fit(x, y)
     prediction, bias, contribution = ti.predict(model, x)
     # print(prediction, "\n", bias, "\n", contribution)
     '''
     for i in range(n):
         point1, point2 = stats.scoreatpercentile(x, [i*100/n, (i+1)*100/n])
         x1 = x[np.where((x >= point1) & (x <= point2))]
         mask = np.in1d(x, x1)
         res[mask] = (i + 1)
         # logging.info("discrete: " + str(res) + str((point1, point2)))
         # logging.info("mask: " + str(mask))
     # logging.info("discrete_main: " + str(res))       
     '''
     # raise ValueError
     return res, interval_list
Example #2
0
def analize_company_explain(loaded_model, y_predict_r, my_company_data):
    """
    : param loaded_model machine learning model
    : param y_predict_r rounded prediction of company
    : param my_company_data data for company
    """
    y_predict_r = np.round(y_predict_r, 1)

    optimised_random_forest = loaded_model.steps[-1]
    prediction, bias, contributions = ti.predict(optimised_random_forest[1][1],
                                                 my_company_data)
    util.myprint(prediction)
    util.myprint(bias)
    local_res = list()

    y_pred_r_colors = list()
    for i in range(len(y_predict_r)):
        if y_predict_r[i] > ariskparametri.YearlyYellow:
            if y_predict_r[i] > ariskparametri.YearlyRed:
                y_pred_r_colors.append(ariskparametri.ReportBarRosso)
            else:
                y_pred_r_colors.append(ariskparametri.ReportBarGiallo)
        else:
            y_pred_r_colors.append(ariskparametri.ReportBarVerde)

    for i in range(len(contributions)):
        res0, res1 = map(list, zip(*contributions[i]))
        util.myprint(res0)
        local_res.append(res1)

    return local_res
Example #3
0
def test_that_tree_works():
    from treeinterpreter import treeinterpreter as ti
    # Code below compares refactored blog post to our wrapper implementation.
    # http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/

    # Fit tree
    boston = load_boston()
    rf = RandomForestRegressor()
    X, y = boston.data[:300], boston.target[:300]
    feature_names = boston.feature_names

    X_new = boston.data[[300, 309]]
    y_new = boston.target[[300, 309]]
    rf.fit(X, y)

    # Build expected local explanation
    prediction, bias, contributions = ti.predict(rf, X_new)

    # Build actual local explanation
    explainer = TreeInterpreter(rf, X, feature_names=feature_names)
    local_expl = explainer.explain_local(X_new, y_new)

    a_local_data = local_expl.data(key=0)
    assert all([
        feature_names[i] == a_local_data["names"][i]
        for i in range(len(feature_names))
    ])
    assert all([
        contributions[0, i] == a_local_data["scores"][i]
        for i in range(len(feature_names))
    ])
    assert a_local_data["extra"]["names"][0] == "Bias"
    assert a_local_data["extra"]["scores"][0] == bias[0]
def MeaningfulSampling(instance2explain, blackbox, training_data, N_samples):
    """
    This function performs dense data generation for the instance2explain.
    It starts by randomly generating data points using the distribution of
    training data, and then making them closer to the instance2explain
    by considering similarities between feature values and feature importance.
    """

    # Generating random data using the distribution of training data
    # Discretizing random data for comparison of feature values
    random_samples = RandomSampling(instance2explain, training_data, N_samples)
    random_samples_dc = QuartileDiscretization(random_samples)

    # Constructing a random forest classifier as surrogate model
    surrogate_model = RandomForestClassifier(n_estimators=10)
    surrogate_model.fit(random_samples, blackbox.predict(random_samples))

    # Extracting feature contributions using TreeIntepreter
    # Discretizing contributions for comparison of feature importance
    prediction, bias, contributions = ti.predict(surrogate_model,
                                                 random_samples)
    contributions_dc = SturgesDiscretization(contributions)

    # Making a dense neighborhood w.r.t instance2explain
    dense_samples = SampleManipulation(prediction, random_samples,
                                       random_samples_dc, contributions_dc)

    interpretable_dense_samples = InterpretableRepresentation(dense_samples)

    return interpretable_dense_samples, dense_samples
Example #5
0
def classify(datapath, commithash=None, index=None):
    """ Load model and classify single data point. Also determines
    most significant feature """
    # pylint: disable = too-many-locals
    clf = joblib.load('model.pkl')
    data, _, hashes, names = load_data(datapath)

    if commithash:
        temp, = np.where(hashes == commithash)
        sample = temp[0]
    elif index:
        sample = index
    else:
        sample = 1

    prediction, _, contributions = ti.predict(clf, data[[sample]])
    label1 = np.array(contributions)[0, :, 0]
    label2 = np.array(contributions)[0, :, 1]

    if prediction[0][0] > prediction[0][1]:
        res = label1
        labeltext = 'clean'
    else:
        res = label2
        labeltext = 'buggy'

    top = max(res)
    index, = np.where(res == top)
    feature = names[index[0] + 1]

    print('Predicted result: ' + labeltext)
    print('Top factor: ' + feature)
Example #6
0
def predict(row):
    df = pd.DataFrame.from_dict([row], orient='columns')
    #open model RF
    with open('website/model.pkl', 'rb') as f:
        model = pickle.load(f)

    df1 = transform_test(df)

    #predict probability using model
    prediction = model.predict_proba(df1.values.reshape(1, -1))[0][1]

    row['prediction'] = prediction

    #determine contribution of features to prediction using treeinterpreter
    prediction, bias, contributions = ti.predict(model,
                                                 df1.values.reshape(1, -1))
    #empty list to hold important features which contributed to prediction
    important_features = []
    #names for features
    column_features = [
        'name_length', 'num_payouts', 'user_age', 'org_facebook',
        'org_twitter', 'body_length', 'gts', 'sale_duration', 'tickets_sold'
    ]
    #take the top 3 features which had the highest contribution
    for feature, key in sorted(
            zip(abs(contributions[0][:, 1]), column_features))[::-1][:3]:
        important_features.append(key)

    row['contributions'] = important_features

    #returns the prob of fraud which we can later classify using the threshold chosen
    return row
Example #7
0
def randomForstClassifier(data):

    labels = [
        'srch_id',
        # 'site_id',
        'prop_id',
        # 'prop_starrating',
        # 'prop_review_score',
        # 'prop_brand_bool',
        'prop_location_score1',
        'prop_location_score2',
        # 'position',
        'price_usd',
        # 'promotion_flag',
        # 'srch_saturday_night_bool'
        # 'random_bool',
        # 'click_bool',
        # 'booking_bool',
        # 'price_usd_normalized',
        # 'consumer'
        # 'Pclass'
        # 'score'
    ]

    # testdata = (testdata[labels])

    y = (data['score'])

    x = data[labels]

    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=42)
    # pprint(X_test)

    print("random forest")
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='auto',
                                n_estimators=1000,
                                max_depth=400,
                                verbose=0)

    rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)

    print('accuracy_score ', accuracy_score(y_test, predictions))
    print('confusion_matrix ', confusion_matrix(y_test, predictions))
    print('classification_report', classification_report(y_test, predictions))

    prediction, bias, contributions = ti.predict(rf, X_test)
    print("Prediction", prediction)
    print("Bias (trainset prior)", bias)
    print("Feature contributions:")
    for c, feature in zip(contributions[0], labels):
        print(feature, c)
    def test_tree_regressor(self):
        X = self.boston.data
        Y = self.boston.target
        testX = X[int(len(X) / 2):]

        #Predict for decision tree
        dt = DecisionTreeRegressor()
        dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)])

        base_prediction = dt.predict(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))

        testX = X[-1:]
        base_prediction = dt.predict(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
    def test_tree_regressor(self):
        X = self.boston.data
        Y = self.boston.target
        testX = X[int(len(X)/2):]
        
        #Predict for decision tree
        dt = DecisionTreeRegressor()
        dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)])

        base_prediction = dt.predict(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
        
        testX = X[-1:]
        base_prediction = dt.predict(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def iterforest(clf,i,metricarr):
  for i in range(i):
    clf_spec = RandomForestClassifier(max_depth=None, random_state=1,oob_score=True,n_estimators=10,class_weight="balanced")
    clf_spec.fit(X_train,y_train)
    y_pred_iter = pd.DataFrame(clf_rf.predict(X_test))
    #rf_probs = clf_rf.predict_proba(X_test)[:, 0]
    #rf_probs2 = clf_rf.predict_proba(X_test)[:, 1]
    
    print(classification_report(y_test, y_pred_iter, target_names=classes_spr))
    metricarr.append(classification_report(y_test, y_pred_iter, target_names=classes_spr))
    np.set_printoptions(precision=2)
    
    classes_spr = list(unique_labels(y_test, y_pred_rf50))
    plot_confusion_matrix(y_test, y_pred_rf50, classes=classes_spr, normalize=False,
                      title='confusion matrix for Normal Random Forest iteration '+i)
    plt.show()
    
    
    from treeinterpreter import treeinterpreter as ti
    from collections import defaultdict 
    import random
    class1=defaultdict(list)
    class2=defaultdict(list)
    #instance = X_test.sample(n=50000).values #instance randomized to avoid RAM error
    prediction, bias, contributions = ti.predict(clf_spec, instance)
    #print ("Prediction", prediction)
    #print ("Bias (trainset prior)", bias)
    print ("Feature contributions for rf100:")
    test =list()
    count = 0
    for i in range(len(instance)):
      for c,feature in zip(contributions[i],header):
        #print (feature,c)
        fc_class1 = c[0]
        fc_class2=c[1]
        class1[feature].append(fc_class1)
        class2[feature].append(fc_class2)
      count+=1

    #np.sum(contributions,axis=1)
    print(count)

    import statistics as sts
    class1_fixed=list()
    class2_fixed=list()
    for header_name in header:
      array_value = class1[header_name]
      class2_array_value = class2[header_name]
      class1_fixed.append(sts.median(array_value))
      class2_fixed.append(sts.median(class2_array_value))      
    for i in enumerate(class1_fixed):
      if class1_fixed[i] >0:
        setnum1=class1_fixed[i]
      if class2_fixed[i] >0:
        setnum1=class1_fixed[i]
    def test_tree_classifier(self):
        X = self.iris.data
        Y = self.iris.target
        dt = DecisionTreeClassifier()
        dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)])
        testX = X[int(len(X) / 2):int(len(X) / 2) + 1]
        base_prediction = dt.predict_proba(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)

        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
 def test_tree_classifier(self):
     X = self.iris.data
     Y = self.iris.target
     dt = DecisionTreeClassifier()
     dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)])
     testX = X[int(len(X)/2):int(len(X)/2)+1]
     base_prediction = dt.predict_proba(testX)
     pred, bias, contrib = treeinterpreter.predict(dt, testX)
     
     self.assertTrue(np.allclose(base_prediction, pred))
     self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
Example #13
0
def tree_interpret(model,X_test,cols=None):
    pred,bias,contrib=ti.predict(model,X_test)
    assert np.allclose(pred,np.sum(contrib,axis=1)+bias[0]),"something wrong!!!"
    if cols is None:
        try:
            cols = X_test.columns()
        except:
            cols = np.arange(X_test.shape[1]).astype(np.str)

    _contrib = pd.DataFrame(contrib[:,:,1],columns=cols)
    _pred = np.argmax(pred,axis=1)
    return _pred,bias,_contrib
Example #14
0
 def test_forest_classifier_parallel(self):
     idx = range(len(self.iris.data))
     np.random.shuffle(idx)
     X = self.iris.data[idx]
     Y = self.iris.target[idx]
     dt = RandomForestClassifier(max_depth=20, n_estimators=500)
     dt.fit(X[:len(X)/2], Y[:len(X)/2])
     testX = X[len(X)/2:]
     base_prediction = dt.predict_proba(testX)
     pred, bias, contrib = treeinterpreter.predict(dt, testX, n_jobs=2)
     self.assertTrue(np.allclose(base_prediction, pred))
     self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
 def test_forest_classifier(self):
     idx = np.arange(len(self.iris.data))
     np.random.shuffle(idx)
     X = self.iris.data[idx]
     Y = self.iris.target[idx]
     dt = RandomForestClassifier(max_depth=3)
     dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)])
     testX = X[int(len(X)/2):]
     base_prediction = dt.predict_proba(testX)
     pred, bias, contrib = treeinterpreter.predict(dt, testX)
     self.assertTrue(np.allclose(base_prediction, pred))
     self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
Example #16
0
def make_prediction():
    if request.method == 'POST':
        sample = []
        data = {}
        feature_names = [
            'satisfaction', 'evaluation', 'projectCount',
            'averageMonthlyHours', 'yearsAtCompany', 'workAccident',
            'promotion', 'salary', 'department'
        ]

        #print request.form
        satisfaction = request.form.get('Satisfaction')
        sample.append(float(satisfaction))
        evaluation = request.form.get('Evaluation')
        sample.append(float(evaluation))
        projectCount = request.form.get('Project Count')
        sample.append(int(projectCount))
        averageMonthlyHours = request.form.get('Average Monthly Hours')
        sample.append(int(averageMonthlyHours))
        yearsAtCompany = request.form.get('Years At Company')
        sample.append(int(yearsAtCompany))
        workAccident = request.form.get('Work Accident')
        sample.append(int(workAccident))
        promotion = request.form.get('Promotion')
        sample.append(int(promotion))
        salary = request.form.get('Salary')
        sample.append(int(salary))
        department = request.form.get('Department')
        sample.append(department)
        sampleDf = pd.DataFrame(sample)
        if len(sample) < 9:
            return render_template(
                'result.html', label1="Missing data or incorrect data entered")
        # make prediction
        prediction = model.predict(sampleDf.T)
        pred, bias, contributions = ti.predict(model, sampleDf.T)
        confidence = model.predict_proba(sampleDf.T)
        data["confidence_0"] = confidence[0][0]
        data["confidence_0"] = confidence[0][1]
        data["Prediction"] = prediction[0]
        #json_cities = json.dumps(city_array)
        #return render (request, 'plot3/plot_page.html', {"city_array" : json_cities})

        data[0] = {}
        data[1] = {}
        for c in range(len(contributions[0])):

            data[0][feature_names[c]] = round(contributions[0][c][0], 2)
            data[1][feature_names[c]] = round(contributions[0][c][1], 2)

        json_data = json.dumps(data)
        jsonify(data=data)
        return render_template('result.html', data=data)
Example #17
0
 def treeinterpreter(self):
     # that [:300] can be changed as needed for now it's only setting for convenience
     # you can use [[200:300]] to find instances you're interested
     prediction, bias, contributions = ti.predict(self.model, self.X[:300])
     contributionsum = np.sum(contributions, axis=0)
     contributiondict = dict(zip(self.names, contributionsum))
     featurerank = sorted(contributiondict.items(),
                          key=lambda kv: kv[1],
                          reverse=True)
     print("Features sorted by their score under treeinterpreter criteria:")
     print(featurerank)
     return featurerank
 def test_forest_classifier_joint(self):
     idx = np.arange(len(self.iris.data))
     np.random.shuffle(idx)
     X = self.iris.data[idx]
     Y = self.iris.target[idx]
     dt = RandomForestClassifier(max_depth=3)
     dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)])
     testX = X[int(len(X)/2):]
     base_prediction = dt.predict_proba(testX)
     pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True)
     self.assertTrue(np.allclose(base_prediction, pred))
     self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
Example #19
0
    def test_gradient_boosting_regressor(self):
        X = self.boston.data
        Y = self.boston.target
        testX = X[len(X)/2:]
        
        dt = GradientBoostingRegressor(n_estimators=10)
        dt.fit(X[:len(X)/2], Y[:len(X)/2])

        base_prediction = dt.predict(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
 def test_forest_classifier(self):
     idx = np.arange(len(self.iris.data))
     np.random.shuffle(idx)
     X = self.iris.data[idx]
     Y = self.iris.target[idx]
     dt = RandomForestClassifier(max_depth=3)
     dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)])
     testX = X[int(len(X) / 2):]
     base_prediction = dt.predict_proba(testX)
     pred, bias, contrib = treeinterpreter.predict(dt, testX)
     self.assertTrue(np.allclose(base_prediction, pred))
     self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def get_interpretation_for_trees(model, row, names, n_bests):
    explanation = treeinterpreter.predict(model, row)
    explanation_sorted_with_columns = list(
        sorted([(elt1, elt2) for elt1, elt2 in zip(explanation[2][0], names)],
               key=lambda x: np.abs(x[0]),
               reverse=True))
    return {
        "contribution":
        [elt[0] for elt in explanation_sorted_with_columns][0:n_bests],
        "columns":
        [elt[1] for elt in explanation_sorted_with_columns][0:n_bests]
    }
Example #22
0
    def test_random_forest_regressor(self):
        X = self.boston.data
        Y = self.boston.target
        testX = X[len(X)/2:]
        
        #Predict for decision tree
        dt = RandomForestRegressor(n_estimators=10)
        dt.fit(X[:len(X)/2], Y[:len(X)/2])

        base_prediction = dt.predict(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
    def test_forest_regressor(self):
        X = self.boston.data
        Y = self.boston.target
        testX = X[len(X)/2:]
        
        #Predict for decision tree
        dt = RandomForestRegressor(n_estimators=10)
        dt.fit(X[:len(X)/2], Y[:len(X)/2])

        base_prediction = dt.predict(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        self.assertTrue(np.allclose(base_prediction, pred))
        self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
Example #24
0
def get_interpretation(X, rf_model):
    for i, row in X.iterrows():
        data_point = pd.DataFrame([row])
        # Once transposed, it will be the column name
        data_point.set_axis(['value_variable'])
        prediction, bias, contributions = ti.predict(rf_model, data_point)
        local_interpretation = data_point.append(
            pd.DataFrame([[round(c[1], 3) for c in contributions[0]]],
                         columns=data_point.columns.tolist(),
                         index=['contribution_variable'
                                ])).T.sort_values('contribution_variable',
                                                  ascending=False)
        print(local_interpretation)
Example #25
0
 def test_forest_classifier_joint(self):
     for ForestClassifier in (RandomForestClassifier, ExtraTreesClassifier):
         idx = np.arange(len(self.iris.data))
         np.random.shuffle(idx)
         X = self.iris.data[idx]
         Y = self.iris.target[idx]
         dt = ForestClassifier(max_depth=3)
         dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)])
         testX = X[int(len(X)/2):]
         base_prediction = dt.predict_proba(testX)
         pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True)
         self.assertTrue(np.allclose(base_prediction, pred))
         self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
def rePredict(request):
    fixData = []
    temp = []
    # 特征名称列表
    featureList = totaldata.columns.values.tolist()
    # print(selData)
    fmin = []
    fmax = []
    for i in featureList:
        fmin.append(float(totaldata[i].min()))
        fmax.append(float(totaldata[i].max()))
    for i in featureList:
        temp.append(float(request.GET.get(i)))
    fixData.append(temp);
    p = {}
    rule = {}
    predict = []
    result = estimator.predict_proba(fixData).tolist()
    for j in range(len(estimator.estimators_)):
        m = estimator.estimators_[j].decision_path(fixData)
        predict.append(estimator.estimators_[j].predict_proba(fixData)[0].tolist())
        d = []
        r = {}  # 决策路径和其对应的特征,范围值,0:[-1.5, 2]
        for i in m[0].indices:
            temp = []
            f = int(estimator.estimators_[j].tree_.feature[i])  # 特征index
            threshold = estimator.estimators_[j].tree_.threshold[i]  # 节点分裂值
            if (estimator.estimators_[j].tree_.feature[i] >= 0):
                if f not in r.keys():
                    r[f] = [fmin[f], fmax[f]]
                temp.append(int(f))
                temp.append(float(threshold))
                if (fixData[0][f] <= threshold):
                    temp.append("<=")
                    if threshold < r[f][1]:
                        r[f][1] = float(threshold)
                else:
                    temp.append(">")
                    if threshold > r[f][0]:
                        r[f][0] = float(threshold)
                d.append(temp)
        p[j] = d
        rule[j] = r
    # contribution compute
    fixDataNDArray = np.array(fixData)
    prediction, bias, contributions = ti.predict(estimator, fixDataNDArray)
    print(prediction, bias, contributions)
    return JsonResponse({'path': p, 'rule': rule, 'predict': predict, 'featureList': featureList, 'result': result, 'data': fixData[0],
                         'prediction': prediction.tolist(), 'bias': bias.tolist(), 'contributions': contributions.tolist()},
                        safe=False)
    def test_forest_regressor_joint(self):
        X = self.boston.data
        Y = self.boston.target
        testX = X[int(len(X)/2):]
        
        #Predict for decision tree
        dt = RandomForestRegressor(n_estimators=10)
        dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)])

        base_prediction = dt.predict(testX)
        pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True)
        self.assertTrue(np.allclose(base_prediction, pred))
        
        self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
Example #28
0
    def _forest_importances(self, X, test_cache, result_channels, c, s):
        if isinstance(self.clf[c], (RandomForestRegressor, RandomForestClassifier)) and \
                isinstance(X, np.ndarray):
            features = np.ones(X.shape[1], dtype=bool)
            if 'feature_importance' not in result_channels[c][s]:
                result_channels[c][s]['feature_importance'] = np.full(X.shape[1], np.nan)
            for sel in self.selection[c]:
                features[features] = sel.get_support()
            contrib = ti.predict(self.clf[c], np.array(test_cache[c]))[2]
            result_channels[c][s]['feature_importance'][features] = \
                np.mean(contrib if self.is_regression else contrib[:, :, 0], axis=0)
        elif self.scheme.is_multiroi[c] and \
                isinstance(self.clf[c].base_estimator, (RandomForestRegressor,
                                                        RandomForestClassifier)):
            if 'feature_importance' not in result_channels[c][s]:
                result_channels[c][s]['feature_importance'] = dict()
            if sum(self.selection, []):
                features = dict()
                for sel in self.selection[c]:
                    for k, v in sel.get_support().items():
                        if k in test_cache[c].keys():
                            if k not in features:
                                features[k] = v
                            else:
                                features[k][features[k]] = v
            else:
                features = {k: np.ones(len(v[0]), dtype=bool)
                            for k, v in test_cache[c].items()}
            for k in test_cache[c].keys():
                if k not in result_channels[c][s]['feature_importance']:
                    result_channels[c][s]['feature_importance'][k] = \
                        np.full(len(features[k]), np.nan)
                contrib = ti.predict(self.clf[c].estimators_[k], test_cache[c][k])[2]
                result_channels[c][s]['feature_importance'][k][features[k]] = \
                    np.mean(contrib if self.is_regression else contrib[:, :, 0], axis=0)

        return result_channels
    def test_forest_regressor(self):
        for ForestRegressor in (RandomForestRegressor, ExtraTreesRegressor):
            X = self.boston.data
            Y = self.boston.target
            testX = X[int(len(X) / 2):]

            #Predict for decision tree
            dt = ForestRegressor(n_estimators=10)
            dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)])

            base_prediction = dt.predict(testX)
            pred, bias, contrib = treeinterpreter.predict(dt, testX)
            self.assertTrue(np.allclose(base_prediction, pred.flatten()))
            self.assertTrue(
                np.allclose(pred.flatten(), bias + np.sum(contrib, axis=1)))
def Contribution(model_dict, data_to_anly, period):
    predict, biases, contributions = ti.predict(
        model_dict['P' + str(period) + 'RF'],
        data_to_anly.drop(['Yield'], axis=1))
    #將各筆資料的變數以貢獻程度降冪排列,合併貢獻值與變數名稱
    df_sort_contributions = cs.contributionSort(contributions,
                                                list(data_to_anly.columns))
    tmp = pd.DataFrame(
        data={
            'Actual': data_to_anly.loc[:, 'Yield'],
            'Predict': list(predict),
            'Biases': list(biases)
        })
    contribution = pd.concat([tmp, df_sort_contributions], axis=1)
    return contribution
Example #31
0
    def __calculate_variables_contribution(self, record):
        for i, row in record.iterrows():
            data_point = pd.DataFrame([row])
            data_point.set_axis(
                ['value_variable'],
                inplace=True)  # Once transposed, it will be the column name
            prediction, bias, contributions = ti.predict(
                self.rf_model, data_point)

            local_interpretation = data_point.append(
                pd.DataFrame([[round(c[1], 3) for c in contributions[0]]],
                             columns=data_point.columns.tolist(),
                             index=['contribution_variable'
                                    ])).T.sort_values('contribution_variable',
                                                      ascending=False)
        return local_interpretation
Example #32
0
    def test_forest_regressor_joint(self):
        return None
        for ForestRegressor in (RandomForestRegressor, ExtraTreesRegressor):
            X = self.boston.data
            Y = self.boston.target
            testX = X[int(len(X)/2):]
            
            #Predict for decision tree
            dt = ForestRegressor(n_estimators=10)
            dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)])

            base_prediction = dt.predict(testX)
            pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True)
            self.assertTrue(np.allclose(base_prediction, pred))
            
            self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
Example #33
0
    def test_gradient_boosting_classifier(self):
        idx = range(len(self.iris.data))
        np.random.shuffle(idx)
        X = self.iris.data[idx]
        Y = self.iris.target[idx]
        dt = GradientBoostingClassifier(max_depth=3)
        dt.fit(X[:len(X)/2], Y[:len(X)/2])
        testX = X[len(X)/2:]
        base_prediction = dt.predict_proba(testX)
        pred, bias, contrib = treeinterpreter.predict(dt, testX)
        
        self.assertTrue(np.allclose(base_prediction, pred))

        # Need to convert score to proba 
        # using logistic function or similar
        sum_contrib = dt.loss_._score_to_proba(bias + np.sum(contrib, axis=1))
        self.assertTrue(np.allclose(pred, sum_contrib))
Example #34
0
	def tree_interp(test_df,model):
		"""
		Call treeinterpreter for RF model and build dataframe with output.

		ONLY WORKS FOR INDEP CONTRIBS FOR REGRESSION

		parameters:
			test_df, df: dataframe of test instances
			model: trained rf regression or classification model from sklearn
			joint, str: t/f, whether to get joint feature contributions or not
				PUT BACK IN WHEN YOU ENABLE THIS FEATURE

		returns: a dataframe with the sampleID, label, bias, prediction and
		contributions for all features, for each instance
		"""
		from treeinterpreter import treeinterpreter as ti

		# put labels, ID in contribution dataframe
		interp_df_half = test_df['Y'].to_frame()
		interp_df_half.set_index(test_df.index)

		# get feature names to use as col names for contributions
		test_featureNames = test_df.columns.values.tolist()
		test_featureNames = test_featureNames[1:]

		print('\n\n===> Calculating independent feature contributions <===')
		# drop Y to format test data for ti
		test_X = test_df.drop(['Y'], axis=1)
		# call ti
		prediction, bias, contributions = ti.predict(model,test_X)

		# add results to contribution df
		interp_df_half['bias'] = bias.tolist()
		interp_df_half['prediction'] = prediction.flatten().tolist()

		# make df of contributions and all other columns to concatanate
		contrib_df = pd.DataFrame(contributions,index = test_df.index,
							columns=test_featureNames)

		# make df where columns are ID, label, bias, prediction, contributions
		local_interp_df = pd.concat([interp_df_half, contrib_df], axis=1)

		print(f'Snapshot of the interpretation dataframe: {local_interp_df.head()}')

		return local_interp_df
Example #35
0
def random_forest_regressor(numpy_df_train, numpy_df_test, y_train, y_test):
    from treeinterpreter import treeinterpreter as ti
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import classification_report,confusion_matrix

    rf = RandomForestRegressor(verbose=True)
    rf.fit(numpy_df_train, y_train)

    print('Freature Importance ', rf.feature_importances_)

    print('Generating predictions')
    prediction, bias, contributions = ti.predict(rf, numpy_df_test)
    print('Predictions generated')

    idx = test.index
    predictions_df = pd.DataFrame(data=prediction[0:], index = idx, columns=['predicted_value'])
    return predictions_df
Example #36
0
def predict_sklearn_wTreeInterpreter(ext_df, features_to_include):
    with open(MODEL_PATH, "rb") as pklr:
        clf = pickle.load(pklr)

    try:
        from treeinterpreter import treeinterpreter
        ext_df["pred_Bs"], bias, contribution = treeinterpreter.predict(
            clf, ext_df[features_to_include])

        return contribution
    except:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback.print_exception(exc_type,
                                  exc_value,
                                  exc_traceback,
                                  file=sys.stdout)

    return
Example #37
0
def get_detailed_report(train, trained_filename):
    """
    : param train data file to do prediction and analyse
    : param trained_filename machine learning model pickle
    """
    loaded_model = pickle.load(open(trained_filename, 'rb'))

    optimised_random_forest = loaded_model.steps[-1]
    prediction, bias, contributions = ti.predict(optimised_random_forest[1][1],
                                                 train)
    util.myprint(prediction)
    util.myprint(bias)
    indices = [
        'Ricavi delle vendite', 'EBITDA/Vendite (%)', 'EBITDA migl EUR',
        'Indice di liquidità', 'Indice corrente',
        'Indice di indebitam. a lungo',
        'Indice di copertura delle immob. (patrimoniale)',
        'Grado di ammortamento', 'Debiti v/banche su fatt. (%)',
        'Grado di copertura degli interessi passivi',
        'Giac. media delle scorte (gg)', 'Giorni copertura scorte (gg)',
        'Redditività di tutto il capitale investito (ROI) (%)',
        'Flusso di cassa di gestione', 'Oneri finanz. su fatt. (%)'
    ]

    for i in range(5):

        print("Feature contributions:")
        res0, res1 = map(list, zip(*contributions[i]))
        util.myprint(res0)
        for idx, feature in sorted(zip(res1, indices),
                                   key=lambda x: -abs(x[0])):
            print(feature, abs(round(idx, 2)))
        print("-" * 20)

    res = list()
    for i in range(len(contributions)):
        res0, res1 = map(list, zip(*contributions[i]))
        util.myprint(res0)
        res.append(res1)

    det_res = pd.DataFrame(np.abs(res))

    return det_res
Example #38
0
    print("Feature Contribution (Random Forest)")
    clf = joblib.load('D:\SLIIT\SoftwareIndustry\df_model.pkl')

    df_emp = mysql_cn.read('select * from employeesit_predict')
    emp_id = df_emp['Employee_ID']
    emp_name = df_emp['Employee_Name']
    id = emp_id.tolist()
    e_name = emp_name.tolist()

    arr_con = np.zeros(len(features))
    arr_gain_lost = np.zeros(len(features))
    arr_feature = np.zeros(len(features))
    msg.update_message()
    for index in range(len(X)):
        i = X[index:index + 1]
        prediction, bias, contributions = ti.predict(clf, i)
        a = 0
        for c, feature in zip(contributions[0], features):
            arr_con[a] = round(c[1] * 100, 2)
            # arr_feature[a] = feature
            # arr_gain_lost[a]=round(c[1], 2)
            a = a + 1
            arr_con_list = arr_con.tolist()
        max_value = max(arr_con_list)
        max_index = arr_con_list.index(max_value)
        mysql_cn.insert_update(
            "INSERT INTO `employeesit_predict_feature_cont`(`Employee_ID`, `Employee_Name`, `Age`, `Gender`, `Marital_Status`, "
            "`Having_Degree`, `Job_Role`, `Department`, `WorkFrom`, `WorkTo`, `Tenure`, `Salary`, `Bonus`, `Claims`, `Worked_Project`, `No_of_Leaves`, "
            "`Distance`, `No_of_Parents`, `No_of_Children`, `No_of_Complaints`, `Bias`, `Prediction_Probability`, `Max_Feature`) "
            "VALUES('%s','%s',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'%s')" %
            (id[index], e_name[index], arr_con[0], arr_con[1], arr_con[2], arr_con[3], arr_con[4], arr_con[5],
Example #39
0
    print
    print('Analyzing')
    #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them
    outliers = featureMatrix[featureMatrix.prediction == opts.anomalyclass].drop('prediction',axis=1)

    num_outliers = len(outliers.index)
    print 'detected %d anomalies out of %d total rows (%.2f%%)' % (num_outliers, total_rows, (num_outliers * 1.0 / total_rows)*100)

    if num_outliers == 0:
        sys.exit(0)

    if (opts.verbose) and type(clf) is RandomForestClassifier:
        print 'investigating all the outliers'
        #investigate each outlier (determine the most influential columns in the prediction)
        prediction, bias, contributions = ti.predict(clf, outliers)
        print 'done'
        print(contributions.shape)

    i=0
    #for each anomaly
    for index, row in outliers.iterrows():
        print('-----------------------------------------')
        print 'line ',index
        #find the row in the original data of the anomaly. print it out as CSV.
        print pd.DataFrame(df.iloc[index]).T.to_csv(header=False, index=False)
        if (opts.verbose) and type(clf) is RandomForestClassifier:
            #if we are verbose print out the investigation by zipping the heavily weighted columns with the appropriate features
            instancecontributions = zip(contributions[i], outliers.columns.values)
            print "Top feature contributions to anomaly class:"
            for (c, feature) in sorted(instancecontributions, key=lambda (c,f): c[1], reverse=True)[:10]:
# In[17]:

fit1


# In[37]:

instances = boston.data[[300, 309]]
print "Instance 0 prediction:", rf.predict(instances[0].reshape(1,13))
print "Instance 1 prediction:", rf.predict(instances[1].reshape(1,13))


# In[38]:

prediction, bias, contributions = ti.predict(rf, instances)


# In[40]:

for i in range(len(instances)):
    print "Instance", i
    print "Bias (trainset mean)", bias[i]
    print "Feature contributions:"
    for c, feature in sorted(zip(contributions[i], 
                                 boston.feature_names), 
                             key=lambda x: -abs(x[0])):
        print feature, round(c, 2)
    print "-"*20

Example #41
0
#    (y.size - x.shape[1] - 1 - 1))
    
#print("Residual sum of squares: %.2f"
#      % np.mean((rf.predict(x) - y) ** 2))

xy_df['predicted'] = rf.predict(x)
xy_df['delta'] = xy_df[y_feat] - xy_df['predicted']
#xy_df = xy_df.sort(columns = y_feat, ascending = False)

#xy_df.to_csv(y_feat + '_predict.txt', sep = '\t', decimal = ',')
################################################################################
# TreeInterpreter
# Calculate feature importances
feat_imp_df = pd.DataFrame(data = rf.feature_importances_, index = feat_names,\
    columns = ['feature_importances'])
prediction, bias, contributions = ti.predict(rf, x)
contr_df = pd.DataFrame(contributions, index = xy_df.index, columns = feat_names)
bias_df = pd.DataFrame(bias, index = xy_df.index, columns = ['bias'])
feat_stat_df = bias_df.join(contr_df)
feat_stat_df = pd.concat([feat_stat_df, feat_imp_df.transpose()])
feat_stat_df = feat_stat_df.T.sort(columns = 'feature_importances',\
    na_position = 'first', ascending = False).T
#feat_stat_df.to_csv(y_feat + '_feature_statistics.txt', sep = '\t',\
#    decimal = ',')

result = pd.concat([xy_df, feat_stat_df], axis = 1)
result = result.sort(columns = y_feat, ascending = False, na_position = 'last')
result.columns = [['DataSet', 'DataSet', 'DataSet', 'DataSet', 'DataSet', \
    'DataSet', 'DataSet', 'DataSet', 'DataSet', 'DataSet', 'DataSet',\
    'Feature_Statistics', 'Feature_Statistics', 'Feature_Statistics',\
    'Feature_Statistics', 'Feature_Statistics', 'Feature_Statistics',\