Beispiel #1
0
    def baseline(name):

        data = DataManager(DATASET_PATH + dataset + ".csv")

        # Wrapper for SKlearn models so that their interface matches metrics_maple and metrics_lime
        class Wrapper():
            def __init__(self, pred):
                self.pred = pred
                self.index = 0

            def predict(self, x):
                return self.pred(x)

            def set_index(self, i):
                self.index = i

            def predict_index(self, x):
                return np.squeeze(self.pred(x)[:, self.index])

        if name == "lr":
            model = LinearRegression()
        elif name == "dt":
            model = DecisionTreeRegressor()
        elif name == "rf":
            model = RandomForestRegressor()

        model.fit(data.X_train, data.y_train)

        predictions = model.predict(data.X_test)
        acc = mean_squared_error(data.y_test, predictions)

        if name == "lr":
            predict_fn = model.predict
        elif name == "dt" or name == "rf":

            def predict_fn(x):
                return np.expand_dims(model.predict(x), 1)

        wrapper = Wrapper(predict_fn)

        _, lime_causal_metric, _ = metrics_lime(wrapper, data.X_train,
                                                data.X_test)

        return acc, lime_causal_metric[0]
Beispiel #2
0
    coefs = np.round(coefs, 2)
    print("Explanation: ")
    for i in range(coefs.shape[0]):
        print(i - 1, coefs[i])
    print("")


def mod(x, i, d):
    x_new = np.copy(x)
    x_new[i] += d
    return x_new


# Load the data
np.random.seed(seed)  # Get the same data everytime
data = DataManager("../Datasets/housing.csv")

X_train = data.X_train
n_input = X_train.shape[1]

# Load the network shape
with open("../UCI-None/config.json", "r") as f:
    config_list = json.load(f)

config = config_list["housing"]

shape = [n_input]
for i in range(config[0]):
    shape.append(config[1])
shape.append(1)
Beispiel #3
0
                                     num_features=num_features,
                                     num_samples=num_samples)

    coef_pairs = exp.local_exp[1]
    for pair in coef_pairs:
        coefs[pair[0]] = pair[1]

    coefs = coefs / sd

    intercept = exp.intercept[1] - np.sum(coefs * u)

    return np.insert(coefs, 0, intercept)


# Load the data
data = DataManager("../Datasets/" + name + ".csv")

X_train = data.X_train
n_input = X_train.shape[1]

# Load the network shape

with open("../UCI-None/config.json", "r") as f:
    config_list = json.load(f)

config = config_list[name]

shape = [n_input]
for i in range(config[0]):
    shape.append(config[1])
shape.append(1)
Beispiel #4
0
#%%
import warnings
import pandas as pd
from sklearn import preprocessing
from Data import DataManager
from Analyser import Analyser

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 51)
output = 'output/'
size = (150, 100)
warnings.filterwarnings('ignore')
# %%
# 1. Read from Dataset
dataManager = DataManager()
dfFullData = dataManager.readData()
#%%
# 2. Analysing Data from Dataset
dfFullData.describe()
#%%
dfFullData.info()
#%%
# Base on displot 01_INITIAL_DistPlot.png, boxplot 02_OUTLIER_BoxPlot.png, data information above and the original datset:
# 1.some feature in dataset does not have normal distibution thus has to be skewed,
# 2.The true label, 'diagnosis' has binary values and the ratio of Yes to No is disproportionate thus stratification has to be done.
# 3.The range of numercal values in some features is wide thus has to be scaled down.
# 4.The true label, 'diagnosis' has to be converted to numbers via label encoding since it has only 2 values
# 5.There are some features with outliers, thus outliers has to be removed
# 6.Features 'ID' and 'Unnamed' has to be dropped as they are not useful
# 7. There are no empty cells
# In addition, the follwing steps also has to be checked: