Beispiel #1
0
def test_BayesianRuleList2():
    dataset = load_breast_cancer()
    x, y = dataset['data'], dataset['target']
    feature_names = dataset['feature_names']
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.33, random_state=42)
    discretizer = MDLP(random_state=42).fit(x_train, y_train)
    x_train_cat = discretizer.transform(x_train)
    category_names = compute_intervals(discretizer)
    rule_list = BayesianRuleList(seed=1, feature_names=feature_names, category_names=category_names, verbose=2)
    rule_list.fit(x_train_cat, y_train)
    print(rule_list)
    x_test_cat = discretizer.transform(x_test)

    print('acc: %.4f' % rule_list.score(x_test_cat, y_test))
Beispiel #2
0
def bayesian_rule_list(X_train, y_train, X_test, y_test):
    from mdlp.discretization import MDLP
    from sklearn import preprocessing

    # First one hot encode
    X_train, X_test = apply_one_hot_encoding(X_train, X_test)


    # Then need to convert classes to integers
    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)

    # Then discretize features
    transformer = MDLP()
    X_train = transformer.fit_transform(X_train, y_train)
    X_test = transformer.transform(X_test)

    brl = pysbrl.BayesianRuleList()
    brl.fit(X_train, y_train)

    print(brl)

    # The complexity is the number of split points + the number of extra conditions
    # (i.e. if x1 > 0 and x2 = 1 then .. counts as 2 not 1), for this reason we do not use brl.n_rules
    brl_str = str(brl)
    brl_complexity = brl_str.count("IF") + brl_str.count("AND")

    training_recreations = brl.predict(X_train)
    brl_training_recreating_pct = scorer(training_recreations, y_train) * 100
    testing_recreations = brl.predict(X_test)
    brl_testing_recreating_pct = scorer(testing_recreations, y_test) * 100

    return brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity
class SupervisedDiscretizationStrategy(object):
    """
        A class used for supervised data discretization.
    """
    def __init__(self):
        self.transformer = MDLP()

    def discretize(self, data_set, validation_size, nb_bins=None):
        """ Discretize continuous attribute using MDLP method.

        Args:
            data_set: The data set containing continuous data.
            validation_size: The validation size of the newly created discretized data set.

        Returns:
            discretized_dataset: A DataSet object containing discretized data.
        """

        # Create strategy object to further create the discretized data set.
        galaxy_dataset_feature_strategy = GalaxyDataSetFeatureStrategy()

        # Get data from training set.
        X_train = data_set.train.get_features
        y_train = data_set.train.get_labels

        # Supervised discretization of the training data set using MDLP.
        X_train_discretized = self.transformer.fit_transform(X=X_train,
                                                             y=y_train)

        # Get data from validation set.
        X_valid = data_set.valid.get_features
        y_valid = data_set.valid.get_labels

        # Unsupervised discretization using MDLP.
        X_valid_discretized = self.transformer.transform(X=X_valid)

        # Merge both training and validation data.
        X = np.append(X_train_discretized, X_valid_discretized, axis=0)
        y = np.append(y_train, y_valid, axis=0)

        # Create a new data set.
        discretized_dataset = galaxy_dataset_feature_strategy.create_datasets(
            X, y, validation_size)

        return discretized_dataset
plt.figure()
plot_confusion_matrix(cm_galaxy_test_k3u,
                      classes=['Smooth', 'Spiral'],
                      normalize=True,
                      title='Confusion matrix, with normalization with k:3')
plt.show()
# In[33]:
# In[33]:
print("       Bayes Naif models with hold-out set ")

# Scale data for train, validation (hold out), and test
# first method of discretization using MDLP
from mdlp.discretization import MDLP
mdlp = MDLP()
Xtrain_galaxy_MDLP = mdlp.fit_transform(X_train_galaxy, Y_train_galaxy)
Xtest_galaxy_MDLP = mdlp.transform(X_test_galaxy, Y_test_galaxy)
Xvalid_galaxy_MDLP = mdlp.transform(X_valid_galaxy, Y_valid_galaxy)

# In[33]:
# Second method of discretization using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Xtrain_galaxy_unsupervised = scaler.fit_transform(X_train_galaxy)
Xtest_galaxy_unsupervised = scaler.transform(X_test_galaxy)
Xvalid_galaxy_unsupervised = scaler.transform(X_valid_galaxy)
# In[33]:

# Bayes naïf gaussien with 2 different parameters i.e.
# 1. priors = probaility of each class

# In[33]: