def test_BayesianRuleList2(): dataset = load_breast_cancer() x, y = dataset['data'], dataset['target'] feature_names = dataset['feature_names'] x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=42) discretizer = MDLP(random_state=42).fit(x_train, y_train) x_train_cat = discretizer.transform(x_train) category_names = compute_intervals(discretizer) rule_list = BayesianRuleList(seed=1, feature_names=feature_names, category_names=category_names, verbose=2) rule_list.fit(x_train_cat, y_train) print(rule_list) x_test_cat = discretizer.transform(x_test) print('acc: %.4f' % rule_list.score(x_test_cat, y_test))
def bayesian_rule_list(X_train, y_train, X_test, y_test): from mdlp.discretization import MDLP from sklearn import preprocessing # First one hot encode X_train, X_test = apply_one_hot_encoding(X_train, X_test) # Then need to convert classes to integers encoder = preprocessing.LabelEncoder() y_train = encoder.fit_transform(y_train) y_test = encoder.transform(y_test) # Then discretize features transformer = MDLP() X_train = transformer.fit_transform(X_train, y_train) X_test = transformer.transform(X_test) brl = pysbrl.BayesianRuleList() brl.fit(X_train, y_train) print(brl) # The complexity is the number of split points + the number of extra conditions # (i.e. if x1 > 0 and x2 = 1 then .. counts as 2 not 1), for this reason we do not use brl.n_rules brl_str = str(brl) brl_complexity = brl_str.count("IF") + brl_str.count("AND") training_recreations = brl.predict(X_train) brl_training_recreating_pct = scorer(training_recreations, y_train) * 100 testing_recreations = brl.predict(X_test) brl_testing_recreating_pct = scorer(testing_recreations, y_test) * 100 return brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity
class SupervisedDiscretizationStrategy(object): """ A class used for supervised data discretization. """ def __init__(self): self.transformer = MDLP() def discretize(self, data_set, validation_size, nb_bins=None): """ Discretize continuous attribute using MDLP method. Args: data_set: The data set containing continuous data. validation_size: The validation size of the newly created discretized data set. Returns: discretized_dataset: A DataSet object containing discretized data. """ # Create strategy object to further create the discretized data set. galaxy_dataset_feature_strategy = GalaxyDataSetFeatureStrategy() # Get data from training set. X_train = data_set.train.get_features y_train = data_set.train.get_labels # Supervised discretization of the training data set using MDLP. X_train_discretized = self.transformer.fit_transform(X=X_train, y=y_train) # Get data from validation set. X_valid = data_set.valid.get_features y_valid = data_set.valid.get_labels # Unsupervised discretization using MDLP. X_valid_discretized = self.transformer.transform(X=X_valid) # Merge both training and validation data. X = np.append(X_train_discretized, X_valid_discretized, axis=0) y = np.append(y_train, y_valid, axis=0) # Create a new data set. discretized_dataset = galaxy_dataset_feature_strategy.create_datasets( X, y, validation_size) return discretized_dataset
plt.figure() plot_confusion_matrix(cm_galaxy_test_k3u, classes=['Smooth', 'Spiral'], normalize=True, title='Confusion matrix, with normalization with k:3') plt.show() # In[33]: # In[33]: print(" Bayes Naif models with hold-out set ") # Scale data for train, validation (hold out), and test # first method of discretization using MDLP from mdlp.discretization import MDLP mdlp = MDLP() Xtrain_galaxy_MDLP = mdlp.fit_transform(X_train_galaxy, Y_train_galaxy) Xtest_galaxy_MDLP = mdlp.transform(X_test_galaxy, Y_test_galaxy) Xvalid_galaxy_MDLP = mdlp.transform(X_valid_galaxy, Y_valid_galaxy) # In[33]: # Second method of discretization using MinMaxScaler from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() Xtrain_galaxy_unsupervised = scaler.fit_transform(X_train_galaxy) Xtest_galaxy_unsupervised = scaler.transform(X_test_galaxy) Xvalid_galaxy_unsupervised = scaler.transform(X_valid_galaxy) # In[33]: # Bayes naïf gaussien with 2 different parameters i.e. # 1. priors = probaility of each class # In[33]: