class MDLPDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): if(labels is None): raise ValueError('Labels must be not None when using \ MDLPDiscretizer') BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels=labels, random_state=random_state) def bins(self, data, labels): self.transformer = MDLP() discretize_data = self.transformer.fit_transform(data, labels) bins = [] for i in range (len(set(labels))): intervals = set(self.transformer.cat2intervals(discretize_data, i)) feature_interval = [] for i in range (len(intervals)): interval = intervals.pop() feature_interval.append(interval[0]) feature_interval.append(interval[1]) feature_interval = set(feature_interval) feature_interval.discard(float('inf')) feature_interval.discard(float('-inf')) array = [x for x in feature_interval] bins.append(np.array(array)) return bins
def bayesian_rule_list(X_train, y_train, X_test, y_test): from mdlp.discretization import MDLP from sklearn import preprocessing # First one hot encode X_train, X_test = apply_one_hot_encoding(X_train, X_test) # Then need to convert classes to integers encoder = preprocessing.LabelEncoder() y_train = encoder.fit_transform(y_train) y_test = encoder.transform(y_test) # Then discretize features transformer = MDLP() X_train = transformer.fit_transform(X_train, y_train) X_test = transformer.transform(X_test) brl = pysbrl.BayesianRuleList() brl.fit(X_train, y_train) print(brl) # The complexity is the number of split points + the number of extra conditions # (i.e. if x1 > 0 and x2 = 1 then .. counts as 2 not 1), for this reason we do not use brl.n_rules brl_str = str(brl) brl_complexity = brl_str.count("IF") + brl_str.count("AND") training_recreations = brl.predict(X_train) brl_training_recreating_pct = scorer(training_recreations, y_train) * 100 testing_recreations = brl.predict(X_test) brl_testing_recreating_pct = scorer(testing_recreations, y_test) * 100 return brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity
def all_entropies(df1, total): mdlp = MDLP() x_test = mdlp.fit_transform(df1[df1.columns[:-1]].values, df1[df1.columns[-1]].values) entropies = [] for x, y in enumerate(mdlp.cut_points_): if len(y) > 1: for j, k in enumerate(y): if j == 0: temp = df1[df1[x] <= y[j]]['class'].value_counts( ).values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if j == len(y) - 1: temp = df1[ df1[x] > y[j]]['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if j != len(y) - 1: temp = df1[(df1[x] > y[j]) & (df1[x] <= y[j + 1])]['class'].value_counts( ).values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if len(y) == 1: temp = df1[df1[x] <= y[0]]['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) temp = df1[df1[x] > y[0]]['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if len(y) == 0: temp = df1['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) return sorted(entropies)
def discMdlp(_df): featureVals = [x for x in _df if x != 'Class'] transformer = MDLP() discretizedMap = {'Class': _df['Class']} discret = transformer.fit_transform(_df[featureVals], _df['Class']) nFrame = pd.DataFrame(data=discret, columns=featureVals) nFrame.loc[:, 'Class'] = pd.Series(_df['Class']) return nFrame
def classifier(args): dataset_info = datasets_info[args.data_type] df = pd.read_csv(dataset_info['path']) for drop_col in dataset_info['drop_columns']: df = df.drop(columns=df.columns[drop_col]) y = df[df.columns[dataset_info['class_column']]] X = df.drop(columns=df.columns[dataset_info['class_column']]) if args.plot: sns.pairplot(df, hue=df.columns[dataset_info['class_column']]) plt.show() # Discretize values before training if args.discretization_bins > 0: if args.discretization_mode == DISC_MDLP: transformer = MDLP() X = transformer.fit_transform(X, y) else: for column in X: bins = discretization(args.discretization_mode, X[column], args.discretization_bins) X[column] = bins # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) # Create a new figure and set the figsize argument so we get square-ish plots of the 4 features. if args.plot: plt.figure(figsize=(10, 3)) # Iterate over the features, creating a subplot with a histogram for each one. if args.plot: for feature in range(X_train.shape[1]): plt.subplot(1, len(X_train.columns), feature + 1) sns.distplot(X_train.values[:, feature]) plt.show() # Fitting Naive Bayes Classification to the Training set # classifier = GaussianNB() classifier = MultinomialNB(alpha=1.0) classifier.fit(X_train, y_train) cross_validation(classifier, X, y) # Predicting the Test set results y_pred = classifier.predict(X_test) print(y_pred) evaluation(y_test, y_pred, args)
class SupervisedDiscretizationStrategy(object): """ A class used for supervised data discretization. """ def __init__(self): self.transformer = MDLP() def discretize(self, data_set, validation_size, nb_bins=None): """ Discretize continuous attribute using MDLP method. Args: data_set: The data set containing continuous data. validation_size: The validation size of the newly created discretized data set. Returns: discretized_dataset: A DataSet object containing discretized data. """ # Create strategy object to further create the discretized data set. galaxy_dataset_feature_strategy = GalaxyDataSetFeatureStrategy() # Get data from training set. X_train = data_set.train.get_features y_train = data_set.train.get_labels # Supervised discretization of the training data set using MDLP. X_train_discretized = self.transformer.fit_transform(X=X_train, y=y_train) # Get data from validation set. X_valid = data_set.valid.get_features y_valid = data_set.valid.get_labels # Unsupervised discretization using MDLP. X_valid_discretized = self.transformer.transform(X=X_valid) # Merge both training and validation data. X = np.append(X_train_discretized, X_valid_discretized, axis=0) y = np.append(y_train, y_valid, axis=0) # Create a new data set. discretized_dataset = galaxy_dataset_feature_strategy.create_datasets( X, y, validation_size) return discretized_dataset
def grow(self, data, t_id, level, cur_performance): """ :param data: current data for future tree growth :param t_id: tree id :param level: level id :return: None """ if level >= self.max_depth: return if len(data) == 0: print "?????????????????????? Early Ends ???????????????????????" return self.tree_depths[t_id] = level decision = self.structures[t_id][level] structure = tuple(self.structures[t_id][:level + 1]) cur_selected = self.computed_cache.get(structure, None) Y = data.as_matrix(columns=[self.target]) if not cur_selected: for cue in list(data): if cue in self.ignore or cue == self.target: continue if self.split_method == "MDLP": mdlp = MDLP() X = data.as_matrix(columns=[cue]) X_disc = mdlp.fit_transform(X, Y) X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0)) bins = np.unique(X_disc, axis=0) if len( bins ) <= 1: # MDLP return the whole range as one bin, use median instead. threshold = data[cue].median() for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) continue # print ", ".join([cue, str(bins)+" bins"]) for bin in bins: indexes = np.where(X_disc == bin)[0] interval = X_interval[indexes] try: if len(np.unique(interval, axis=0)) != 1: print "???????????????????????????????????????????????????" except: print 'ha' interval = interval[0] if interval[0] == float('-inf'): threshold = interval[1] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) elif interval[1] == float('inf'): threshold = interval[0] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) else: cur_selected = self.eval_range_split( level, cur_selected, cur_performance, data, cue, indexes, interval, decision) continue elif self.split_method == "percentile": thresholds = set(data[cue].quantile( [x / 20.0 for x in range(1, 20)], interpolation='midpoint')) else: thresholds = [data[cue].median()] # point split, e.g. median or x% percentiles. for threshold in thresholds: for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) self.computed_cache[structure] = cur_selected self.selected[t_id][level] = cur_selected['rule'] self.performance_on_train[t_id][level] = cur_selected[ 'metrics'] + get_performance(cur_selected['metrics']) self.grow(cur_selected['undecided'], t_id, level + 1, cur_selected['metrics'])
#coding=utf-8 import numpy as np from mdlp.discretization import MDLP from sklearn.datasets import load_iris column = np.array([1,2]) transformer = MDLP(column) iris = load_iris() X, y = iris.data, iris.target print y print type(X), type(y) X_disc = transformer.fit_transform(X,y) conv_X = transformer.fit_transform(X, y) print conv_X di = transformer.cut_points_ print transformer.cut_points_ for each in di: print len(di[each])
X_data_galaxy, Y_data_galaxy, test_size=0.4, random_state=0, shuffle=True, stratify=Y_data_galaxy) X_test_galaxy, X_valid_galaxy, Y_test_galaxy, Y_valid_galaxy = train_test_split( X_valid_galaxy, Y_valid_galaxy, test_size=0.5, random_state=0, shuffle=True, stratify=Y_valid_galaxy) # In[30]: from sklearn.metrics import accuracy_score # ============================================================================= # from sklearn.preprocessing import StandardScaler # Xtrain_galaxy_s=X_train_galaxy # Xtest_galaxy_s=X_test_galaxy # Xvalid_galaxy_s=X_valid_galaxy # ============================================================================= from mdlp.discretization import MDLP from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target mdlp = MDLP() conv_X = mdlp.fit_transform(X, y)
title='Confusion matrix, without normalization with k:3') plt.figure() plot_confusion_matrix(cm_galaxy_test_k3u, classes=['Smooth', 'Spiral'], normalize=True, title='Confusion matrix, with normalization with k:3') plt.show() # In[33]: # In[33]: print(" Bayes Naif models with hold-out set ") # Scale data for train, validation (hold out), and test # first method of discretization using MDLP from mdlp.discretization import MDLP mdlp = MDLP() Xtrain_galaxy_MDLP = mdlp.fit_transform(X_train_galaxy, Y_train_galaxy) Xtest_galaxy_MDLP = mdlp.transform(X_test_galaxy, Y_test_galaxy) Xvalid_galaxy_MDLP = mdlp.transform(X_valid_galaxy, Y_valid_galaxy) # In[33]: # Second method of discretization using MinMaxScaler from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() Xtrain_galaxy_unsupervised = scaler.fit_transform(X_train_galaxy) Xtest_galaxy_unsupervised = scaler.transform(X_test_galaxy) Xvalid_galaxy_unsupervised = scaler.transform(X_valid_galaxy) # In[33]: # Bayes naïf gaussien with 2 different parameters i.e. # 1. priors = probaility of each class
from mdlp.discretization import MDLP train_raw = pd.read_csv("input/train.csv") test_raw = pd.read_csv("input/test.csv") # drop NaNs, use only the Age feature itself to estimate bins train_sur_age = train_raw[['Survived', 'Age']].dropna(axis=0) survived = train_sur_age['Survived'].values age = (train_sur_age['Age'].values).reshape(-1, 1) n_bins = [] age_lim = [] n = 1000 for i in range(n): transformer = MDLP(random_state=i, continuous_features=None) age_dis = transformer.fit_transform(age, survived) age_bins = transformer.cat2intervals(age_dis, 0) n_bins.append(len(set(age_bins))) if len(set(age_bins)) == 2: age_lim.append(age_bins[0]) elif len(set(age_bins)) > 2: print('\t ! more than two bins, n=', len(set(age_bins))) print('* estimated N bins:', set(n_bins)) print('\t mean', np.mean(1. * np.array(n_bins))) print('* Age thresholds, frequencies') lim_val = np.array(age_lim)[:, 0] sum_not_inf = 0 for val_i in set(lim_val): print('\t', val_i, (1. * sum(lim_val == val_i)) / n) sum_not_inf = sum_not_inf + sum(lim_val == val_i)
rownum += 1 print "continuous_features:", continuous_features print "<----------------------------------------------------->" # Think about how to improve !!!!!!!!!! X = [] y = [] for each_ele in all_insts_str_list: y.append(float(each_ele[-1])) temp = [] for index_cf, each_column in enumerate(continuous_features): temp.append(float(each_ele[each_column])) X.append(temp) X_disc = discretizationer.fit_transform(X, y) X_disc = X_disc.tolist() for index_X, each_inst in enumerate(X_disc): X_disc[index_X] = map(int, each_inst) # print X_disc cut_dict = discretizationer.cut_points_ X_attr_count = [] for each in cut_dict: X_attr_count.append(len(cut_dict[each]) + 1) print X_attr_count for index_val, each in enumerate(each_attrVal_array): for index_val_disc, each_val_disv in enumerate(X_attr_count): for each_col in continuous_features: if index_val_disc == each_col: each_attrVal_array[each_col] = list(
df['Age'] = df['Age'].fillna(age_mean) df['Embarked'] = df['Embarked'].fillna(embark_mode) df['Cabin'] = df['Cabin'].fillna("U") df['Title'] = df['Name'].map(lambda x: substring_exist(x, TITLE_LIST)) df['Title'] = df.apply(replace_titles, axis=1) df['Embarked'] = df.apply(replace_embark, axis=1) df['Deck'] = df['Cabin'].map(lambda x: substring_exist(x, CABIN_LIST)) df['Deck'] = df.apply(replace_deck, axis=1) df['Family_Size'] = df['SibSp'] + df['Parch'] df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size'] + 1) transformer = MDLP() X_age = df["Age"].to_numpy().reshape((df["Age"].shape[0], 1)) y_age = df["Survived"].to_numpy().reshape((df["Survived"].shape[0], 1)) disc = transformer.fit_transform(X_age, y_age) df["Age_disc"] = disc df['Sex'] = df.apply(replace_sex, axis=1) df["Pclass"] = df["Pclass"].map(lambda x: 1 / x) df.to_csv("./data/train_neat.csv") print(df.columns)