Beispiel #1
0
class MDLPDiscretizer(BaseDiscretizer):
    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
        if(labels is None):
            raise ValueError('Labels must be not None when using \
                             MDLPDiscretizer')
        BaseDiscretizer.__init__(self, data, categorical_features,
                                 feature_names, labels=labels,
                                 random_state=random_state)

    def bins(self, data, labels):
        self.transformer = MDLP()
        discretize_data = self.transformer.fit_transform(data, labels)
        bins = []
        for i in range (len(set(labels))):
            intervals = set(self.transformer.cat2intervals(discretize_data, i))
            feature_interval = []
            for i in range (len(intervals)):
                interval = intervals.pop()
                feature_interval.append(interval[0])
                feature_interval.append(interval[1])
            feature_interval = set(feature_interval)
            feature_interval.discard(float('inf'))
            feature_interval.discard(float('-inf'))
            array = [x for x in feature_interval]
            bins.append(np.array(array))
        return bins
Beispiel #2
0
def bayesian_rule_list(X_train, y_train, X_test, y_test):
    from mdlp.discretization import MDLP
    from sklearn import preprocessing

    # First one hot encode
    X_train, X_test = apply_one_hot_encoding(X_train, X_test)


    # Then need to convert classes to integers
    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)

    # Then discretize features
    transformer = MDLP()
    X_train = transformer.fit_transform(X_train, y_train)
    X_test = transformer.transform(X_test)

    brl = pysbrl.BayesianRuleList()
    brl.fit(X_train, y_train)

    print(brl)

    # The complexity is the number of split points + the number of extra conditions
    # (i.e. if x1 > 0 and x2 = 1 then .. counts as 2 not 1), for this reason we do not use brl.n_rules
    brl_str = str(brl)
    brl_complexity = brl_str.count("IF") + brl_str.count("AND")

    training_recreations = brl.predict(X_train)
    brl_training_recreating_pct = scorer(training_recreations, y_train) * 100
    testing_recreations = brl.predict(X_test)
    brl_testing_recreating_pct = scorer(testing_recreations, y_test) * 100

    return brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity
Beispiel #3
0
def all_entropies(df1, total):
    mdlp = MDLP()
    x_test = mdlp.fit_transform(df1[df1.columns[:-1]].values,
                                df1[df1.columns[-1]].values)
    entropies = []
    for x, y in enumerate(mdlp.cut_points_):
        if len(y) > 1:
            for j, k in enumerate(y):
                if j == 0:
                    temp = df1[df1[x] <= y[j]]['class'].value_counts(
                    ).values.tolist()
                    if len(temp) > 1:
                        entropies.append(cal_entropy(temp, total))
                    else:
                        temp.append(0)
                        entropies.append(cal_entropy(temp, total))
                if j == len(y) - 1:
                    temp = df1[
                        df1[x] > y[j]]['class'].value_counts().values.tolist()
                    if len(temp) > 1:
                        entropies.append(cal_entropy(temp, total))
                    else:
                        temp.append(0)
                        entropies.append(cal_entropy(temp, total))

                if j != len(y) - 1:
                    temp = df1[(df1[x] > y[j])
                               & (df1[x] <= y[j + 1])]['class'].value_counts(
                               ).values.tolist()
                    if len(temp) > 1:
                        entropies.append(cal_entropy(temp, total))
                    else:
                        temp.append(0)
                        entropies.append(cal_entropy(temp, total))

        if len(y) == 1:
            temp = df1[df1[x] <= y[0]]['class'].value_counts().values.tolist()
            if len(temp) > 1:
                entropies.append(cal_entropy(temp, total))
            else:
                temp.append(0)
                entropies.append(cal_entropy(temp, total))
            temp = df1[df1[x] > y[0]]['class'].value_counts().values.tolist()
            if len(temp) > 1:
                entropies.append(cal_entropy(temp, total))
            else:
                temp.append(0)
                entropies.append(cal_entropy(temp, total))

        if len(y) == 0:
            temp = df1['class'].value_counts().values.tolist()
            if len(temp) > 1:
                entropies.append(cal_entropy(temp, total))
            else:
                temp.append(0)
                entropies.append(cal_entropy(temp, total))

    return sorted(entropies)
Beispiel #4
0
def discMdlp(_df):
    featureVals = [x for x in _df if x != 'Class']
    transformer = MDLP()
    discretizedMap = {'Class': _df['Class']}

    discret = transformer.fit_transform(_df[featureVals], _df['Class'])
    nFrame = pd.DataFrame(data=discret, columns=featureVals)
    nFrame.loc[:, 'Class'] = pd.Series(_df['Class'])

    return nFrame
def classifier(args):
    dataset_info = datasets_info[args.data_type]

    df = pd.read_csv(dataset_info['path'])
    for drop_col in dataset_info['drop_columns']:
        df = df.drop(columns=df.columns[drop_col])
    y = df[df.columns[dataset_info['class_column']]]
    X = df.drop(columns=df.columns[dataset_info['class_column']])

    if args.plot:
        sns.pairplot(df, hue=df.columns[dataset_info['class_column']])
        plt.show()

    # Discretize values before training
    if args.discretization_bins > 0:
        if args.discretization_mode == DISC_MDLP:
            transformer = MDLP()
            X = transformer.fit_transform(X, y)
        else:
            for column in X:
                bins = discretization(args.discretization_mode, X[column],
                                      args.discretization_bins)
                X[column] = bins

    # Splitting the dataset into the Training set and Test set
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=42)

    # Create a new figure and set the figsize argument so we get square-ish plots of the 4 features.
    if args.plot:
        plt.figure(figsize=(10, 3))

    # Iterate over the features, creating a subplot with a histogram for each one.
    if args.plot:
        for feature in range(X_train.shape[1]):
            plt.subplot(1, len(X_train.columns), feature + 1)
            sns.distplot(X_train.values[:, feature])
        plt.show()

    # Fitting Naive Bayes Classification to the Training set
    # classifier = GaussianNB()
    classifier = MultinomialNB(alpha=1.0)
    classifier.fit(X_train, y_train)

    cross_validation(classifier, X, y)

    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print(y_pred)

    evaluation(y_test, y_pred, args)
class SupervisedDiscretizationStrategy(object):
    """
        A class used for supervised data discretization.
    """
    def __init__(self):
        self.transformer = MDLP()

    def discretize(self, data_set, validation_size, nb_bins=None):
        """ Discretize continuous attribute using MDLP method.

        Args:
            data_set: The data set containing continuous data.
            validation_size: The validation size of the newly created discretized data set.

        Returns:
            discretized_dataset: A DataSet object containing discretized data.
        """

        # Create strategy object to further create the discretized data set.
        galaxy_dataset_feature_strategy = GalaxyDataSetFeatureStrategy()

        # Get data from training set.
        X_train = data_set.train.get_features
        y_train = data_set.train.get_labels

        # Supervised discretization of the training data set using MDLP.
        X_train_discretized = self.transformer.fit_transform(X=X_train,
                                                             y=y_train)

        # Get data from validation set.
        X_valid = data_set.valid.get_features
        y_valid = data_set.valid.get_labels

        # Unsupervised discretization using MDLP.
        X_valid_discretized = self.transformer.transform(X=X_valid)

        # Merge both training and validation data.
        X = np.append(X_train_discretized, X_valid_discretized, axis=0)
        y = np.append(y_train, y_valid, axis=0)

        # Create a new data set.
        discretized_dataset = galaxy_dataset_feature_strategy.create_datasets(
            X, y, validation_size)

        return discretized_dataset
Beispiel #7
0
    def grow(self, data, t_id, level, cur_performance):
        """
        :param data: current data for future tree growth
        :param t_id: tree id
        :param level: level id
        :return: None
        """
        if level >= self.max_depth:
            return
        if len(data) == 0:
            print "?????????????????????? Early Ends ???????????????????????"
            return
        self.tree_depths[t_id] = level
        decision = self.structures[t_id][level]
        structure = tuple(self.structures[t_id][:level + 1])
        cur_selected = self.computed_cache.get(structure, None)
        Y = data.as_matrix(columns=[self.target])
        if not cur_selected:
            for cue in list(data):
                if cue in self.ignore or cue == self.target:
                    continue
                if self.split_method == "MDLP":
                    mdlp = MDLP()
                    X = data.as_matrix(columns=[cue])
                    X_disc = mdlp.fit_transform(X, Y)
                    X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0))
                    bins = np.unique(X_disc, axis=0)
                    if len(
                            bins
                    ) <= 1:  # MDLP return the whole range as one bin, use median instead.
                        threshold = data[cue].median()
                        for direction in "><":
                            cur_selected = self.eval_point_split(
                                level, cur_selected, cur_performance, data,
                                cue, direction, threshold, decision)
                        continue
                    # print ", ".join([cue, str(bins)+" bins"])
                    for bin in bins:
                        indexes = np.where(X_disc == bin)[0]
                        interval = X_interval[indexes]
                        try:
                            if len(np.unique(interval, axis=0)) != 1:
                                print "???????????????????????????????????????????????????"
                        except:
                            print 'ha'
                        interval = interval[0]
                        if interval[0] == float('-inf'):
                            threshold = interval[1]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        elif interval[1] == float('inf'):
                            threshold = interval[0]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        else:
                            cur_selected = self.eval_range_split(
                                level, cur_selected, cur_performance, data,
                                cue, indexes, interval, decision)
                    continue
                elif self.split_method == "percentile":
                    thresholds = set(data[cue].quantile(
                        [x / 20.0 for x in range(1, 20)],
                        interpolation='midpoint'))
                else:
                    thresholds = [data[cue].median()]
                # point split, e.g. median or x% percentiles.
                for threshold in thresholds:
                    for direction in "><":
                        cur_selected = self.eval_point_split(
                            level, cur_selected, cur_performance, data, cue,
                            direction, threshold, decision)

            self.computed_cache[structure] = cur_selected
        self.selected[t_id][level] = cur_selected['rule']
        self.performance_on_train[t_id][level] = cur_selected[
            'metrics'] + get_performance(cur_selected['metrics'])
        self.grow(cur_selected['undecided'], t_id, level + 1,
                  cur_selected['metrics'])
Beispiel #8
0
#coding=utf-8
import numpy as np
from mdlp.discretization import MDLP
from sklearn.datasets import load_iris
column = np.array([1,2])
transformer = MDLP(column)
iris = load_iris()
X, y = iris.data, iris.target
print y
print type(X), type(y)
X_disc = transformer.fit_transform(X,y)

conv_X = transformer.fit_transform(X, y)
print conv_X
di = transformer.cut_points_




print  transformer.cut_points_

for each in di:
    print len(di[each])
    X_data_galaxy,
    Y_data_galaxy,
    test_size=0.4,
    random_state=0,
    shuffle=True,
    stratify=Y_data_galaxy)
X_test_galaxy, X_valid_galaxy, Y_test_galaxy, Y_valid_galaxy = train_test_split(
    X_valid_galaxy,
    Y_valid_galaxy,
    test_size=0.5,
    random_state=0,
    shuffle=True,
    stratify=Y_valid_galaxy)
# In[30]:

from sklearn.metrics import accuracy_score

# =============================================================================
# from sklearn.preprocessing import StandardScaler
# Xtrain_galaxy_s=X_train_galaxy
# Xtest_galaxy_s=X_test_galaxy
# Xvalid_galaxy_s=X_valid_galaxy
# =============================================================================
from mdlp.discretization import MDLP
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
mdlp = MDLP()
conv_X = mdlp.fit_transform(X, y)
                      title='Confusion matrix, without normalization with k:3')
plt.figure()
plot_confusion_matrix(cm_galaxy_test_k3u,
                      classes=['Smooth', 'Spiral'],
                      normalize=True,
                      title='Confusion matrix, with normalization with k:3')
plt.show()
# In[33]:
# In[33]:
print("       Bayes Naif models with hold-out set ")

# Scale data for train, validation (hold out), and test
# first method of discretization using MDLP
from mdlp.discretization import MDLP
mdlp = MDLP()
Xtrain_galaxy_MDLP = mdlp.fit_transform(X_train_galaxy, Y_train_galaxy)
Xtest_galaxy_MDLP = mdlp.transform(X_test_galaxy, Y_test_galaxy)
Xvalid_galaxy_MDLP = mdlp.transform(X_valid_galaxy, Y_valid_galaxy)

# In[33]:
# Second method of discretization using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Xtrain_galaxy_unsupervised = scaler.fit_transform(X_train_galaxy)
Xtest_galaxy_unsupervised = scaler.transform(X_test_galaxy)
Xvalid_galaxy_unsupervised = scaler.transform(X_valid_galaxy)
# In[33]:

# Bayes naïf gaussien with 2 different parameters i.e.
# 1. priors = probaility of each class
Beispiel #11
0
from mdlp.discretization import MDLP

train_raw = pd.read_csv("input/train.csv")
test_raw = pd.read_csv("input/test.csv")

# drop NaNs, use only the Age feature itself to estimate bins
train_sur_age = train_raw[['Survived', 'Age']].dropna(axis=0)
survived = train_sur_age['Survived'].values
age = (train_sur_age['Age'].values).reshape(-1, 1)

n_bins = []
age_lim = []
n = 1000
for i in range(n):
    transformer = MDLP(random_state=i, continuous_features=None)
    age_dis = transformer.fit_transform(age, survived)
    age_bins = transformer.cat2intervals(age_dis, 0)
    n_bins.append(len(set(age_bins)))
    if len(set(age_bins)) == 2: age_lim.append(age_bins[0])
    elif len(set(age_bins)) > 2:
        print('\t ! more than two bins, n=', len(set(age_bins)))

print('* estimated N bins:', set(n_bins))
print('\t mean', np.mean(1. * np.array(n_bins)))
print('* Age thresholds, frequencies')
lim_val = np.array(age_lim)[:, 0]

sum_not_inf = 0
for val_i in set(lim_val):
    print('\t', val_i, (1. * sum(lim_val == val_i)) / n)
    sum_not_inf = sum_not_inf + sum(lim_val == val_i)
Beispiel #12
0
    rownum += 1

print "continuous_features:", continuous_features
print "<----------------------------------------------------->"

# Think about how to improve !!!!!!!!!!
X = []
y = []
for each_ele in all_insts_str_list:
    y.append(float(each_ele[-1]))
    temp = []
    for index_cf, each_column in enumerate(continuous_features):
        temp.append(float(each_ele[each_column]))
    X.append(temp)

X_disc = discretizationer.fit_transform(X, y)
X_disc = X_disc.tolist()
for index_X, each_inst in enumerate(X_disc):
    X_disc[index_X] = map(int, each_inst)
# print X_disc
cut_dict = discretizationer.cut_points_
X_attr_count = []
for each in cut_dict:
    X_attr_count.append(len(cut_dict[each]) + 1)
print X_attr_count

for index_val, each in enumerate(each_attrVal_array):
    for index_val_disc, each_val_disv in enumerate(X_attr_count):
        for each_col in continuous_features:
            if index_val_disc == each_col:
                each_attrVal_array[each_col] = list(
Beispiel #13
0
df['Age'] = df['Age'].fillna(age_mean)
df['Embarked'] = df['Embarked'].fillna(embark_mode)
df['Cabin'] = df['Cabin'].fillna("U")

df['Title'] = df['Name'].map(lambda x: substring_exist(x, TITLE_LIST))

df['Title'] = df.apply(replace_titles, axis=1)
df['Embarked'] = df.apply(replace_embark, axis=1)
df['Deck'] = df['Cabin'].map(lambda x: substring_exist(x, CABIN_LIST))
df['Deck'] = df.apply(replace_deck, axis=1)

df['Family_Size'] = df['SibSp'] + df['Parch']

df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size'] + 1)

transformer = MDLP()
X_age = df["Age"].to_numpy().reshape((df["Age"].shape[0], 1))
y_age = df["Survived"].to_numpy().reshape((df["Survived"].shape[0], 1))

disc = transformer.fit_transform(X_age, y_age)
df["Age_disc"] = disc

df['Sex'] = df.apply(replace_sex, axis=1)

df["Pclass"] = df["Pclass"].map(lambda x: 1 / x)

df.to_csv("./data/train_neat.csv")

print(df.columns)