def getClassWeights(self, weightType, dataSet=None):

        if not weightType in [
                "Freq", "MedianFreq", "1x", "2x", "division", "relativeToMin",
                "quantile"
        ]:
            raise ValueError(
                "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]"
            )

        # get class weights because of imbalanced dataset (e.g. a lot of road and buildings)
        print("Calculate class ", weightType, " weights...")

        # only calculate the weights from a specific split of the dataset. For performance reasons
        # PART = 1 would be the total dataset
        PART = 10
        classCount = np.zeros(self.config["classes"])
        # count all the classes in every given mask image
        for i in range(int(self.config["trainSize"] / PART)):
            labelImg = self.getImage(i, "trainLabel").flatten()
            labelClassCount = np.bincount(labelImg,
                                          minlength=self.config["classes"])
            classCount += labelClassCount

            if i % int(1000 / PART) == 0:
                print("Label image ", i, "/", self.config["trainSize"] / PART)

        print("Class count: ", classCount.shape, classCount)

        #choose class weights type
        #Frequency
        if weightType == "Freq":
            classWeights = np.median(classCount) / classCount
        #Median Frequency
        elif weightType == "MedianFreq":
            classWeights = np.median(np.median(classCount) / classCount) / (
                np.median(classCount) / classCount)
        # Simple Total/ClassCount
        elif weightType == "1x":
            classWeights = 1 - (classCount / classCount.sum() * 1)
        # Simple Total/ClassCount doubled effect
        elif weightType == "2x":
            classWeights = 1 - (classCount / classCount.sum() * 2)
        # Simple Total/ClassCount divided by Minimum
        elif weightType == "division":
            classWeights = classCount.sum() / classCount
            #divide with minimum
            classWeights[classWeights == 1] = 999999
            classWeights /= classWeights.min()
        # all weights are relative to the smallest class which is assigned the 1.0. Minimal assigned value is 0.1
        elif weightType == "relativeToMin":
            classWeights = classCount.min() / classCount
            print("Class weights: ", classWeights.shape, classWeights)
            classWeights[(classWeights < 0.1)] *= 10
        # using the quantile transformer of sklearn all the weights are distributed in 0-0.9999. Minimal assigned value is 0.1
        elif weightType == "quantile":
            from sklearn.preprocessing.data import QuantileTransformer
            _scaler = QuantileTransformer()
            classCount = np.expand_dims(classCount, axis=1)
            classWeights = _scaler.fit_transform(classCount)
            classWeights = np.around(classWeights, decimals=4)
            classWeights = np.squeeze(classWeights)
            classWeights = 1 - classWeights
            classWeights[(classWeights < 0.1)] = 0.1

        else:
            raise ValueError(
                "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]"
            )

        # eliminate inf values
        classWeights[(classWeights == np.inf)] = 1
        print("Class weights: ", classWeights.shape, classWeights)
        np.save(
            "classWeights" + str(self.config["x"]) + str(self.config["y"]) +
            self.config["name"], classWeights)
class FeatureMap(object):
    def __init__(self, df):
        self.df = copy.deepcopy(df)
        self.onehot = None
        self.label_code = None
        self.col_label_dict = dict()
        self.min_max_scale = None
        self.max_abs_scale = None
        self.standard_scale = None
        self.robust_scale = None
        self.quantile_transform = None

    def log_map(self, col_need, col_replace=True):
        df_need = self.df[col_need]
        if col_replace:
            self.df[col_need] = df_need.apply(lambda x: np.log(x))
        else:
            col_need_extend = [col + "_log" for col in col_need]
            self.df[col_need_extend] = df_need.apply(lambda x: np.log(x))

    def box_cox_map(self, col_need, gamma=1.0, col_replace=True):
        """
        y = ((1+x)**gamma - 1) / gamma  if gamma != 0
            log(1+x)                    if gamma == 0
        ref: http://onlinestatbook.com/2/transformations/box-cox.html
        :param col_need:
        :param gamma:
        :param col_replace:
        :return:
        """
        df_need = self.df[col_need]
        if col_replace:
            self.df[col_need] = df_need.applymap(lambda x: boxcox1p(x, gamma))
        else:
            col_need_extend = [col + "_boxCox" for col in col_need]
            self.df[col_need_extend] = df_need.applymap(
                lambda x: boxcox1p(x, gamma))

    def onehot_encode(self, col_need, start_zero=True):
        """
        onehot encode DataFrame of which the columns you need
        note: the origin category should be integer in range(classes) or range(classes+1)
        :param col_need:
        :param start_zero: category is in range(classes)
        :return: new DataFrame without col_need, after onehot encoding,
                  start method is in accordance with start_zero
        """
        self.onehot = OneHotEncoder(sparse=False)
        array_onehot = self.onehot.fit_transform(self.df.loc[:, col_need])

        col_onehot = []

        for col_index in range(len(col_need)):
            if start_zero:
                for hot_index in range(self.onehot.n_values_[col_index]):
                    col_onehot.append(col_need[col_index] + str(hot_index))
            else:
                for hot_index in range(1, self.onehot.n_values_[col_index]):
                    col_onehot.append(col_need[col_index] + str(hot_index))

        self.df.drop(col_need, axis=1, inplace=True)

        df_onehot = pd.DataFrame(array_onehot,
                                 columns=col_onehot,
                                 index=self.df.index)
        self.df = pd.concat([self.df, df_onehot], axis=1)

    def label_encode(self, col_need):
        """
        onehot encode DataFrame of which the columns you need
        :param col_need: length should be 1
        :return: new DataFrame without col_need, after label encoding, start from 0
        """
        assert isinstance(col_need, list) and len(col_need) == 1
        self.label_code = LabelEncoder()
        array_label_code = self.label_code.fit_transform(self.df.loc[:,
                                                                     col_need])

        label_list = list(self.label_code.classes_)
        for i, x in enumerate(label_list):
            self.col_label_dict[col_need[0] + "_" +
                                str(i)] = col_need[0] + "_" + x

        self.df.drop(col_need, axis=1, inplace=True)

        df_label_code = pd.DataFrame(array_label_code,
                                     columns=col_need,
                                     index=self.df.index)
        self.df = pd.concat([self.df, df_label_code], axis=1)

    def standard_scale_map(self, col_need, drop_origin_col=False):
        self.standard_scale = StandardScaler()
        array_standard = self.standard_scale.fit_transform(
            self.df.loc[:, col_need])
        self._scale_map(array=array_standard,
                        column_name=col_need,
                        suffix="_stdScale",
                        drop_origin_columns=drop_origin_col)

    def min_max_scale_map(self, col_need, drop_origin_col=False):
        self.min_max_scale = MinMaxScaler()
        array_min_max = self.min_max_scale.fit_transform(self.df.loc[:,
                                                                     col_need])
        self._scale_map(array=array_min_max,
                        column_name=col_need,
                        suffix="_minMaxScale",
                        drop_origin_columns=drop_origin_col)

    def max_abs_scale_map(self, col_need, drop_origin_col=False):
        self.max_abs_scale = MaxAbsScaler()
        array_max_abs = self.max_abs_scale.fit_transform(self.df.loc[:,
                                                                     col_need])
        self._scale_map(array=array_max_abs,
                        column_name=col_need,
                        suffix="_maxAbsScale",
                        drop_origin_columns=drop_origin_col)

    def robust_scale_map(self,
                         col_need,
                         quantile_range=(25, 75),
                         drop_origin_col=False):
        """
        This Scaler removes the median and scales the data according to
        the quantile range (defaults to IQR: Interquartile Range).
        The IQR is the range between the 1st quartile (25th quantile)
        and the 3rd quartile (75th quantile).
        :param col_need:
        :param quantile_range:
        :param drop_origin_col:
        :return:
        """
        self.robust_scale = RobustScaler(quantile_range=quantile_range)
        array_robust = self.robust_scale.fit_transform(self.df.loc[:,
                                                                   col_need])
        self._scale_map(array=array_robust,
                        column_name=col_need,
                        suffix="_robust_scale",
                        drop_origin_columns=drop_origin_col)

    def quantile_scale_map(self,
                           col_need,
                           distribution='uniform',
                           drop_origin_col=False):
        """

        :param col_need:
        :param distribution: 'uniform' (default) or 'normal'
        :param drop_origin_col:
        :return:
        """
        self.quantile_transform = QuantileTransformer(
            output_distribution=distribution)

        array_quantile = self.quantile_transform.fit_transform(
            self.df.loc[:, col_need])
        self._scale_map(array=array_quantile,
                        column_name=col_need,
                        suffix="_q{}Map".format(distribution.capitalize()),
                        drop_origin_columns=drop_origin_col)

    def _scale_map(self,
                   array,
                   column_name,
                   suffix,
                   drop_origin_columns=False):
        if drop_origin_columns:
            self.df.drop(column_name, axis=1, inplace=True)

        col = [col + suffix for col in column_name]
        df_scale = pd.DataFrame(array, columns=col, index=self.df.index)
        self.df = pd.concat([self.df, df_scale], axis=1)

    def quantile_floor_map(self, col_need, floor_num=5, drop_origin_col=False):
        """
        after quantile_scale_map when distribution='uniform', value is scaled in [0, 1]
        for tree models, onehot encoding is need
        :param col_need:
        :param floor_num: uniform floor map
        :param drop_origin_col
        :return:
        """
        bool0 = (self.df.loc[:, col_need] >= 0) & (self.df.loc[:, col_need] <=
                                                   1)
        assert bool0.all().all()
        col_suffix = np.array([x.endswith("_qUniformMap") for x in col_need])
        assert np.prod(col_suffix)

        array_quantile_floor = (self.df.loc[:, col_need].values *
                                floor_num).astype(np.int)
        self._scale_map(array=array_quantile_floor,
                        column_name=col_need,
                        suffix="_qFloorMap",
                        drop_origin_columns=drop_origin_col)
Exemple #3
0
    df['BS1' + str(i1)] = df1['ans1'].str[i1]

# remove original style name now
df.drop(['Style Name'], 1, inplace=True)
df = df.dropna(axis=0)

c = df.columns[df.dtypes.eq(object)]
df[c] = df[c].apply(pd.to_numeric, errors='coerce', axis=0)

scaler = QuantileTransformer()
#df3 = scaler.fit_transform(df)

X5 = np.array(df.drop(['Score'], 1))
y5 = np.array(df['Score'])

X3 = scaler.fit_transform(pd.DataFrame(X5))
#y3 = scaler.fit_transform(pd.DataFrame(y5))
y3 = 1.2 - np.log(y5)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X5, y5, test_size=0.20)

# Data Munging Compleated.######################

from keras import layers, models


def build_model():

    model = models.Sequential()
    # 32 - Next layer depth, no of layers that should be present in the next layer, each layer acts as unique filter.
Exemple #4
0
# Author: Franz Weidmann
# Info: Creates for each hosts an SVM one class classifier to retrieve the normal
# state of the host. All models will be trained and save into a npy file

import numpy as np
from sklearn import svm
from sklearn.externals import joblib
from sklearn.preprocessing.data import QuantileTransformer

trainData = np.load("../../data/data.npy")

# transform data for scaling and save the state of the transformer for each host
scalers = []
for h in range(trainData.shape[0]):
    _scaler = QuantileTransformer()
    trainData[h] = _scaler.fit_transform(trainData[h])
    scalers.append(_scaler)

# train and svm one class classifier for every host
models = []
for modelIndex in range(trainData.shape[0]):
    print("Creating model ", modelIndex)
    model = svm.OneClassSVM(kernel="rbf", verbose=True)
    model.fit(trainData[modelIndex])
    models.append(model)
    print("Trained model ", modelIndex)

joblib.dump([scalers, models], "models.pkl")