Beispiel #1
0
def cv(model, x, y):
    errors = []
    kf = KFold(n_splits=10, shuffle=True)
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        x_scaler = MaxAbsScaler()
        y_scaler = MaxAbsScaler()

        x_scaler.fit(x_train)
        y_scaler.fit(y_train)

        xx_train = x_scaler.transform(x_train)
        xx_test = x_scaler.transform(x_test)
        yy_train = y_scaler.transform(y_train)
        yy_test = y_scaler.transform(y_test)

        cv_model = sklearn.base.clone(model)
        cv_model.fit(xx_train, yy_train)

        yy_predicted = cv_model.predict(xx_test)

        error = math.sqrt(mean_squared_error(yy_test, yy_predicted))
        errors.append(error)
    return errors
Beispiel #2
0
    def scale(self, X_train, X_test, type):
        if type == "StandardScaler":
            standardScaler = StandardScaler()
            standardScaler.fit(X_train)
            X_train = standardScaler.transform(X_train)
            X_test = standardScaler.transform(X_test)
            return X_train, X_test

        elif type == "MinMaxScaler":
            minMaxScaler = MinMaxScaler()
            minMaxScaler.fit(X_train)
            X_train = minMaxScaler.transform(X_train)
            X_test = minMaxScaler.transform(X_test)
            return X_train, X_test
        elif type == "MaxScaler":

            maxScaler = MaxAbsScaler()
            maxScaler.fit(X_train)
            X_train = maxScaler.transform(X_train)
            X_test = maxScaler.transform(X_test)
            return X_train, X_test

        elif type == "RobustScaler":
            robustScaler = RobustScaler()
            robustScaler.fit(X_train)
            X_train = robustScaler.transform(X_train)
            X_test = robustScaler.transform(X_test)
            return X_train, X_test
Beispiel #3
0
def test_max_abs_scaler():
    tform = MaxAbsScaler()
    tform.fit(X)
    tform_ = convert_estimator(tform)
    X_t = tform.transform(X)
    X_t_ = tform_.transform(X)
    np.allclose(X_t, X_t_)
Beispiel #4
0
def ml_stratified_cv():
    #from sklearn.utils import check_random_state
    #rng = check_random_state(0)
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.preprocessing import MaxAbsScaler
    scaler = MaxAbsScaler()

    flag_scale = True

    cv = StratifiedKFold(y, n_folds=10, shuffle=True)
    ytrue, ypred, score = [], [], []
    for itr, its in cv:
        Xtr, ytr = X[itr], y[itr]
        Xts, yts = X[its], y[its]

        if flag_scale:
            scaler.fit(Xtr)
            Xtr = scaler.transform(Xtr)
            Xts = scaler.transform(Xts)

        clf.fit(Xtr, ytr)
        ypr = clf.predict(Xts)
        sco = clf.decision_function(Xts)

        ytrue.append(yts)
        ypred.append(ypr)
        score.append(sco)

    ytrue = np.concatenate(ytrue)
    ypred = np.concatenate(ypred)
    score = np.concatenate(score)

    print tw.clf_results_extended(ytrue, score)
Beispiel #5
0
def scikit_clustering(number_of_clusters=3600):
    user_features = pickle.load(
        open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb'))
    users_features_vectors = list(user_features.values())
    users_dataset = np.array(users_features_vectors)
    df = pd.DataFrame(users_dataset)
    df[0] = df[0].astype('category')
    df[1] = df[1].astype('category')
    df[3] = df[3].astype('category')
    df[6] = df[6].astype('category')

    abs_scaler = MaxAbsScaler()
    abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]])
    df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]])
    print(df.iloc[:, [0]].dtypes[0])

    clustering = AgglomerativeClustering(n_clusters=number_of_clusters,
                                         affinity=gower.gower_matrix,
                                         linkage='complete').fit(df)

    result = clustering.labels_
    clustering_result = {}
    for i in range(len(result)):
        if result[i] in clustering_result:
            clustering_result[result[i]] += [users_features_vectors[i]]
        else:
            clustering_result[result[i]] = [users_features_vectors[i]]
    file_to_write = open('users_vectors_clustering.p', 'wb')
    pickle.dump(clustering_result, file_to_write)
Beispiel #6
0
def normalize_data(dataframe, mode):
    if mode == 'abs':
        from sklearn.preprocessing import MaxAbsScaler
        max_abs = MaxAbsScaler(copy=True)  #save for retransform later
        max_abs.fit(dataframe)
        data_norm = max_abs.transform(dataframe)

        return data_norm, max_abs

    if mode == 'robust':
        from sklearn.preprocessing import RobustScaler
        robust = RobustScaler(copy=True)  #save for retransform later
        robust.fit(dataframe)
        data_norm = robust.transform(dataframe)

        return data_norm, robust

    if mode == 'min_max':
        from sklearn.preprocessing import MinMaxScaler
        minmax = MinMaxScaler(feature_range=(0, 1),
                              copy=True)  #save for retransform later
        minmax.fit(dataframe)
        data_norm = minmax.transform(dataframe)

        return data_norm, minmax
    if mode == 'std':
        from sklearn.preprocessing import StandardScaler
        stdscaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        stdscaler.fit(dataframe)
        data_norm = stdscaler.transform(dataframe)

        return data_norm, stdscaler
Beispiel #7
0
def scaler_dummy(dataset):

  scaler_mm = MinMaxScaler() 
  scaler_ma = MaxAbsScaler()
  scaler_sd = StandardScaler()
  scaler_rb = RobustScaler()

  numerical = list(dataset.columns)
  data_transform_mm = pd.DataFrame(data = dataset)
  data_transform_ma = pd.DataFrame(data = dataset)
  data_transform_sd = pd.DataFrame(data = dataset)
  data_transform_rb = pd.DataFrame(data = dataset)


  scaler_mm.fit(dataset[numerical])
  scaler_ma.fit(dataset[numerical])
  scaler_sd.fit(dataset[numerical])
  scaler_rb.fit(dataset[numerical])


  data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical])
  data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical])
  data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical])
  data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical])


  ## get dummies

  features_final_mm = pd.get_dummies(data_transform_mm)
  features_final_ma = pd.get_dummies(data_transform_ma)
  features_final_sd = pd.get_dummies(data_transform_sd)
  features_final_rb = pd.get_dummies(data_transform_rb)

  return features_final_mm, features_final_ma, features_final_sd, features_final_rb
Beispiel #8
0
def scikit_clustering_ver2(number_of_clusters=3600):
    user_features = pickle.load(
        open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb'))
    users_features_vectors = list(user_features.values())
    users_dataset = np.array(users_features_vectors)
    df = pd.DataFrame(users_dataset)
    df[0] = df[0].astype('category')
    df[1] = df[1].astype('category')
    df[3] = df[3].astype('category')
    df[6] = df[6].astype('category')

    abs_scaler = MaxAbsScaler()
    abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]])
    df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]])

    clustering = KMeans(n_clusters=number_of_clusters, verbose=1).fit(df)

    result = clustering.labels_
    logging.info("result: {0}".format(result))
    clustering_result = {}
    for i in range(len(result)):
        if result[i] in clustering_result:
            clustering_result[result[i]] += [users_features_vectors[i]]
        else:
            clustering_result[result[i]] = [users_features_vectors[i]]
    file_to_write = open('users_vectors_clustering.p', 'wb')
    pickle.dump(clustering_result, file_to_write)
class MaxAbsScalerPrim(primitive):
    def __init__(self, random_state=0):
        super(MaxAbsScalerPrim, self).__init__(name='MaxAbsScaler')
        self.id = 8
        self.hyperparams = []
        self.type = 'feature preprocess'
        self.description = "Scale each feature by its maximum absolute value. his estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity. This scaler can also be applied to sparse CSR or CSC matrices."
        self.hyperparams_run = {'default': True}
        self.scaler = MaxAbsScaler()
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        # Update
        return True

    def fit(self, data):
        data = handle_data(data)
        self.scaler.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        for i in range(len(cols)):
            if not 'one_hot' in cols[i]:
                cols[i] = "{}_mxabsscale".format(cols[i])
        output['X'] = pd.DataFrame(self.scaler.transform(output['X']),
                                   columns=cols)
        final_output = {0: output}
        return final_output
Beispiel #10
0
class ScalingAdder(BaseEstimator, TransformerMixin):
    def _create_scaler(self, scaler):
        if scaler == 'std':
            self._sc = StandardScaler()
        if scaler == 'minmax':
            self._sc = MinMaxScaler()
        if scaler == 'maxabs':
            self._sc = MaxAbsScaler()

    def __init__(self, scaler=None):
        self.scaler = scaler
        self._create_scaler(scaler)

    def set_params(self, scaler=None, **parameters):
        self.scaler = scaler
        self._create_scaler(scaler)
        return self

    def get_params(self, **kwargs):
        return {"scaler": self.scaler}

    def transform(self, X, **transform_params):
        if self.scaler is None:
            return X
        if (X.shape[1] > 1):
            return np.hstack((X[:, :1], self._sc.transform(X[:, 1:])))
        return np.hstack((X[:, :1], np.zeros(shape=(X.shape[0], 1))))

    def fit(self, X, y=None, **fit_params):
        if self.scaler is not None:
            if X.shape[1] > 1:
                self._sc.fit(X[:, 1:], y)
        return self
 def scale_data(X, scaler=None):
     print('Performing data scaling...')
     if not scaler:
         scaler = MaxAbsScaler()
         scaler.fit(X)
     X = scaler.transform(X)
     return X, scaler
Beispiel #12
0
def get_data():

	df_train = pd.read_csv('data_original/aps_failure_training_set.csv')
	df_test = pd.read_csv('data_original/aps_failure_test_set.csv')
	# 
	df_train.replace('na','-1', inplace=True)
	df_test.replace('na','-1', inplace=True)
	# categorical for label: 0: neg, 1: pos
	df_train['class'] = pd.Categorical(df_train['class']).codes
	df_test['class']  = pd.Categorical(df_test['class']).codes

	# split data into x and y
	Y_train = df_train['class'].copy(deep=True)
	X_train = df_train.copy(deep=True)
	X_train.drop(['class'], inplace=True, axis=1)

	Y_test = df_test['class'].copy(deep=True)
	X_test = df_test.copy(deep=True)
	X_test.drop(['class'], inplace=True, axis=1)

	# strings to float
	X_train = X_train.astype('float64')
	X_test  = X_test.astype('float64')

	# scale the dataset
	scaler = MaxAbsScaler()
	scaler.fit(X_train)
	X_train = scaler.transform(X_train)
	X_test  = scaler.transform(X_test)

	return X_train, Y_train, X_test, Y_test
def use_MaxAbsScaler():
    # 가장 큰 절대값 사용 : -1 ~ 1 사이로 변환
    x = [[1., -1., 5.], [2., 0., -5.], [0., 1., -10]]

    scaler = MaxAbsScaler()
    scaler.fit(x)
    print(scaler.transform(x))
Beispiel #14
0
class AutoMaxScaler(BaseEstimator, TransformerMixin):
    """
    Determine non-categorical columns and max scale the values.
    """
    def __init__(self,
                 ignore_columns: list = [],
                 uniqueness_thresshold: Optional[float] = None):
        """
        Args:
            uniqueness_thresshold: Columns with less unique values than this
                are considered categorical.
        """
        self.ignore_columns = ignore_columns
        self.uniqueness_thresshold = uniqueness_thresshold

    def fit(self, X, y=None):
        """
        Determine which columns to min-max scale.
        """
        self.scaler_ = MaxAbsScaler(copy=True)
        self.columns_to_transform_ = get_numerical_columns(
            data_frame=X,
            ignore_columns=self.ignore_columns,
            uniqueness_thresshold=self.uniqueness_thresshold,
        )
        self.scaler_.fit(X[self.columns_to_transform_])
        return self

    def transform(self, X, y=None):
        """
        Max scale the columns and return copy.
        """
        data_subframe = X[self.columns_to_transform_]
        X[self.columns_to_transform_] = self.scaler_.transform(data_subframe)
        return X.copy()
Beispiel #15
0
class DataFrameScaler(BaseEstimator, TransformerMixin):
    '''Docstring of `DataFrameScaler`

    Scaling a pandas DataFrame.

    Args:
        scale: Valid values are 
        "unit" - Centers to the mean and component wise scale 
        to unit variance;
        "0,1" - Scales data to given range. Use extra parameter 
        `feature_range=(min,max)` to set range;
        "-1,1" - Scales data to the range [-1, 1] by dividing 
        through the largest maximum value in each feature. 
        It is meant for data that is already centered at zero or 
        sparse data.
        Extra paramters will be passed to sklearn scalers if specified.
        ignore_cols: Columns that will not be scaled.
        By default, all categorical columns will be ignored.
        Specify this parameter to ignore numerical columns too.
    '''

    SCALERS = {
        'unit': StandardScaler,
        '0,1': MinMaxScaler,
        '-1,1': MaxAbsScaler
    }

    def __init__(self, scaler: str = 'unit', ignore_cols: List[str] = [], target_cols: List[str] = [], **kwargs):
        assert scaler in ['unit', '0,1', '-1,1'], 'Invalid scaler {}. See help for valid scalers.'.format(scaler)
        self.scaler = scaler
        self.ignore_cols = ignore_cols
        self.target_cols = ignore_cols and [] or target_cols
        self.kwargs = kwargs
        self.ignore_cols = ignore_cols
        self.target_cols = ignore_cols and [] or target_cols
    
    def fit(self, X: pd.DataFrame, y=None):
        if self.scaler == 'unit':
            self.scaler = StandardScaler(**{
                k: self.kwargs.get(k, d) for k,d in [('copy', True), ('with_mean', True), ('with_std', True)]
            })
        elif self.scaler == '0,1':
            self.scaler = MinMaxScaler(**{
                k: self.kwargs.get(k, d) for k, d in [('feature_range', (0, 1)), ('copy', True)]
            })
        elif self.scaler == '-1,1':
            self.scaler = MaxAbsScaler(**{
                k: self.kwargs.get(k, d) for k, d in [('copy', True)]
            })
        # self.scaler = self.SCALERS[self.scaler](**self.kwargs).fit(X)
        self.target_cols = self.ignore_cols and [col for col in X.select_dtypes(include=['number']) if not col in self.ignore_cols] or (
            self.target_cols and [col for col in X.select_dtypes(include=['number']) if col in self.target_cols] or X.select_dtypes(include=['number']).columns)
        self.scaler.fit(X[self.target_cols])
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        X[self.target_cols] = self.scaler.transform(X[self.target_cols])
        return X
Beispiel #16
0
def ScaleMaxAbs(train_data):
    # minMaxScaler와 유사 : -1 ~ 1사이에 값을 위치시킴
    maxAbsScaler = MaxAbsScaler()
    maxAbsScaler.fit(train_data)
    dt = maxAbsScaler.transform(train_data)
    return maxAbsScaler, pd.DataFrame(dt,
                                      index=train_data.index,
                                      columns=train_data.columns)
Beispiel #17
0
def test_max_abs_scaler_sparse():
    X_sparse = tosparse(X)
    tform = MaxAbsScaler()
    tform.fit(X)
    tform_ = convert_estimator(tform)
    X_t = tform.transform(X)
    X_t_ = tform_.transform(X_sparse)
    np.allclose(X_t, X_t_.todense())
Beispiel #18
0
 def test_max_abs_scaler(self):
     model = MaxAbsScaler()
     data = [[0., 0., 3.], [1., 1., 0.], [0., 2., 1.], [1., 0., 2.]]
     model.fit(data)
     model_onnx = convert_sklearn(model, 'scaler', [('input', FloatTensorType([1, 3]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(numpy.array(data, dtype=numpy.float32),
                         model, basename="SklearnMaxAbsScaler")
def max_abs_scaler_usecase():
    X_train = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]])

    scaler = MaxAbsScaler()
    scaler.fit(X_train)

    print(scaler.max_abs_)
    print(scaler.scale_)
    print(scaler.transform(X_train))
Beispiel #20
0
class MaxAbsScaler(FeatureTransformAlgorithm):
    r"""Implementation of feature scaling by its maximum absolute value.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html#sklearn.preprocessing.MaxAbsScaler

    See Also:
        * :class:`niaaml.preprocessing.feature_transform.FeatureTransformAlgorithm`
    """
    Name = 'Maximum Absolute Scaler'

    def __init__(self, **kwargs):
        r"""Initialize MaxAbsScaler.
        """
        super(MaxAbsScaler, self).__init__()
        self.__max_abs_scaler = MAS()

    def fit(self, x, **kwargs):
        r"""Fit implemented transformation algorithm.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to fit transformation algorithm.
        """
        self.__max_abs_scaler.fit(x)

    def transform(self, x, **kwargs):
        r"""Transforms the given x data.

        Arguments:
            x (pandas.core.frame.DataFrame): Data to transform.

        Returns:
            pandas.core.frame.DataFrame: Transformed data.
        """

        return self.__max_abs_scaler.transform(x)

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return FeatureTransformAlgorithm.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(
                self.__max_abs_scaler.get_params()))
Beispiel #21
0
def preproces_avs(path="food-101/images/", shape=(80, 80), batch_size=1000):
    pre = MaxAbsScaler()
    imag_gen = ImageDataGenerator()
    gen = imag_gen.flow_from_directory(path,
                                       target_size=shape,
                                       batch_size=batch_size,
                                       class_mode=None)
    flat = shape[0] * shape[1] * 3
    pre.fit(next(gen).reshape((batch_size, flat)))
    return pre
def maxAbsScaler(train_x, test_x):
    scaler = MaxAbsScaler()
    scaler.fit(train_x.data)
    train_x.data = scaler.transform(train_x.data)

    if test_x is not None:
        test_x.data = scaler.transform(test_x.data)

    print(pd.DataFrame(train_x.data).describe())
    return train_x.data
Beispiel #23
0
    def preprocess_per_stamp(self, X, norm_opt):
        """
        Prepare preprocessors for each stamp.
        """

        programs = X.index

        # Get stamps
        stamps = self.get_unique_time_stamps(programs)

        # Init storage for preprocessors
        preprocs = {}

        # Iterate through stamps
        for stamp in stamps:
            # Find each program with this stamp
            progs = []
            for i in range(X.shape[0]):
                program = programs[i]
                tmp = program.split("-")
                for entry in tmp:
                    if entry == stamp:
                        progs.append(program)
            # Get data for these programs
            x = X.loc[progs].values  # FIXME
            # Initialize preprocessor
            if norm_opt == "max":
                preproc = MaxAbsScaler()
                preproc.fit(x)
                preprocs[stamp] = preproc

            elif norm_opt == "norm":
                preproc = Normalizer()
                preproc.fit(x)
                preprocs[stamp] = preproc

            elif norm_opt == "minmax":
                preproc = MinMaxScaler((0, 1))
                preproc.fit(x)
                preprocs[stamp] = preproc

            elif norm_opt == "standard":
                preproc = StandardScaler()
                preproc.fit(x)
                preprocs[stamp] = preproc

            elif norm_opt == "robust":
                preproc = RobustScaler()
                preproc.fit(x)
                preprocs[stamp] = preproc
            else:
                msg = "{} preprocessor not valid."
                raise ValueError(msg.format(norm_opt))
        return preprocs
Beispiel #24
0
def ml_train_test_split():
    from sklearn.cross_validation import train_test_split
    from sklearn.preprocessing import MaxAbsScaler
    scaler = MaxAbsScaler()

    Xtr, Xts, ytr, yts = train_test_split(X, y, test_size=0.3)
    flag_scale = True

    if flag_scale:
        scaler.fit(Xtr)
        Xtr = scaler.transform(Xtr)
        Xts = scaler.transform(Xts)
Beispiel #25
0
def max_abs():
    x = [[1, -2, 3, 4, 5], [3, 4, -5, 6, 7], [1, 7, 2, -6, 2],
         [3, 8, 6, 2, -8]]
    print(x)

    scaler = MaxAbsScaler()
    scaler.fit(x)

    print(scaler.scale_)
    print(scaler.max_abs_)
    print(scaler.transform(x))

    pass
Beispiel #26
0
def MaxAbs_Scaler(X_train, X_test):
    """
  Max min standardization: scale each feature by its maximum absolute value.
  :param X_train: array-like training data;
  :param X_test: array-like test data;
  :return: standardized training data and test data, and the scaler
  """
    scaler = MaxAbsScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, scaler
    def test_max_abs_scaler_floats(self):
        # Generate a random 2D array with values in [0, 1000)
        np.random.seed(0)
        data = np.random.rand(100, 200) * 1000
        data = np.array(data, dtype=np.float32)
        data_tensor = torch.from_numpy(data)

        model = MaxAbsScaler()
        model.fit(data)
        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertIsNotNone(torch_model)
        np.testing.assert_allclose(model.transform(data), torch_model.transform(data_tensor), rtol=1e-06, atol=1e-06)
Beispiel #28
0
def plotPCA(X_train, y_train, X_test, y_test, outdir):
    #clf = loadClf(term, fold, clfName)
    #try:
    #    decision = clf.decision_function
    #    Vf = numpy.arange(-1.,1.1,0.1)
    #    V = (0.,)
    #except AttributeError:
    #    decision =  lambda x:clf.predict_proba(x)[:,0]
    #    Vf = numpy.arange(0.,1.05,0.05)
    #    V = (0.5,)
    scaler = MaxAbsScaler(copy=False)
    target_names = ("Positive", "Negative")
    term = outdir.parent.name.replace("_", " ")
    pca = PCA(n_components=2)
    pca.fit(X_train)
    scaler.fit(pca.transform(X_train))
    #delta = 0.025
    #a=numpy.arange(-1., 1., delta)
    #b=numpy.arange(-1., 1., delta)
    #A,B = numpy.meshgrid(a,b)
    #C=numpy.empty(A.shape)
    for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test,
                                                     'testing')):
        X_r = scaler.transform(pca.transform(X))
        inlier = (numpy.abs(X_r[:, 0]) <= 1) & (numpy.abs(X_r[:, 1]) <= 1)
        #print(X_r)
        plt.clf()

        #for k,l in product(range(len(a)),range(len(b))):
        #    C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),))))
        #print(C)
        #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone)
        #cfp.cmap.set_under('black')
        #cfp.cmap.set_over('white')
        #plt.contour(A,B,C,V,colors=("b",))
        #y=clf.predict(X)
        for c, i, target_name in zip("rg", (0, 1), target_names):
            plt.scatter(
                X_r[(y == i) & inlier, 0],
                X_r[(y == i) & inlier, 1],
                c=c,
                label=target_name,
                marker=",",
                s=1,  #0.8,#1/numpy.sqrt(2),
                #edgecolors='none',
                linewidth=0,
                alpha=0.7)
        plt.legend()
        plt.title('PCA for %s on %s data' % (term, n))
        plt.savefig(str(outdir / ('pca-%s.png' % (n, ))))
        plt.savefig(str(outdir / ('pca-%s.ps' % (n, ))))
Beispiel #29
0
class CatBoost_BASIC:

    _scaler = MaxAbsScaler()
    _cb_model = 0
    _idxAccepted = 0

    def fit(self, X):

        idx = GetUniqueColumns(X)
        v = GetCSR_X_Variance(X)
        m = v > 0
        an = np.where(m)
        an = an[0]
        # Keep where in both lists
        s = set(idx)
        s = s.intersection(an)
        self._idxAccepted = np.array(list(s))

        self._scaler = MaxAbsScaler()
        self._scaler.fit(X)

    def transform(self, X):
        X = self._scaler.transform(X)
        X = X[:, self._idxAccepted]

        return X

    def __init__(self):
        pass

    def train_with_validation(self, X_train, y_train, X_test, y_test):

        c = CatBoostRegressor(iterations=50,
                              learning_rate=0.05,
                              depth=10,
                              eval_metric='RMSE',
                              random_seed=42,
                              bagging_temperature=0.2,
                              od_type='Iter',
                              metric_period=50,
                              od_wait=20)

        c.fit(np.array(X_train.todense()),
              np.array(y_train),
              eval_set=(np.array(X_test.todense()), np.array(y_test)),
              use_best_model=True,
              verbose=True)
        self._cb_model = c

    def predict(self, X_test):
        return self._cb_model.predict(np.array(X_test.todense()))
def test_maxabsscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.MaxAbsScaler
    # with sklearn.preprocessing.MaxAbsScaler

    maxabsscalerr = MaxAbsScalerR()
    maxabsscalerr.fit(np.concatenate(trajs))

    maxabsscaler = MaxAbsScaler()
    maxabsscaler.fit(trajs)

    y_ref1 = maxabsscalerr.transform(trajs[0])
    y1 = maxabsscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Beispiel #31
0
def test_maxabsscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.MaxAbsScaler
    # with sklearn.preprocessing.MaxAbsScaler

    maxabsscalerr = MaxAbsScalerR()
    maxabsscalerr.fit(np.concatenate(trajs))

    maxabsscaler = MaxAbsScaler()
    maxabsscaler.fit(trajs)

    y_ref1 = maxabsscalerr.transform(trajs[0])
    y1 = maxabsscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Beispiel #32
0
def plotPCA(X_train, y_train, X_test, y_test, outdir):
    #clf = loadClf(term, fold, clfName)
    #try:
    #    decision = clf.decision_function
    #    Vf = numpy.arange(-1.,1.1,0.1)
    #    V = (0.,)
    #except AttributeError:
    #    decision =  lambda x:clf.predict_proba(x)[:,0]
    #    Vf = numpy.arange(0.,1.05,0.05)
    #    V = (0.5,)
    scaler = MaxAbsScaler(copy=False)
    target_names = ("Positive","Negative")
    term = outdir.parent.name.replace("_", " ")
    pca = PCA(n_components=2)
    pca.fit(X_train)
    scaler.fit(pca.transform(X_train))
    #delta = 0.025
    #a=numpy.arange(-1., 1., delta)
    #b=numpy.arange(-1., 1., delta)
    #A,B = numpy.meshgrid(a,b)
    #C=numpy.empty(A.shape)
    for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test, 'testing')):
        X_r = scaler.transform(pca.transform(X))
        inlier = (numpy.abs(X_r[:,0]) <= 1) & (numpy.abs(X_r[:,1]) <= 1)
        #print(X_r)
        plt.clf()

        #for k,l in product(range(len(a)),range(len(b))):
        #    C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),))))
        #print(C)
        #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone)
        #cfp.cmap.set_under('black')
        #cfp.cmap.set_over('white')
        #plt.contour(A,B,C,V,colors=("b",))
        #y=clf.predict(X)
        for c, i, target_name in zip("rg", (0, 1), target_names):
            plt.scatter(X_r[(y == i) & inlier, 0], X_r[(y == i) & inlier, 1],
                    c = c,
                    label = target_name,
                    marker = ",",
                    s = 1,#0.8,#1/numpy.sqrt(2),
                    #edgecolors='none',
                    linewidth = 0,
                    alpha = 0.7)
        plt.legend()
        plt.title('PCA for %s on %s data' % (term, n))
        plt.savefig(str(outdir/('pca-%s.png' % (n,))))
        plt.savefig(str(outdir/('pca-%s.ps' % (n,))))
def scale_data(x_train, x_test):

    """
        We only scale the continuous features. No need to scale binary features
    """

    
    idx_binary = [] # columns with boolean values
    for k in range(x_train.shape[1]):
        idx_binary.append( np.array_equal(x_train[:,k], x_train[:,k].astype(bool)) ) # checking if a column is binary
    idx_cont = np.logical_not(idx_binary)


    sc = MaxAbsScaler()
    sc.fit(x_train[:, idx_cont])
    
    x_train[:, idx_cont] = sc.transform(x_train[:, idx_cont])
    x_test[:, idx_cont] = sc.transform(x_test[:, idx_cont])

    return
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)


result1 = tpot_data.copy()

# Use Scikit-learn's MaxAbsScaler to scale the features
training_features = result1.loc[training_indices].drop('class', axis=1)

if len(training_features.columns.values) > 0:
    scaler = MaxAbsScaler()
    scaler.fit(training_features.values.astype(np.float64))
    scaled_features = scaler.transform(result1.drop('class', axis=1).values.astype(np.float64))
    result1 = pd.DataFrame(data=scaled_features)
    result1['class'] = result1['class'].values
else:
    result1 = result1.copy()

# Perform classification with a k-nearest neighbor classifier
knnc2 = KNeighborsClassifier(n_neighbors=min(10, len(training_indices)))
knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result2 = result1.copy()
result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
def _max_abs_scaler(column):
    sc = MaxAbsScaler()
    sc.fit(column.reshape(-1,1))
    new_col = sc.transform(column.reshape(-1,1))
    return(new_col)