def cv(model, x, y): errors = [] kf = KFold(n_splits=10, shuffle=True) for train_index, test_index in kf.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] x_scaler = MaxAbsScaler() y_scaler = MaxAbsScaler() x_scaler.fit(x_train) y_scaler.fit(y_train) xx_train = x_scaler.transform(x_train) xx_test = x_scaler.transform(x_test) yy_train = y_scaler.transform(y_train) yy_test = y_scaler.transform(y_test) cv_model = sklearn.base.clone(model) cv_model.fit(xx_train, yy_train) yy_predicted = cv_model.predict(xx_test) error = math.sqrt(mean_squared_error(yy_test, yy_predicted)) errors.append(error) return errors
def scale(self, X_train, X_test, type): if type == "StandardScaler": standardScaler = StandardScaler() standardScaler.fit(X_train) X_train = standardScaler.transform(X_train) X_test = standardScaler.transform(X_test) return X_train, X_test elif type == "MinMaxScaler": minMaxScaler = MinMaxScaler() minMaxScaler.fit(X_train) X_train = minMaxScaler.transform(X_train) X_test = minMaxScaler.transform(X_test) return X_train, X_test elif type == "MaxScaler": maxScaler = MaxAbsScaler() maxScaler.fit(X_train) X_train = maxScaler.transform(X_train) X_test = maxScaler.transform(X_test) return X_train, X_test elif type == "RobustScaler": robustScaler = RobustScaler() robustScaler.fit(X_train) X_train = robustScaler.transform(X_train) X_test = robustScaler.transform(X_test) return X_train, X_test
def test_max_abs_scaler(): tform = MaxAbsScaler() tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X) np.allclose(X_t, X_t_)
def ml_stratified_cv(): #from sklearn.utils import check_random_state #rng = check_random_state(0) from sklearn.cross_validation import StratifiedKFold from sklearn.preprocessing import MaxAbsScaler scaler = MaxAbsScaler() flag_scale = True cv = StratifiedKFold(y, n_folds=10, shuffle=True) ytrue, ypred, score = [], [], [] for itr, its in cv: Xtr, ytr = X[itr], y[itr] Xts, yts = X[its], y[its] if flag_scale: scaler.fit(Xtr) Xtr = scaler.transform(Xtr) Xts = scaler.transform(Xts) clf.fit(Xtr, ytr) ypr = clf.predict(Xts) sco = clf.decision_function(Xts) ytrue.append(yts) ypred.append(ypr) score.append(sco) ytrue = np.concatenate(ytrue) ypred = np.concatenate(ypred) score = np.concatenate(score) print tw.clf_results_extended(ytrue, score)
def scikit_clustering(number_of_clusters=3600): user_features = pickle.load( open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb')) users_features_vectors = list(user_features.values()) users_dataset = np.array(users_features_vectors) df = pd.DataFrame(users_dataset) df[0] = df[0].astype('category') df[1] = df[1].astype('category') df[3] = df[3].astype('category') df[6] = df[6].astype('category') abs_scaler = MaxAbsScaler() abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]]) df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]]) print(df.iloc[:, [0]].dtypes[0]) clustering = AgglomerativeClustering(n_clusters=number_of_clusters, affinity=gower.gower_matrix, linkage='complete').fit(df) result = clustering.labels_ clustering_result = {} for i in range(len(result)): if result[i] in clustering_result: clustering_result[result[i]] += [users_features_vectors[i]] else: clustering_result[result[i]] = [users_features_vectors[i]] file_to_write = open('users_vectors_clustering.p', 'wb') pickle.dump(clustering_result, file_to_write)
def normalize_data(dataframe, mode): if mode == 'abs': from sklearn.preprocessing import MaxAbsScaler max_abs = MaxAbsScaler(copy=True) #save for retransform later max_abs.fit(dataframe) data_norm = max_abs.transform(dataframe) return data_norm, max_abs if mode == 'robust': from sklearn.preprocessing import RobustScaler robust = RobustScaler(copy=True) #save for retransform later robust.fit(dataframe) data_norm = robust.transform(dataframe) return data_norm, robust if mode == 'min_max': from sklearn.preprocessing import MinMaxScaler minmax = MinMaxScaler(feature_range=(0, 1), copy=True) #save for retransform later minmax.fit(dataframe) data_norm = minmax.transform(dataframe) return data_norm, minmax if mode == 'std': from sklearn.preprocessing import StandardScaler stdscaler = StandardScaler(copy=True, with_mean=True, with_std=True) stdscaler.fit(dataframe) data_norm = stdscaler.transform(dataframe) return data_norm, stdscaler
def scaler_dummy(dataset): scaler_mm = MinMaxScaler() scaler_ma = MaxAbsScaler() scaler_sd = StandardScaler() scaler_rb = RobustScaler() numerical = list(dataset.columns) data_transform_mm = pd.DataFrame(data = dataset) data_transform_ma = pd.DataFrame(data = dataset) data_transform_sd = pd.DataFrame(data = dataset) data_transform_rb = pd.DataFrame(data = dataset) scaler_mm.fit(dataset[numerical]) scaler_ma.fit(dataset[numerical]) scaler_sd.fit(dataset[numerical]) scaler_rb.fit(dataset[numerical]) data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical]) data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical]) data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical]) data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical]) ## get dummies features_final_mm = pd.get_dummies(data_transform_mm) features_final_ma = pd.get_dummies(data_transform_ma) features_final_sd = pd.get_dummies(data_transform_sd) features_final_rb = pd.get_dummies(data_transform_rb) return features_final_mm, features_final_ma, features_final_sd, features_final_rb
def scikit_clustering_ver2(number_of_clusters=3600): user_features = pickle.load( open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb')) users_features_vectors = list(user_features.values()) users_dataset = np.array(users_features_vectors) df = pd.DataFrame(users_dataset) df[0] = df[0].astype('category') df[1] = df[1].astype('category') df[3] = df[3].astype('category') df[6] = df[6].astype('category') abs_scaler = MaxAbsScaler() abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]]) df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]]) clustering = KMeans(n_clusters=number_of_clusters, verbose=1).fit(df) result = clustering.labels_ logging.info("result: {0}".format(result)) clustering_result = {} for i in range(len(result)): if result[i] in clustering_result: clustering_result[result[i]] += [users_features_vectors[i]] else: clustering_result[result[i]] = [users_features_vectors[i]] file_to_write = open('users_vectors_clustering.p', 'wb') pickle.dump(clustering_result, file_to_write)
class MaxAbsScalerPrim(primitive): def __init__(self, random_state=0): super(MaxAbsScalerPrim, self).__init__(name='MaxAbsScaler') self.id = 8 self.hyperparams = [] self.type = 'feature preprocess' self.description = "Scale each feature by its maximum absolute value. his estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity. This scaler can also be applied to sparse CSR or CSC matrices." self.hyperparams_run = {'default': True} self.scaler = MaxAbsScaler() self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) # Update return True def fit(self, data): data = handle_data(data) self.scaler.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) for i in range(len(cols)): if not 'one_hot' in cols[i]: cols[i] = "{}_mxabsscale".format(cols[i]) output['X'] = pd.DataFrame(self.scaler.transform(output['X']), columns=cols) final_output = {0: output} return final_output
class ScalingAdder(BaseEstimator, TransformerMixin): def _create_scaler(self, scaler): if scaler == 'std': self._sc = StandardScaler() if scaler == 'minmax': self._sc = MinMaxScaler() if scaler == 'maxabs': self._sc = MaxAbsScaler() def __init__(self, scaler=None): self.scaler = scaler self._create_scaler(scaler) def set_params(self, scaler=None, **parameters): self.scaler = scaler self._create_scaler(scaler) return self def get_params(self, **kwargs): return {"scaler": self.scaler} def transform(self, X, **transform_params): if self.scaler is None: return X if (X.shape[1] > 1): return np.hstack((X[:, :1], self._sc.transform(X[:, 1:]))) return np.hstack((X[:, :1], np.zeros(shape=(X.shape[0], 1)))) def fit(self, X, y=None, **fit_params): if self.scaler is not None: if X.shape[1] > 1: self._sc.fit(X[:, 1:], y) return self
def scale_data(X, scaler=None): print('Performing data scaling...') if not scaler: scaler = MaxAbsScaler() scaler.fit(X) X = scaler.transform(X) return X, scaler
def get_data(): df_train = pd.read_csv('data_original/aps_failure_training_set.csv') df_test = pd.read_csv('data_original/aps_failure_test_set.csv') # df_train.replace('na','-1', inplace=True) df_test.replace('na','-1', inplace=True) # categorical for label: 0: neg, 1: pos df_train['class'] = pd.Categorical(df_train['class']).codes df_test['class'] = pd.Categorical(df_test['class']).codes # split data into x and y Y_train = df_train['class'].copy(deep=True) X_train = df_train.copy(deep=True) X_train.drop(['class'], inplace=True, axis=1) Y_test = df_test['class'].copy(deep=True) X_test = df_test.copy(deep=True) X_test.drop(['class'], inplace=True, axis=1) # strings to float X_train = X_train.astype('float64') X_test = X_test.astype('float64') # scale the dataset scaler = MaxAbsScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) return X_train, Y_train, X_test, Y_test
def use_MaxAbsScaler(): # 가장 큰 절대값 사용 : -1 ~ 1 사이로 변환 x = [[1., -1., 5.], [2., 0., -5.], [0., 1., -10]] scaler = MaxAbsScaler() scaler.fit(x) print(scaler.transform(x))
class AutoMaxScaler(BaseEstimator, TransformerMixin): """ Determine non-categorical columns and max scale the values. """ def __init__(self, ignore_columns: list = [], uniqueness_thresshold: Optional[float] = None): """ Args: uniqueness_thresshold: Columns with less unique values than this are considered categorical. """ self.ignore_columns = ignore_columns self.uniqueness_thresshold = uniqueness_thresshold def fit(self, X, y=None): """ Determine which columns to min-max scale. """ self.scaler_ = MaxAbsScaler(copy=True) self.columns_to_transform_ = get_numerical_columns( data_frame=X, ignore_columns=self.ignore_columns, uniqueness_thresshold=self.uniqueness_thresshold, ) self.scaler_.fit(X[self.columns_to_transform_]) return self def transform(self, X, y=None): """ Max scale the columns and return copy. """ data_subframe = X[self.columns_to_transform_] X[self.columns_to_transform_] = self.scaler_.transform(data_subframe) return X.copy()
class DataFrameScaler(BaseEstimator, TransformerMixin): '''Docstring of `DataFrameScaler` Scaling a pandas DataFrame. Args: scale: Valid values are "unit" - Centers to the mean and component wise scale to unit variance; "0,1" - Scales data to given range. Use extra parameter `feature_range=(min,max)` to set range; "-1,1" - Scales data to the range [-1, 1] by dividing through the largest maximum value in each feature. It is meant for data that is already centered at zero or sparse data. Extra paramters will be passed to sklearn scalers if specified. ignore_cols: Columns that will not be scaled. By default, all categorical columns will be ignored. Specify this parameter to ignore numerical columns too. ''' SCALERS = { 'unit': StandardScaler, '0,1': MinMaxScaler, '-1,1': MaxAbsScaler } def __init__(self, scaler: str = 'unit', ignore_cols: List[str] = [], target_cols: List[str] = [], **kwargs): assert scaler in ['unit', '0,1', '-1,1'], 'Invalid scaler {}. See help for valid scalers.'.format(scaler) self.scaler = scaler self.ignore_cols = ignore_cols self.target_cols = ignore_cols and [] or target_cols self.kwargs = kwargs self.ignore_cols = ignore_cols self.target_cols = ignore_cols and [] or target_cols def fit(self, X: pd.DataFrame, y=None): if self.scaler == 'unit': self.scaler = StandardScaler(**{ k: self.kwargs.get(k, d) for k,d in [('copy', True), ('with_mean', True), ('with_std', True)] }) elif self.scaler == '0,1': self.scaler = MinMaxScaler(**{ k: self.kwargs.get(k, d) for k, d in [('feature_range', (0, 1)), ('copy', True)] }) elif self.scaler == '-1,1': self.scaler = MaxAbsScaler(**{ k: self.kwargs.get(k, d) for k, d in [('copy', True)] }) # self.scaler = self.SCALERS[self.scaler](**self.kwargs).fit(X) self.target_cols = self.ignore_cols and [col for col in X.select_dtypes(include=['number']) if not col in self.ignore_cols] or ( self.target_cols and [col for col in X.select_dtypes(include=['number']) if col in self.target_cols] or X.select_dtypes(include=['number']).columns) self.scaler.fit(X[self.target_cols]) return self def transform(self, X: pd.DataFrame, y=None): X = X.copy() X[self.target_cols] = self.scaler.transform(X[self.target_cols]) return X
def ScaleMaxAbs(train_data): # minMaxScaler와 유사 : -1 ~ 1사이에 값을 위치시킴 maxAbsScaler = MaxAbsScaler() maxAbsScaler.fit(train_data) dt = maxAbsScaler.transform(train_data) return maxAbsScaler, pd.DataFrame(dt, index=train_data.index, columns=train_data.columns)
def test_max_abs_scaler_sparse(): X_sparse = tosparse(X) tform = MaxAbsScaler() tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X_sparse) np.allclose(X_t, X_t_.todense())
def test_max_abs_scaler(self): model = MaxAbsScaler() data = [[0., 0., 3.], [1., 1., 0.], [0., 2., 1.], [1., 0., 2.]] model.fit(data) model_onnx = convert_sklearn(model, 'scaler', [('input', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) dump_data_and_model(numpy.array(data, dtype=numpy.float32), model, basename="SklearnMaxAbsScaler")
def max_abs_scaler_usecase(): X_train = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]) scaler = MaxAbsScaler() scaler.fit(X_train) print(scaler.max_abs_) print(scaler.scale_) print(scaler.transform(X_train))
class MaxAbsScaler(FeatureTransformAlgorithm): r"""Implementation of feature scaling by its maximum absolute value. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html#sklearn.preprocessing.MaxAbsScaler See Also: * :class:`niaaml.preprocessing.feature_transform.FeatureTransformAlgorithm` """ Name = 'Maximum Absolute Scaler' def __init__(self, **kwargs): r"""Initialize MaxAbsScaler. """ super(MaxAbsScaler, self).__init__() self.__max_abs_scaler = MAS() def fit(self, x, **kwargs): r"""Fit implemented transformation algorithm. Arguments: x (pandas.core.frame.DataFrame): n samples to fit transformation algorithm. """ self.__max_abs_scaler.fit(x) def transform(self, x, **kwargs): r"""Transforms the given x data. Arguments: x (pandas.core.frame.DataFrame): Data to transform. Returns: pandas.core.frame.DataFrame: Transformed data. """ return self.__max_abs_scaler.transform(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureTransformAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string( self.__max_abs_scaler.get_params()))
def preproces_avs(path="food-101/images/", shape=(80, 80), batch_size=1000): pre = MaxAbsScaler() imag_gen = ImageDataGenerator() gen = imag_gen.flow_from_directory(path, target_size=shape, batch_size=batch_size, class_mode=None) flat = shape[0] * shape[1] * 3 pre.fit(next(gen).reshape((batch_size, flat))) return pre
def maxAbsScaler(train_x, test_x): scaler = MaxAbsScaler() scaler.fit(train_x.data) train_x.data = scaler.transform(train_x.data) if test_x is not None: test_x.data = scaler.transform(test_x.data) print(pd.DataFrame(train_x.data).describe()) return train_x.data
def preprocess_per_stamp(self, X, norm_opt): """ Prepare preprocessors for each stamp. """ programs = X.index # Get stamps stamps = self.get_unique_time_stamps(programs) # Init storage for preprocessors preprocs = {} # Iterate through stamps for stamp in stamps: # Find each program with this stamp progs = [] for i in range(X.shape[0]): program = programs[i] tmp = program.split("-") for entry in tmp: if entry == stamp: progs.append(program) # Get data for these programs x = X.loc[progs].values # FIXME # Initialize preprocessor if norm_opt == "max": preproc = MaxAbsScaler() preproc.fit(x) preprocs[stamp] = preproc elif norm_opt == "norm": preproc = Normalizer() preproc.fit(x) preprocs[stamp] = preproc elif norm_opt == "minmax": preproc = MinMaxScaler((0, 1)) preproc.fit(x) preprocs[stamp] = preproc elif norm_opt == "standard": preproc = StandardScaler() preproc.fit(x) preprocs[stamp] = preproc elif norm_opt == "robust": preproc = RobustScaler() preproc.fit(x) preprocs[stamp] = preproc else: msg = "{} preprocessor not valid." raise ValueError(msg.format(norm_opt)) return preprocs
def ml_train_test_split(): from sklearn.cross_validation import train_test_split from sklearn.preprocessing import MaxAbsScaler scaler = MaxAbsScaler() Xtr, Xts, ytr, yts = train_test_split(X, y, test_size=0.3) flag_scale = True if flag_scale: scaler.fit(Xtr) Xtr = scaler.transform(Xtr) Xts = scaler.transform(Xts)
def max_abs(): x = [[1, -2, 3, 4, 5], [3, 4, -5, 6, 7], [1, 7, 2, -6, 2], [3, 8, 6, 2, -8]] print(x) scaler = MaxAbsScaler() scaler.fit(x) print(scaler.scale_) print(scaler.max_abs_) print(scaler.transform(x)) pass
def MaxAbs_Scaler(X_train, X_test): """ Max min standardization: scale each feature by its maximum absolute value. :param X_train: array-like training data; :param X_test: array-like test data; :return: standardized training data and test data, and the scaler """ scaler = MaxAbsScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) return X_train, X_test, scaler
def test_max_abs_scaler_floats(self): # Generate a random 2D array with values in [0, 1000) np.random.seed(0) data = np.random.rand(100, 200) * 1000 data = np.array(data, dtype=np.float32) data_tensor = torch.from_numpy(data) model = MaxAbsScaler() model.fit(data) torch_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(torch_model) np.testing.assert_allclose(model.transform(data), torch_model.transform(data_tensor), rtol=1e-06, atol=1e-06)
def plotPCA(X_train, y_train, X_test, y_test, outdir): #clf = loadClf(term, fold, clfName) #try: # decision = clf.decision_function # Vf = numpy.arange(-1.,1.1,0.1) # V = (0.,) #except AttributeError: # decision = lambda x:clf.predict_proba(x)[:,0] # Vf = numpy.arange(0.,1.05,0.05) # V = (0.5,) scaler = MaxAbsScaler(copy=False) target_names = ("Positive", "Negative") term = outdir.parent.name.replace("_", " ") pca = PCA(n_components=2) pca.fit(X_train) scaler.fit(pca.transform(X_train)) #delta = 0.025 #a=numpy.arange(-1., 1., delta) #b=numpy.arange(-1., 1., delta) #A,B = numpy.meshgrid(a,b) #C=numpy.empty(A.shape) for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test, 'testing')): X_r = scaler.transform(pca.transform(X)) inlier = (numpy.abs(X_r[:, 0]) <= 1) & (numpy.abs(X_r[:, 1]) <= 1) #print(X_r) plt.clf() #for k,l in product(range(len(a)),range(len(b))): # C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),)))) #print(C) #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone) #cfp.cmap.set_under('black') #cfp.cmap.set_over('white') #plt.contour(A,B,C,V,colors=("b",)) #y=clf.predict(X) for c, i, target_name in zip("rg", (0, 1), target_names): plt.scatter( X_r[(y == i) & inlier, 0], X_r[(y == i) & inlier, 1], c=c, label=target_name, marker=",", s=1, #0.8,#1/numpy.sqrt(2), #edgecolors='none', linewidth=0, alpha=0.7) plt.legend() plt.title('PCA for %s on %s data' % (term, n)) plt.savefig(str(outdir / ('pca-%s.png' % (n, )))) plt.savefig(str(outdir / ('pca-%s.ps' % (n, ))))
class CatBoost_BASIC: _scaler = MaxAbsScaler() _cb_model = 0 _idxAccepted = 0 def fit(self, X): idx = GetUniqueColumns(X) v = GetCSR_X_Variance(X) m = v > 0 an = np.where(m) an = an[0] # Keep where in both lists s = set(idx) s = s.intersection(an) self._idxAccepted = np.array(list(s)) self._scaler = MaxAbsScaler() self._scaler.fit(X) def transform(self, X): X = self._scaler.transform(X) X = X[:, self._idxAccepted] return X def __init__(self): pass def train_with_validation(self, X_train, y_train, X_test, y_test): c = CatBoostRegressor(iterations=50, learning_rate=0.05, depth=10, eval_metric='RMSE', random_seed=42, bagging_temperature=0.2, od_type='Iter', metric_period=50, od_wait=20) c.fit(np.array(X_train.todense()), np.array(y_train), eval_set=(np.array(X_test.todense()), np.array(y_test)), use_best_model=True, verbose=True) self._cb_model = c def predict(self, X_test): return self._cb_model.predict(np.array(X_test.todense()))
def test_maxabsscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.MaxAbsScaler # with sklearn.preprocessing.MaxAbsScaler maxabsscalerr = MaxAbsScalerR() maxabsscalerr.fit(np.concatenate(trajs)) maxabsscaler = MaxAbsScaler() maxabsscaler.fit(trajs) y_ref1 = maxabsscalerr.transform(trajs[0]) y1 = maxabsscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def test_maxabsscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.MaxAbsScaler # with sklearn.preprocessing.MaxAbsScaler maxabsscalerr = MaxAbsScalerR() maxabsscalerr.fit(np.concatenate(trajs)) maxabsscaler = MaxAbsScaler() maxabsscaler.fit(trajs) y_ref1 = maxabsscalerr.transform(trajs[0]) y1 = maxabsscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def plotPCA(X_train, y_train, X_test, y_test, outdir): #clf = loadClf(term, fold, clfName) #try: # decision = clf.decision_function # Vf = numpy.arange(-1.,1.1,0.1) # V = (0.,) #except AttributeError: # decision = lambda x:clf.predict_proba(x)[:,0] # Vf = numpy.arange(0.,1.05,0.05) # V = (0.5,) scaler = MaxAbsScaler(copy=False) target_names = ("Positive","Negative") term = outdir.parent.name.replace("_", " ") pca = PCA(n_components=2) pca.fit(X_train) scaler.fit(pca.transform(X_train)) #delta = 0.025 #a=numpy.arange(-1., 1., delta) #b=numpy.arange(-1., 1., delta) #A,B = numpy.meshgrid(a,b) #C=numpy.empty(A.shape) for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test, 'testing')): X_r = scaler.transform(pca.transform(X)) inlier = (numpy.abs(X_r[:,0]) <= 1) & (numpy.abs(X_r[:,1]) <= 1) #print(X_r) plt.clf() #for k,l in product(range(len(a)),range(len(b))): # C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),)))) #print(C) #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone) #cfp.cmap.set_under('black') #cfp.cmap.set_over('white') #plt.contour(A,B,C,V,colors=("b",)) #y=clf.predict(X) for c, i, target_name in zip("rg", (0, 1), target_names): plt.scatter(X_r[(y == i) & inlier, 0], X_r[(y == i) & inlier, 1], c = c, label = target_name, marker = ",", s = 1,#0.8,#1/numpy.sqrt(2), #edgecolors='none', linewidth = 0, alpha = 0.7) plt.legend() plt.title('PCA for %s on %s data' % (term, n)) plt.savefig(str(outdir/('pca-%s.png' % (n,)))) plt.savefig(str(outdir/('pca-%s.ps' % (n,))))
def scale_data(x_train, x_test): """ We only scale the continuous features. No need to scale binary features """ idx_binary = [] # columns with boolean values for k in range(x_train.shape[1]): idx_binary.append( np.array_equal(x_train[:,k], x_train[:,k].astype(bool)) ) # checking if a column is binary idx_cont = np.logical_not(idx_binary) sc = MaxAbsScaler() sc.fit(x_train[:, idx_cont]) x_train[:, idx_cont] = sc.transform(x_train[:, idx_cont]) x_test[:, idx_cont] = sc.transform(x_test[:, idx_cont]) return
import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.preprocessing import MaxAbsScaler from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() # Use Scikit-learn's MaxAbsScaler to scale the features training_features = result1.loc[training_indices].drop('class', axis=1) if len(training_features.columns.values) > 0: scaler = MaxAbsScaler() scaler.fit(training_features.values.astype(np.float64)) scaled_features = scaler.transform(result1.drop('class', axis=1).values.astype(np.float64)) result1 = pd.DataFrame(data=scaled_features) result1['class'] = result1['class'].values else: result1 = result1.copy() # Perform classification with a k-nearest neighbor classifier knnc2 = KNeighborsClassifier(n_neighbors=min(10, len(training_indices))) knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result2 = result1.copy() result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
def _max_abs_scaler(column): sc = MaxAbsScaler() sc.fit(column.reshape(-1,1)) new_col = sc.transform(column.reshape(-1,1)) return(new_col)