def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X = np.ones(5) assert_array_equal(scale(X, with_mean=False), X)
def boston_DBSCAN(class_num=0): '''给出Boston房价数据集中class_num类,数据形式为归一化数据列 :parameter ———— class_num:类号,读取函数所返回的类别号 :returns ———— x_boston:波士顿数据集中class_num类的自变量,归一化数据列,共13列 y_boston:波士顿数据集中class_num类自变量,归一化数据列,共1列 ''' # 读取全数据集 bostondata = load_boston() boston_X = bostondata.data boston_y = bostondata.target boston_full = np.c_[boston_X, boston_y] # 进行全数据集归一化 scale = StandardScaler() boston_full = scale.fit_transform(boston_full) # 数据集降维为3维,方便可视化调参。 pca = PCA(n_components=3) boston_full3 = pca.fit_transform(boston_full) # 分类 clt = DBSCAN(eps=0.8, min_samples=5, n_jobs=4) label3 = clt.fit_predict(X=boston_full3) # 给定输出数据 group0_boston = boston_full[label3 == 0] x_boston = group0_boston[:, 0:-2] y_boston = group0_boston[:, -1] return x_boston, y_boston
def normalize_features(self, scaler: StandardScaler=None) \ -> StandardScaler: ''' Normalizes the features of the dataset using a StandardScaler (subtract mean, divide by standard deviation). If a scaler is provided, uses that scaler to perform the normalization. Otherwise fits a scaler to the features in the dataset and then performs the normalization. :param scaler: A fitted StandardScaler. Used if provided. Otherwise a StandardScaler is fit on this dataset and is then used. :param replace_nan_token: What to replace nans with. :return: A fitted StandardScaler. If a scaler is provided, this is the same scaler. Otherwise, this is a scaler fit on this dataset. ''' if not self.data or not self.data[0].features: return None if not scaler: scaler = StandardScaler() features = np.vstack([d.features for d in self.data]) scaler.fit(features) for d in self.data: d.set_features(scaler.transform(d.features.reshape(1, -1))[0]) return scaler
def fit(self, data, args): self.model = StandardScaler() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def read_file(): file_content = pd.read_csv('train.csv') exc_cols = [u'Id', u'Response'] cols = [c for c in file_content.columns if c not in exc_cols] train_datas = file_content.ix[:, cols] train_lables = file_content['Response'].values test_file = pd.read_csv('test.csv') test_ids = test_file['Id'].values test_datas = test_file.ix[:, [c for c in test_file.columns if c not in [u'Id']]] # 填充平均值 test_datas = test_datas.fillna(-1) train_datas = train_datas.fillna(-1) all_datas = pd.concat([train_datas, test_datas], axis=0) # 对数据进行一下划分 categoricalVariables = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_10", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"] all_file_data = all_datas.ix[:, [c for c in all_datas.columns if c not in categoricalVariables]] all_file_cate = all_datas.ix[:, [c for c in categoricalVariables]] # 归一化 对数值数据 scalar_this = StandardScaler() scalar_this.fit_transform(all_file_data) # 重新组合数据 train_datas = pd.concat([all_file_data[:train_datas.shape[0]], all_file_cate[:train_datas.shape[0]]], axis=1) test_datas = pd.concat([all_file_data[file_content.shape[0]:], all_file_cate[file_content.shape[0]:]], axis=1) # 向量化 train_datas = DictVectorizer().fit_transform(train_datas.to_dict(outtype='records')).toarray() test_datas = DictVectorizer().fit_transform(test_datas.to_dict(outtype='records')).toarray() return (train_datas, train_lables, test_ids, test_datas)
def load_UCI_Credit_Card_data(infile=None, balanced=True, seed=5): X = [] y = [] sids = [] with open(infile, "r") as fi: fi.readline() reader = csv.reader(fi) for row in reader: sids.append(row[0]) X.append(row[1:-1]) y0 = int(row[-1]) if y0 == 0: y0 = -1 y.append(y0) y = np.array(y) if balanced: X, y = balance_X_y(X, y, seed) X = np.array(X, dtype=np.float32) y = np.array(y, dtype=np.float32) encoder = OneHotEncoder(categorical_features=[1, 2, 3]) encoder.fit(X) X = encoder.transform(X).toarray() X, y = shuffle_X_y(X, y, seed) scale_model = StandardScaler() X = scale_model.fit_transform(X) return X, np.expand_dims(y, axis=1)
def prepare_time_data(data): data_scaler = StandardScaler() data_concat = np.concatenate(data, axis=0) data_scaler.fit(data_concat) new_data = [data_scaler.transform(data_) for data_ in data] return data_scaler, new_data
def get_data(args, logger, debug): '''Get data.''' # Get data: train_data, val_data, test_data = _get_data(args, logger) debug(f'train size = {len(train_data):,} | val size = {len(val_data):,} |' f' test size = {len(test_data):,}') if args.dataset_type == 'classification': class_sizes = get_class_sizes(args.data_df) debug('Class sizes') debug(class_sizes) # Scale features: if args.features_scaling: features_scaler = train_data.normalize_features() val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None # Initialise scaler and scale training targets by subtracting mean and # dividing standard deviation (regression only): if args.dataset_type == 'regression': debug('Fitting scaler') scaler = StandardScaler() targets = scaler.fit_transform(train_data.targets()) train_data.set_targets(targets) else: scaler = None return train_data, val_data, test_data, scaler, features_scaler
def _create_scaler(self, positivity): self.scaler_positivity = positivity if positivity is True: eps = 1e-9 self._scaler = MinMaxScaler(feature_range=(eps, 1)) else: self._scaler = StandardScaler() self.scaler_is_fitted = False
def __stdScaler(self): all_cols = list(self.data_df.columns.values) for col in all_cols: if col not in self.non_numeric_cols and col != 'time_to_failure': stdScaler = StandardScaler() stdScaler.fit(self.data_df[[col]]) self.data_df[col] = stdScaler.transform(self.data_df[[col]]) print('Standard Scaler applied ... ')
def imputeAndScale(X_train,X_test): imp= Imputer() X_train=imp.fit_transform(X_train) X_test=imp.transform(X_test) scaler= StandardScaler().fit(X_train) X_test=scaler.transform(X_test) X_train= scaler.transform(X_train) return X_train, X_test
def test_scalar(): from sklearn.preprocessing.data import MinMaxScaler, StandardScaler scalar = StandardScaler() training = pd.read_csv(TRAIN_FEATURES_CSV, nrows=200000) test = pd.read_csv(TEST_FEATURES_CSV) # normalize the values for column in TOTAL_TRAINING_FEATURE_COLUMNS: training[column] = scalar.fit_transform(training[column]) test[column] = scalar.transform(test[column])
def make_models(X, y, y_bin): return dict(ols=LinearRegression().fit(X, y), lr_bin=LogisticRegression().fit(X, y_bin), lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y), lr_mn=LogisticRegression(solver='lbfgs', multi_class='multinomial').fit(X, y), svc=SVC(kernel='linear').fit(X, y_bin), svr=SVR(kernel='linear').fit(X, y), dtc=DecisionTreeClassifier(max_depth=4).fit(X, y), dtr=DecisionTreeRegressor(max_depth=4).fit(X, y), rfc=RandomForestClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), rfr=RandomForestRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbc=GradientBoostingClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbr=GradientBoostingRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), abc=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y), abc2=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y), abc3=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y_bin), abc4=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y_bin), km=KMeans(1).fit(X), km2=KMeans(5).fit(X), pc1=PCA(1).fit(X), pc2=PCA(2).fit(X), pc3=PCA(2, whiten=True).fit(X), mlr1=MLPRegressor([2], 'relu').fit(X, y), mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y), mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y), mlc=MLPClassifier([2, 2], 'tanh').fit(X, y), mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin), bin=Binarizer(0.5), mms=MinMaxScaler().fit(X), mas=MaxAbsScaler().fit(X), ss1=StandardScaler().fit(X), ss2=StandardScaler(with_mean=False).fit(X), ss3=StandardScaler(with_std=False).fit(X), n1=Normalizer('l1'), n2=Normalizer('l2'), n3=Normalizer('max'))
def retrieve_data(undersampling=False, ratio=1, random_state=None): ## Getting and reading csv-data files into a pandas dataframe path = os.path.dirname(os.path.realpath(__file__)) file1 = path + "/../data/creditcard_part1.csv" file2 = path + "/../data/creditcard_part2.csv" df1 = pd.read_csv(file1) df2 = pd.read_csv(file2) df = pd.concat((df1, df2), ignore_index=True) ## Finding the class balances class_counts = df.Class.value_counts() num_fraudulent = class_counts[1] num_non_fraudulent = class_counts[0] ## Splitting the dataset into design matrix X and targets y X = df.loc[:, df.columns != 'Class'].values y = df.loc[:, df.columns == 'Class'].values.ravel() #### StandardScaler is more useful for classification, and Normalizer is more useful for regression. standard_scaler = StandardScaler() X = standard_scaler.fit_transform(X) ### Undersampling to fix imbalanced class if undersampling: if random_state is not None: np.random.seed(random_state) if ratio > 1: raise ValueError("Undersampling ratio can't be larger than one") multiplier = int(1.0 / ratio) ## Randomized undersampling method indices_nonfraud = np.where(y == 0)[0] indices_fraud = np.where(y == 1)[0] np.random.shuffle(indices_nonfraud) indices_nonfraud_under = indices_nonfraud[:multiplier * num_fraudulent] indices_under = np.concatenate((indices_fraud, indices_nonfraud_under)) np.random.shuffle(indices_under) ## Using indices from undersampling method to create new balanced dataset X_under = X[indices_under] y_under = y[indices_under] ## Splitting the dataset into test and training sets X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.33, random_state=4) return X_train, X_test, y_train, y_test
def NUS_WIDE_load_two_party_data(data_dir, selected_labels, neg_label=-1, n_samples=-1): print("# load_two_party_data") Xa, Xb, y = get_labeled_data_with_2_party(data_dir=data_dir, selected_labels=selected_labels, n_samples=n_samples) scale_model = StandardScaler() Xa = scale_model.fit_transform(Xa) Xb = scale_model.fit_transform(Xb) y_ = [] pos_count = 0 neg_count = 0 for i in range(y.shape[0]): # the first label in y as the first class while the other labels as the second class if y[i, 0] == 1: y_.append(1) pos_count += 1 else: y_.append(neg_label) neg_count += 1 print("pos counts:", pos_count) print("neg counts:", neg_count) y = np.expand_dims(y_, axis=1) print("Xa shape:", Xa.shape) print("Xb shape:", Xb.shape) print("y shape:", y.shape) n_train = int(0.8 * Xa.shape[0]) print("# of train samples:", n_train) # print("# of test samples:", n_test) Xa_train, Xb_train = Xa[:n_train], Xb[:n_train] Xa_test, Xb_test = Xa[n_train:], Xb[n_train:] y_train, y_test = y[:n_train], y[n_train:] print("Xa_train.shape:", Xa_train.shape) print("Xb_train.shape:", Xb_train.shape) print("Xa_test.shape:", Xa_test.shape) print("Xb_test.shape:", Xb_test.shape) print("y_train.shape:", y_train.shape) print("y_test.shape:", y_test.shape) return [Xa_train, Xb_train, y_train], [Xa_test, Xb_test, y_test]
def test_scaler_without_copy(): """Check that StandardScaler.fit does not change input""" rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_copy = X.copy() StandardScaler(copy=False).fit(X) assert_array_equal(X, X_copy) X_csr_copy = X_csr.copy() StandardScaler(with_mean=False, copy=False).fit(X_csr) assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
def test_scale_sparse_with_mean_raise_exception(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X_csr = sparse.csr_matrix(X) # check scaling and fit with direct calls on sparse data assert_raises(ValueError, scale, X_csr, with_mean=True) assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr) # check transform and inverse_transform after a fit on a dense array scaler = StandardScaler(with_mean=True).fit(X) assert_raises(ValueError, scaler.transform, X_csr) X_transformed_csr = sparse.csr_matrix(scaler.transform(X)) assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def make_model(classifier, **params): pipeline = Pipeline([ ('feature_extractor', FeatureExtractor()), ('scaler', StandardScaler()), ('model', classifier(**params)), ]) return pipeline
def __init__(self, copy=True, with_mean=True, with_std=True): self._hyperparams = { 'copy': copy, 'with_mean': with_mean, 'with_std': with_std } self._wrapped_model = Op(**self._hyperparams)
def fit(self, x_train, y_train): self.processing_steps = [StandardScaler()] svr = SVR(kernel='rbf', gamma=0.1) # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf # C = [2**i for i in np.arange(start=-5, stop=16, step=2)] # gamma = [2**i for i in np.arange(start=-15, stop=4, step=2)] # https://stats.stackexchange.com/questions/43943/ # which-search-range-for-determining-svm-optimal-c- # and-gamma-parameters C = [2**i for i in [-3, -2, -1, 0, 1, 2, 3, 4, 5]] gamma = [2**i for i in [-5, -4, -3, -2, -1, 0, 1, 2, 3]] params = {"C": sp_uniform(0.125, 32), "gamma": sp_uniform(0.03125, 8)} params.update(self.kwargs) reg = RandomizedSearchCV(estimator=svr, param_distributions=params, n_iter=10, scoring=self.score['function'], cv=3, iid=True) clf = MultiOutputRegressor(reg) self._update_pipeline_and_fit(x_train, y_train, [clf])
def test_fit_transform(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for obj in ((StandardScaler(), Normalizer(), Binarizer())): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2)
def fit(self, x_train, y_train): self.processing_steps = [StandardScaler()] ann = MLPRegressor() params = { 'hidden_layer_sizes': sp_randint(20, 150), 'alpha': sp_uniform(0, 100), 'max_iter': sp_randint(100, 2000), 'solver': ['lbfgs'], # 'identity', 'logistic', 'tanh', 'relu' 'activation': ['relu'] } if 'hidden_layer_sizes' in self.kwargs: self.kwargs['hidden_layer_sizes'] = self.parsefunction( self.kwargs['hidden_layer_sizes']) params.update(self.kwargs) clf = RandomizedSearchCV(estimator=ann, param_distributions=params, n_iter=10, scoring=self.score['function'], cv=3, iid=True) self._update_pipeline_and_fit(x_train, y_train, [clf])
def main(): args = parse() n_rollout = args.nrollout n_epoch = args.epoch savename = args.savename if args.savename is not None else 'model-' + str( n_rollout) + 'unroll' np.random.seed(1098) path = args.filename names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort'] with h5py.File(path, 'r') as f: (target_pos, target_speed, pos, vel, effort) = [[np.array(val) for val in f[name].values()] for name in names] x_target = np.array(target_pos) x_first = np.array([pos_[0] for pos_ in pos]) x_speed = np.array(target_speed).reshape((-1, 1)) aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort] x = np.concatenate((x_target, x_first, x_speed), axis=1) input_scaler = StandardScaler() x = input_scaler.fit_transform(x) output_scaler = StandardScaler() effort_concat = np.concatenate([a for a in effort], axis=0) output_scaler.fit(effort_concat) effort = [output_scaler.transform(eff) for eff in effort] y = pad_sequences(effort, padding='post', value=0.) aux_output = pad_sequences(aux_output, padding='post', value=0.) x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x, y, aux_output, test_size=0.2) y_mask, y_test_mask = [this_y[:, :, 0] for this_y in (y_aux, y_aux_test)] y_aux_mask, y_aux_test_mask = [ np.ones(this_y.shape[:2]) for this_y in (y_aux, y_aux_test) ] model = MyModel(train=[x, [y, y_aux]], val=[x_test, [y_test, y_aux_test]], train_mask=[y_mask, y_aux_mask], val_mask=[y_test_mask, y_aux_test_mask], max_unroll=n_rollout, name=savename) if not os.path.exists('save'): os.makedirs('save') if args.train: model.fit(nb_epoch=n_epoch, batch_size=32) elif args.resume: model.resume(nb_epoch=n_epoch, batch_size=32)
def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def test_theanets_regression(): check_regression( TheanetsRegressor(layers=[3], trainers=[dict(algo='rmsprop', **impatient)]), **regressor_params) check_regression( TheanetsRegressor(scaler=StandardScaler(), trainers=[dict(algo='rmsprop', **impatient)]), **regressor_params)
def load_scalers(path: str) -> Tuple[StandardScaler, StandardScaler]: ''' Loads the scalers a model was trained with. :param path: Path where model checkpoint is saved. :return: A tuple with the data scaler and the features scaler. ''' state = torch.load(path, map_location=lambda storage, loc: storage) scaler = StandardScaler(state['data_scaler']['means'], state['data_scaler']['stds']) \ if state['data_scaler'] else None features_scaler = StandardScaler(state['features_scaler']['means'], state['features_scaler']['stds']) \ if state['features_scaler'] else None return scaler, features_scaler
def test_theanets_regression(): check_regression( TheanetsRegressor(layers=[20], trainers=[{ 'optimize': 'rmsprop', 'min_improvement': 0.1 }]), **regressor_params) check_regression(TheanetsRegressor(scaler=StandardScaler()), **regressor_params)
class StandardScalerImpl(): def __init__(self, copy=True, with_mean=True, with_std=True): self._hyperparams = { 'copy': copy, 'with_mean': with_mean, 'with_std': with_std } self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_theanets_regression(): check_regression( TheanetsRegressor(layers=[3], trainers=[{ 'algo': 'rmsprop', 'learning_rate': 0.1 }]), **regressor_params) check_regression(TheanetsRegressor(scaler=StandardScaler()), **regressor_params)
def test_warning_scaling_integers(): # Check warning when scaling integer data X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "Data with input dtype uint8 was converted to float64" clean_warning_registry() assert_warns_message(DataConversionWarning, w, scale, X) assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X) assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
def test_warning_scaling_integers(): """Check warning when scaling integer data""" X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "assumes floating point values as input, got uint8" clean_warning_registry() assert_warns_message(UserWarning, w, scale, X) assert_warns_message(UserWarning, w, StandardScaler().fit, X) assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
def test_warning_scaling_integers(): """Check warning when scaling integer data""" X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, StandardScaler().fit, X) with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, MinMaxScaler().fit, X)
def test_center_kernel(): """Test that KernelCenterer is equivalent to StandardScaler in feature space""" rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) scaler = StandardScaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = rng.random_sample((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2)
def train(self, dataset_filename, scale=True, feature_selector=None, feature_selection_params={}, feature_selection_threshold=.25, learning_params={}, optimize=True, optimization_params={}, scorers=['f1_score'], attribute_set=None, class_name=None, metaresults_prefix="./0-", **kwargs): plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf") data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name, **kwargs) learner = self.learner #the class must remember the attribute_set and the class_name in order to reproduce the vectors self.attribute_set = attribute_set self.class_name = class_name #scale data to the mean if scale: log.info("Scaling datasets...") log.debug("Data shape before scaling: {}".format(data.shape)) self.scaler = StandardScaler() data = self.scaler.fit_transform(data) log.debug("Data shape after scaling: {}".format(data.shape)) log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_)) #avoid any NaNs and Infs that may have occurred due to the scaling data = np.nan_to_num(data) #feature selection if isinstance(feature_selection_params, basestring): feature_selection_params = eval(feature_selection_params) self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) #initialize learning method and scoring functions and optimize self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers) log.info("Data shape before fitting: {}".format(data.shape)) self.learner.fit(data, labels) self.fit = True return metadata
def test_scaler_2d_arrays(): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied assert_true(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert_true(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X)
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) assert_raises(ValueError, StandardScaler().fit, X_csr) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
class SkRanker(Ranker, SkLearner): ''' Basic ranker wrapping scikit-learn functions ''' def train(self, dataset_filename, scale=True, feature_selector=None, feature_selection_params={}, feature_selection_threshold=.25, learning_params={}, optimize=True, optimization_params={}, scorers=['f1_score'], attribute_set=None, class_name=None, metaresults_prefix="./0-", **kwargs): plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf") data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name, **kwargs) learner = self.learner #the class must remember the attribute_set and the class_name in order to reproduce the vectors self.attribute_set = attribute_set self.class_name = class_name #scale data to the mean if scale: log.info("Scaling datasets...") log.debug("Data shape before scaling: {}".format(data.shape)) self.scaler = StandardScaler() data = self.scaler.fit_transform(data) log.debug("Data shape after scaling: {}".format(data.shape)) log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_)) #avoid any NaNs and Infs that may have occurred due to the scaling data = np.nan_to_num(data) #feature selection if isinstance(feature_selection_params, basestring): feature_selection_params = eval(feature_selection_params) self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) #initialize learning method and scoring functions and optimize self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers) log.info("Data shape before fitting: {}".format(data.shape)) self.learner.fit(data, labels) self.fit = True return metadata def get_model_description(self): params = {} if self.scaler: params = self.scaler.get_params(deep=True) try: #these are for SVC if self.learner.kernel == "rbf": params["gamma"] = self.learner.gamma params["C"] = self.learner.C for i, n_support in enumerate(self.learner.n_support_): params["n_{}".format(i)] = n_support log.debug(len(self.learner.dual_coef_)) return params elif self.learner.kernel == "linear": coefficients = self.learner.coef_ att_coefficients = {} for attname, coeff in zip(self.attribute_set.get_names_pairwise(), coefficients[0]): att_coefficients[attname] = coeff return att_coefficients except AttributeError: pass try: #adaboost etc params = self.learner.get_params() numeric_params = OrderedDict() for key, value in params.iteritems(): try: value = float(value) except ValueError: continue numeric_params[key] = value return numeric_params except: pass return {} def get_ranked_sentence(self, parallelsentence, critical_attribute="rank_predicted", new_rank_name="rank_hard", del_orig_class_att=False, bidirectional_pairs=False, ties=True, reconstruct='hard'): """ """ if type(self.learner) == str: if self.classifier: self.learner = self.classifier # this is to provide backwards compatibility for old models # whose classes used differeent attribute names try: self.learner._dual_coef_ = self.learner.dual_coef_ self.learner._intercept_ = self.learner.intercept_ except AttributeError: # it's ok if the model doesn't have these variables pass try: # backwards compatibility for old LogisticRegression try_classes = self.learner.classes_ except AttributeError: self.learner.classes_ = [-1, 1] #de-compose multiranked sentence into pairwise comparisons pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences(bidirectional_pairs=bidirectional_pairs, class_name=self.class_name, ties=ties) if len(parallelsentence.get_translations()) == 1: log.warning("Parallelsentence has only one target sentence") parallelsentence.tgt[0].add_attribute(new_rank_name, 1) return parallelsentence, {} elif len(parallelsentence.get_translations()) == 0: return parallelsentence, {} #list that will hold the pairwise parallel sentences including the learner's decision classified_pairwise_parallelsentences = [] resultvector = {} for pairwise_parallelsentence in pairwise_parallelsentences: #convert pairwise parallel sentence into an orange instance instance = parallelsentence_to_instance(pairwise_parallelsentence, attribute_set=self.attribute_set) #scale data instance to mean, based on trained scaler if self.scaler: try: instance = np.nan_to_num(instance) instance = self.scaler.transform(instance) except ValueError as e: log.error("Could not transform instance: {}, scikit replied: {}".format(instance, e)) #raise ValueError(e) pass try: if self.featureselector: instance = np.nan_to_num(instance) instance = self.featureselector.transform(instance) except AttributeError: pass log.debug('Instance = {}'.format(instance)) #make sure no NaN or inf appears in the instance instance = np.nan_to_num(instance) #run learner for this instance predicted_value = self.learner.predict(instance) try: distribution = dict(zip(self.learner.classes_, self.learner.predict_proba(instance)[0])) except AttributeError: #if learner does not support per-class probability (e.g. LinearSVC) assign 0.5 distribution = dict([(cl, 0.5) for cl in self.learner.classes_]) log.debug("Distribution: {}".format(distribution)) log.debug("Predicted value: {}".format(predicted_value)) #even if we have a binary learner, it may be that it cannot decide between two classes #for us, this means a tie if not bidirectional_pairs and distribution and len(distribution)==2 and float(distribution[1])==0.5: predicted_value = 0 distribution[predicted_value] = 0.5 log.debug("{}, {}, {}".format(pairwise_parallelsentence.get_system_names(), predicted_value, distribution)) #gather several metadata from the classification, which may be needed resultvector.update({'systems' : pairwise_parallelsentence.get_system_names(), 'value' : predicted_value, 'distribution': distribution, 'confidence': distribution[int(predicted_value)], # 'instance' : instance, }) #add the new predicted ranks as attributes of the new pairwise sentence pairwise_parallelsentence.add_attributes({"rank_predicted":predicted_value, "prob_-1":distribution[-1], "prob_1":distribution[1] }) classified_pairwise_parallelsentences.append(pairwise_parallelsentence) #gather all classified pairwise comparisons of into one parallel sentence again sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences) if reconstruct == 'hard': log.debug("Applying hard reconstruction to produce rank {}".format(new_rank_name)) ranked_sentence = sentenceset.get_multiranked_sentence(critical_attribute=critical_attribute, new_rank_name=new_rank_name, del_orig_class_att=del_orig_class_att) else: attribute1 = "prob_-1" attribute2 = "prob_1" log.debug("Applying soft reconstruction to produce rank {}".format(new_rank_name)) try: ranked_sentence = sentenceset.get_multiranked_sentence_with_soft_ranks(attribute1, attribute2, critical_attribute, new_rank_name, normalize_ranking=False) except: raise ValueError("Sentenceset {} from {} caused exception".format(classified_pairwise_parallelsentences, parallelsentence)) return ranked_sentence, resultvector
def test_scaler_int(): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) with warnings.catch_warnings(record=True): scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) with warnings.catch_warnings(record=True): scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., 1.109, 1.856, 21., 1.559], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0( X_csr_scaled.astype(np.float)) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)