# load the dataset as a numpy array data = read_csv(full_path, header=None) # retrieve numpy array data = data.values # split into input and output elements X, y = data[:, :-1], data[:, -1] # label encode the target variable to have the classes 0 and 1 y = LabelEncoder().fit_transform(y) return X, y # define the location of the dataset full_path = 'haberman.csv' # load the dataset X, y = load_dataset(full_path) # fit the model steps = [('t1', MinMaxScaler()),('t2', PowerTransformer()),('m',LogisticRegression(solver='lbfgs'))] model = Pipeline(steps=steps) model.fit(X, y) # some survival cases print('Survival Cases:') data = [[31,59,2], [31,65,4], [34,60,1]] for row in data: # make prediction yhat = model.predict_proba([row]) # get percentage of survival p_survive = yhat[0, 0] * 100 # summarize print('>data=%s, Survival=%.3f%%' % (row, p_survive)) # some non-survival cases print('Non-Survival Cases:') data = [[44,64,6], [34,66,9], [38,69,21]]
def get_data(ssX=None, batch_size=32, train=True, **kwargs): """ inputs: batch_size: int return: (dataloader, test_dataloader) """ plot_random = False if 'plot_random' not in kwargs else kwargs[ 'plot_random'] plot_resonant = not plot_random train_all = False if 'train_all' not in kwargs else kwargs['train_all'] plot = False if 'plot' not in kwargs else kwargs['plot'] if not train_all and ssX is None: plot_resonant = True plot_random = False if train_all: filename = 'data/combined.pkl' elif plot_resonant: filename = 'data/resonant_dataset.pkl' elif plot_random: filename = 'data/random_dataset.pkl' # These are generated by data_from_pkl.py loaded_data = pkl.load(open(filename, 'rb')) train_ssX = (ssX is None) fullX, fully = loaded_data['X'], loaded_data['y'] if train_all: len_random = 17082 #Number of valid random examples (others have NaNs) random_data = np.arange(len(fullX)) >= (len(fullX) - len_random) # Differentiate megno if 'fix_megno' in kwargs and kwargs['fix_megno']: idx = [ i for i, lab in enumerate(loaded_data['labels']) if 'megno' in lab ][0] fullX[:, 1:, idx] -= fullX[:, :-1, idx] if 'include_derivatives' in kwargs and kwargs['include_derivatives']: derivative = fullX[:, 1:, :] - fullX[:, :-1, :] derivative = np.concatenate((derivative[:, [0], :], derivative), axis=1) fullX = np.concatenate((fullX, derivative), axis=2) # Hide fraction of test # MAKE SURE WE DO COPIES AFTER!!!! if train: if train_all: remy, finaly, remX, finalX, rem_random, final_random = train_test_split( fully, fullX, random_data, shuffle=True, test_size=1. / 10, random_state=0) trainy, testy, trainX, testX, train_random, test_random = train_test_split( remy, remX, rem_random, shuffle=True, test_size=1. / 10, random_state=1) else: remy, finaly, remX, finalX = train_test_split(fully, fullX, shuffle=True, test_size=1. / 10, random_state=0) trainy, testy, trainX, testX = train_test_split(remy, remX, shuffle=True, test_size=1. / 10, random_state=1) else: assert not train_all remy = fully finaly = fully testy = fully trainy = fully remX = fullX finalX = fullX testX = fullX trainX = fullX if plot: # Use test dataset for plotting, so put it in validation part: testX = finalX testy = finaly if train_ssX: if 'power_transform' in kwargs and kwargs['power_transform']: ssX = PowerTransformer(method='yeo-johnson') #Power is best else: ssX = StandardScaler() #Power is best n_t = trainX.shape[1] n_features = trainX.shape[2] if train_ssX: ssX.fit(trainX.reshape(-1, n_features)[::1539]) ttrainy = trainy ttesty = testy ttrainX = ssX.transform(trainX.reshape(-1, n_features)).reshape( -1, n_t, n_features) ttestX = ssX.transform(testX.reshape(-1, n_features)).reshape( -1, n_t, n_features) if train_all: ttest_random = test_random ttrain_random = train_random tremX = ssX.transform(remX.reshape(-1, n_features)).reshape( -1, n_t, n_features) tremy = remy train_len = ttrainX.shape[0] X = Variable( torch.from_numpy(np.concatenate( (ttrainX, ttestX))).type(torch.FloatTensor)) y = Variable( torch.from_numpy(np.concatenate( (ttrainy, ttesty))).type(torch.FloatTensor)) if train_all: r = Variable( torch.from_numpy(np.concatenate( (ttrain_random, ttest_random))).type(torch.BoolTensor)) Xrem = Variable(torch.from_numpy(tremX).type(torch.FloatTensor)) yrem = Variable(torch.from_numpy(tremy).type(torch.FloatTensor)) idxes = np.s_[:] dataset = torch.utils.data.TensorDataset(X[:train_len, :, idxes], y[:train_len]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=8) # Cut up dataset into only the random or resonant parts. # Only needed if plotting OR if (not plot) or (not train_all): test_dataset = torch.utils.data.TensorDataset(X[train_len:, :, idxes], y[train_len:]) else: if plot_random: mask = r else: mask = ~r print( f'Plotting with {mask.sum()} total elements, when plot_random={plot_random}' ) test_dataset = torch.utils.data.TensorDataset( X[train_len:][r[train_len:]][:, :, idxes], y[train_len:][r[train_len:]]) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=3000, shuffle=False, pin_memory=True, num_workers=8) kwargs['model'].ssX = copy(ssX) return dataloader, test_dataloader
# Create a 1D array the same length as yfilled with zeros named yinput # https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros.html target = np.zeros((len(yorig), 1)).ravel() # This will set the value of yinput to 1 for the indices of positive which where had true written to them at line 36 target[positive] = 1 total_aggressive = np.sum(positive) # This is unneeded as yinput was initialised with 0s but I include it for completeness. It will set the value of yinput # to 0 for the indices of negative that were set to true at line 39 target[negative] = 0 total_even = np.sum(negative) # look at scaling if scaler == "power": print("Power Transform Scaler") X_scaler = PowerTransformer(method='yeo-johnson') Xorig = scaleData(Xorig, X_scaler) elif scaler == "norm": print("Normalizer Scaler") X_scaler = Normalizer() Xorig = scaleData(Xorig, X_scaler) elif scaler == "robo": print("Robust Scaler") X_scaler = RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True) Xorig = scaleData(Xorig, X_scaler) elif scaler == "standard": print("Standard Scaler") X_scaler = StandardScaler(with_mean=True, with_std=True)
# Take only 2 features to make visualization easier # Feature of 0 has a long tail distribution. # Feature 5 has a few but very large outliers. X = X_full[:, [0, 5]] distributions = [ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after power transformation (Yeo-Johnson)', PowerTransformer(method='yeo-johnson').fit_transform(X)), ('Data after power transformation (Box-Cox)', PowerTransformer(method='box-cox').fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform').fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ] # scale the output between 0 and 1 for the colorbar y = minmax_scale(y_full) # plasma does not exist in matplotlib < 1.5 cmap = getattr(cm, 'plasma_r', cm.hot_r)
iris = load_iris() def _get_valid_samples_by_column(X, col): """Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive", [(MaxAbsScaler(), maxabs_scale, True, False), (MinMaxScaler(), minmax_scale, False, False), (StandardScaler(), scale, False, False), (StandardScaler(with_mean=False), scale, True, False), (PowerTransformer('yeo-johnson'), power_transform, False, False), (PowerTransformer('box-cox'), power_transform, False, True), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False), (RobustScaler(), robust_scale, False, False), (RobustScaler(with_centering=False), robust_scale, True, False)]) def test_missing_value_handling(est, func, support_sparse, strictly_positive): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan if strictly_positive: X += np.nanmin(X) + 0.1 X_train, X_test = train_test_split(X, random_state=1) # sanity check
] base_data_pipe_post = [ ('flatten', preprocess.Flattenor()), ] print(f"\n{c*10} Starting TrainingManager with Grid Search {c*10}\n") dpipez = [ Pipeline(base_data_pipe_pre + [ ('basic_green', extract.ColorChannelz()), ] + base_data_pipe_post + [ ('scaler', StandardScaler()), ]), Pipeline(base_data_pipe_pre + [ ('basic_green', extract.ColorChannelz()), ] + base_data_pipe_post + [ ('power', PowerTransformer()), ]), ## TODO: recheck size remaps # Pipeline( base_data_pipe_pre+[('color_chan', extract.FundusColorChannelz() ),]+base_data_pipe_post +[('scaler', StandardScaler()), ] ), # Pipeline( base_data_pipe_pre+[('eigenz_chan', extract.EigenzChannelz(topn=70) ),]+base_data_pipe_post +[('scaler', StandardScaler()), ] ), # Pipeline( base_data_pipe_pre+[('patch_chan', extract.PatchifyChannelz(nx_patchez=12) ),]+base_data_pipe_post +[('scaler', StandardScaler()), ] ) ] mpipez = [ (Pipeline([('flatten', preprocess.Flattenor()), ('svm', svm.SVC())]), { 'kernel': ('linear', 'rbf'), 'C': [1, 10] }), ## (Pipeline([('flatten', preprocess.Flattenor()), ('logit', LogisticRegression())]), { 'C': [1, 10]
from queencity20.utils.getData import * from queencity20.utils.remove_correlated import * from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from collections import defaultdict df = getTrainingData() df.head() allButTarget = [c for c in df.columns if c != "target"] means = df.mean(skipna=True) fdf = df.fillna(means) from sklearn.preprocessing import PowerTransformer pt = PowerTransformer() fdf.loc[:,allButTarget] = pt.fit_transform(fdf.loc[:,allButTarget]) fdf = diffCols(fdf) #corcols = list(set(find_correlation(fdf.drop("target" , axis=1), threshold=0.9))) #fdf = fdf.drop(corcols , axis=1) #from sklearn.preprocessing import StandardScaler #scaler = StandardScaler() #X = scaler.fit_transform(X) X = fdf.drop(["target"], axis=1) fdf = fdf[~np.any(np.logical_or(X > 2.5 , X < -2.5) , axis=1)] X = fdf.drop(["target"], axis=1) y = fdf["target"]
def power_transform(data): pt = PowerTransformer(standardize=False) return pt.fit_transform(data)
def fit_yeo_johnson_transformer(train_imputed_numeric_df: pd.DataFrame): yeo_johnson_transformer = PowerTransformer(method="yeo-johnson", copy=True) yeo_johnson_transformer.fit(train_imputed_numeric_df) return yeo_johnson_transformer
list_symmetry_score.append(symmetry_score) list_colour_scores.append(colour_score) df = pd.DataFrame({ "Asymmetry score": list_symmetry_score, "Border score": list_border_score, "Colour score": list_colour_scores }) # Predict labels for each fold using the KNN algortihm X = X_full = df.to_numpy() # Vanaf 2 zodat melanoma info niet wordt meegenomen # Load labels control_group = np.array(control_group) y = control_group distributions = [('Data after power transformation (Box-Cox)', PowerTransformer(method='box-cox').fit_transform(X))] for i in distributions: titel = i[0] methode = i[0] X = i[1] X[:, 0] = X[:, 0] * 1.6 X[:, 1] = X[:, 1] * 2.0 X[:, 2] = X[:, 2] * 2.0 symmetrie = X[:, 0] kleur = X[:, 2] border = X[:, 1] kf = StratifiedShuffleSplit(n_splits=1, test_size=0.4) list_border_score.append(border_score)
def predefined_ops(): '''return dict of user defined none-default instances of operators ''' clean = { 'clean': Cleaner(dtype_filter='not_datetime', na1='null', na2='mean', drop_uid=True), 'cleanNA': Cleaner(dtype_filter='not_datetime', na1=None, na2=None), 'cleanMean': Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'), 'cleanMn': Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'), } # encode = { 'woe8': WoeEncoder(max_leaf_nodes=8), 'woe5': WoeEncoder(max_leaf_nodes=5), 'woeq8': WoeEncoder(q=8), 'woeq5': WoeEncoder(q=5), 'woeb5': WoeEncoder(bins=5), 'woem': WoeEncoder(mono=True), 'oht': OhtEncoder(), 'ordi': OrdiEncoder(), # 'bin10': BinEncoder(bins=10, int_bins=True), # 10 bin edges encoder # 'bin5': BinEncoder(bins=5, int_bins=True), # 5 bin edges encoder # 'binm10': BinEncoder(max_leaf_nodes=10, # int_bins=True), # 10 bin tree cut edges encoder # 'binm5': BinEncoder(max_leaf_nodes=5, # int_bins=True), # 5 bin tree cut edges encoder } resample = { # over_sampling # under sampling controlled methods 'runder': RandomUnderSampler(), 'nearmiss': NearMiss(version=3), 'pcart': InstanceHardnessThreshold(), # clean outliers 'inlierForest': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'IsolationForest', 'contamination': 0.1 }), 'inlierLocal': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'LocalOutlierFactor', 'contamination': 0.1 }), 'inlierEllip': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'EllipticEnvelope', 'contamination': 0.1 }), 'inlierOsvm': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'OneClassSVM', 'contamination': 0.1 }), } scale = { 'stdscale': StandardScaler(), 'minmax': MinMaxScaler(), 'absmax': MaxAbsScaler(), 'rscale': RobustScaler(quantile_range=(10, 90)), 'quantile': QuantileTransformer(), # uniform distribution 'power': PowerTransformer(), # Gaussian distribution 'norm': Normalizer(), # default L2 norm # scale sparse data 'maxabs': MaxAbsScaler(), 'stdscalesp': StandardScaler(with_mean=False), } # feature construction feature_c = { 'pca': PCA(whiten=True), 'spca': SparsePCA(n_jobs=-1), 'ipca': IncrementalPCA(whiten=True), 'kpca': KernelPCA(kernel='rbf', n_jobs=-1), 'poly': PolynomialFeatures(degree=2), # kernel approximation 'Nys': Nystroem(random_state=0), 'rbf': RBFSampler(random_state=0), 'rfembedding': RandomTreesEmbedding(n_estimators=10), 'LDA': LinearDiscriminantAnalysis(), 'QDA': QuadraticDiscriminantAnalysis(), } # select from model feature_m = { 'fwoe': SelectFromModel(WoeEncoder(max_leaf_nodes=5)), 'flog': SelectFromModel(LogisticRegression(penalty='l1', solver='saga', C=1e-2)), 'fsgd': SelectFromModel(SGDClassifier(penalty="l1")), 'fxgb': SelectFromModel( XGBClassifier(n_jobs=-1, booster='gbtree', max_depth=2, n_estimators=50), ), 'frf': SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)), # fixed number of features 'fxgb20': SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'), max_features=20), 'frf20': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5), max_features=20), 'frf10': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5), max_features=10), 'fRFElog': RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1), 'fRFExgb': RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1), } # Univariate feature selection feature_u = { 'fchi2': GenericUnivariateSelect(chi2, 'percentile', 25), 'fMutualclf': GenericUnivariateSelect(mutual_info_classif, 'percentile', 25), 'fFclf': GenericUnivariateSelect(f_classif, 'percentile', 25), } imp = { "impXGB": XGBClassifier(n_jobs=-1, booster='gbtree', max_depth=2, n_estimators=50), "impRF": ExtraTreesClassifier(n_estimators=100, max_depth=2) } instances = {} instances.update(**clean, **encode, **scale, **feature_c, **feature_m, **feature_u, **resample, **imp) return instances
def produce_smoted(): sample_map = { 1: 500, 2: 500, 3: 500, 4: 500, 5: 500, 6: 500, 7: 500, 8: 500, 9: 500, 10: 500 } # Read data X = pd.read_csv('train_data.csv', header=None) y = pd.read_csv('train_labels.csv', header=None) # Combine for shuffling and partitioning data = pd.concat([y, X], axis='columns', ignore_index=True) # Shuffle for more reliable validation later data = data.sample(frac=1).reset_index(drop=True) # Let's partition. 1st part is used to train with SMOTE, 2nd (smaller) part is used to validate train, test = train_test_split(data, test_size=0.3, random_state=0) # Find x & y x_train = train.drop(labels=0, axis='columns') y_train = train[[0]] x_test = test.drop(labels=0, axis='columns') y_test = test[[0]] # Let's try SMOTE X_resampled, y_resampled = BorderlineSMOTE().fit_resample(x_train, y_train) X_resampled = pd.DataFrame(X_resampled) y_resampled = pd.DataFrame(y_resampled) training_data = pd.concat( [y_resampled, X_resampled], axis='columns', ignore_index=True).sample(frac=1).reset_index(drop=True) # Rhythm patterns rhythm = training_data.iloc[:, 1:169].copy() # Chroma chroma_cleaned = training_data.iloc[:, 169:205] # MFCCs mfcc_cleaned = training_data.iloc[:, 221:265].copy() cleaned_x_training = pd.concat([rhythm, chroma_cleaned, mfcc_cleaned], axis='columns', ignore_index=True) # Outlier detection threshold = 3 for col in range(cleaned_x_training.shape[1]): mean = np.mean(cleaned_x_training.iloc[:, col]) z = np.abs(stats.zscore(cleaned_x_training.iloc[:, col])) rows = np.where(z > threshold) for row in rows: cleaned_x_training.at[row, col] = mean # Scaling scaler = PowerTransformer() scaled_data = scaler.fit_transform(cleaned_x_training) scaled_x_training = pd.DataFrame(scaled_data) # NOW SAME OPERATIONS FOR VALIDATION DATA validation_data = pd.concat( [y_test, x_test], axis='columns', ignore_index=True).sample(frac=1).reset_index(drop=True) # Rhythm patterns rhythm = validation_data.iloc[:, 1:169].copy() # Chroma chroma_cleaned = validation_data.iloc[:, 169:193] # MFCCs mfcc_cleaned = validation_data.iloc[:, 221:265].copy() cleaned_x_validation = pd.concat([rhythm, chroma_cleaned, mfcc_cleaned], axis='columns', ignore_index=True) # Outlier detection threshold = 3 for col_val in range(cleaned_x_validation.shape[1]): mean_val = np.mean(cleaned_x_validation.iloc[:, col_val]) z_val = np.abs(stats.zscore(cleaned_x_validation.iloc[:, col_val])) rows_val = np.where(z_val > threshold) for row_val in rows_val: cleaned_x_validation.at[row_val, col_val] = mean_val # Scaling scaler = PowerTransformer() scaled_data = scaler.fit_transform(cleaned_x_validation) scaled_x_validation = pd.DataFrame(scaled_data) return scaled_x_training, scaled_x_validation, training_data[[ 0 ]], validation_data[[0]]
df_final = pd.concat([x, y], axis=1) df_final = df_final.rename(columns={f'internet_traffic_{cell_id}': 'y'}) df_final['y_sh'] = df_final['y'].shift(periods=-1) df_final = df_final.dropna() # SPLITTING AND PREPARING DATASET size = len(df_final) X_train = df_final.drop(['y_sh'], axis=1)[:int(0.8 * size)] y_train = df_final['y_sh'][:int(0.8 * size)] X_test = df_final.drop(['y_sh'], axis=1)[int(0.8 * size):] y_test = df_final['y_sh'][int(0.8 * size):] scaler = PowerTransformer() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) y_train = np.array(y_train) y_test = np.array(y_test) n_features = X_train.shape[1] # PREPARING DATA TO INPUT INTO NNET X_train = X_train.reshape(-1, 1, n_features) X_test = X_test.reshape(-1, 1, n_features) # %% file_path = f'C:\\Users\\patri\\Documents\\Github\\milan-telecom-analysis\\results\\model_results\\{neighorrs}_neighbors_id_{cell_id}.h5' network = keras.models.load_model(file_path,
def __init__(self, name='YeoJohnson'): super().__init__(name) self.inplace = True self.power = PowerTransformer(method='yeo-johnson', standardize=False)
# %% from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline # scaler = [StandardScaler()] classifier_test = [SVC()] #=================Scaler scaler = [ StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(quantile_range=(25, 75)), PowerTransformer(method='yeo-johnson'), # PowerTransformer(method='box-cox'), QuantileTransformer(output_distribution='normal'), QuantileTransformer(output_distribution='uniform'), Normalizer() ] # %% #=================Classifier classifier_test = [ OneVsRestClassifier(SVC()), DecisionTreeClassifier(max_depth=5), SVC(), SVC(kernel="linear", C=0.025), LogisticRegressionCV(cv=5, random_state=0),
def __init__(self, preprocess_type=None, extend_data=False, short_end=False): self.config = Config() # prepare input data config_path = self.config.get_filepath("", "config.yaml") config_file = open(config_path, 'r') yaml_config = yaml.load(config_file, Loader=yaml.SafeLoader) self.training_dataset_names = [ d['name'] for d in yaml_config['training_datasets'] ] self.training_dataset_start_pos = [ d['start_position'] for d in yaml_config['training_datasets'] ] self.test_dataset_names = [ d['name'] for d in yaml_config['test_datasets'] ] self.test_dataset_start_pos = [ d['start_position'] for d in yaml_config['test_datasets'] ] self.dataset_names = np.concatenate( (self.training_dataset_names, self.test_dataset_names)) # do we need these? self.dataset_start_pos = np.concatenate( (self.training_dataset_start_pos, self.test_dataset_start_pos)) # do we need these? # read in all pickle files self.all_pd = [] for dataset_name in self.dataset_names: self.all_pd.append( pd.read_pickle(self.config.get_filepath_data(dataset_name))) if extend_data: training_dataset_names_copy = np.array(self.training_dataset_names, copy=True) # create a copy of the data shifted up by 10 for i, dataset_name in enumerate(training_dataset_names_copy): self.dataset_names = np.append(self.dataset_names, dataset_name + "_" + str(10)) self.training_dataset_names = np.append( self.training_dataset_names, dataset_name + "_" + str(10)) self.dataset_start_pos = np.append( self.dataset_start_pos, self.training_dataset_start_pos[i]) self.training_dataset_start_pos.append( self.training_dataset_start_pos[i]) self.all_pd.append(self.all_pd[i].copy() + 10) self.dict_datasets = dict( zip(self.dataset_names, np.arange(len(self.dataset_names)))) self.enable_difference = False self._feature_range = [0, 1] self.normalisation_scalers = [] for _ in self.dataset_names: self.normalisation_scalers.append( MinMaxScaler(feature_range=self.feature_range)) self.enable_normalisation_scaler = False self.enable_ignore_price = False # scale each curve to feature_range self.power_transformer = PowerTransformer() self.enable_power_transform = False self.standardisation_scalers = [] for _ in self.dataset_names: self.standardisation_scalers.append(StandardScaler()) self.enable_standardisation_scaler = False self.enable_log_returns = False self.mult_factor = 10 # 5 self.add_factor = 25 # 6 self.enable_log = False self.enable_pct_change = False self.enable_curve_smoothing = False self.short_end = short_end # now setup PreprocessType settings if preprocess_type is PreprocessType.NORMALISATION_OVER_TENORS: self.enable_normalisation_scaler = True self.feature_range = [0, 1] elif preprocess_type is PreprocessType.NORMALISATION_OVER_CURVES: self.enable_normalisation_scaler = True self.feature_range = [0, 1] self.enable_ignore_price = True elif preprocess_type is PreprocessType.STANDARDISATION_OVER_TENORS: self.enable_standardisation_scaler = True elif preprocess_type is PreprocessType.LOG_RETURNS_OVER_TENORS: self.enable_log_returns = True
def gaussian_scaler(train, test, method='yeo-johnson'): scaler = PowerTransformer(method, standardize=False, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return scaler, train_scaled, test_scaled
def chang_hug_map(X, hex_colors, FONT_SIZE=12, BINS=30): ''' Function that applies Chang & Hug map of preprocessing data to a normal distribution: REF: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_map_data_to_normal.html#sphx-glr-auto-examples-preprocessing-plot-map-data-to-normal-py Parameters: * X = features * hex_colors = hexadecimal colors to be used for each feature * FONT_SIZE = size of font on plots * BINS = number of bins on histogram plots ''' # setting preprocessing methods: PowerTransformer (Box-Cox, Yeo-Johnson); QuantileTransformer scaler = MinMaxScaler(feature_range=(1, 2)) boxcox = PowerTransformer(method='box-cox') bc = Pipeline(steps=[('s', scaler), ('bc', boxcox)]) yj = PowerTransformer(method='yeo-johnson') rng = np.random.RandomState(304) qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', random_state=rng) # adding distributions of columns distributions = [] for i in range(0, len(X.columns)): name = X.columns[i] array = X[X.columns[i]].to_numpy().reshape(-1, 1) distributions.append((name, array)) colors = hex_colors # generating the plot fig, axes = plt.subplots( nrows=12, ncols=15, figsize=(35, 25)) # cols = num of preprocessing methods + original axes = axes.flatten() axes_idxs = [ (0, 15, 30, 45), (1, 16, 31, 46), (2, 17, 32, 47), (3, 18, 33, 48), (4, 19, 34, 49), (5, 20, 35, 50), # first set (6, 21, 36, 51), (7, 22, 37, 52), (8, 23, 38, 53), (9, 24, 39, 54), (10, 25, 40, 55), (11, 26, 41, 56), (12, 27, 42, 57), (13, 28, 43, 58), (14, 29, 44, 59), (60, 75, 90, 105), (61, 76, 91, 106), (62, 77, 92, 107), (63, 78, 93, 108), (64, 79, 94, 109), (65, 80, 95, 110), # second set (66, 81, 96, 111), (67, 82, 97, 112), (68, 83, 98, 113), (69, 84, 99, 114), (70, 85, 100, 115), (71, 86, 101, 116), (72, 87, 102, 117), (73, 88, 103, 118), (74, 89, 104, 119), (120, 135, 150, 165), (121, 136, 151, 166), (122, 137, 152, 167), (123, 138, 153, 168), (124, 139, 154, 169), (125, 140, 155, 170), (126, 141, 156, 171), (127, 142, 157, 172), (128, 143, 158, 173), (129, 144, 159, 174), (130, 145, 160, 175), (131, 146, 161, 176), (132, 147, 162, 177), (133, 148, 163, 178), (134, 149, 164, 179) ] axes_list = [(axes[i], axes[j], axes[k], axes[l]) for (i, j, k, l) in axes_idxs] for distribution, color, axes in zip(distributions, colors, axes_list): name, X_col = distribution X_train, X_test = train_test_split(X_col, test_size=0.2, random_state=rng) # perform power and quantile transforms X_trans_bc = bc.fit(X_train).transform(X_test) lmbda_bc = round(bc.named_steps['bc'].lambdas_[0], 2) X_trans_yj = yj.fit(X_train).transform(X_test) lmbda_yj = round(yj.lambdas_[0], 2) X_trans_qt = qt.fit(X_train).transform(X_test) ax_original, ax_bc, ax_yj, ax_qt = axes ax_original.hist(X_train, color=color, bins=BINS) ax_original.set_title(name, fontsize=FONT_SIZE) ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) for ax, X_trans, meth_name, lmbda in zip( (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt), ('Box-Cox', 'Yeo-Johnson', 'Quartile transform'), (lmbda_bc, lmbda_yj, None)): ax.hist(X_trans, color=color, bins=BINS) title = f'After {meth_name}' if lmbda is not None: title += f'\n$\lambda$ = {lmbda}' ax.set_title(title, fontsize=FONT_SIZE) ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE) ax.set_xlim([-3.5, 3.5]) # Setting last plot as empty for i in range(-10, 0): ax_original, ax_bc, ax_yj, ax_qt = axes_list[i] ax_original.axis('off') ax_bc.axis('off') ax_yj.axis('off') ax_qt.axis('off') # Export and last adjustments plt.tight_layout() plt.savefig('fig/09_col_trf.png') plt.show()
train_x = dataset_train[:, data_train.columns.isin(variables_x)] train_y = dataset_train[:, data_train.columns.isin(variables_y)].reshape(-1) test_x = dataset_test[:, data_train.columns.isin(variables_x)] test_y = dataset_test[:, data_train.columns.isin(variables_y)].reshape(-1) # # train_x = StandardScaler().fit_transform(train_x) # test_x = StandardScaler().fit_transform(test_x) # train_x = MinMaxScaler().fit_transform(train_x) # test_x = MinMaxScaler().fit_transform(test_x) # train_x = QuantileTransformer().fit_transform(train_x) # test_x = QuantileTransformer().fit_transform(test_x) # train_x = PowerTransformer().fit_transform(train_x) test_x = PowerTransformer().fit_transform(test_x) # myFile = open('../data/power_m_o_test_x.csv', 'w') with myFile: writer = csv.writer(myFile) writer.writerows(test_x) myFile2 = open('../data/power_m_o_train_x.csv', 'w') with myFile2: writer2 = csv.writer(myFile2) writer2.writerows(train_x) # train_x = PowerTransformer().fit_transform(train_x)
iris = load_iris() def _get_valid_samples_by_column(X, col): """Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive, omit_kwargs", [ (MaxAbsScaler(), maxabs_scale, True, False, []), (MinMaxScaler(), minmax_scale, False, False, ["clip"]), (StandardScaler(), scale, False, False, []), (StandardScaler(with_mean=False), scale, True, False, []), (PowerTransformer("yeo-johnson"), power_transform, False, False, []), (PowerTransformer("box-cox"), power_transform, False, True, []), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []), (RobustScaler(), robust_scale, False, False, []), (RobustScaler(with_centering=False), robust_scale, True, False, []), ], ) def test_missing_value_handling(est, func, support_sparse, strictly_positive, omit_kwargs): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan
import pickle import pandas as pd import numpy as np from sklearn.preprocessing import PowerTransformer from xgboost import XGBRFClassifier from sklearn.model_selection import train_test_split df = pd.read_csv("heart_failure_clinical_records_dataset.csv") t = np.array(list(df['creatinine_phosphokinase'])).reshape(-1, 1) pt = PowerTransformer(method="yeo-johnson") creatinine_phosphokinase = pt.fit_transform(t) df['creatinine_phosphokinase'] = creatinine_phosphokinase t = np.array(list(df['serum_creatinine'])).reshape(-1, 1) pt = PowerTransformer(method="yeo-johnson") serum_creatinine = pt.fit_transform(t) df['serum_creatinine'] = serum_creatinine df.drop(columns=['sex', 'diabetes'], inplace=True) X = df.iloc[:, 0:10].values Y = df['DEATH_EVENT'].values x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=6) xrclf = XGBRFClassifier() xrclf.fit(x_train, y_train)
def __init__(self): self.pt = PowerTransformer()
import numpy as np import pandas as pd from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=123) # Average CV score on the training set was: 0.9566871315015497 exported_pipeline = make_pipeline( PowerTransformer(), StandardScaler(), RobustScaler(), GaussianProcessRegressor(kernel=Matern(length_scale=4.0, nu=2.5), n_restarts_optimizer=185, normalize_y=False)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 123) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
''' Log transformation In the previous exercises you scaled the data linearly, which will not affect the data's shape. This works great if your data is normally distributed (or closely normally distributed), an assumption that a lot of machine learning models make. Sometimes you will work with data that closely conforms to normality, e.g the height or weight of a population. On the other hand, many variables in the real world do not follow this pattern e.g, wages or age of a population. In this exercise you will use a log transform on the ConvertedSalary column in the so_numeric_df DataFrame as it has a large amount of its data centered around the lower values, but contains very high values also. These distributions are said to have a long right tail. Instructions 100 XP Import PowerTransformer from sklearn's preprocessing module. Instantiate the PowerTransformer() as pow_trans. Fit the PowerTransformer on the ConvertedSalary column of so_numeric_df. Transform the same column with the scaler you just fit. ''' SOLUTION # Import PowerTransformer from sklearn.preprocessing import PowerTransformer # Instantiate PowerTransformer pow_trans = PowerTransformer() # Train the transform on the data pow_trans.fit(so_numeric_df[['ConvertedSalary']]) # Apply the power transform to the data so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform( so_numeric_df[['ConvertedSalary']]) # Plot the data before and after the transformation so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist() plt.show()
yhat = model3.transform(tmpX[n_train:], tmpY[n_train:]) model3.zscore(yhat, tmpY[n_train:]) print(f"{c*10} End Model3 <-- new, hyperparams {c*10}\n") # ## TODO: classifier/regressor/clusterer/etc Mixin requirements # piper = Pipeline(['model', model2]) # print( piper ) # piper.fit_transform(tmpX, tmpY) print(f"\n{c*10} Starting TrainingManager with Grid Search {c*10}\n") import preprocess, extract from sklearn.preprocessing import StandardScaler, PowerTransformer from sklearn.linear_model import LogisticRegression from sklearn import svm dpipez = [Pipeline([('scaler', StandardScaler()), ]), Pipeline([('power', PowerTransformer()),]) ] mpipez = [ ( Pipeline([ ('flatten', preprocess.Flattenor()), ('svm', svm.SVC() ) ]), {'kernel':('linear', 'rbf'), 'C':[1, 10]}) , ## ( Pipeline([ ('flatten', preprocess.Flattenor()),('logit', LogisticRegression() ) ]), {'C':[1,10]} ), ## (Pipeline([('reshaper', preprocess.Reshapeor( (1, -1)) ), ('tensorfy', preprocess.ToTensor() ),('zmodel', model2)]), {}) ] #*tmpX[0].shape print( mpipez) mgr = ZTrainingManager() mgr.build_permutationz(data_pipez=dpipez, model_pipez=mpipez) mgr.run( [x.cpu().numpy().ravel() for x in tmpX], [y.cpu().numpy().ravel() for y in tmpY] , train_test_split=1.) print(f"{c*10} End ZTrainingManager {c*10}\n")
def gaussian_scaler(train, test): scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train) train_scaled, test_scaled = transform_scaler(train, test, scaler) return train_scaled, test_scaled, scaler
# License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import PowerTransformer, minmax_scale print(__doc__) N_SAMPLES = 3000 FONT_SIZE = 6 BINS = 100 pt = PowerTransformer(method='box-cox') rng = np.random.RandomState(304) size = (N_SAMPLES, 1) # lognormal distribution X_lognormal = rng.lognormal(size=size) # chi-squared distribution df = 3 X_chisq = rng.chisquare(df=df, size=size) # weibull distribution a = 50 X_weibull = rng.weibull(a=a, size=size)
def transform(cols, cols_to_transform, scaler): values = scaler.transform(cols) return df[['name', 'alignment' ]].join(pd.DataFrame(values, columns=cols_to_transform)) scalers = [ StandardScaler( ), # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html MinMaxScaler( ), # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html RobustScaler( ), # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html PowerTransformer( ), # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html Normalizer( ), # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html ] _df = df[['name', 'alignment', 'weight', 'height']].dropna() cols_to_transform = ['weight', 'height'] df_to_scale = _df[cols_to_transform] for scaler in scalers: scaled_values = scaler.fit_transform(df_to_scale) scaled_values = pd.DataFrame(scaled_values, columns=cols_to_transform) df_transformed = _df[['name', 'alignment']].join(scaled_values) plot_weight_vs_height(df_transformed, str(scaler.__class__.__name__)) # -
def to_normal(train, test, features, method="yeo-johnson"): # method can be box-cox pt = PowerTransformer(method=method) train[features] = pt.fit_transform(train[features]) test[features] = pt.transform(test[features]) return train, test
def post(self, init_dataset=init_dataset): """ POST HTTP request @param init_dataset: Init dataset loaded @return: 200 code with history of the fit """ start_time = time.time() # Init the main parameters of training test_size = request.args.get('test_size', default=0.2, type=float) batch_size = request.args.get('batch_size', default=512, type=int) epochs = request.args.get('epochs', default=20, type=int) frac = request.args.get('frac', default=1, type=float) # Test of frac feature if not 1 >= frac >= 0.0001: return ioObj.generic_err400_with_resp( 'Wrong format of frac feature. Please, provide a float number in (0,1]', start_time) LOG.info("Loaded formatted data", extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)}) # Test of frac feature if not 1 >= test_size >= 0.0001: return ioObj.generic_err400_with_resp( 'Wrong format of test_size feature. Please, provide a float number in (0,1]', start_time) LOG.info("Loaded formatted data", extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)}) # Fetch frac of the dataset frac_dataset = init_dataset.sample(frac=frac) # Train test split X_train, X_test, y_train, y_test = train_test_split(frac_dataset.iloc[:, 3:-1], frac_dataset.iloc[:, -1], test_size=test_size) LOG.info("Train test split", extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)}) # Init of two PowerTransformer objects model_new.def_scaler(PowerTransformer(), PowerTransformer()) # Fit on X_train and transform both test and train samples X_train = model_new.scalerX.fit_transform(X_train) X_test = model_new.scalerX.transform(X_test) # Fit on y_train and transform both test and train samples y_train = model_new.scalerY.fit_transform(y_train.to_numpy().reshape(-1, 1)) y_test = model_new.scalerY.transform(y_test.to_numpy().reshape(-1, 1)) LOG.info("Transformed input and output data", extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)}) # Set the model regressor and compile regressor = model_new.model regressor.compile(optimizer='adam', loss='mse', metrics=['mae', model_new.coeff_determination]) LOG.info("Compiled the new model and about to fit it", extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)}) # Fit regressor history = regressor.fit( X_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=[early_stopping], validation_data=(X_test, y_test) ) LOG.info("Fitted model. Output the results", extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)}) # Format the results final_results = {} for key in history.history.keys(): final_results[key] = history.history[key] # Serialize format of model, weights, and PowerTransformers with open(mdl_new_name, "w") as json_file: json_file.write(regressor.to_json()) regressor.save_weights(mdl_new_weights_filename) pickle.dump(model_new.scalerX, open(scaler_new_X_filename, 'wb')) pickle.dump(model_new.scalerY, open(scaler_new_Y_filename, 'wb')) # Load files into MongoDB new_model_db.load_local_files() LOG.info("Loaded files of model, weights and PowerTransformers. Outputting the results", extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)}) return ioObj.generic_resp(200, 'application/json', ioObj.json_d(ioObj.success_message(final_results)))