class DFPowerTransformer(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = PowerTransformer(**kwargs) self.transform_cols = None self.stat_df = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) # Reference: https://help.gooddata.com/doc/en/reporting-and-dashboards/maql-analytical-query-language/maql-expression-reference/aggregation-functions/statistical-functions/predictive-statistical-use-cases/normality-testing-skewness-and-kurtosis # Highly skewed: -1 > Skewness > 1 # Moderate skewed: -0.5 < Skewness < -1 # 0.5 < Skewness < 1 # Approximately symmetric: -0.5 < Skewness < 0.5 skew_df = X[self.transform_cols].skew().to_frame(name='Skewness') # Normal distributed kurtosis: 3 kurt_df = X[self.transform_cols].kurt().to_frame(name='Kurtosis') self.stat_df = skew_df.merge(kurt_df, left_index=True, right_index=True, how='left') return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.copy() new_X[self.transform_cols] = self.model.transform( X[self.transform_cols]) # Transformed skewness & kurtosis skew_df = new_X[self.transform_cols].skew().to_frame( name='Skewness (Transformed)') kurt_df = new_X[self.transform_cols].kurt().to_frame( name='Kurtosis (Transformed)') stat_df = skew_df.merge(kurt_df, left_index=True, right_index=True, how='left') self.stat_df = self.stat_df.merge(stat_df, left_index=True, right_index=True, how='left') return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.copy() new_X[self.transform_cols] = self.model.inverse_transform( X[self.transform_cols]) return new_X
#pip install hyperopt from hyperopt import fmin, hp, tpe from sklearn.model_selection import StratifiedKFold nfolds = 5 skf = StratifiedKFold(n_splits=nfolds, shuffle=True) acc = [] #https://github.com/BIMSBbioinfo/maui/blob/master/vignette/maui_vignette.ipynb import maui import maui.utils print(f'Maui version: {maui.__version__}') #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html from sklearn.preprocessing import PowerTransformer pt = PowerTransformer() dfSILAClog2 = np.log2(dfSILAC + 1) #really weird scaling print(pt.fit(dfSILAClog2)) dfSILACtf = pt.transform(dfSILAClog2) print(pt.lambdas_) dfSILAClog2tf = maui.utils.scale(dfSILAClog2) from keras import backend as K import tensorflow as tf #K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=12, inter_op_parallelism_threads=12))) maui_model = maui.Maui(n_hidden=[1100], n_latent=70, epochs=400) z = maui_model.fit_transform({'mRNA': dfSILAClog2tf}) maui_model.hist.plot() maui_model.cluster(ami_y=z) maui_model.kmeans_scores.plot() import seaborn as sns sns.clustermap(maui_model.z_)
def grid_search(datFile, splitFile): # LOADING DATA FILE df = pd.read_csv(datFile, header=None) cols = ["z{}".format(x) for x in range(len(df.columns) - 2)] cols = cols + ["sample", "class"] df.columns = cols # LOADING TRAIN_TEST_VALIDATION SPLIT FILE split = pd.read_csv(splitFile) split = split.drop(["id", "synsetId", "subSynsetId"], axis=1) # SETTING SPLIT VARIABLES 1 train = split.loc[split["split"] == "train"] test = split.loc[split["split"] == "test"] val = split.loc[split["split"] == "val"] # SETTING SPLIT VARIABLES 2 train_set = df.loc[df["sample"].isin(train["modelId"])] test_set = df.loc[df["sample"].isin(test["modelId"])] val_set = df.loc[df["sample"].isin(val["modelId"])] # SETTING SPLIT VARIABLES 3 X_train = train_set.drop(["sample", "class"], axis=1) y_train = train_set["class"] X_test = test_set.drop(["sample", "class"], axis=1) y_test = test_set["class"] X_val = val_set.drop(["sample", "class"], axis=1) y_val = val_set["class"] # STANDARDIZATION scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X_val = scaler.transform(X_val) # SKEW REMOVAL pt = PowerTransformer(method="yeo-johnson", standardize=False) pt.fit(X_train) X_train = pt.transform(X_train) X_test = pt.transform(X_test) X_val = pt.transform(X_val) # TRAIN + VAL X_trainval = np.concatenate([X_train, X_val]) y_trainval = pd.concat([y_train, y_val]) prefold = [-1 for x in range(X_train.shape[0])] + \ [0 for x in range(X_val.shape[0])] # GRIDSEARCHCV - KNN ps = PredefinedSplit(prefold) knn = KNN() param_grid = { "p": [1, 2], "n_neighbors": [5, 6, 7, 8, 9, 10], "weights": ["uniform", "distance"] } grid = GridSearchCV(knn, param_grid=param_grid, n_jobs=-1, cv=ps) grid.fit(X_trainval, y_trainval) print(grid.best_estimator_) print(grid.best_score_) print(grid.best_params_) # GRIDSEARCHCV - SVM ps = PredefinedSplit(prefold) svm = SVM() param_grid = { "C": [1.0, 2.0, 3.0, 4.0], "kernel": ["rbf", "poly"], "gamma": ["scale"], } grid = GridSearchCV(svm, param_grid=param_grid, n_jobs=-1, cv=ps) grid.fit(X_trainval, y_trainval) print(grid.best_estimator_) print(grid.best_score_) print(grid.best_params_) # GRIDSEARCHCV - RANDOM FOREST ps = PredefinedSplit(prefold) randforest = RandomForestClassifier() param_grid = { "n_estimators": [500, 600], "min_samples_split": [2], "min_samples_leaf": [1], "max_features": ["auto"] } grid = GridSearchCV(randforest, param_grid=param_grid, n_jobs=-1, cv=ps) grid.fit(X_trainval, y_trainval) print(grid.best_estimator_) print(grid.best_score_) print(grid.best_params_)
def yeo_johnson_target_transformer(self): yeo_johnson_target_transformer = PowerTransformer(method="yeo-johnson", copy=True) yeo_johnson_target_transformer.fit( np.array(self.train_data[self.target]).reshape(-1, 1)) return yeo_johnson_target_transformer
def prepare_data(self, df, look_back, freq_period, first=0,seq2seq=False): ''' Parameters ---------- df : DataFrame datafrmae contening historical data . look_back : int length entry of the model . Decompose the signal in three sub signals, trend,seasonal and residual in order to work separetly on each signal Returns ------- trend_x : array values of the trend of the signal, matrix of dimention X array of dimension (1,length entry of model) X= length(dataframe)/look_back. trend_y : array vaklues to be predicted during the training seasonal_x : array same as trend_x but with the seasonal part of the signal. seasonal_y : array same as trend_y but with the seasonal part of the signal. residual_x : array same as trend_x but with the residual part of the signal. residual_y : array same as trend_y but with the residual part of the signal. ''' self.seq2seq=seq2seq imputer = KNNImputer(n_neighbors=2, weights="uniform") df.loc[:,"y"]=imputer.fit_transform(np.array(df["y"]).reshape(-1, 1)) if look_back%2==0: window=freq_period+1 else: window=freq_period scalerfile = self.directory + '/scaler_pred.sav' if not os.path.isfile(scalerfile) or os.path.isfile(scalerfile) and first == 1: if (df["y"].max() - df["y"].min()) > 100: if self.verbose == 1: print("PowerTransformation scaler used") scaler = PowerTransformer() else: if self.verbose == 1: print("Identity scaler used") scaler = IdentityTransformer() self.scaler2 = scaler.fit(np.reshape(np.array(df["y"]), (-1, 1))) Y = self.scaler2.transform(np.reshape(np.array(df["y"]), (-1, 1))) pickle.dump(self.scaler2, open(scalerfile, 'wb')) elif os.path.isfile(scalerfile) and first == 0: self.scaler2 = pickle.load(open(scalerfile, "rb")) Y = self.scaler2.transform(np.reshape(np.array(df["y"]), (-1, 1))) if freq_period % 2 == 0: freq_period = freq_period + 1 decomposition = STL(Y, period=freq_period) decomposition = decomposition.fit() df.loc[:, 'trend'] = decomposition.trend df.loc[:, 'seasonal'] = decomposition.seasonal df.loc[:, 'residual'] = decomposition.resid self.trend = np.asarray(df.loc[:, 'trend']) self.seasonal = np.asarray(df.loc[:, 'seasonal']) self.residual = np.asarray(df.loc[:, 'residual']) if not self.seq2seq : trend_x, trend_y = decoupe_dataframe(df["trend"], look_back) seasonal_x, seasonal_y = decoupe_dataframe(df["seasonal"], look_back) residual_x, residual_y = decoupe_dataframe(df["residual"], look_back) else : trend_x, trend_y = sequence_dataframe(df["trend"], look_back,self.len_pred) seasonal_x, seasonal_y = sequence_dataframe(df["seasonal"], look_back,self.len_pred) residual_x, residual_y = sequence_dataframe(df["residual"], look_back,self.len_pred) if self.verbose == 1: print("prepared") return trend_x, trend_y, seasonal_x, seasonal_y, residual_x, residual_y
# In[46]: y=data[data["max_heart_rate achieved"]<85] y # DATA PREPARATION # In[47]: from sklearn.preprocessing import PowerTransformer log=PowerTransformer() log.fit(data[['st_deprssion']]) data['log_depression']=log.transform(data[['st_deprssion']]) data.drop('st_deprssion',inplace=True,axis=1) # In[48]: cnts_feature=['age','resting_blood_pressure','cholestoral','max_heart_rate achieved','log_depression'] cat_feature=[i for i in data.columns if i not in cnts_feature + ['target']] # In[49]: data=pd.get_dummies(data,columns=cat_feature)
#3 RobustScaler # the centering and scaling statistics of this scaler are based on percentiles #and are therefore not influenced by a few number of very large marginal outliers. scaler3 = RobustScaler() scaler3.fit(X) X3 = scaler3.transform(X) df3 = pd.DataFrame(data=X3, columns=column_names) print(df3.describe()) sns.jointplot(x='MedInc', y='AveOccup', data=df3, xlim=[-2, 3], ylim=[-2, 3]) #Range -2 to 3 #4 PowerTransformer # applies a power transformation to each feature to make the data more Gaussian-like scaler4 = PowerTransformer() scaler4.fit(X) X4 = scaler4.transform(X) df4 = pd.DataFrame(data=X4, columns=column_names) print(df4.describe()) sns.jointplot(x='MedInc', y='AveOccup', data=df4) # #5 QuantileTransformer # has an additional output_distribution parameter allowing to match a # Gaussian distribution instead of a uniform distribution. scaler5 = QuantileTransformer() scaler5.fit(X) X5 = scaler5.transform(X) df5 = pd.DataFrame(data=X5, columns=column_names) print(df5.describe()) sns.jointplot(x='MedInc', y='AveOccup', data=df5) #
コメント: baseline ver2.1作成後のEDA """ # ライブラリのインポート from sklearn.preprocessing import PowerTransformer # durationのヒストグラム plt.hist(train['duration'], bins=50) plt.title('duration') plt.show() # Yeo-Johnson変換 pt = PowerTransformer(method='yeo-johnson') data = train['duration'].values.reshape(-1, 1) pt.fit(data) train['duration'] = pt.transform(data) # Yeo-Johnson変換後のdurationのヒストグラム plt.hist(train['duration'], bins=50) plt.title('duration(Yeo-Johnson)') plt.show() # campaignのヒストグラム plt.hist(train['campaign'], bins=50) plt.title('campaign') plt.show() # Box-Cox変換 pt = PowerTransformer(method='box-cox') data = train['campaign'].values.reshape(-1, 1)
print(y[:10]) print('************************************') print(np.max(x), np.min(x)) # 711.0 / 0.0 print(dataset.feature_names) # print(dataset.DESCR) # 데이터 전처리 (MinMaxScaler ; (x - min) / (max - min) -> 0 <= x' <= 1) from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer from sklearn.preprocessing import MaxAbsScaler, PowerTransformer # scaler = StandardScaler() # scaler = MaxAbsScaler() scaler = PowerTransformer(method='yeo-johnson') # scaler = PowerTransformer(method='box-cox') # only be applied to strictly positive data scaler.fit(x) x = scaler.transform(x) # Minmax # print(np.max(x), np.min(x)) # 711.0 / 0.0 -> 1.0 / 0.0 # print(np.max(x[0])) # 0.99999999999999999 # print(np.max(x), np.min(x)) # 9.933930601860268 -3.9071933049810337 print(np.max(x[0])) # 0.44105193260704206 from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
class DualNNRec(RecommenderBase, ABC): # TODO add support for Early Stopping def __init__( self, weight_decay: float = 0.0, lr: float = 2e-5, cap_length: int = 128, eps: float = 1e-8, num_warmup_steps: int = 0, epochs: int = 4, ffnn_params: dict = None, ): super().__init__() self.device = None self.device = self._find_device() self.weight_decay = weight_decay self.lr = lr self.eps = eps self.num_warmup_steps = num_warmup_steps self.epochs = epochs self.cap_length = cap_length self.scaler = PowerTransformer(method='yeo-johnson') self.ffnn_params = ffnn_params self.model = None # Set the seed value all over the place to make this reproducible. self.seed_val = 42 random.seed(self.seed_val) np.random.seed(self.seed_val) torch.manual_seed(self.seed_val) @abstractmethod def _get_model(self, ffnn_input_size): pass def _find_device(self): # If there's a GPU available... if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") return device def _normalize_features(self, df, is_train=False): if is_train == True: print("Fitting yeo-jhonson scaler") self.scaler.fit(df) # print(self.scaler.scale_, self.scaler.mean_, self.scaler.var_, self.scaler.n_samples_seen_) return pd.DataFrame(self.scaler.transform(df), columns=df.columns) def load_model(self): pass # TODO add support for cat features def fit(self, df_train_features: pd.DataFrame, df_train_tokens_reader: pd.io.parsers.TextFileReader, df_train_label: pd.DataFrame, df_val_features: pd.DataFrame, df_val_tokens_reader: pd.io.parsers.TextFileReader, df_val_label: pd.DataFrame, save_filename: str, cat_feature_set: set, normalize: bool = True, train_batches_to_skip: int = 0, val_batches_to_skip: int = 0, pretrained_model_dict_path: str = None, pretrained_optimizer_dict_path: str = None): self.df_train_label = df_train_label self.df_val_label = df_val_label print(df_train_features) print(df_val_features) assert len( df_train_label.columns) == 2, "it needs 2 labels in train df." assert len(df_val_label.columns) == 2, "it needs 2 labels in val df." assert len(df_train_features.columns) == len(df_val_features.columns), \ "df_train_features and df_val_features have different number of columns" if normalize: df_train_features = self._normalize_features(df_train_features, is_train=True) df_val_features = self._normalize_features(df_val_features) print(df_train_features) print(df_val_features) gpu = torch.cuda.is_available() if gpu: torch.cuda.manual_seed_all(self.seed_val) ffnn_input_size = HIDDEN_SIZE_BERT + df_train_features.shape[1] self.model = self._get_model(ffnn_input_size=ffnn_input_size) if pretrained_model_dict_path is not None: print(f"Loading pretrained model : {pretrained_model_dict_path}") self.model.load_state_dict(torch.load(pretrained_model_dict_path)) if gpu: self.model.cuda() # freeze all bert layers # for param in self.model.bert.parameters(): # param.requires_grad = False train_dataset = CustomDatasetDualCap( df_features=df_train_features, df_tokens_reader=df_train_tokens_reader, df_label=df_train_label, cap=self.cap_length, batches_to_skip=train_batches_to_skip) val_dataset = CustomDatasetDualCap( df_features=df_val_features, df_tokens_reader=df_val_tokens_reader, df_label=df_val_label, cap=self.cap_length, batches_to_skip=val_batches_to_skip) train_dataloader, validation_dataloader = create_data_loaders( train_dataset, val_dataset, batch_size=df_train_tokens_reader.chunksize) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW( optimizer_grouped_parameters, lr=self. lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=self.eps # args.adam_epsilon - default is 1e-8. ) if pretrained_optimizer_dict_path is not None: print( f"Loading pretrained optimizer : {pretrained_optimizer_dict_path}" ) optimizer.load_state_dict( torch.load(pretrained_optimizer_dict_path)) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * self.epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] # Measure the total training time for the whole run. total_t0 = time.time() # For each epoch... for epoch_i in range(0, self.epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format( epoch_i + 1, self.epochs)) print('Training...') avg_train_loss, training_time, = self.train( self.model, train_dataloader, optimizer, scheduler) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("") print("Running Validation...") avg_val_loss, validation_time = self.validation( model=self.model, validation_dataloader=validation_dataloader) # Record all statistics from this epoch. curr_stats = { 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, # 'PRAUC train': prauc_train, # 'RCE train': rce_train, # 'PRAUC val': prauc_val, # 'RCE val': rce_val, 'Valid. Loss': avg_val_loss, # 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time } training_stats.append(curr_stats) pathlib.Path('./saved_models').mkdir(parents=True, exist_ok=True) model_path = f"./saved_models/saved_model_{save_filename}" optimizer_path = f"./saved_models/saved_optimizer_{save_filename}" print(f"Saving model : {model_path}") torch.save(self.model.state_dict(), model_path) torch.save(optimizer.state_dict(), optimizer_path) bot_string = f"DistilBertDoubleInput NN - dual_label \n ---------------- \n" bot_string = bot_string + str(self.model) bot_string = bot_string + "Weight decay: " + str( self.weight_decay) + "\n" bot_string = bot_string + "Learning rate: " + str(self.lr) + "\n" bot_string = bot_string + "Epsilon: " + str( self.eps) + "\n ---------------- \n" bot_string = bot_string + "\n".join( [key + ": " + str(curr_stats[key]) for key in curr_stats]) + "\n\n" bot_string = bot_string + "Saved to : " + model_path #telegram_bot_send_update(bot_string) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) return training_stats def train(self, model, train_dataloader, optimizer, scheduler): # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_train_loss = 0 # total_train_prauc = 0 # total_train_rce = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train() preds_list = [None] * 2 labels_list = [None] * 2 # For each batch of training data... for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)): # Progress update every 40 batches. #if step % 40 == 0 and not step == 0: # # Calculate elapsed time in minutes. # elapsed = format_time(time.time() - t0) # # # Report progress. # print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: features # [3]: labels b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_features = batch[2].to(self.device) b_labels = batch[3].to(self.device) #print("b_labels") #print(b_labels) #print(b_labels.shape) # print("b_labels:",b_labels.shape) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # It returns different numbers of parameters depending on what arguments # arge given and what flags are set. For our useage here, it returns # the loss (because we provided labels) and the "logits"--the model # outputs prior to activation. output_list = model( input_ids=b_input_ids, input_features=b_features, # token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. loss = output_list[0][0] total_train_loss += loss.item() for i in range(2): curr_preds = output_list[i][2] if preds_list[i] is None: preds_list[i] = curr_preds else: preds_list[i] = np.hstack([preds_list[i], curr_preds]) curr_labels = b_labels.detach().cpu().numpy()[:, i] if labels_list[i] is None: labels_list[i] = curr_labels else: labels_list[i] = np.hstack([labels_list[i], curr_labels]) # print(f"batch {step} RCE: {rce}") # print(f"batch {step} PRAUC: {prauc}") # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) print(f"TRAINING STATISTICS FOR EPOCH") for i in range(2): prauc, rce, conf, max_pred, min_pred, avg = self.evaluate( preds=preds_list[i], labels=labels_list[i]) if i == 0: print("\n------- LABEL 1 -------") elif i == 1: print("\n------- LABEL 2 -------") print(f"PRAUC : {prauc}" f"\nRCE : {rce}" f"\nMIN : {min_pred}" f"\nMAX : {max_pred}" f"\nAVG : {avg}") # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(training_time)) return avg_train_loss, training_time def validation(self, model, validation_dataloader): t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables total_eval_loss = 0 preds_list = [None] * 2 labels_list = [None] * 2 # Measure how long the training epoch takes. t0 = time.time() # Evaluate data for one epoch for step, batch in tqdm(enumerate(validation_dataloader), total=len(validation_dataloader)): # Progress update every 40 batches. #if step % 40 == 0 and not step == 0: # # Calculate elapsed time in minutes. # elapsed = format_time(time.time() - t0) # # # Report progress. # print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(validation_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: features # [3]: labels b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_features = batch[2].to(self.device) b_labels = batch[3].to(self.device) # print("b_labels:",b_labels.shape) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. output_list = model( input_ids=b_input_ids, input_features=b_features, attention_mask=b_input_mask, labels=b_labels, ) loss = output_list[0][0] total_eval_loss += loss.item() for i in range(2): curr_preds = output_list[i][2] if preds_list[i] is None: preds_list[i] = curr_preds else: preds_list[i] = np.hstack([preds_list[i], curr_preds]) curr_labels = b_labels.detach().cpu().numpy()[:, i] if labels_list[i] is None: labels_list[i] = curr_labels else: labels_list[i] = np.hstack([labels_list[i], curr_labels]) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) print(f"VALIDATION STATISTICS FOR EPOCH") for i in range(2): prauc, rce, conf, max_pred, min_pred, avg = self.evaluate( preds=preds_list[i], labels=labels_list[i]) if i == 0: print("\n------- LABEL 1 -------") elif i == 1: print("\n------- LABEL 2 -------") print(f"PRAUC : {prauc}" f"\nRCE : {rce}" f"\nMIN : {min_pred}" f"\nMAX : {max_pred}" f"\nAVG : {avg}") # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) return avg_val_loss, validation_time def evaluate(self, preds, labels=None): # print(preds) # print(preds.shape) # print(labels) # print(labels.shape) # Tries to load X and Y if not directly passed if (labels is None): print("No labels passed, cannot perform evaluation.") if (self.model is None): print("No model trained, cannot to perform evaluation.") else: #print("preds") #print(preds) #print(preds.shape) #print("labels") #print(labels) #print(labels.shape) # Declaring the class containing the metrics cm = CoMe(preds, labels) # Evaluating prauc = cm.compute_prauc() rce = cm.compute_rce() # Confusion matrix conf = cm.confMatrix() # Prediction stats max_pred, min_pred, avg = cm.computeStatistics() return prauc, rce, conf, max_pred, min_pred, avg def get_prediction(self, df_test_features: pd.DataFrame, df_test_tokens_reader: pd.io.parsers.TextFileReader, pretrained_model_dict_path: str = None, normalize: bool = True): if normalize: df_test_features = self._normalize_features(df_test_features) if pretrained_model_dict_path is None: assert self.model is not None, "You are trying to predict without training." else: ffnn_input_size = HIDDEN_SIZE_BERT + df_test_features.shape[1] self.model = self._get_model(ffnn_input_size=ffnn_input_size) self.model.load_state_dict(torch.load(pretrained_model_dict_path)) self.model.cuda() self.model.eval() preds = None test_dataset = CustomTestDatasetCap( df_features=df_test_features, df_tokens_reader=df_test_tokens_reader, cap=self.cap_length) test_dataloader = DataLoader( test_dataset, # The test samples. sampler=SequentialSampler( test_dataset), # Select batches sequentially batch_size=df_test_tokens_reader.chunksize # Generates predictions with this batch size. ) # Evaluate data for one epoch for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)): # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: features # [3]: labels b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_features = batch[2].to(self.device) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. curr_logits = self.model( input_ids=b_input_ids, input_features=b_features, # token_type_ids=None, --> missing in distilbert attention_mask=b_input_mask) curr_logits = curr_logits[0] #print(curr_logits) #print(curr_logits.shape) curr_preds = torch.sigmoid(curr_logits) curr_preds = curr_preds.detach().cpu().numpy() if preds is None: preds = curr_preds else: preds = np.vstack([preds, curr_preds]) return preds
from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=66) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=66) from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, RobustScaler, PowerTransformer # scaler = QuantileTransformer(n_quantiles=100) scaler = PowerTransformer() # scaler = RobustScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) x_val = scaler.transform(x_val) from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.layers import Dense, BatchNormalization model = Sequential() model.add(Dense(1024, activation='relu', input_shape=(11, ))) model.add(Dense(512, activation='relu')) model.add(Dense(256, activation='relu')) model.add(Dense(128, activation='relu')) model.add(Dense(64, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dense(16, activation='relu'))
marginal_x="rug", marginal_y="histogram") # >> fig_density.show() # Show density heatmap for cities fig_city = pltx.density_heatmap(data.head(30000), x="ORIGIN", y="DEST", marginal_y="histogram") # >> fig_city.show() # Explore the skewness skew = data.skew() print('Skewness:', skew) # Fix using a yj transformation numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] num_cols = data.select_dtypes(include=numerics) pt = PowerTransformer(method='yeo-johnson') skewed_features = [] for feature, skew in skew.items(): if skew >= 1.5 and feature in num_cols.columns.values and feature != 'YEAR': skewed_features.append(feature) pt = PowerTransformer() pt.fit(data[skewed_features]) data[skewed_features] = pt.transform(data[skewed_features]) print('Skewness after normalization:', data.skew())
def chang_hug_map(X, hex_colors, FONT_SIZE=12, BINS=30): ''' Function that applies Chang & Hug map of preprocessing data to a normal distribution: REF: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_map_data_to_normal.html#sphx-glr-auto-examples-preprocessing-plot-map-data-to-normal-py Parameters: * X = features * hex_colors = hexadecimal colors to be used for each feature * FONT_SIZE = size of font on plots * BINS = number of bins on histogram plots ''' # setting preprocessing methods: PowerTransformer (Box-Cox, Yeo-Johnson); QuantileTransformer scaler = MinMaxScaler(feature_range=(1, 2)) boxcox = PowerTransformer(method='box-cox') bc = Pipeline(steps=[('s', scaler), ('bc', boxcox)]) yj = PowerTransformer(method='yeo-johnson') rng = np.random.RandomState(304) qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', random_state=rng) # adding distributions of columns distributions = [] for i in range(0, len(X.columns)): name = X.columns[i] array = X[X.columns[i]].to_numpy().reshape(-1, 1) distributions.append((name, array)) colors = hex_colors # generating the plot fig, axes = plt.subplots( nrows=12, ncols=15, figsize=(35, 25)) # cols = num of preprocessing methods + original axes = axes.flatten() axes_idxs = [ (0, 15, 30, 45), (1, 16, 31, 46), (2, 17, 32, 47), (3, 18, 33, 48), (4, 19, 34, 49), (5, 20, 35, 50), # first set (6, 21, 36, 51), (7, 22, 37, 52), (8, 23, 38, 53), (9, 24, 39, 54), (10, 25, 40, 55), (11, 26, 41, 56), (12, 27, 42, 57), (13, 28, 43, 58), (14, 29, 44, 59), (60, 75, 90, 105), (61, 76, 91, 106), (62, 77, 92, 107), (63, 78, 93, 108), (64, 79, 94, 109), (65, 80, 95, 110), # second set (66, 81, 96, 111), (67, 82, 97, 112), (68, 83, 98, 113), (69, 84, 99, 114), (70, 85, 100, 115), (71, 86, 101, 116), (72, 87, 102, 117), (73, 88, 103, 118), (74, 89, 104, 119), (120, 135, 150, 165), (121, 136, 151, 166), (122, 137, 152, 167), (123, 138, 153, 168), (124, 139, 154, 169), (125, 140, 155, 170), (126, 141, 156, 171), (127, 142, 157, 172), (128, 143, 158, 173), (129, 144, 159, 174), (130, 145, 160, 175), (131, 146, 161, 176), (132, 147, 162, 177), (133, 148, 163, 178), (134, 149, 164, 179) ] axes_list = [(axes[i], axes[j], axes[k], axes[l]) for (i, j, k, l) in axes_idxs] for distribution, color, axes in zip(distributions, colors, axes_list): name, X_col = distribution X_train, X_test = train_test_split(X_col, test_size=0.2, random_state=rng) # perform power and quantile transforms X_trans_bc = bc.fit(X_train).transform(X_test) lmbda_bc = round(bc.named_steps['bc'].lambdas_[0], 2) X_trans_yj = yj.fit(X_train).transform(X_test) lmbda_yj = round(yj.lambdas_[0], 2) X_trans_qt = qt.fit(X_train).transform(X_test) ax_original, ax_bc, ax_yj, ax_qt = axes ax_original.hist(X_train, color=color, bins=BINS) ax_original.set_title(name, fontsize=FONT_SIZE) ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) for ax, X_trans, meth_name, lmbda in zip( (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt), ('Box-Cox', 'Yeo-Johnson', 'Quartile transform'), (lmbda_bc, lmbda_yj, None)): ax.hist(X_trans, color=color, bins=BINS) title = f'After {meth_name}' if lmbda is not None: title += f'\n$\lambda$ = {lmbda}' ax.set_title(title, fontsize=FONT_SIZE) ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE) ax.set_xlim([-3.5, 3.5]) # Setting last plot as empty for i in range(-10, 0): ax_original, ax_bc, ax_yj, ax_qt = axes_list[i] ax_original.axis('off') ax_bc.axis('off') ax_yj.axis('off') ax_qt.axis('off') # Export and last adjustments plt.tight_layout() plt.savefig('fig/09_col_trf.png') plt.show()
class PreprocessData: def __init__(self, preprocess_type=None, extend_data=False, short_end=False): self.config = Config() # prepare input data config_path = self.config.get_filepath("", "config.yaml") config_file = open(config_path, 'r') yaml_config = yaml.load(config_file, Loader=yaml.SafeLoader) self.training_dataset_names = [ d['name'] for d in yaml_config['training_datasets'] ] self.training_dataset_start_pos = [ d['start_position'] for d in yaml_config['training_datasets'] ] self.test_dataset_names = [ d['name'] for d in yaml_config['test_datasets'] ] self.test_dataset_start_pos = [ d['start_position'] for d in yaml_config['test_datasets'] ] self.dataset_names = np.concatenate( (self.training_dataset_names, self.test_dataset_names)) # do we need these? self.dataset_start_pos = np.concatenate( (self.training_dataset_start_pos, self.test_dataset_start_pos)) # do we need these? # read in all pickle files self.all_pd = [] for dataset_name in self.dataset_names: self.all_pd.append( pd.read_pickle(self.config.get_filepath_data(dataset_name))) if extend_data: training_dataset_names_copy = np.array(self.training_dataset_names, copy=True) # create a copy of the data shifted up by 10 for i, dataset_name in enumerate(training_dataset_names_copy): self.dataset_names = np.append(self.dataset_names, dataset_name + "_" + str(10)) self.training_dataset_names = np.append( self.training_dataset_names, dataset_name + "_" + str(10)) self.dataset_start_pos = np.append( self.dataset_start_pos, self.training_dataset_start_pos[i]) self.training_dataset_start_pos.append( self.training_dataset_start_pos[i]) self.all_pd.append(self.all_pd[i].copy() + 10) self.dict_datasets = dict( zip(self.dataset_names, np.arange(len(self.dataset_names)))) self.enable_difference = False self._feature_range = [0, 1] self.normalisation_scalers = [] for _ in self.dataset_names: self.normalisation_scalers.append( MinMaxScaler(feature_range=self.feature_range)) self.enable_normalisation_scaler = False self.enable_ignore_price = False # scale each curve to feature_range self.power_transformer = PowerTransformer() self.enable_power_transform = False self.standardisation_scalers = [] for _ in self.dataset_names: self.standardisation_scalers.append(StandardScaler()) self.enable_standardisation_scaler = False self.enable_log_returns = False self.mult_factor = 10 # 5 self.add_factor = 25 # 6 self.enable_log = False self.enable_pct_change = False self.enable_curve_smoothing = False self.short_end = short_end # now setup PreprocessType settings if preprocess_type is PreprocessType.NORMALISATION_OVER_TENORS: self.enable_normalisation_scaler = True self.feature_range = [0, 1] elif preprocess_type is PreprocessType.NORMALISATION_OVER_CURVES: self.enable_normalisation_scaler = True self.feature_range = [0, 1] self.enable_ignore_price = True elif preprocess_type is PreprocessType.STANDARDISATION_OVER_TENORS: self.enable_standardisation_scaler = True elif preprocess_type is PreprocessType.LOG_RETURNS_OVER_TENORS: self.enable_log_returns = True @property def feature_range(self): # implements the get - this name is *the* name return self._feature_range @feature_range.setter def feature_range(self, value): # name must be the same self._feature_range = value for i, _ in enumerate(self.dataset_names): self.normalisation_scalers[i] = MinMaxScaler(feature_range=value) def get_data(self, training_dataset_names=None, test_dataset_names=None, chunks_of=None): if training_dataset_names is None: training_dataset_names = self.training_dataset_names if isinstance(training_dataset_names, str): training_dataset_names = np.array([training_dataset_names]) if test_dataset_names is None: test_dataset_names = self.test_dataset_names if test_dataset_names is None and self.test_dataset_names is None: test_dataset_names = [] if isinstance(test_dataset_names, str): test_dataset_names = np.array([test_dataset_names]) training_data = [] test_data = [] training_data_scaled = [] test_data_scaled = [] for key, value in self.dict_datasets.items(): start_position = self.dataset_start_pos[value] end_position = None if chunks_of is not None: end_position = chunks_of * ( (self.all_pd[value].shape[0] - start_position) // chunks_of) if key in training_dataset_names: # we take the log returns of each data set and scale wrt first dataset new_training_data = self.all_pd[value].copy( )[start_position:end_position] if self.short_end: new_training_data = new_training_data.iloc[:, 0] new_training_data_scaled = self.scale_data( new_training_data, value, True) training_data.append(new_training_data) training_data_scaled.append(new_training_data_scaled) if key in test_dataset_names: new_test_data = self.all_pd[value].copy( )[start_position:end_position] if self.short_end: new_test_data = new_test_data.iloc[:, 0] new_test_data_scaled = self.scale_data( new_test_data, value, True) # todo: should we scale test data wrt training data? test_data.append(new_test_data) test_data_scaled.append(new_test_data_scaled) maturities = self.all_pd[0].columns.values / (30 * 12) # for years if test_dataset_names is not None: return training_data, test_data, training_data_scaled, test_data_scaled, training_dataset_names, test_dataset_names, maturities else: return training_data_scaled, maturities # def rescale_data_inputter(self, data, datasets=None): # rescaled_data = [] # if datasets == "train": # for i, name in enumerate(self.training_dataset_names): # # pos = self.dict_datasets[name] # rescaled_data.append(self.rescale_data(data[i], dataset_name=name)) # # elif datasets == "test": # for i, name in enumerate(self.test_dataset_names): # # pos = self.dict_datasets[name] # # self.scale_data(self, data, dataset_num=pos) # rescaled_data.append(self.rescale_data(data[i], dataset_name=name)) # # return rescaled_data def scale_data(self, data, dataset_name=None, should_fit=False): # if given a numpy array, convert it to a dataframe first if type(data) is np.ndarray: _data = pd.DataFrame(data=data) elif isinstance(data, list): _data_list = [] # if isinstance(dataset_name, list): for _data, _dataset_name in zip(data, dataset_name): _data_list.append( self.scale_data(_data, _dataset_name, should_fit)) # else: # for _data in data: # _data_list.append(self.scale_data(_data, should_fit, dataset_name)) return _data_list else: _data = data.copy() time = _data.axes[0].tolist() # maturities = _data.columns.values dataset_num = 999 if dataset_name is not None: if isinstance(dataset_name, numbers.Integral): dataset_num = dataset_name else: for key, value in self.dict_datasets.items(): if key == dataset_name: dataset_num = value if self.enable_log: _data = _data.apply(np.log) if self.enable_difference: _data = _data.diff(axis=1) _data = _data.fillna(0) if self.enable_pct_change: _data = _data.pct_change() _data = _data.fillna(0) if self.enable_log_returns: shift = (_data.shift(0) + self.add_factor) / ( _data.shift(1) + self.add_factor ) # add 6 to make it non-negative, to take the log later shift = shift.dropna() if not (np.array(shift) > 0).all(): # some values are non-positive... this will break the log print("NON-POSITIVE VALUES FOUND, CANNOT PASS THROUGH LOG!!") print(np.min(_data)) print(shift) _data = self.mult_factor * np.log(shift) time = _data.axes[0].tolist() # now use only numpy, convert pandas to numpy array _data = _data.values if self.short_end and len(_data.shape) == 1: _data = _data.reshape(-1, 1) if self.enable_standardisation_scaler: if not self.enable_ignore_price: if should_fit: self.standardisation_scalers[dataset_num].fit(_data) _data = self.standardisation_scalers[dataset_num].transform( _data) else: data_temp = [] for row in _data: # row_as_2d = row.reshape(1, -1) row_as_column = row[:, np.newaxis] self.standardisation_scalers[dataset_num].fit( row_as_column) temp = self.standardisation_scalers[dataset_num].transform( row_as_column) data_temp.append(temp.ravel()) _data = np.array(data_temp) if self.enable_normalisation_scaler: if not self.enable_ignore_price: if should_fit: self.normalisation_scalers[dataset_num].fit(_data) _data = self.normalisation_scalers[dataset_num].transform( _data) else: data_temp = [] for row in _data: # row_as_2d = row.reshape(1, -1) row_as_column = row[:, np.newaxis] self.normalisation_scalers[dataset_num].fit(row_as_column) temp = self.normalisation_scalers[dataset_num].transform( row_as_column) data_temp.append(temp.ravel()) _data = np.array(data_temp) if self.enable_power_transform: if should_fit: self.power_transformer.fit(_data) _data = self.power_transformer.transform(_data) df = pd.DataFrame(data=_data, index=np.array(time)) return df def rescale_data(self, data, dataset_name=None, start_value=None, index=None, columns=None): if isinstance(data, pd.DataFrame): if columns is None: columns = data.columns.values if index is None: index = data.index.values if type(data) is np.ndarray: temp_data = data else: temp_data = np.array(data) if self.short_end and len(temp_data.shape) == 1: temp_data = temp_data.reshape(-1, 1) dataset_num = 999 if dataset_name is not None: for key, value in self.dict_datasets.items(): if key == dataset_name: dataset_num = value if self.enable_difference: temp_data = temp_data # TODO: inverse difference if self.enable_power_transform: temp_data = self.power_transformer.inverse_transform(temp_data) if self.enable_normalisation_scaler: # we need to scale each rolling window manually if self.enable_ignore_price: # rescale each curve individually data_min = self.all_pd[dataset_num].min(axis=1) data_max = self.all_pd[dataset_num].max(axis=1) a = self.feature_range[0] b = self.feature_range[1] for i in np.arange(temp_data.shape[0]): temp_data[i] = ( (temp_data[i] - a) / (b - a)) * (data_max[i] - data_min[i]) + data_min[i] else: if len(temp_data.shape) == 3: new_temp_data = [] for i in np.arange(temp_data.shape[0]): new_temp_data.append( self.normalisation_scalers[dataset_num]. inverse_transform(temp_data[i])) temp_data = np.array(new_temp_data) else: temp_data = self.normalisation_scalers[ dataset_num].inverse_transform(temp_data) if self.enable_standardisation_scaler: # temp_data = self.standardisation_scaler.inverse_transform(temp_data) if self.enable_ignore_price: raise NotImplementedError else: if len(temp_data.shape) == 3: new_temp_data = [] for i in np.arange(temp_data.shape[0]): new_temp_data.append( self.standardisation_scalers[dataset_num]. inverse_transform(temp_data[i])) temp_data = np.array(new_temp_data) else: temp_data = self.standardisation_scalers[ dataset_num].inverse_transform(temp_data) if self.enable_log: temp_data = np.exp(temp_data) if self.enable_log_returns: # if start_value is not assigned but dataset_name is, use the first value of the dataset as start_value if dataset_name is not None and start_value is None: _start_value = self.all_pd[dataset_num].iloc[0] elif start_value is not None: _start_value = start_value else: _start_value = 1. # print("shapes, log-return rescale", temp_data.shape, _start_value.shape, _start_value[0].shape) if len(temp_data.shape) is 1: z = np.exp(temp_data / self.mult_factor) z = np.insert( np.array(z), 0, _start_value[0] + self.add_factor) # instead of the usual _start_value temp_data = np.cumprod(z) - self.add_factor temp_data = pd.DataFrame(data=temp_data, index=self.all_pd[dataset_num].index) # print(temp_data.head(10)) elif len( temp_data.shape ) is 2: # when taking log-returns on an individual batch, todo: check if self.short_end: z = np.exp(temp_data / self.mult_factor) z = np.insert(z, 0, _start_value[0] + self.add_factor, axis=0) temp_data = np.cumprod(z, axis=0) - self.add_factor else: z = np.exp(temp_data / self.mult_factor) z = np.insert(z, 0, _start_value + self.add_factor, axis=0) temp_data = np.cumprod(z, axis=0) - self.add_factor elif len(temp_data.shape ) > 2: # when taking log-returns on multiple batches z = np.exp(temp_data[:, :] / self.mult_factor) z = np.insert(z, 0, _start_value + self.add_factor, axis=1) temp_data = np.cumprod(z, axis=1) - self.add_factor else: z = np.exp(temp_data[0, :] / self.mult_factor) z = np.insert(z, 0, _start_value + self.add_factor) temp_data = np.cumprod(z) - self.add_factor # print("log returns undo...", _start_value, temp_data[0]) if self.enable_curve_smoothing: curve_smooth = [] for curve in temp_data: curve_smooth.append(savgol_filter( curve, 23, 5)) # window size 51, polynomial order 3 temp_data = np.array(curve_smooth) if index is not None and columns is not None: return pd.DataFrame(temp_data, index=index, columns=columns) else: return temp_data
#Log transformation #In the previous exercises you scaled the data linearly, which will not affect the data's shape. This works great if your data is normally distributed (or closely normally distributed), an assumption that a lot of machine learning models make. Sometimes you will work with data that closely conforms to normality, e.g the height or weight of a population. On the other hand, many variables in the real world do not follow this pattern e.g, wages or age of a population. In this exercise you will use a log transform on the ConvertedSalary column in the so_numeric_df DataFrame as it has a large amount of its data centered around the lower values, but contains very high values also. These distributions are said to have a long right tail. # Import PowerTransformer from sklearn.preprocessing import PowerTransformer # Instantiate PowerTransformer pow_trans = PowerTransformer() # Train the transform on the data pow_trans.fit(so_numeric_df[['ConvertedSalary']]) # Apply the power transform to the data so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']]) # Plot the data before and after the transformation so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist() plt.show() #Percentage based outlier removal #One way to ensure a small portion of data is not having an overly adverse effect is by removing a certain percentage of the largest and/or smallest values in the column. This can be achieved by finding the relevant quantile and trimming the data using it with a mask. This approach is particularly useful if you are concerned that the highest values in your dataset should be avoided. When using this approach, you must remember that even if there are no outliers, this will still remove the same top N percentage from the dataset. # Find the 95th quantile
del df_iterator gc.collect() return ret ##############################Train######################################## train_path = "/data/recsys2020/history_nn/TrainXGB.csv" train_dict = generate_dict_np(train_path) ##Fit scalers scaler_f = PowerTransformer(copy=False) start_time = time.time() s = len(train_dict['features']) scaler_f.fit(train_dict['features'][np.random.choice(s, int(0.1 * s))].astype( np.float64, copy=False)) print("Elapsed: {0}".format(inhour(time.time() - start_time))) print("fit feature scaler") ##Save scalers with open('/data/recsys2020/history_nn/f_scaler.pkl', 'wb') as f: pickle.dump(scaler_f, f, protocol=4) ##Fit scalers ## Load scalers # with open('/data/recsys2020/history_nn/f_scaler.pkl', 'rb') as f: # scaler_f = pickle.load(f) ## Load scalers ##Apply scalers to train set start_time = time.time() train_dict['features'] = scaler_f.transform(train_dict['features'])
'darkorchid' ] fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2)) axes = axes.flatten() axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21), (13, 16, 19, 22), (14, 17, 20, 23)] axes_list = [(axes[i], axes[j], axes[k], axes[l]) for (i, j, k, l) in axes_idxs] for distribution, color, axes in zip(distributions, colors, axes_list): name, X = distribution X_train, X_test = train_test_split(X, test_size=.5) # perform power transforms and quantile transform X_trans_bc = bc.fit(X_train).transform(X_test) lmbda_bc = round(bc.lambdas_[0], 2) X_trans_yj = yj.fit(X_train).transform(X_test) lmbda_yj = round(yj.lambdas_[0], 2) X_trans_qt = qt.fit(X_train).transform(X_test) ax_original, ax_bc, ax_yj, ax_qt = axes ax_original.hist(X_train, color=color, bins=BINS) ax_original.set_title(name, fontsize=FONT_SIZE) ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) for ax, X_trans, meth_name, lmbda in zip( (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt), ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'), (lmbda_bc, lmbda_yj, None)):
def get_data( ssX=None, batch_size=32, train=True, **kwargs): """ inputs: batch_size: int return: (dataloader, test_dataloader) """ plot_random = False if 'plot_random' not in kwargs else kwargs['plot_random'] plot_resonant = not plot_random train_all = False if 'train_all' not in kwargs else kwargs['train_all'] plot = False if 'plot' not in kwargs else kwargs['plot'] if not train_all and ssX is None: plot_resonant = True plot_random = False if train_all: filename = 'data/combined.pkl' elif plot_resonant: filename = 'data/resonant_dataset.pkl' elif plot_random: filename = 'data/random_dataset.pkl' # These are generated by data_from_pkl.py loaded_data = pkl.load( open(filename, 'rb') ) train_ssX = (ssX is None) fullX, fully = loaded_data['X'], loaded_data['y'] if train_all: len_random = 17082 #Number of valid random examples (others have NaNs) random_data = np.arange(len(fullX)) >= (len(fullX) - len_random) # Differentiate megno if 'fix_megno' in kwargs and kwargs['fix_megno']: idx = [i for i, lab in enumerate(loaded_data['labels']) if 'megno' in lab][0] fullX[:, 1:, idx] -= fullX[:, :-1, idx] if 'include_derivatives' in kwargs and kwargs['include_derivatives']: derivative = fullX[:, 1:, :] - fullX[:, :-1, :] derivative = np.concatenate(( derivative[:, [0], :], derivative), axis=1) fullX = np.concatenate(( fullX, derivative), axis=2) # Hide fraction of test # MAKE SURE WE DO COPIES AFTER!!!! if train: if train_all: remy, finaly, remX, finalX, rem_random, final_random = train_test_split(fully, fullX, random_data, shuffle=True, test_size=1./10, random_state=0) trainy, testy, trainX, testX, train_random, test_random = train_test_split(remy, remX, rem_random, shuffle=True, test_size=1./10, random_state=1) else: remy, finaly, remX, finalX = train_test_split(fully, fullX, shuffle=True, test_size=1./10, random_state=0) trainy, testy, trainX, testX = train_test_split(remy, remX, shuffle=True, test_size=1./10, random_state=1) else: assert not train_all remy = fully finaly = fully testy = fully trainy = fully remX = fullX finalX = fullX testX = fullX trainX = fullX if plot: # Use test dataset for plotting, so put it in validation part: testX = finalX testy = finaly if train_ssX: if 'power_transform' in kwargs and kwargs['power_transform']: ssX = PowerTransformer(method='yeo-johnson') #Power is best else: ssX = StandardScaler() #Power is best n_t = trainX.shape[1] n_features = trainX.shape[2] if train_ssX: ssX.fit(trainX.reshape(-1, n_features)[::1539]) ttrainy = trainy ttesty = testy ttrainX = ssX.transform(trainX.reshape(-1, n_features)).reshape(-1, n_t, n_features) ttestX = ssX.transform(testX.reshape(-1, n_features)).reshape(-1, n_t, n_features) if train_all: ttest_random = test_random ttrain_random = train_random tremX = ssX.transform(remX.reshape(-1, n_features)).reshape(-1, n_t, n_features) tremy = remy train_len = ttrainX.shape[0] X = Variable(torch.from_numpy(np.concatenate((ttrainX, ttestX))).type(torch.FloatTensor)) y = Variable(torch.from_numpy(np.concatenate((ttrainy, ttesty))).type(torch.FloatTensor)) if train_all: r = Variable(torch.from_numpy(np.concatenate((ttrain_random, ttest_random))).type(torch.BoolTensor)) Xrem = Variable(torch.from_numpy(tremX).type(torch.FloatTensor)) yrem = Variable(torch.from_numpy(tremy).type(torch.FloatTensor)) idxes = np.s_[:] dataset = torch.utils.data.TensorDataset(X[:train_len, :, idxes], y[:train_len]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=8) # Cut up dataset into only the random or resonant parts. # Only needed if plotting OR if (not plot) or (not train_all): test_dataset = torch.utils.data.TensorDataset(X[train_len:, :, idxes], y[train_len:]) else: if plot_random: mask = r else: mask = ~r print(f'Plotting with {mask.sum()} total elements, when plot_random={plot_random}') test_dataset = torch.utils.data.TensorDataset(X[train_len:][r[train_len:]][:, :, idxes], y[train_len:][r[train_len:]]) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=3000, shuffle=False, pin_memory=True, num_workers=8) kwargs['model'].ssX = copy(ssX) return dataloader, test_dataloader
# With the increase in income, ccavg also increases, and people tend to take more loans. sns.scatterplot(x='CCAvg',y='Income',hue = 'PersonalLoan',data = df) # NO CORRELATION BETWEEN A CUSTOMER USING INTERNET BANKING FACILITIES AND TAKING A PERSONAL LOAN. sns.countplot(x='Online',hue='PersonalLoan',data=df) sns.boxplot(x='PersonalLoan',y='CCAvg',data=df) """# **NECESSARY TRANSFORMATIONS FOR FEATURE VARIABLES**""" y=df['PersonalLoan'] x=df.drop(['PersonalLoan'],axis=1) from sklearn.preprocessing import PowerTransformer pt = PowerTransformer(method = "yeo-johnson", standardize = False) pt.fit(x['Income'].values.reshape(-1,1)) x['Income'] = pt.transform(x['Income'].values.reshape(-1,1)) sns.distplot(x.Income) pt = PowerTransformer(method = "yeo-johnson", standardize = False) pt.fit(x['CCAvg'].values.reshape(-1,1)) x['CCAvg'] = pt.transform(x['CCAvg'].values.reshape(-1,1)) sns.distplot(x.CCAvg) x['Mortgage_Int'] = pd.cut(x['Mortgage'], bins = [0,100,200,300,400,500,600,700], labels = [0,1,2,3,4,5,6], include_lowest = True) x.drop('Mortgage',axis = 1, inplace = True) sns.distplot(x.Mortgage_Int)
sns.reset_defaults() #sns.set_style('whitegrid') #sns.set_context('talk') sns.set_context(context='talk', font_scale=0.7) tfd = tfp.distributions nametrain = '/Users/aklimase/Documents/USGS/data/cybertrainyeti10_residfeb.csv' nametest = '/Users/aklimase/Documents/USGS/data/cybertestyeti10_residfeb.csv' train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain, nametest, n=12) #%% #preprocessing transform inputs data to be guassian shaped pt = PowerTransformer() aa = pt.fit(train_data1[:, :]) train_data = aa.transform(train_data1) test_data = aa.transform(test_data1) train_targets = train_targets1[0:5000] test_targets = test_targets1[0:5000] y_test = test_targets1.T[0:1] y_train = train_targets1.T[0:1] x_range = [[min(train_data.T[i]) for i in range(len(train_data[0]))], [max(train_data.T[i]) for i in range(len(train_data[0]))]] x_train = train_data[0:5000] x_test = test_data[0:5000]
# Transform the data using the fitted scaler so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']]) # Compare the origional and transformed column print(so_numeric_df[['Age_SS', 'Age']].head()) ## Log transformation # Import PowerTransformer from sklearn.preprocessing import PowerTransformer # Instantiate PowerTransformer pow_trans = PowerTransformer() # Train the transform on the data pow_trans.fit(so_numeric_df[["ConvertedSalary"]]) # Apply the power transform to the data so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform( so_numeric_df[['ConvertedSalary']]) # Plot the data before and after the transformation so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist() plt.show() ### Removing outliers ## Percentage based outlier removal # Find the 95th quantile quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)
def single_results(datFiles, splitFile): # LOADING TRAIN_TEST_VALIDATION SPLIT FILE split = pd.read_csv(splitFile) split = split.drop(["id", "synsetId", "subSynsetId"], axis=1) # SETTING SPLIT VARIABLES train = split.loc[split["split"] == "train"] test = split.loc[split["split"] == "test"] val = split.loc[split["split"] == "val"] for datFile in datFiles: # LOADING DATA FILE df = pd.read_csv(datFile, header=None) n_features = len(df.columns) - 2 feats = ["z{}".format(x) for x in range(n_features)] cols = feats + ["sample", "class"] df.columns = cols # SPLITIN SETS train_set = df.loc[df["sample"].isin(train["modelId"])] test_set = df.loc[df["sample"].isin(test["modelId"])] val_set = df.loc[df["sample"].isin(val["modelId"])] X_train = train_set.drop(["sample", "class"], axis=1) y_train = train_set["class"] X_test = test_set.drop(["sample", "class"], axis=1) y_test = test_set["class"] X_val = val_set.drop(["sample", "class"], axis=1) y_val = val_set["class"] # REMOVE ZERO VARIANCE selector = VarianceThreshold() X_train = selector.fit_transform(X_train) X_test = selector.fit_transform(X_test) X_val = selector.fit_transform(X_val) # STANDARDIZATION scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X_val = scaler.transform(X_val) # SKEW REMOVAL pt = PowerTransformer(method="yeo-johnson", standardize=False) pt.fit(X_train) X_train = pt.transform(X_train) X_test = pt.transform(X_test) X_val = pt.transform(X_val) # CLASSIFIERS classifiers = { "kNN": KNN(n_neighbors=8, weights="distance"), "SVM": SVM(C=3, gamma="scale", kernel="rbf"), "RFC": RandomForest(n_estimators=500), } ans = { "classifier": [], "accuracy": [], } for name, classifier in classifiers.items(): # CLASSIFICATION classifier.fit(X_train, y_train) accuracy = classifier.score(X_test, y_test) accuracy = round(100 * accuracy, 2) ans["classifier"].append(name) ans["accuracy"].append(accuracy) print("{}\t{}\t{}".format(datFile, name, accuracy)) ans = pd.DataFrame(ans) ans.to_csv(datFile.replace(".dat", "_ans.csv"), index=None) del ans del classifiers
def train(): seed = 0 df = pd.read_csv('listings.csv') train, test = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True) # Drop unnecessary columns train = train[[ 'neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights', 'price' ]] test = test[[ 'neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights', 'price' ]] # Power Transform X_train = train.drop(['price'], axis=1) y_train = train['price'].values X_test = test.drop(['price'], axis=1) y_test = test['price'].values num_cols = X_train._get_numeric_data().columns.tolist() pt = PowerTransformer(method='yeo-johnson') X_train[num_cols] = pt.fit_transform(X_train[num_cols]) X_test[num_cols] = pt.transform(X_test[num_cols]) # saving transformer first joblib.dump(pt.fit(y_train.reshape(-1, 1)), 'powerTransform.joblib') y_train = pt.fit_transform(y_train.reshape(-1, 1)) y_test = pt.transform(y_test.reshape(-1, 1)) # Label Encoder le = LabelEncoder() cat_cols_train = X_train.select_dtypes( include=['string', 'object']).columns.tolist() cat_cols_test = X_test.select_dtypes( include=['string', 'object']).columns.tolist() for col in cat_cols_train: joblib.dump(le.fit(X_train[col].astype('string')), 'le_{}.joblib'.format(col)) X_train[col] = le.fit_transform(X_train[col].astype('string')) # I fit the test dataset because it contains previously unseen labels in the train dataset for col in cat_cols_test: X_test[col] = le.fit_transform(X_test[col].astype('string')) # Outliers X_train['price'] = y_train.ravel().tolist() X_train.drop(X_train[(X_train['price'] < -4)].index, inplace=True) y_train = X_train['price'] X_train.drop('price', axis=1, inplace=True) # Model X_train = X_train.values y_train = y_train.values model = LGBMRegressor(max_depth=10, num_leaves=20, random_state=0) model.fit(X_train, y_train) joblib.dump(model, "model.joblib")
def yeo_johnson_transformer(self): yeo_johnson_transformer = PowerTransformer(method="yeo-johnson", copy=True) yeo_johnson_transformer.fit(self.train_imputed_numeric_df) return yeo_johnson_transformer
data.dtypes # astype dataset data.REG_YYMM = data.REG_YYMM.astype('category') data.CARD_SIDO_NM = data.CARD_SIDO_NM.astype('category') data.CARD_CCG_NM = data.CARD_CCG_NM.astype('category') data.STD_CLSS_NM = data.STD_CLSS_NM.astype('category') data.HOM_SIDO_NM = data.HOM_SIDO_NM.astype('category') data.HOM_CCG_NM = data.HOM_CCG_NM.astype('category') data.AGE = data.AGE.astype('category') data.SEX_CTGO_CD = data.SEX_CTGO_CD.astype('category') data.FLC = data.FLC.astype('category') # Transformation pt = PowerTransformer(method='box-cox', standardize=False) pt.fit(data.iloc[:, 9:12]) pt_int_data = pt.transform(data.iloc[:, 9:12]) pt.lambda_ # Group by category_data = pd.DataFrame(data.iloc[:, :9]) pt_int_data = pd.DataFrame(pt_int_data, columns=data.columns[9:12]) pt_data = pd.concat([category_data, pt_int_data], axis=1) pt_data = pt_data.sort_values(by='REG_YYMM') groupby_pt = pt_data.groupby(list(data.columns), observed=True) sum_groupby_pt = groupby_pt.sum() pt_data.REG_YYMM.value_counts() # Shaping
# X = np.column_stack((lol, X[:,2])) # # difference data # y_gauss = JP_highZ(X[:,0], (X[:, 1] / (X[:,0]**(1/3)) ), X[:,2]) *1000 # y_gauss = y_gauss.reshape(len(y_gauss),1) # y = y_gauss - y_og #Scaling X scaler = MinMaxScaler(feature_range=(0, 1)) X_scaled = scaler.fit_transform(X) scaler_x = scaler.fit(X) #scaling y scaler2 = PowerTransformer() #scaler2 = MinMaxScaler(feature_range=(0,1)) scaler_y = scaler2.fit(y) y_scaled = scaler_y.transform(y) scaler_y2 = scaler.fit(y_scaled) y_scaled = scaler_y2.transform(y_scaled) # create model def baseline_model(): model = Sequential() model.add( Dense(200, input_dim=2, kernel_initializer='he_uniform', activation='relu')) #model.add(Dropout(0.2))
class PowerTransformer(BaseEstimator, TransformerMixin): """ Box-cox transform. References ---------- G.E.P. Box and D.R. Cox, “An Analysis of Transformations”, Journal of the Royal Statistical Society B, 26, 211-252 (1964). """ def __init__(self, *, method='yeo-johnson', standardize=False, lmd=None, tolerance=(-np.inf, np.inf), on_err=None): """ Parameters ---------- method: 'yeo-johnson' or 'box-cox' ‘yeo-johnson’ works with positive and negative values ‘box-cox’ only works with strictly positive values standardize: boolean Normalize to standard normal or not. Recommend using a sepearate `standard` function instead of using this option. lmd: list or 1-dim ndarray You might assign each input xs with a specific lmd yourself. Leave None(default) to use a inferred value. See `PowerTransformer` for detials. tolerance: tuple Tolerance of lmd. Set None to accept any. Default is **(-np.inf, np.inf)** but recommend **(-2, 2)** for Box-cox transform on_err: None or str Error handle when try to inference lambda. Can be None or **log**, **nan** or **raise** by string. **log** will return the logarithmic transform of xs that have a min shift to 1. **nan** return ``ndarray`` with shape xs.shape filled with``np.nan``. **raise** raise a FloatingPointError. You can catch it yourself. Default(None) will return the input series without scale transform. .. _PowerTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer """ self._tolerance = tolerance self._pt = PT(method=method, standardize=standardize) self._lmd = lmd self._shape = None self._on_err = on_err def _check_type(self, x): if isinstance(x, list): x = np.array(x, dtype=np.float) elif isinstance(x, (DataFrame, Series)): x = x.values if not isinstance(x, np.ndarray): raise TypeError( 'parameter `X` should be a `DataFrame`, `Series`, `ndarray` or list object ' 'but got {}'.format(type(x))) if len(x.shape) == 1: x = x.reshape(-1, 1) return x def fit(self, x): """ Parameters ---------- X : array-like of shape (n_samples, n_features) The data used to compute the per-feature transformation Returns ------- self : object Fitted scaler. """ x = self._pt._check_input(self._check_type(x), in_fit=True) # forcing constant column vectors to have no transformation (lambda=1) idx = [] for i, col in enumerate(x.T): if np.all(col == col[0]): idx.append(i) if self._lmd is not None: if isinstance(self._lmd, float): self._pt.lambdas_ = np.array([self._lmd] * x.shape[1]) elif x.shape[1] != len(self._lmd): raise ValueError( 'shape[1] of parameter `X` should be {} but got {}'.format( x.shape[1], len(self._lmd))) else: self._pt.lambdas_ = np.array(self._lmd) else: self._pt.fit(x) if len(idx) > 0: self._pt.lambdas_[idx] = 1. return self def transform(self, x): ret = self._pt.transform(self._check_type(x)) if isinstance(x, pd.DataFrame): return pd.DataFrame(ret, index=x.index, columns=x.columns) return ret def inverse_transform(self, x): ret = self._pt.inverse_transform(self._check_type(x)) if isinstance(x, pd.DataFrame): return pd.DataFrame(ret, index=x.index, columns=x.columns) return ret
def get_outliers( data, STD_NORM, side, METHOD='yeo-johnson', PLOT=False, title=None, title_fontsize=None, x_label=None, y_label=None, label_fontsize=None ): import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import PowerTransformer from statsmodels.graphics.gofplots import qqplot import colourPals as cp import importlib importlib.reload(cp) # ================================================== # Error checking assert side == 'left' or side == 'right', "'side' argument has to be either 'left' or 'right'" # ================================================== # If minimum text is less than zero, and 'box-cox' is selected, compute constant k to shift the text cos that the transformation can be performed. if METHOD == 'box-cox' and min(data) <= 0: k = 1 - min(data) data = data + k # ----- Transform text pt = PowerTransformer(method=METHOD) # Find optimal lambda value for transform pt.fit(data.to_numpy().reshape(-1, 1)) # Transform text to a normal distribution data_trans = pt.transform(data.to_numpy().reshape(-1, 1)) # ----- Compute threshold to remove text above or below threshold data_trans_thres = data_trans.mean() + STD_NORM*data_trans.std() # Transform threshold back to original distribution data_thres = pt.inverse_transform(np.array(data_trans_thres).reshape(1, -1)) data_thres = data_thres.flatten()[0] # If text was shifted before, shift the text back by the same constant. if 'k' in locals(): data_thres = data_thres - k data = data - k # If normalised standard deviation is less than 0, remove negative end of the text. # If normalised standard deviation is more than or equal to 0, remove positive end of the text. if side == 'left': outliers = data[data < data_thres] elif side == 'right': outliers = data[data > data_thres] else: raise ValueError("Argument side has to be 'left'or 'right' ") # Flatten can covert transformed text to a series data_trans = pd.Series(data_trans.flatten()) if PLOT: FIG_SIZE = 3 sns.set_style("darkgrid") sns.set_context("notebook") fig, ax = plt.subplots(nrows=3, figsize=(FIG_SIZE*2, FIG_SIZE*3), dpi=300) # Plot coeffMax before transformation sns.distplot(data, rug=True, kde=False, ax=ax[0], color=cp.cbPaired['blue']) ax[0].axvline(x=data_thres, c=cp.cbPaired['red']) ax[0].set_title(title, fontsize=title_fontsize) ax[0].set_xlabel(x_label, fontsize=label_fontsize) ax[0].set_ylabel(f"Frequency", fontsize=label_fontsize) # Plot coeffMax after transformation sns.distplot(data_trans, rug=True, kde=False, ax=ax[1], color=cp.cbPaired['purple']) ax[1].axvline(x=data_trans_thres, c=cp.cbPaired['red']) ax[1].set_xlabel(f"{METHOD.capitalize()} Transformed", fontsize=label_fontsize) ax[1].set_ylabel(f"Frequency", fontsize=label_fontsize) # Plot qqplot of coeffMax after transformation qqplot(data_trans, ax=ax[2], line='s', color=cp.cbPaired['purple']) plt.tight_layout() plt.show() return outliers, data_thres
def load_data_from_folder( folder_path, text_cols, tokenizer, label_col, label_list=None, categorical_cols=None, numerical_cols=None, sep_text_token_str=' ', categorical_encode_type='ohe', numerical_transformer_method='quantile_normal', empty_text_values=None, replace_empty_text=None, max_token_length=None, debug=False, ): """ Function to load tabular and text data from a specified folder Loads train, test and/or validation text and tabular data from specified folder path into TorchTextDataset class and does categorical and numerical data preprocessing if specified. Inside the folder, there is expected to be a train.csv, and test.csv (and if given val.csv) containing the training, testing, and validation sets respectively Args: folder_path (str): The path to the folder containing `train.csv`, and `test.csv` (and if given `val.csv`) text_cols (:obj:`list` of :obj:`str`): The column names in the dataset that contain text from which we want to load tokenizer (:obj:`transformers.tokenization_utils.PreTrainedTokenizer`): HuggingFace tokenizer used to tokenize the input texts as specifed by text_cols label_col (str): The column name of the label, for classification the column should have int values from 0 to n_classes-1 as the label for each class. For regression the column can have any numerical value label_list (:obj:`list` of :obj:`str`, optional): Used for classification; the names of the classes indexed by the values in label_col. categorical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain categorical features. The features can be already prepared numerically, or could be preprocessed by the method specified by categorical_encode_type numerical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain numerical features. These columns should contain only numeric values. sep_text_token_str (str, optional): The string token that is used to separate between the different text columns for a given data example. For Bert for example, this could be the [SEP] token. categorical_encode_type (str, optional): Given categorical_cols, this specifies what method we want to preprocess our categorical features. choices: [ 'ohe', 'binary', None] see encode_features.CategoricalFeatures for more details numerical_transformer_method (str, optional): Given numerical_cols, this specifies what method we want to use for normalizing our numerical data. choices: ['yeo_johnson', 'box_cox', 'quantile_normal', None] see https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html for more details empty_text_values (:obj:`list` of :obj:`str`, optional): specifies what texts should be considered as missing which would be replaced by replace_empty_text replace_empty_text (str, optional): The value of the string that will replace the texts that match with those in empty_text_values. If this argument is None then the text that match with empty_text_values will be skipped max_token_length (int, optional): The token length to pad or truncate to on the input text debug (bool, optional): Whether or not to load a smaller debug version of the dataset Returns: :obj:`tuple` of `tabular_torch_dataset.TorchTextDataset`: This tuple contains the training, validation and testing sets. The val dataset is :obj:`None` if there is no `val.csv` in folder_path """ train_df = pd.read_csv(join(folder_path, 'train.csv'), index_col=0) test_df = pd.read_csv(join(folder_path, 'test.csv'), index_col=0) if exists(join(folder_path, 'val.csv')): val_df = pd.read_csv(join(folder_path, 'val.csv'), index_col=0) else: val_df = None if categorical_encode_type == 'ohe' or categorical_encode_type == 'binary': dfs = [df for df in [train_df, val_df, test_df] if df is not None] data_df = pd.concat(dfs, axis=0) if categorical_encode_type == 'ohe': data_df = pd.get_dummies(data_df, columns=categorical_cols, dummy_na=True) categorical_cols = [ col for col in data_df.columns for old_col in categorical_cols if col.startswith(old_col) and len(col) > len(old_col) ] elif categorical_encode_type == 'binary': cat_feat_processor = CategoricalFeatures(data_df, categorical_cols, 'binary') vals = cat_feat_processor.fit_transform() cat_df = pd.DataFrame(vals, columns=cat_feat_processor.feat_names) data_df = pd.concat([data_df, cat_df], axis=1) categorical_cols = cat_feat_processor.feat_names train_df = data_df.loc[train_df.index] if val_df is not None: val_df = data_df.loc[val_df.index] test_df = data_df.loc[test_df.index] categorical_encode_type = None if numerical_transformer_method != 'none': if numerical_transformer_method == 'yeo_johnson': numerical_transformer = PowerTransformer(method='yeo-johnson') elif numerical_transformer_method == 'box_cox': numerical_transformer = PowerTransformer(method='box-cox') elif numerical_transformer_method == 'quantile_normal': numerical_transformer = QuantileTransformer( output_distribution='normal') else: raise ValueError(f'preprocessing transformer method ' f'{numerical_transformer_method} not implemented') num_feats = load_num_feats(train_df, convert_to_func(numerical_cols)) numerical_transformer.fit(num_feats) else: numerical_transformer = None train_dataset = load_data(train_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer, empty_text_values, replace_empty_text, max_token_length, debug) test_dataset = load_data(test_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer, empty_text_values, replace_empty_text, max_token_length, debug) if val_df is not None: val_dataset = load_data(val_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer, empty_text_values, replace_empty_text, max_token_length, debug) else: val_dataset = None return train_dataset, val_dataset, test_dataset
'seagreen', 'royalblue', 'darkorchid'] fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2)) axes = axes.flatten() axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21), (13, 16, 19, 22), (14, 17, 20, 23)] axes_list = [(axes[i], axes[j], axes[k], axes[l]) for (i, j, k, l) in axes_idxs] for distribution, color, axes in zip(distributions, colors, axes_list): name, X = distribution X_train, X_test = train_test_split(X, test_size=.5) # perform power transforms and quantile transform X_trans_bc = bc.fit(X_train).transform(X_test) lmbda_bc = round(bc.lambdas_[0], 2) X_trans_yj = yj.fit(X_train).transform(X_test) lmbda_yj = round(yj.lambdas_[0], 2) X_trans_qt = qt.fit(X_train).transform(X_test) ax_original, ax_bc, ax_yj, ax_qt = axes ax_original.hist(X_train, color=color, bins=BINS) ax_original.set_title(name, fontsize=FONT_SIZE) ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) for ax, X_trans, meth_name, lmbda in zip( (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt), ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),
def fit_yeo_johnson_transformer(train_imputed_numeric_df: pd.DataFrame): yeo_johnson_transformer = PowerTransformer(method="yeo-johnson", copy=True) yeo_johnson_transformer.fit(train_imputed_numeric_df) return yeo_johnson_transformer